From 93d81d1aca26e64a75d06a85f7e128b5f49053e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 15:32:51 +0100
Subject: mutex: small cleanup

Remove a local variable by combining an assingment and test in one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b658e..357c6d221ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -129,7 +129,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
 	spin_lock_mutex(&lock->wait_lock, flags);
@@ -142,8 +141,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +156,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
-- 
cgit v1.2.3


From 41719b03091911028116155deddc5eedf8c45e37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 15:36:26 +0100
Subject: mutex: preemption fixes

The problem is that dropping the spinlock right before schedule is a voluntary
preemption point and can cause a schedule, right after which we schedule again.

Fix this inefficiency by keeping preemption disabled until we schedule, do this
by explicity disabling preemption and providing a schedule() variant that
assumes preemption is already disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/mutex.c        |  5 ++++-
 kernel/sched.c        | 10 +++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f..9f0b372cfa6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -328,6 +328,7 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 
 struct nsproxy;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 357c6d221ef..524ffc33dc0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,6 +131,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct mutex_waiter waiter;
 	unsigned long flags;
 
+	preempt_disable();
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -170,13 +171,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
@@ -193,6 +195,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d..b001c133c35 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4538,15 +4538,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4603,7 +4601,13 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
-- 
cgit v1.2.3


From 0d66bf6d3514b35eb6897629059443132992dbd7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 Jan 2009 14:01:47 +0100
Subject: mutex: implement adaptive spinning

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo's test-mutex application (http://lkml.org/lkml/2006/1/8/50)
gave a 345% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               296604

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               85870

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it'll release the lock soon, and thus we'll be better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h   |   5 ++-
 include/linux/sched.h   |   1 +
 kernel/mutex-debug.c    |   9 +---
 kernel/mutex-debug.h    |  18 ++++----
 kernel/mutex.c          | 115 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/mutex.h          |  22 ++++++++-
 kernel/sched.c          |  61 +++++++++++++++++++++++++
 kernel/sched_features.h |   1 +
 8 files changed, 201 insertions(+), 31 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 7a0e5c4f807..3069ec7e0ab 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 #ifdef CONFIG_DEBUG_MUTEXES
-	struct mutex		*lock;
 	void			*magic;
 #endif
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f0b372cfa6..c34b137cd1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -330,6 +330,7 @@ extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb53..50d022e5a56 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534..6b2d735846a 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 524ffc33dc0..ff42e975590 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 	 * 'unlocked' into 'locked' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
 	 * The unlocking fastpath is the 0->1 transition from 'locked'
 	 * into 'unlocked' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -132,10 +147,71 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	unsigned long flags;
 
 	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn't track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+	 * to serialize everything.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there are pending waiters, join them.
+		 */
+		if (!list_empty(&lock->wait_list))
+			break;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
@@ -185,8 +261,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -222,7 +298,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +337,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +373,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +439,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +468,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb29..67578ca48f9 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index b001c133c35..589e7308c61 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4614,6 +4614,67 @@ need_resched:
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c..07bc02e99ab 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)
-- 
cgit v1.2.3


From ac6e60ee405aa3bf718f7fe4cb01b7ee0b8877ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Jan 2009 17:29:31 +0100
Subject: mutex: adaptive spinnning, performance tweaks

Spin more agressively. This is less fair but also markedly faster.

The numbers:

 * dbench 50 (higher is better):
  spin        1282MB/s
  v10         548MB/s
  v10 no wait 1868MB/s

 * 4k creates (numbers in files/second higher is better):
  spin        avg 200.60 median 193.20 std 19.71 high 305.93 low 186.82
  v10         avg 180.94 median 175.28 std 13.91 high 229.31 low 168.73
  v10 no wait avg 232.18 median 222.38 std 22.91 high 314.66 low 209.12

 * File stats (numbers in seconds, lower is better):
  spin        2.27s
  v10         5.1s
  v10 no wait 1.6s

( The source changes are smaller than they look, I just moved the
  need_resched checks in __mutex_lock_common after the cmpxchg. )

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/kernel/mutex.c b/kernel/mutex.c
index ff42e975590..5d79781394a 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -170,12 +170,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	for (;;) {
 		struct thread_info *owner;
 
-		/*
-		 * If there are pending waiters, join them.
-		 */
-		if (!list_empty(&lock->wait_list))
-			break;
-
 		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
@@ -184,6 +178,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		if (owner && !mutex_spin_on_owner(lock, owner))
 			break;
 
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
 		/*
 		 * When there's no owner, we might have preempted between the
 		 * owner acquiring the lock and setting the owner field. If
@@ -193,13 +194,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		if (!owner && (need_resched() || rt_task(task)))
 			break;
 
-		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
-			lock_acquired(&lock->dep_map, ip);
-			mutex_set_owner(lock);
-			preempt_enable();
-			return 0;
-		}
-
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don't need
-- 
cgit v1.2.3


From cf47b8f3d96b0b8b10b557444a28b3ca4024ff82 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Jan 2009 13:40:46 -0500
Subject: Btrfs: stop spinning on mutex_trylock and let the adaptive code spin
 for us

Mutexes now spin internally and the btrfs spin is no longer required for
performance.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/btrfs/locking.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db..40ba8e8962f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -37,16 +37,6 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
-	int i;
-
-	if (mutex_trylock(&eb->mutex))
-		return 0;
-	for (i = 0; i < 512; i++) {
-		cpu_relax();
-		if (mutex_trylock(&eb->mutex))
-			return 0;
-	}
-	cpu_relax();
 	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
 	return 0;
 }
-- 
cgit v1.2.3


From 074ca44283bf031678e67af7d82668bb03c55a55 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@gmail.com>
Date: Fri, 6 Feb 2009 16:23:37 -0500
Subject: ext4: Remove stale block allocator references from ext4.h

Remove some leftovers from when the old block allocator was removed
(c2ea3fde).  ext4_sb_info is now a bit lighter.  Also remove a dangling
read_block_bitmap() prototype.

Signed-off-by: Mike Snitzer <snitzer@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 10 ----------
 fs/ext4/ext4_i.h  |  3 ---
 fs/ext4/ext4_sb.h |  4 ----
 fs/ext4/mballoc.h |  1 -
 4 files changed, 18 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057..0db01421da3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -32,14 +32,6 @@
  */
 #undef EXT4FS_DEBUG
 
-/*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS	8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS		1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
 /*
  * Debug code
  */
@@ -54,8 +46,6 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
-#define EXT4_MULTIBLOCK_ALLOCATOR	1
-
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c..2d516c0a22a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
 /*
  * storage for cached extent
  */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a04..e318f486cc2 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -65,10 +65,6 @@ struct ext4_sb_info {
 	struct blockgroup_lock s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
 
-	/* root of the per fs reservation window tree */
-	spinlock_t s_rsv_window_lock;
-	struct rb_root s_rsv_window_root;
-
 	/* Journaling */
 	struct inode *s_journal_inode;
 	struct journal_s *s_journal;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf1..7acf5ef2453 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -247,7 +247,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
-- 
cgit v1.2.3


From e187c6588d6ef3169db53c389b3de9dfde3b16cc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 6 Feb 2009 16:23:37 -0500
Subject: ext4: remove call to ext4_group_desc() in
 ext4_group_used_meta_blocks()

The static function ext4_group_used_meta_blocks() only has one caller,
who already has access to the block group's group descriptor.  So it's
better to have ext4_init_block_bitmap() pass the group descriptor to
ext4_group_used_meta_blocks(), so it doesn't need to call
ext4_group_desc().  Previously this function did not check if
ext4_group_desc() returned NULL due to an error, potentially causing a
kernel OOPS report.  This avoids the issue entirely.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899..b37b1287558 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-				ext4_group_t block_group)
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	ext4_fsblk_t tmp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	int used_blocks = sbi->s_itb_per_group + 2;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-		struct ext4_group_desc *gdp;
-		struct buffer_head *bh;
-
-		gdp = ext4_get_group_desc(sb, block_group, &bh);
 		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
 					block_group))
 			used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
-- 
cgit v1.2.3


From 6f2b9b9a9d750a9175dc79c74bfed5add840983c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 29 Jan 2009 16:03:20 +0100
Subject: timer: implement lockdep deadlock detection

This modifies the timer code in a way to allow lockdep to detect
deadlocks resulting from a lock being taken in the timer function
as well as around the del_timer_sync() call.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/linux/timer.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/timer.c        | 68 +++++++++++++++++++++++++++++++------
 2 files changed, 141 insertions(+), 20 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index daf9685b861..51774eb87cc 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -5,6 +5,7 @@
 #include <linux/ktime.h>
 #include <linux/stddef.h>
 #include <linux/debugobjects.h>
+#include <linux/stringify.h>
 
 struct tvec_base;
 
@@ -21,52 +22,126 @@ struct timer_list {
 	char start_comm[16];
 	int start_pid;
 #endif
+#ifdef CONFIG_LOCKDEP
+	struct lockdep_map lockdep_map;
+#endif
 };
 
 extern struct tvec_base boot_tvec_bases;
 
+#ifdef CONFIG_LOCKDEP
+/*
+ * NB: because we have to copy the lockdep_map, setting the lockdep_map key
+ * (second argument) here is required, otherwise it could be initialised to
+ * the copy of the lockdep_map later! We use the pointer to and the string
+ * "<file>:<line>" as the key resp. the name of the lockdep_map.
+ */
+#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)				\
+	.lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
+#else
+#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
+#endif
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.base = &boot_tvec_bases,			\
+		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
+			__FILE__ ":" __stringify(__LINE__))	\
 	}
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
 
-void init_timer(struct timer_list *timer);
-void init_timer_deferrable(struct timer_list *timer);
+void init_timer_key(struct timer_list *timer,
+		    const char *name,
+		    struct lock_class_key *key);
+void init_timer_deferrable_key(struct timer_list *timer,
+			       const char *name,
+			       struct lock_class_key *key);
+
+#ifdef CONFIG_LOCKDEP
+#define init_timer(timer)						\
+	do {								\
+		static struct lock_class_key __key;			\
+		init_timer_key((timer), #timer, &__key);		\
+	} while (0)
+
+#define init_timer_deferrable(timer)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		init_timer_deferrable_key((timer), #timer, &__key);	\
+	} while (0)
+
+#define init_timer_on_stack(timer)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		init_timer_on_stack_key((timer), #timer, &__key);	\
+	} while (0)
+
+#define setup_timer(timer, fn, data)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		setup_timer_key((timer), #timer, &__key, (fn), (data));\
+	} while (0)
+
+#define setup_timer_on_stack(timer, fn, data)				\
+	do {								\
+		static struct lock_class_key __key;			\
+		setup_timer_on_stack_key((timer), #timer, &__key,	\
+					 (fn), (data));			\
+	} while (0)
+#else
+#define init_timer(timer)\
+	init_timer_key((timer), NULL, NULL)
+#define init_timer_deferrable(timer)\
+	init_timer_deferrable_key((timer), NULL, NULL)
+#define init_timer_on_stack(timer)\
+	init_timer_on_stack_key((timer), NULL, NULL)
+#define setup_timer(timer, fn, data)\
+	setup_timer_key((timer), NULL, NULL, (fn), (data))
+#define setup_timer_on_stack(timer, fn, data)\
+	setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
+#endif
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-extern void init_timer_on_stack(struct timer_list *timer);
+extern void init_timer_on_stack_key(struct timer_list *timer,
+				    const char *name,
+				    struct lock_class_key *key);
 extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
 static inline void destroy_timer_on_stack(struct timer_list *timer) { }
-static inline void init_timer_on_stack(struct timer_list *timer)
+static inline void init_timer_on_stack_key(struct timer_list *timer,
+					   const char *name,
+					   struct lock_class_key *key)
 {
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 }
 #endif
 
-static inline void setup_timer(struct timer_list * timer,
+static inline void setup_timer_key(struct timer_list * timer,
+				const char *name,
+				struct lock_class_key *key,
 				void (*function)(unsigned long),
 				unsigned long data)
 {
 	timer->function = function;
 	timer->data = data;
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 }
 
-static inline void setup_timer_on_stack(struct timer_list *timer,
+static inline void setup_timer_on_stack_key(struct timer_list *timer,
+					const char *name,
+					struct lock_class_key *key,
 					void (*function)(unsigned long),
 					unsigned long data)
 {
 	timer->function = function;
 	timer->data = data;
-	init_timer_on_stack(timer);
+	init_timer_on_stack_key(timer, name, key);
 }
 
 /**
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143..ef1c385bc57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
 	debug_object_free(timer, &timer_debug_descr);
 }
 
-static void __init_timer(struct timer_list *timer);
+static void __init_timer(struct timer_list *timer,
+			 const char *name,
+			 struct lock_class_key *key);
 
-void init_timer_on_stack(struct timer_list *timer)
+void init_timer_on_stack_key(struct timer_list *timer,
+			     const char *name,
+			     struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	__init_timer(timer);
+	__init_timer(timer, name, key);
 }
-EXPORT_SYMBOL_GPL(init_timer_on_stack);
+EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
 void destroy_timer_on_stack(struct timer_list *timer)
 {
@@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
 #endif
 
-static void __init_timer(struct timer_list *timer)
+static void __init_timer(struct timer_list *timer,
+			 const char *name,
+			 struct lock_class_key *key)
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
 	timer->start_pid = -1;
 	memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
+	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
 /**
@@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
  * init_timer() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer(struct timer_list *timer)
+void init_timer_key(struct timer_list *timer,
+		    const char *name,
+		    struct lock_class_key *key)
 {
 	debug_timer_init(timer);
-	__init_timer(timer);
+	__init_timer(timer, name, key);
 }
-EXPORT_SYMBOL(init_timer);
+EXPORT_SYMBOL(init_timer_key);
 
-void init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable_key(struct timer_list *timer,
+			       const char *name,
+			       struct lock_class_key *key)
 {
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 	timer_set_deferrable(timer);
 }
-EXPORT_SYMBOL(init_timer_deferrable);
+EXPORT_SYMBOL(init_timer_deferrable_key);
 
 static inline void detach_timer(struct timer_list *timer,
 				int clear_pending)
@@ -789,6 +800,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  */
 int del_timer_sync(struct timer_list *timer)
 {
+#ifdef CONFIG_LOCKDEP
+	unsigned long flags;
+
+	local_irq_save(flags);
+	lock_map_acquire(&timer->lockdep_map);
+	lock_map_release(&timer->lockdep_map);
+	local_irq_restore(flags);
+#endif
+
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -861,10 +881,36 @@ static inline void __run_timers(struct tvec_base *base)
 
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
+
 			spin_unlock_irq(&base->lock);
 			{
 				int preempt_count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+				/*
+				 * It is permissible to free the timer from
+				 * inside the function that is called from
+				 * it, this we need to take into account for
+				 * lockdep too. To avoid bogus "held lock
+				 * freed" warnings as well as problems when
+				 * looking into timer->lockdep_map, make a
+				 * copy and use that here.
+				 */
+				struct lockdep_map lockdep_map =
+					timer->lockdep_map;
+#endif
+				/*
+				 * Couple the lock chain with the lock chain at
+				 * del_timer_sync() by acquiring the lock_map
+				 * around the fn() call here and in
+				 * del_timer_sync().
+				 */
+				lock_map_acquire(&lockdep_map);
+
 				fn(data);
+
+				lock_map_release(&lockdep_map);
+
 				if (preempt_count != preempt_count()) {
 					printk(KERN_ERR "huh, entered %p "
 					       "with preempt_count %08x, exited"
-- 
cgit v1.2.3


From cf40bd16fdad42c053040bcd3988f5fdedbb6c57 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 21 Jan 2009 08:12:39 +0100
Subject: lockdep: annotate reclaim context (__GFP_NOFS)

Here is another version, with the incremental patch rolled up, and
added reclaim context annotation to kswapd, and allocation tracing
to slab allocators (which may only ever reach the page allocator
in rare cases, so it is good to put annotations here too).

Haven't tested this version as such, but it should be getting closer
to merge worthy ;)

--
After noticing some code in mm/filemap.c accidentally perform a __GFP_FS
allocation when it should not have been, I thought it might be a good idea to
try to catch this kind of thing with lockdep.

I coded up a little idea that seems to work. Unfortunately the system has to
actually be in __GFP_FS page reclaim, then take the lock, before it will mark
it. But at least that might still be some orders of magnitude more common
(and more debuggable) than an actual deadlock condition, so we have some
improvement I hope (the concept is no less complete than discovery of a lock's
interrupt contexts).

I guess we could even do the same thing with __GFP_IO (normal reclaim), and
even GFP_NOIO locks too... but filesystems will have the most locks and fiddly
code paths, so let's start there and see how it goes.

It *seems* to work. I did a quick test.

=================================
[ INFO: inconsistent lock state ]
2.6.28-rc6-00007-ged31348-dirty #26
---------------------------------
inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage.
modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]
{in-reclaim-W} state was registered at:
  [<ffffffff80267bdb>] __lock_acquire+0x75b/0x1a60
  [<ffffffff80268f71>] lock_acquire+0x91/0xc0
  [<ffffffff8070f0e1>] mutex_lock_nested+0xb1/0x310
  [<ffffffffa002002b>] brd_init+0x2b/0x216 [brd]
  [<ffffffff8020903b>] _stext+0x3b/0x170
  [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
  [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b
  [<ffffffffffffffff>] 0xffffffffffffffff
irq event stamp: 3929
hardirqs last  enabled at (3929): [<ffffffff8070f2b5>] mutex_lock_nested+0x285/0x310
hardirqs last disabled at (3928): [<ffffffff8070f089>] mutex_lock_nested+0x59/0x310
softirqs last  enabled at (3732): [<ffffffff8061f623>] sk_filter+0x83/0xe0
softirqs last disabled at (3730): [<ffffffff8061f5b6>] sk_filter+0x16/0xe0

other info that might help us debug this:
1 lock held by modprobe/8526:
 #0:  (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]

stack backtrace:
Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26
Call Trace:
 [<ffffffff80265483>] print_usage_bug+0x193/0x1d0
 [<ffffffff80266530>] mark_lock+0xaf0/0xca0
 [<ffffffff80266735>] mark_held_locks+0x55/0xc0
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffff802667ca>] trace_reclaim_fs+0x2a/0x60
 [<ffffffff80285005>] __alloc_pages_internal+0x475/0x580
 [<ffffffff8070f29e>] ? mutex_lock_nested+0x26e/0x310
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffffa002006a>] brd_init+0x6a/0x216 [brd]
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffff8020903b>] _stext+0x3b/0x170
 [<ffffffff8070f8b9>] ? mutex_unlock+0x9/0x10
 [<ffffffff8070f83d>] ? __mutex_unlock_slowpath+0x10d/0x180
 [<ffffffff802669ec>] ? trace_hardirqs_on_caller+0x12c/0x190
 [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
 [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h    |  17 +++-
 include/linux/sched.h      |   1 +
 kernel/lockdep.c           | 229 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/lockdep_internals.h |   3 +-
 kernel/lockdep_proc.c      |   6 +-
 mm/page_alloc.c            |   5 +
 mm/slab.c                  |   4 +
 mm/slob.c                  |   2 +
 mm/slub.c                  |   1 +
 mm/vmscan.c                |   3 +
 10 files changed, 254 insertions(+), 17 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 23bf02fb124..cc97bdbc796 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -27,12 +27,16 @@ enum lock_usage_bit
 	LOCK_USED = 0,
 	LOCK_USED_IN_HARDIRQ,
 	LOCK_USED_IN_SOFTIRQ,
+	LOCK_USED_IN_RECLAIM_FS,
 	LOCK_ENABLED_SOFTIRQS,
 	LOCK_ENABLED_HARDIRQS,
+	LOCK_HELD_OVER_RECLAIM_FS,
 	LOCK_USED_IN_HARDIRQ_READ,
 	LOCK_USED_IN_SOFTIRQ_READ,
+	LOCK_USED_IN_RECLAIM_FS_READ,
 	LOCK_ENABLED_SOFTIRQS_READ,
 	LOCK_ENABLED_HARDIRQS_READ,
+	LOCK_HELD_OVER_RECLAIM_FS_READ,
 	LOCK_USAGE_STATES
 };
 
@@ -42,16 +46,20 @@ enum lock_usage_bit
 #define LOCKF_USED			(1 << LOCK_USED)
 #define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
 #define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
 #define LOCKF_ENABLED_HARDIRQS		(1 << LOCK_ENABLED_HARDIRQS)
 #define LOCKF_ENABLED_SOFTIRQS		(1 << LOCK_ENABLED_SOFTIRQS)
+#define LOCKF_HELD_OVER_RECLAIM_FS	(1 << LOCK_HELD_OVER_RECLAIM_FS)
 
 #define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
 
 #define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
 #define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
 #define LOCKF_ENABLED_HARDIRQS_READ	(1 << LOCK_ENABLED_HARDIRQS_READ)
 #define LOCKF_ENABLED_SOFTIRQS_READ	(1 << LOCK_ENABLED_SOFTIRQS_READ)
+#define LOCKF_HELD_OVER_RECLAIM_FS_READ	(1 << LOCK_HELD_OVER_RECLAIM_FS_READ)
 
 #define LOCKF_ENABLED_IRQS_READ \
 		(LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
@@ -324,7 +332,11 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 	lock_set_class(lock, lock->name, lock->key, subclass, ip);
 }
 
-# define INIT_LOCKDEP				.lockdep_recursion = 0,
+extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
+extern void lockdep_clear_current_reclaim_state(void);
+extern void lockdep_trace_alloc(gfp_t mask);
+
+# define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
 
@@ -342,6 +354,9 @@ static inline void lockdep_on(void)
 # define lock_release(l, n, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
+# define lockdep_set_current_reclaim_state(g)	do { } while (0)
+# define lockdep_clear_current_reclaim_state()	do { } while (0)
+# define lockdep_trace_alloc(g)			do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4efb552aca4..b00a77f4999 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1313,6 +1313,7 @@ struct task_struct {
 	int lockdep_depth;
 	unsigned int lockdep_recursion;
 	struct held_lock held_locks[MAX_LOCK_DEPTH];
+	gfp_t lockdep_reclaim_gfp;
 #endif
 
 /* journalling filesystem info */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c3568f0..977f940fd56 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -310,12 +310,14 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE	1
 # define SOFTIRQ_VERBOSE	1
+# define RECLAIM_VERBOSE	1
 #else
 # define HARDIRQ_VERBOSE	0
 # define SOFTIRQ_VERBOSE	0
+# define RECLAIM_VERBOSE	0
 #endif
 
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
 /*
  * Quick filtering for interesting events:
  */
@@ -454,6 +456,10 @@ static const char *usage_str[] =
 	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
 	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
 	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
+	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
+	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
+	[LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -462,9 +468,10 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 }
 
 void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6)
 {
-	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.', *c5 = '.', *c6 = '.';
 
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 		*c1 = '+';
@@ -493,14 +500,29 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
 			*c4 = '?';
 	}
+
+	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
+		*c5 = '+';
+	else
+		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS)
+			*c5 = '-';
+
+	if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+		*c6 = '-';
+	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
+		*c6 = '+';
+		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+			*c6 = '?';
+	}
+
 }
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[KSYM_NAME_LEN], c1, c2, c3, c4;
+	char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6;
 	const char *name;
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4);
+	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
 
 	name = class->name;
 	if (!name) {
@@ -513,7 +535,7 @@ static void print_lock_name(struct lock_class *class)
 		if (class->subclass)
 			printk("/%d", class->subclass);
 	}
-	printk("){%c%c%c%c}", c1, c2, c3, c4);
+	printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
@@ -1306,6 +1328,26 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 					LOCK_ENABLED_SOFTIRQS, "soft"))
 		return 0;
 
+	/*
+	 * Prove that the new dependency does not connect a reclaim-fs-safe
+	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
+					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a reclaim-fs-safe-read
+	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
+					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read"))
+		return 0;
+
 	return 1;
 }
 
@@ -1949,6 +1991,14 @@ static int softirq_verbose(struct lock_class *class)
 	return 0;
 }
 
+static int reclaim_verbose(struct lock_class *class)
+{
+#if RECLAIM_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
 #define STRICT_READ_CHECKS	1
 
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
@@ -2007,6 +2057,31 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_USED_IN_RECLAIM_FS:
+		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_HELD_OVER_RECLAIM_FS_READ))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-safe, check that this lock
+		 * took no reclaim-fs-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it reclaim-fs-safe, check that this lock
+		 * took no reclaim-fs-unsafe-read lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+				LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
 			return 0;
@@ -2033,6 +2108,19 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_USED_IN_RECLAIM_FS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-read-safe, check that this lock
+		 * took no reclaim-fs-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_ENABLED_HARDIRQS:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
@@ -2085,6 +2173,32 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_HELD_OVER_RECLAIM_FS:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_USED_IN_RECLAIM_FS_READ))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-unsafe, check that no reclaim-fs-safe
+		 * lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it softirq-unsafe, check that no
+		 * softirq-safe-read lock in the system ever took
+		 * it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+				   LOCK_USED_IN_RECLAIM_FS_READ, "reclaim-fs-read"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_ENABLED_HARDIRQS_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
@@ -2115,6 +2229,21 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it reclaim-fs-read-unsafe, check that no
+		 * reclaim-fs-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	default:
 		WARN_ON(1);
 		break;
@@ -2123,11 +2252,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	return ret;
 }
 
+enum mark_type {
+	HARDIRQ,
+	SOFTIRQ,
+	RECLAIM_FS,
+};
+
 /*
  * Mark all held locks with a usage bit:
  */
 static int
-mark_held_locks(struct task_struct *curr, int hardirq)
+mark_held_locks(struct task_struct *curr, enum mark_type mark)
 {
 	enum lock_usage_bit usage_bit;
 	struct held_lock *hlock;
@@ -2136,17 +2271,32 @@ mark_held_locks(struct task_struct *curr, int hardirq)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		hlock = curr->held_locks + i;
 
-		if (hardirq) {
+		switch (mark) {
+		case HARDIRQ:
 			if (hlock->read)
 				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
 			else
 				usage_bit = LOCK_ENABLED_HARDIRQS;
-		} else {
+			break;
+
+		case SOFTIRQ:
 			if (hlock->read)
 				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
 			else
 				usage_bit = LOCK_ENABLED_SOFTIRQS;
+			break;
+
+		case RECLAIM_FS:
+			if (hlock->read)
+				usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ;
+			else
+				usage_bit = LOCK_HELD_OVER_RECLAIM_FS;
+			break;
+
+		default:
+			BUG();
 		}
+
 		if (!mark_lock(curr, hlock, usage_bit))
 			return 0;
 	}
@@ -2200,7 +2350,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	 * We are going to turn hardirqs on, so set the
 	 * usage bit for all held locks:
 	 */
-	if (!mark_held_locks(curr, 1))
+	if (!mark_held_locks(curr, HARDIRQ))
 		return;
 	/*
 	 * If we have softirqs enabled, then set the usage
@@ -2208,7 +2358,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	 * this bit from being set before)
 	 */
 	if (curr->softirqs_enabled)
-		if (!mark_held_locks(curr, 0))
+		if (!mark_held_locks(curr, SOFTIRQ))
 			return;
 
 	curr->hardirq_enable_ip = ip;
@@ -2288,7 +2438,7 @@ void trace_softirqs_on(unsigned long ip)
 	 * enabled too:
 	 */
 	if (curr->hardirqs_enabled)
-		mark_held_locks(curr, 0);
+		mark_held_locks(curr, SOFTIRQ);
 }
 
 /*
@@ -2317,6 +2467,31 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(&redundant_softirqs_off);
 }
 
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	/* no reclaim without waiting on it */
+	if (!(gfp_mask & __GFP_WAIT))
+		return;
+
+	/* this guy won't enter reclaim */
+	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+		return;
+
+	/* We're only interested __GFP_FS allocations for now */
+	if (!(gfp_mask & __GFP_FS))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(irqs_disabled()))
+		return;
+
+	mark_held_locks(curr, RECLAIM_FS);
+}
+
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
@@ -2362,6 +2537,22 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 		}
 	}
 
+	/*
+	 * We reuse the irq context infrastructure more broadly as a general
+	 * context checking code. This tests GFP_FS recursion (a lock taken
+	 * during reclaim for a GFP_FS allocation is held over a GFP_FS
+	 * allocation).
+	 */
+	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
+		if (hlock->read) {
+			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
+					return 0;
+		} else {
+			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
+					return 0;
+		}
+	}
+
 	return 1;
 }
 
@@ -2453,6 +2644,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_SOFTIRQS:
 	case LOCK_ENABLED_HARDIRQS_READ:
 	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_USED_IN_RECLAIM_FS:
+	case LOCK_USED_IN_RECLAIM_FS_READ:
+	case LOCK_HELD_OVER_RECLAIM_FS:
+	case LOCK_HELD_OVER_RECLAIM_FS_READ:
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
@@ -2966,6 +3161,16 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
+void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
+{
+	current->lockdep_reclaim_gfp = gfp_mask;
+}
+
+void lockdep_clear_current_reclaim_state(void)
+{
+	current->lockdep_reclaim_gfp = 0;
+}
+
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b196932c0..e887b783244 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -32,7 +32,8 @@ extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
 
 extern void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b81389..b84a1dfa907 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
 {
 	struct lock_class *class = v;
 	struct lock_list *entry;
-	char c1, c2, c3, c4;
+	char c1, c2, c3, c4, c5, c6;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
 	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
 #endif
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4);
-	seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
+	seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6);
 
 	seq_printf(m, ": ");
 	print_name(m, class);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b307385..22b15a4cde8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1479,6 +1479,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	unsigned long pages_reclaimed = 0;
 
+	lockdep_trace_alloc(gfp_mask);
+
 	might_sleep_if(wait);
 
 	if (should_fail_alloc_page(gfp_mask, order))
@@ -1578,12 +1580,15 @@ nofail_alloc:
 	 */
 	cpuset_update_task_memory_state();
 	p->flags |= PF_MEMALLOC;
+
+	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
 	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
 
 	p->reclaim_state = NULL;
+	lockdep_clear_current_reclaim_state();
 	p->flags &= ~PF_MEMALLOC;
 
 	cond_resched();
diff --git a/mm/slab.c b/mm/slab.c
index ddc41f337d5..6b61de8543e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3318,6 +3318,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
+	lockdep_trace_alloc(flags);
+
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
@@ -3394,6 +3396,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
+	lockdep_trace_alloc(flags);
+
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
diff --git a/mm/slob.c b/mm/slob.c
index bf7e8fc3aed..1264799df5d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -464,6 +464,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
+	lockdep_trace_alloc(flags);
+
 	if (size < PAGE_SIZE - align) {
 		if (!size)
 			return ZERO_SIZE_PTR;
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a2..214eb207c51 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1596,6 +1596,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
+	lockdep_trace_alloc(gfpflags);
 	might_sleep_if(gfpflags & __GFP_WAIT);
 
 	if (should_failslab(s->objsize, gfpflags))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a27c44aa32..303eb658b50 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1963,6 +1963,9 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
+
+	lockdep_set_current_reclaim_state(GFP_KERNEL);
+
 	node_to_cpumask_ptr(cpumask, pgdat->node_id);
 
 	if (!cpumask_empty(cpumask))
-- 
cgit v1.2.3


From 4fc95e867f1e75351b89db3c68212dfcce7ea563 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 13:10:52 +0100
Subject: lockdep: sanitize bit names

s/\(LOCKF\?_ENABLED_[^ ]*\)S\(_READ\)\?\>/\1\2/g

So that the USED_IN and ENABLED have the same names.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 22 ++++++-------
 kernel/lockdep.c        | 84 ++++++++++++++++++++++++-------------------------
 kernel/lockdep_proc.c   | 12 +++----
 3 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index cc97bdbc796..da2e2b25b3b 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -28,14 +28,14 @@ enum lock_usage_bit
 	LOCK_USED_IN_HARDIRQ,
 	LOCK_USED_IN_SOFTIRQ,
 	LOCK_USED_IN_RECLAIM_FS,
-	LOCK_ENABLED_SOFTIRQS,
-	LOCK_ENABLED_HARDIRQS,
+	LOCK_ENABLED_SOFTIRQ,
+	LOCK_ENABLED_HARDIRQ,
 	LOCK_HELD_OVER_RECLAIM_FS,
 	LOCK_USED_IN_HARDIRQ_READ,
 	LOCK_USED_IN_SOFTIRQ_READ,
 	LOCK_USED_IN_RECLAIM_FS_READ,
-	LOCK_ENABLED_SOFTIRQS_READ,
-	LOCK_ENABLED_HARDIRQS_READ,
+	LOCK_ENABLED_SOFTIRQ_READ,
+	LOCK_ENABLED_HARDIRQ_READ,
 	LOCK_HELD_OVER_RECLAIM_FS_READ,
 	LOCK_USAGE_STATES
 };
@@ -47,22 +47,22 @@ enum lock_usage_bit
 #define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
 #define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
 #define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
-#define LOCKF_ENABLED_HARDIRQS		(1 << LOCK_ENABLED_HARDIRQS)
-#define LOCKF_ENABLED_SOFTIRQS		(1 << LOCK_ENABLED_SOFTIRQS)
+#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
+#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
 #define LOCKF_HELD_OVER_RECLAIM_FS	(1 << LOCK_HELD_OVER_RECLAIM_FS)
 
-#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
 
 #define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
 #define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
 #define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
-#define LOCKF_ENABLED_HARDIRQS_READ	(1 << LOCK_ENABLED_HARDIRQS_READ)
-#define LOCKF_ENABLED_SOFTIRQS_READ	(1 << LOCK_ENABLED_SOFTIRQS_READ)
+#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
+#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
 #define LOCKF_HELD_OVER_RECLAIM_FS_READ	(1 << LOCK_HELD_OVER_RECLAIM_FS_READ)
 
-#define LOCKF_ENABLED_IRQS_READ \
-		(LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
+#define LOCKF_ENABLED_IRQ_READ \
+		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
 #define LOCKF_USED_IN_IRQ_READ \
 		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 977f940fd56..32f944752b1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -450,12 +450,12 @@ static const char *usage_str[] =
 	[LOCK_USED] =			"initial-use ",
 	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W",
 	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W",
-	[LOCK_ENABLED_SOFTIRQS] =	"softirq-on-W",
-	[LOCK_ENABLED_HARDIRQS] =	"hardirq-on-W",
+	[LOCK_ENABLED_SOFTIRQ] =	"softirq-on-W",
+	[LOCK_ENABLED_HARDIRQ] =	"hardirq-on-W",
 	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R",
 	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
-	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
-	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+	[LOCK_ENABLED_SOFTIRQ_READ] =	"softirq-on-R",
+	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
 	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
 	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
 	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
@@ -476,28 +476,28 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 		*c1 = '+';
 	else
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
 			*c1 = '-';
 
 	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
 		*c2 = '+';
 	else
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
 			*c2 = '-';
 
-	if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 		*c3 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
 		*c3 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 			*c3 = '?';
 	}
 
-	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 		*c4 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
 		*c4 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 			*c4 = '?';
 	}
 
@@ -1296,7 +1296,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
-					LOCK_ENABLED_HARDIRQS, "hard"))
+					LOCK_ENABLED_HARDIRQ, "hard"))
 		return 0;
 
 	/*
@@ -1306,7 +1306,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
-					LOCK_ENABLED_HARDIRQS, "hard-read"))
+					LOCK_ENABLED_HARDIRQ, "hard-read"))
 		return 0;
 
 	/*
@@ -1316,7 +1316,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
+					LOCK_ENABLED_SOFTIRQ, "soft"))
 		return 0;
 	/*
 	 * Prove that the new dependency does not connect a softirq-safe-read
@@ -1325,7 +1325,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
+					LOCK_ENABLED_SOFTIRQ, "soft"))
 		return 0;
 
 	/*
@@ -2008,17 +2008,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_HARDIRQS_READ))
+				 LOCK_ENABLED_HARDIRQ_READ))
 			return 0;
 		/*
 		 * just marked it hardirq-safe, check that this lock
 		 * took no hardirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQS, "hard"))
+					  LOCK_ENABLED_HARDIRQ, "hard"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2026,24 +2026,24 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no hardirq-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+				LOCK_ENABLED_HARDIRQ_READ, "hard-read"))
 			return 0;
 #endif
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_SOFTIRQS_READ))
+				 LOCK_ENABLED_SOFTIRQ_READ))
 			return 0;
 		/*
 		 * just marked it softirq-safe, check that this lock
 		 * took no softirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQS, "soft"))
+					  LOCK_ENABLED_SOFTIRQ, "soft"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2051,7 +2051,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no softirq-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+				LOCK_ENABLED_SOFTIRQ_READ, "soft-read"))
 			return 0;
 #endif
 		if (softirq_verbose(hlock_class(this)))
@@ -2083,27 +2083,27 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
 			return 0;
 		/*
 		 * just marked it hardirq-read-safe, check that this lock
 		 * took no hardirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQS, "hard"))
+					  LOCK_ENABLED_HARDIRQ, "hard"))
 			return 0;
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
 			return 0;
 		/*
 		 * just marked it softirq-read-safe, check that this lock
 		 * took no softirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQS, "soft"))
+					  LOCK_ENABLED_SOFTIRQ, "soft"))
 			return 0;
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
@@ -2121,7 +2121,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_HARDIRQS:
+	case LOCK_ENABLED_HARDIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2147,7 +2147,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_SOFTIRQS:
+	case LOCK_ENABLED_SOFTIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2199,7 +2199,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_HARDIRQS_READ:
+	case LOCK_ENABLED_HARDIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2214,7 +2214,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2274,16 +2274,16 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 		switch (mark) {
 		case HARDIRQ:
 			if (hlock->read)
-				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+				usage_bit = LOCK_ENABLED_HARDIRQ_READ;
 			else
-				usage_bit = LOCK_ENABLED_HARDIRQS;
+				usage_bit = LOCK_ENABLED_HARDIRQ;
 			break;
 
 		case SOFTIRQ:
 			if (hlock->read)
-				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+				usage_bit = LOCK_ENABLED_SOFTIRQ_READ;
 			else
-				usage_bit = LOCK_ENABLED_SOFTIRQS;
+				usage_bit = LOCK_ENABLED_SOFTIRQ;
 			break;
 
 		case RECLAIM_FS:
@@ -2520,19 +2520,19 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 	if (!hlock->hardirqs_off) {
 		if (hlock->read) {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS_READ))
+					LOCK_ENABLED_HARDIRQ_READ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS_READ))
+						LOCK_ENABLED_SOFTIRQ_READ))
 					return 0;
 		} else {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS))
+					LOCK_ENABLED_HARDIRQ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS))
+						LOCK_ENABLED_SOFTIRQ))
 					return 0;
 		}
 	}
@@ -2640,10 +2640,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_SOFTIRQ:
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_ENABLED_HARDIRQS:
-	case LOCK_ENABLED_SOFTIRQS:
-	case LOCK_ENABLED_HARDIRQS_READ:
-	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_ENABLED_HARDIRQ:
+	case LOCK_ENABLED_SOFTIRQ:
+	case LOCK_ENABLED_HARDIRQ_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 	case LOCK_HELD_OVER_RECLAIM_FS:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b84a1dfa907..bd474fd9df9 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			nr_uncategorized++;
 		if (class->usage_mask & LOCKF_USED_IN_IRQ)
 			nr_irq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_IRQS)
+		if (class->usage_mask & LOCKF_ENABLED_IRQ)
 			nr_irq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
 			nr_softirq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
 			nr_softirq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 			nr_hardirq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
 			nr_hardirq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
 			nr_irq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
 			nr_irq_read_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
 			nr_softirq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 			nr_softirq_read_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
 			nr_hardirq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 			nr_hardirq_read_unsafe++;
 
 #ifdef CONFIG_PROVE_LOCKING
-- 
cgit v1.2.3


From a652d7081bc96b3094e85ca30e47f50185d2f717 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 13:13:11 +0100
Subject: lockdep: sanitize reclaim bit names

s/HELD_OVER/ENABLED/g

so that its similar to the hard and soft-irq names.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h |  8 ++++----
 kernel/lockdep.c        | 38 +++++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index da2e2b25b3b..6d729c9d1d2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -30,13 +30,13 @@ enum lock_usage_bit
 	LOCK_USED_IN_RECLAIM_FS,
 	LOCK_ENABLED_SOFTIRQ,
 	LOCK_ENABLED_HARDIRQ,
-	LOCK_HELD_OVER_RECLAIM_FS,
+	LOCK_ENABLED_RECLAIM_FS,
 	LOCK_USED_IN_HARDIRQ_READ,
 	LOCK_USED_IN_SOFTIRQ_READ,
 	LOCK_USED_IN_RECLAIM_FS_READ,
 	LOCK_ENABLED_SOFTIRQ_READ,
 	LOCK_ENABLED_HARDIRQ_READ,
-	LOCK_HELD_OVER_RECLAIM_FS_READ,
+	LOCK_ENABLED_RECLAIM_FS_READ,
 	LOCK_USAGE_STATES
 };
 
@@ -49,7 +49,7 @@ enum lock_usage_bit
 #define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
 #define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
 #define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
-#define LOCKF_HELD_OVER_RECLAIM_FS	(1 << LOCK_HELD_OVER_RECLAIM_FS)
+#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
 
 #define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
@@ -59,7 +59,7 @@ enum lock_usage_bit
 #define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
 #define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
 #define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
-#define LOCKF_HELD_OVER_RECLAIM_FS_READ	(1 << LOCK_HELD_OVER_RECLAIM_FS_READ)
+#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
 
 #define LOCKF_ENABLED_IRQ_READ \
 		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 32f944752b1..dd4716c0832 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -458,8 +458,8 @@ static const char *usage_str[] =
 	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
 	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
 	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
-	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
-	[LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R",
+	[LOCK_ENABLED_RECLAIM_FS] =	"ov-reclaim-W",
+	[LOCK_ENABLED_RECLAIM_FS_READ] = "ov-reclaim-R",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -504,14 +504,14 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
 	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
 		*c5 = '+';
 	else
-		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS)
+		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS)
 			*c5 = '-';
 
-	if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
 		*c6 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
 		*c6 = '+';
-		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
 			*c6 = '?';
 	}
 
@@ -1335,7 +1335,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
-					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 		return 0;
 
 	/*
@@ -1345,7 +1345,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
-					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read"))
+					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs-read"))
 		return 0;
 
 	return 1;
@@ -2058,17 +2058,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_HELD_OVER_RECLAIM_FS_READ))
+				 LOCK_ENABLED_RECLAIM_FS_READ))
 			return 0;
 		/*
 		 * just marked it reclaim-fs-safe, check that this lock
 		 * took no reclaim-fs-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2076,7 +2076,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no reclaim-fs-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read"))
+				LOCK_ENABLED_RECLAIM_FS_READ, "reclaim-fs-read"))
 			return 0;
 #endif
 		if (reclaim_verbose(hlock_class(this)))
@@ -2109,14 +2109,14 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
 			return 0;
 		/*
 		 * just marked it reclaim-fs-read-safe, check that this lock
 		 * took no reclaim-fs-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 			return 0;
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
@@ -2173,7 +2173,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_HELD_OVER_RECLAIM_FS:
+	case LOCK_ENABLED_RECLAIM_FS:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2229,7 +2229,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2288,9 +2288,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 
 		case RECLAIM_FS:
 			if (hlock->read)
-				usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ;
+				usage_bit = LOCK_ENABLED_RECLAIM_FS_READ;
 			else
-				usage_bit = LOCK_HELD_OVER_RECLAIM_FS;
+				usage_bit = LOCK_ENABLED_RECLAIM_FS;
 			break;
 
 		default:
@@ -2646,8 +2646,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_HELD_OVER_RECLAIM_FS:
-	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+	case LOCK_ENABLED_RECLAIM_FS:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
-- 
cgit v1.2.3


From 9fe51abf7a1c787f918f66fa3cef9cd0cedb3791 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:09:46 +0100
Subject: lockdep: lockdep_states.h

Introduce a header file to generate all the states from.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_states.h | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 kernel/lockdep_states.h

diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
new file mode 100644
index 00000000000..937039ef2dd
--- /dev/null
+++ b/kernel/lockdep_states.h
@@ -0,0 +1,3 @@
+LOCKDEP_STATE(HARDIRQ)
+LOCKDEP_STATE(SOFTIRQ)
+LOCKDEP_STATE(RECLAIM_FS)
-- 
cgit v1.2.3


From 36bfb9bb03db2002a8574600c6aeb4cdd1ba01a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:12:41 +0100
Subject: lockdep: simplify mark_held_locks

remove the explicit state iteration

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dd4716c0832..18e0990148e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2253,11 +2253,19 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 }
 
 enum mark_type {
-	HARDIRQ,
-	SOFTIRQ,
-	RECLAIM_FS,
+#define LOCKDEP_STATE(__STATE)	__STATE,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 };
 
+#define MARK_HELD_CASE(__STATE)						\
+	case __STATE:							\
+		if (hlock->read)					\
+			usage_bit = LOCK_ENABLED_##__STATE##_READ;	\
+		else							\
+			usage_bit = LOCK_ENABLED_##__STATE;		\
+		break;
+
 /*
  * Mark all held locks with a usage bit:
  */
@@ -2272,27 +2280,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 		hlock = curr->held_locks + i;
 
 		switch (mark) {
-		case HARDIRQ:
-			if (hlock->read)
-				usage_bit = LOCK_ENABLED_HARDIRQ_READ;
-			else
-				usage_bit = LOCK_ENABLED_HARDIRQ;
-			break;
-
-		case SOFTIRQ:
-			if (hlock->read)
-				usage_bit = LOCK_ENABLED_SOFTIRQ_READ;
-			else
-				usage_bit = LOCK_ENABLED_SOFTIRQ;
-			break;
-
-		case RECLAIM_FS:
-			if (hlock->read)
-				usage_bit = LOCK_ENABLED_RECLAIM_FS_READ;
-			else
-				usage_bit = LOCK_ENABLED_RECLAIM_FS;
-			break;
-
+#define LOCKDEP_STATE(__STATE) MARK_HELD_CASE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 		default:
 			BUG();
 		}
-- 
cgit v1.2.3


From 5346417e17daf5a7712e4cf030b45414e46607cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:15:53 +0100
Subject: lockdep: simplify mark_lock()

remove the state iteration

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 18e0990148e..e68bd7d694b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2626,18 +2626,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		return 0;
 
 	switch (new_bit) {
-	case LOCK_USED_IN_HARDIRQ:
-	case LOCK_USED_IN_SOFTIRQ:
-	case LOCK_USED_IN_HARDIRQ_READ:
-	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_ENABLED_HARDIRQ:
-	case LOCK_ENABLED_SOFTIRQ:
-	case LOCK_ENABLED_HARDIRQ_READ:
-	case LOCK_ENABLED_SOFTIRQ_READ:
-	case LOCK_USED_IN_RECLAIM_FS:
-	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_ENABLED_RECLAIM_FS:
-	case LOCK_ENABLED_RECLAIM_FS_READ:
+#define LOCKDEP_STATE(__STATE)			\
+	case LOCK_USED_IN_##__STATE:		\
+	case LOCK_USED_IN_##__STATE##_READ:	\
+	case LOCK_ENABLED_##__STATE:		\
+	case LOCK_ENABLED_##__STATE##_READ:
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
-- 
cgit v1.2.3


From 9851673bc32bc9fcafbbaeffc858ead434bd6d58 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:18:40 +0100
Subject: lockdep: move state bit definitions around

For convenience later.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h    | 49 ++++------------------------------------------
 kernel/lockdep_internals.h | 46 +++++++++++++++++++++++++++++++++++++++++++
 kernel/lockdep_states.h    |  6 ++++++
 3 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 6d729c9d1d2..5a58ea3e91e 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -20,51 +20,10 @@ struct lockdep_map;
 #include <linux/stacktrace.h>
 
 /*
- * Lock-class usage-state bits:
+ * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
+ * the total number of states... :-(
  */
-enum lock_usage_bit
-{
-	LOCK_USED = 0,
-	LOCK_USED_IN_HARDIRQ,
-	LOCK_USED_IN_SOFTIRQ,
-	LOCK_USED_IN_RECLAIM_FS,
-	LOCK_ENABLED_SOFTIRQ,
-	LOCK_ENABLED_HARDIRQ,
-	LOCK_ENABLED_RECLAIM_FS,
-	LOCK_USED_IN_HARDIRQ_READ,
-	LOCK_USED_IN_SOFTIRQ_READ,
-	LOCK_USED_IN_RECLAIM_FS_READ,
-	LOCK_ENABLED_SOFTIRQ_READ,
-	LOCK_ENABLED_HARDIRQ_READ,
-	LOCK_ENABLED_RECLAIM_FS_READ,
-	LOCK_USAGE_STATES
-};
-
-/*
- * Usage-state bitmasks:
- */
-#define LOCKF_USED			(1 << LOCK_USED)
-#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
-#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
-#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
-#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
-#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
-#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
-
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
-
-#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
-#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
-#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
-#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
-#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
-#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
-
-#define LOCKF_ENABLED_IRQ_READ \
-		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
-		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define XXX_LOCK_USAGE_STATES		(1+3*4)
 
 #define MAX_LOCKDEP_SUBCLASSES		8UL
 
@@ -105,7 +64,7 @@ struct lock_class {
 	 * IRQ/softirq usage tracking bits:
 	 */
 	unsigned long			usage_mask;
-	struct stack_trace		usage_traces[LOCK_USAGE_STATES];
+	struct stack_trace		usage_traces[XXX_LOCK_USAGE_STATES];
 
 	/*
 	 * These fields represent a directed graph of lock dependencies,
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index e887b783244..1352409cfef 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -6,6 +6,52 @@
  * lockdep subsystem internal functions and variables.
  */
 
+/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit {
+	LOCK_USED = 0,
+	LOCK_USED_IN_HARDIRQ,
+	LOCK_USED_IN_SOFTIRQ,
+	LOCK_USED_IN_RECLAIM_FS,
+	LOCK_ENABLED_SOFTIRQ,
+	LOCK_ENABLED_HARDIRQ,
+	LOCK_ENABLED_RECLAIM_FS,
+	LOCK_USED_IN_HARDIRQ_READ,
+	LOCK_USED_IN_SOFTIRQ_READ,
+	LOCK_USED_IN_RECLAIM_FS_READ,
+	LOCK_ENABLED_SOFTIRQ_READ,
+	LOCK_ENABLED_HARDIRQ_READ,
+	LOCK_ENABLED_RECLAIM_FS_READ,
+	LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define LOCKF_USED			(1 << LOCK_USED)
+#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
+#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
+#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
+#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
+#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
+
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
+#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
+#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
+#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
+#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
+
+#define LOCKF_ENABLED_IRQ_READ \
+		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
 /*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
index 937039ef2dd..995b0cc2b84 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/lockdep_states.h
@@ -1,3 +1,9 @@
+/*
+ * Lockdep states,
+ *
+ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
+ * you add one, or come up with a nice dynamic solution.
+ */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
 LOCKDEP_STATE(RECLAIM_FS)
-- 
cgit v1.2.3


From d7b1b02134272840f4b655136e00c461e1cf1d53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:38:38 +0100
Subject: lockdep: generate the state bit definitions

Generate the state bit definitions from the lockdep_states.h file.

Also, move LOCK_USED to last, so that the

 USED_IN
 USED_IN_READ
 ENABLED
 ENABLED_READ

states are nicely bit aligned -- we're going to use that property

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_internals.h | 47 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 1352409cfef..7e653e66ce5 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -10,43 +10,36 @@
  * Lock-class usage-state bits:
  */
 enum lock_usage_bit {
-	LOCK_USED = 0,
-	LOCK_USED_IN_HARDIRQ,
-	LOCK_USED_IN_SOFTIRQ,
-	LOCK_USED_IN_RECLAIM_FS,
-	LOCK_ENABLED_SOFTIRQ,
-	LOCK_ENABLED_HARDIRQ,
-	LOCK_ENABLED_RECLAIM_FS,
-	LOCK_USED_IN_HARDIRQ_READ,
-	LOCK_USED_IN_SOFTIRQ_READ,
-	LOCK_USED_IN_RECLAIM_FS_READ,
-	LOCK_ENABLED_SOFTIRQ_READ,
-	LOCK_ENABLED_HARDIRQ_READ,
-	LOCK_ENABLED_RECLAIM_FS_READ,
+#define LOCKDEP_STATE(__STATE)		\
+	LOCK_USED_IN_##__STATE,		\
+	LOCK_USED_IN_##__STATE##_READ,	\
+	LOCK_ENABLED_##__STATE,		\
+	LOCK_ENABLED_##__STATE##_READ,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+	LOCK_USED,
 	LOCK_USAGE_STATES
 };
 
 /*
  * Usage-state bitmasks:
  */
-#define LOCKF_USED			(1 << LOCK_USED)
-#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
-#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
-#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
-#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
-#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
-#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
+#define __LOCKF(__STATE)	LOCKF_##__STATE = (1 << LOCK_##__STATE),
+
+enum {
+#define LOCKDEP_STATE(__STATE)						\
+	__LOCKF(USED_IN_##__STATE)					\
+	__LOCKF(USED_IN_##__STATE##_READ)				\
+	__LOCKF(ENABLED_##__STATE)					\
+	__LOCKF(ENABLED_##__STATE##_READ)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+	__LOCKF(USED)
+};
 
 #define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
 
-#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
-#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
-#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
-#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
-#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
-#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
-
 #define LOCKF_ENABLED_IRQ_READ \
 		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
 #define LOCKF_USED_IN_IRQ_READ \
-- 
cgit v1.2.3


From fabe9c42c6328de314d811887b4752eb3d202291 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:51:01 +0100
Subject: lockdep: generate usage strings

generate the usage strings

XXX capital invasion :-(

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e68bd7d694b..d31f7f836a0 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -445,21 +445,21 @@ atomic_t nr_find_usage_backwards_recursions;
  * Locking printouts:
  */
 
+#define __STR(foo)	#foo
+#define STR(foo)	__STR(foo)
+
+#define __USAGE(__STATE)						\
+	[LOCK_USED_IN_##__STATE] = "IN-"STR(__STATE)"-W",		\
+	[LOCK_ENABLED_##__STATE] = STR(__STATE)"-ON-W",			\
+	[LOCK_USED_IN_##__STATE##_READ] = "IN-"STR(__STATE)"-R",	\
+	[LOCK_ENABLED_##__STATE##_READ] = STR(__STATE)"-ON-R",
+
 static const char *usage_str[] =
 {
-	[LOCK_USED] =			"initial-use ",
-	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W",
-	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W",
-	[LOCK_ENABLED_SOFTIRQ] =	"softirq-on-W",
-	[LOCK_ENABLED_HARDIRQ] =	"hardirq-on-W",
-	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R",
-	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
-	[LOCK_ENABLED_SOFTIRQ_READ] =	"softirq-on-R",
-	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
-	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
-	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
-	[LOCK_ENABLED_RECLAIM_FS] =	"ov-reclaim-W",
-	[LOCK_ENABLED_RECLAIM_FS_READ] = "ov-reclaim-R",
+#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+	[LOCK_USED] = "INITIAL USE",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
-- 
cgit v1.2.3


From 6a6904d3475cc5e4a506b5c3c3684444e22c4cc4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:07:44 +0100
Subject: lockdep: split up mark_lock_irq()

split mark_lock_irq() into 4 simple helper functions

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 372 ++++++++++++++++++++++---------------------------------
 1 file changed, 147 insertions(+), 225 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d31f7f836a0..0b863c83a77 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2001,6 +2001,109 @@ static int reclaim_verbose(struct lock_class *class)
 
 #define STRICT_READ_CHECKS	1
 
+static int
+mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
+		      int new_bit, int excl_bit,
+		      const char *name, const char *rname,
+		      int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+	if (!valid_state(curr, this, new_bit, excl_bit + 1))
+		return 0;
+	/*
+	 * just marked it hardirq-safe, check that this lock
+	 * took no hardirq-unsafe lock in the past:
+	 */
+	if (!check_usage_forwards(curr, this, excl_bit, name))
+		return 0;
+#if STRICT_READ_CHECKS
+	/*
+	 * just marked it hardirq-safe, check that this lock
+	 * took no hardirq-unsafe-read lock in the past:
+	 */
+	if (!check_usage_forwards(curr, this, excl_bit + 1, rname))
+		return 0;
+#endif
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
+static int
+mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
+			   int new_bit, int excl_bit,
+			   const char *name, const char *rname,
+			   int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+	/*
+	 * just marked it hardirq-read-safe, check that this lock
+	 * took no hardirq-unsafe lock in the past:
+	 */
+	if (!check_usage_forwards(curr, this, excl_bit, name))
+		return 0;
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
+static int
+mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
+		      int new_bit, int excl_bit,
+		      const char *name, const char *rname,
+		      int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+	if (!valid_state(curr, this, new_bit, excl_bit + 1))
+		return 0;
+	/*
+	 * just marked it hardirq-unsafe, check that no hardirq-safe
+	 * lock in the system ever took it in the past:
+	 */
+	if (!check_usage_backwards(curr, this, excl_bit, name))
+		return 0;
+#if STRICT_READ_CHECKS
+	/*
+	 * just marked it hardirq-unsafe, check that no
+	 * hardirq-safe-read lock in the system ever took
+	 * it in the past:
+	 */
+	if (!check_usage_backwards(curr, this, excl_bit + 1, rname))
+		return 0;
+#endif
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
+static int
+mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
+			   int new_bit, int excl_bit,
+			   const char *name, const char *rname,
+			   int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+#if STRICT_READ_CHECKS
+	/*
+	 * just marked it hardirq-read-unsafe, check that no
+	 * hardirq-safe lock in the system ever took it in the past:
+	 */
+	if (!check_usage_backwards(curr, this, excl_bit, name))
+		return 0;
+#endif
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
@@ -2008,242 +2111,61 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_HARDIRQ_READ))
-			return 0;
-		/*
-		 * just marked it hardirq-safe, check that this lock
-		 * took no hardirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQ, "hard"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it hardirq-safe, check that this lock
-		 * took no hardirq-unsafe-read lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_HARDIRQ_READ, "hard-read"))
-			return 0;
-#endif
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in(curr, this, new_bit,
+				LOCK_ENABLED_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_SOFTIRQ_READ))
-			return 0;
-		/*
-		 * just marked it softirq-safe, check that this lock
-		 * took no softirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQ, "soft"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-safe, check that this lock
-		 * took no softirq-unsafe-read lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_SOFTIRQ_READ, "soft-read"))
-			return 0;
-#endif
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in(curr, this, new_bit,
+				LOCK_ENABLED_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_RECLAIM_FS_READ))
-			return 0;
-		/*
-		 * just marked it reclaim-fs-safe, check that this lock
-		 * took no reclaim-fs-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it reclaim-fs-safe, check that this lock
-		 * took no reclaim-fs-unsafe-read lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_RECLAIM_FS_READ, "reclaim-fs-read"))
-			return 0;
-#endif
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in(curr, this, new_bit,
+				LOCK_ENABLED_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	case LOCK_USED_IN_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
-			return 0;
-		/*
-		 * just marked it hardirq-read-safe, check that this lock
-		 * took no hardirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQ, "hard"))
-			return 0;
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in_read(curr, this, new_bit,
+				LOCK_ENABLED_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
-			return 0;
-		/*
-		 * just marked it softirq-read-safe, check that this lock
-		 * took no softirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQ, "soft"))
-			return 0;
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in_read(curr, this, new_bit,
+				LOCK_ENABLED_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
-			return 0;
-		/*
-		 * just marked it reclaim-fs-read-safe, check that this lock
-		 * took no reclaim-fs-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in_read(curr, this, new_bit,
+				LOCK_ENABLED_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	case LOCK_ENABLED_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_USED_IN_HARDIRQ_READ))
-			return 0;
-		/*
-		 * just marked it hardirq-unsafe, check that no hardirq-safe
-		 * lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_HARDIRQ, "hard"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it hardirq-unsafe, check that no
-		 * hardirq-safe-read lock in the system ever took
-		 * it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-				   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
-			return 0;
-#endif
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled(curr, this, new_bit,
+				LOCK_USED_IN_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_USED_IN_SOFTIRQ_READ))
-			return 0;
-		/*
-		 * just marked it softirq-unsafe, check that no softirq-safe
-		 * lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_SOFTIRQ, "soft"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-unsafe, check that no
-		 * softirq-safe-read lock in the system ever took
-		 * it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
-			return 0;
-#endif
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled(curr, this, new_bit,
+				LOCK_USED_IN_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_USED_IN_RECLAIM_FS_READ))
-			return 0;
-		/*
-		 * just marked it reclaim-fs-unsafe, check that no reclaim-fs-safe
-		 * lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-unsafe, check that no
-		 * softirq-safe-read lock in the system ever took
-		 * it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-				   LOCK_USED_IN_RECLAIM_FS_READ, "reclaim-fs-read"))
-			return 0;
-#endif
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled(curr, this, new_bit,
+				LOCK_USED_IN_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	case LOCK_ENABLED_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it hardirq-read-unsafe, check that no
-		 * hardirq-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_HARDIRQ, "hard"))
-			return 0;
-#endif
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled_read(curr, this, new_bit,
+				LOCK_USED_IN_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-read-unsafe, check that no
-		 * softirq-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_SOFTIRQ, "soft"))
-			return 0;
-#endif
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled_read(curr, this, new_bit,
+				LOCK_USED_IN_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it reclaim-fs-read-unsafe, check that no
-		 * reclaim-fs-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-#endif
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled_read(curr, this, new_bit,
+				LOCK_USED_IN_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	default:
 		WARN_ON(1);
 		break;
-- 
cgit v1.2.3


From 604de3b5b63ebc33a762c44d9c742f235b010346 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:24:44 +0100
Subject: lockdep: simplify the mark_lock_irq() helpers

In order to unify them, take some arguments away

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 60 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0b863c83a77..989a60baf97 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2001,12 +2001,38 @@ static int reclaim_verbose(struct lock_class *class)
 
 #define STRICT_READ_CHECKS	1
 
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
+{
+	return state_names[bit >> 2];
+}
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_rname(enum lock_usage_bit bit)
+{
+	return state_rnames[bit >> 2];
+}
+
 static int
 mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 		      int new_bit, int excl_bit,
-		      const char *name, const char *rname,
 		      int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2034,9 +2060,11 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 static int
 mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 			   int new_bit, int excl_bit,
-			   const char *name, const char *rname,
 			   int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	/*
@@ -2054,9 +2082,11 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 static int
 mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 		      int new_bit, int excl_bit,
-		      const char *name, const char *rname,
 		      int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2085,9 +2115,11 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 static int
 mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
 			   int new_bit, int excl_bit,
-			   const char *name, const char *rname,
 			   int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 #if STRICT_READ_CHECKS
@@ -2113,57 +2145,53 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_HARDIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
 				LOCK_ENABLED_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
 				LOCK_ENABLED_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
 		return mark_lock_irq_used_in(curr, this, new_bit,
 				LOCK_ENABLED_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
 				LOCK_ENABLED_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
 				LOCK_ENABLED_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
 				LOCK_ENABLED_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
 				LOCK_USED_IN_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
 				LOCK_USED_IN_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
 		return mark_lock_irq_enabled(curr, this, new_bit,
 				LOCK_USED_IN_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
 				LOCK_USED_IN_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
 				LOCK_USED_IN_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
 				LOCK_USED_IN_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	default:
-- 
cgit v1.2.3


From f989209e2f604730888a6daa3b3ff30ed0c9d7c0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:09:59 +0100
Subject: lockdep: further simplify mark_lock_irq() helpers

take away another parameter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 989a60baf97..306d0b823bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2025,14 +2025,35 @@ static inline const char *state_rname(enum lock_usage_bit bit)
 	return state_rnames[bit >> 2];
 }
 
+static int exclusive_bit(int new_bit)
+{
+	/*
+	 * USED_IN
+	 * USED_IN_READ
+	 * ENABLED
+	 * ENABLED_READ
+	 *
+	 * bit 0 - write/read
+	 * bit 1 - used_in/enabled
+	 * bit 2+  state
+	 */
+
+	int state = new_bit & ~3;
+	int dir = new_bit & 2;
+
+	return state | (dir ^ 2);
+}
+
 static int
 mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
-		      int new_bit, int excl_bit,
+		      int new_bit,
 		      int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2059,12 +2080,14 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit, int excl_bit,
+			   int new_bit,
 			   int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	/*
@@ -2081,12 +2104,14 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
-		      int new_bit, int excl_bit,
+		      int new_bit,
 		      int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2114,12 +2139,14 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit, int excl_bit,
+			   int new_bit,
 			   int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 #if STRICT_READ_CHECKS
@@ -2144,54 +2171,42 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
-				LOCK_ENABLED_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
-				LOCK_ENABLED_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
 		return mark_lock_irq_used_in(curr, this, new_bit,
-				LOCK_ENABLED_RECLAIM_FS,
 				reclaim_verbose);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				LOCK_ENABLED_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				LOCK_ENABLED_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				LOCK_ENABLED_RECLAIM_FS,
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
-				LOCK_USED_IN_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
-				LOCK_USED_IN_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
 		return mark_lock_irq_enabled(curr, this, new_bit,
-				LOCK_USED_IN_RECLAIM_FS,
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				LOCK_USED_IN_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				LOCK_USED_IN_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				LOCK_USED_IN_RECLAIM_FS,
 				reclaim_verbose);
 
 	default:
-- 
cgit v1.2.3


From cd95302d255264c5e6ebe1063320d80517bf2f83 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:38:21 +0100
Subject: lockdep: simplify mark_lock_irq() helpers #3

Kill another argument

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 65 +++++++++++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 38 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 306d0b823bd..e0a027d58da 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1975,7 +1975,7 @@ void print_irqtrace_events(struct task_struct *curr)
 	print_ip_sym(curr->softirq_disable_ip);
 }
 
-static int hardirq_verbose(struct lock_class *class)
+static int HARDIRQ_verbose(struct lock_class *class)
 {
 #if HARDIRQ_VERBOSE
 	return class_filter(class);
@@ -1983,7 +1983,7 @@ static int hardirq_verbose(struct lock_class *class)
 	return 0;
 }
 
-static int softirq_verbose(struct lock_class *class)
+static int SOFTIRQ_verbose(struct lock_class *class)
 {
 #if SOFTIRQ_VERBOSE
 	return class_filter(class);
@@ -1991,7 +1991,7 @@ static int softirq_verbose(struct lock_class *class)
 	return 0;
 }
 
-static int reclaim_verbose(struct lock_class *class)
+static int RECLAIM_FS_verbose(struct lock_class *class)
 {
 #if RECLAIM_VERBOSE
 	return class_filter(class);
@@ -2025,6 +2025,19 @@ static inline const char *state_rname(enum lock_usage_bit bit)
 	return state_rnames[bit >> 2];
 }
 
+static int (*state_verbose_f[])(struct lock_class *class) = {
+#define LOCKDEP_STATE(__STATE) \
+	__STATE##_verbose,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline int state_verbose(enum lock_usage_bit bit,
+				struct lock_class *class)
+{
+	return state_verbose_f[bit >> 2](class);
+}
+
 static int exclusive_bit(int new_bit)
 {
 	/*
@@ -2046,8 +2059,7 @@ static int exclusive_bit(int new_bit)
 
 static int
 mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
-		      int new_bit,
-		      int (*verbose)(struct lock_class *class))
+		      int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2072,7 +2084,7 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 	if (!check_usage_forwards(curr, this, excl_bit + 1, rname))
 		return 0;
 #endif
-	if (verbose(hlock_class(this)))
+	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
 	return 1;
@@ -2080,8 +2092,7 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit,
-			   int (*verbose)(struct lock_class *class))
+			   int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2096,7 +2107,7 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 	 */
 	if (!check_usage_forwards(curr, this, excl_bit, name))
 		return 0;
-	if (verbose(hlock_class(this)))
+	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
 	return 1;
@@ -2104,8 +2115,7 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
-		      int new_bit,
-		      int (*verbose)(struct lock_class *class))
+		      int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2131,7 +2141,7 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 	if (!check_usage_backwards(curr, this, excl_bit + 1, rname))
 		return 0;
 #endif
-	if (verbose(hlock_class(this)))
+	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
 	return 1;
@@ -2139,8 +2149,7 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit,
-			   int (*verbose)(struct lock_class *class))
+			   int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2170,44 +2179,24 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		return mark_lock_irq_used_in(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
-		return mark_lock_irq_used_in(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
-		return mark_lock_irq_used_in(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_used_in(curr, this, new_bit);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_used_in_read(curr, this, new_bit);
 
 	case LOCK_ENABLED_HARDIRQ:
-		return mark_lock_irq_enabled(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
-		return mark_lock_irq_enabled(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
-		return mark_lock_irq_enabled(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_enabled(curr, this, new_bit);
 
 	case LOCK_ENABLED_HARDIRQ_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_enabled_read(curr, this, new_bit);
 
 	default:
 		WARN_ON(1);
-- 
cgit v1.2.3


From 780e820b2dfefdfead9243724c2d2b73f379fda6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:51:29 +0100
Subject: lockdep: merge the _READ mark_lock_irq() helpers

The _READ helpers show remarkable similarity, merge them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 61 +++++++++++++++++++++-----------------------------------
 1 file changed, 23 insertions(+), 38 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e0a027d58da..2d95f9db259 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2091,22 +2091,34 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 }
 
 static int
-mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
+mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
 			   int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
 	int excl_bit = exclusive_bit(new_bit);
+	int dir = new_bit & 2;
 
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
-	/*
-	 * just marked it hardirq-read-safe, check that this lock
-	 * took no hardirq-unsafe lock in the past:
-	 */
-	if (!check_usage_forwards(curr, this, excl_bit, name))
-		return 0;
+
+	if (!dir) {
+		/*
+		 * just marked it hardirq-read-safe, check that this lock
+		 * took no hardirq-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this, excl_bit, name))
+			return 0;
+	} else if (STRICT_READ_CHECKS) {
+		/*
+		 * just marked it hardirq-read-unsafe, check that no
+		 * hardirq-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this, excl_bit, name))
+			return 0;
+	}
+
 	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
@@ -2147,31 +2159,6 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int
-mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit)
-{
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
-	int excl_bit = exclusive_bit(new_bit);
-
-	if (!valid_state(curr, this, new_bit, excl_bit))
-		return 0;
-#if STRICT_READ_CHECKS
-	/*
-	 * just marked it hardirq-read-unsafe, check that no
-	 * hardirq-safe lock in the system ever took it in the past:
-	 */
-	if (!check_usage_backwards(curr, this, excl_bit, name))
-		return 0;
-#endif
-	if (verbose(hlock_class(this)))
-		return 2;
-
-	return 1;
-}
-
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
@@ -2186,18 +2173,16 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit);
+	case LOCK_ENABLED_HARDIRQ_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
+		return mark_lock_irq_read(curr, this, new_bit);
 
 	case LOCK_ENABLED_HARDIRQ:
 	case LOCK_ENABLED_SOFTIRQ:
 	case LOCK_ENABLED_RECLAIM_FS:
 		return mark_lock_irq_enabled(curr, this, new_bit);
 
-	case LOCK_ENABLED_HARDIRQ_READ:
-	case LOCK_ENABLED_SOFTIRQ_READ:
-	case LOCK_ENABLED_RECLAIM_FS_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit);
-
 	default:
 		WARN_ON(1);
 		break;
-- 
cgit v1.2.3


From 42c50d544e009cd9b1a8e74d7c5ce8d03ca917ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:58:16 +0100
Subject: lockdep: merge the !_READ mark_lock_irq() helpers

These two are also remakably similar

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 58 +++++++++++++++-----------------------------------------
 1 file changed, 15 insertions(+), 43 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2d95f9db259..1ba52e68355 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2057,31 +2057,39 @@ static int exclusive_bit(int new_bit)
 	return state | (dir ^ 2);
 }
 
+typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
+			     enum lock_usage_bit bit, const char *name);
+
 static int
-mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
+mark_lock_irq_write(struct task_struct *curr, struct held_lock *this,
 		      int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
 	int excl_bit = exclusive_bit(new_bit);
+	int dir = new_bit & 2;
+
+	check_usage_f usage = dir ?
+		check_usage_backwards : check_usage_forwards;
 
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
 		return 0;
+
 	/*
 	 * just marked it hardirq-safe, check that this lock
 	 * took no hardirq-unsafe lock in the past:
 	 */
-	if (!check_usage_forwards(curr, this, excl_bit, name))
+	if (!usage(curr, this, excl_bit, name))
 		return 0;
 #if STRICT_READ_CHECKS
 	/*
 	 * just marked it hardirq-safe, check that this lock
 	 * took no hardirq-unsafe-read lock in the past:
 	 */
-	if (!check_usage_forwards(curr, this, excl_bit + 1, rname))
+	if (!usage(curr, this, excl_bit + 1, rname))
 		return 0;
 #endif
 	if (state_verbose(new_bit, hlock_class(this)))
@@ -2125,40 +2133,6 @@ mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int
-mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
-		      int new_bit)
-{
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
-	int excl_bit = exclusive_bit(new_bit);
-
-	if (!valid_state(curr, this, new_bit, excl_bit))
-		return 0;
-	if (!valid_state(curr, this, new_bit, excl_bit + 1))
-		return 0;
-	/*
-	 * just marked it hardirq-unsafe, check that no hardirq-safe
-	 * lock in the system ever took it in the past:
-	 */
-	if (!check_usage_backwards(curr, this, excl_bit, name))
-		return 0;
-#if STRICT_READ_CHECKS
-	/*
-	 * just marked it hardirq-unsafe, check that no
-	 * hardirq-safe-read lock in the system ever took
-	 * it in the past:
-	 */
-	if (!check_usage_backwards(curr, this, excl_bit + 1, rname))
-		return 0;
-#endif
-	if (state_verbose(new_bit, hlock_class(this)))
-		return 2;
-
-	return 1;
-}
-
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
@@ -2168,7 +2142,10 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_HARDIRQ:
 	case LOCK_USED_IN_SOFTIRQ:
 	case LOCK_USED_IN_RECLAIM_FS:
-		return mark_lock_irq_used_in(curr, this, new_bit);
+	case LOCK_ENABLED_HARDIRQ:
+	case LOCK_ENABLED_SOFTIRQ:
+	case LOCK_ENABLED_RECLAIM_FS:
+		return mark_lock_irq_write(curr, this, new_bit);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
@@ -2178,11 +2155,6 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_RECLAIM_FS_READ:
 		return mark_lock_irq_read(curr, this, new_bit);
 
-	case LOCK_ENABLED_HARDIRQ:
-	case LOCK_ENABLED_SOFTIRQ:
-	case LOCK_ENABLED_RECLAIM_FS:
-		return mark_lock_irq_enabled(curr, this, new_bit);
-
 	default:
 		WARN_ON(1);
 		break;
-- 
cgit v1.2.3


From 9d3651a23dc1f7ed7d207f9118459d3a73d485a7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 17:18:32 +0100
Subject: lockdep: fully reduce mark_lock_irq()

Now what its only two functions, they again look rather similar.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 86 ++++++--------------------------------------------------
 1 file changed, 8 insertions(+), 78 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1ba52e68355..000d53a2da3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2061,13 +2061,13 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
-mark_lock_irq_write(struct task_struct *curr, struct held_lock *this,
-		      int new_bit)
+mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
 	int excl_bit = exclusive_bit(new_bit);
+	int read = new_bit & 1;
 	int dir = new_bit & 2;
 
 	check_usage_f usage = dir ?
@@ -2075,57 +2075,17 @@ mark_lock_irq_write(struct task_struct *curr, struct held_lock *this,
 
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
-	if (!valid_state(curr, this, new_bit, excl_bit + 1))
-		return 0;
 
-	/*
-	 * just marked it hardirq-safe, check that this lock
-	 * took no hardirq-unsafe lock in the past:
-	 */
-	if (!usage(curr, this, excl_bit, name))
-		return 0;
-#if STRICT_READ_CHECKS
-	/*
-	 * just marked it hardirq-safe, check that this lock
-	 * took no hardirq-unsafe-read lock in the past:
-	 */
-	if (!usage(curr, this, excl_bit + 1, rname))
+	if (!read && !valid_state(curr, this, new_bit, excl_bit + 1))
 		return 0;
-#endif
-	if (state_verbose(new_bit, hlock_class(this)))
-		return 2;
-
-	return 1;
-}
-
-static int
-mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit)
-{
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
-	int excl_bit = exclusive_bit(new_bit);
-	int dir = new_bit & 2;
 
-	if (!valid_state(curr, this, new_bit, excl_bit))
+	if ((!read || (!dir || STRICT_READ_CHECKS)) &&
+			!usage(curr, this, excl_bit, name))
 		return 0;
 
-	if (!dir) {
-		/*
-		 * just marked it hardirq-read-safe, check that this lock
-		 * took no hardirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this, excl_bit, name))
-			return 0;
-	} else if (STRICT_READ_CHECKS) {
-		/*
-		 * just marked it hardirq-read-unsafe, check that no
-		 * hardirq-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this, excl_bit, name))
-			return 0;
-	}
+	if ((!read && STRICT_READ_CHECKS) &&
+			!usage(curr, this, excl_bit + 1, rname))
+		return 0;
 
 	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
@@ -2133,36 +2093,6 @@ mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
-		enum lock_usage_bit new_bit)
-{
-	int ret = 1;
-
-	switch(new_bit) {
-	case LOCK_USED_IN_HARDIRQ:
-	case LOCK_USED_IN_SOFTIRQ:
-	case LOCK_USED_IN_RECLAIM_FS:
-	case LOCK_ENABLED_HARDIRQ:
-	case LOCK_ENABLED_SOFTIRQ:
-	case LOCK_ENABLED_RECLAIM_FS:
-		return mark_lock_irq_write(curr, this, new_bit);
-
-	case LOCK_USED_IN_HARDIRQ_READ:
-	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_ENABLED_HARDIRQ_READ:
-	case LOCK_ENABLED_SOFTIRQ_READ:
-	case LOCK_ENABLED_RECLAIM_FS_READ:
-		return mark_lock_irq_read(curr, this, new_bit);
-
-	default:
-		WARN_ON(1);
-		break;
-	}
-
-	return ret;
-}
-
 enum mark_type {
 #define LOCKDEP_STATE(__STATE)	__STATE,
 #include "lockdep_states.h"
-- 
cgit v1.2.3


From cf2ad4d13c4ac6366c730fcf6c6be00db12fb75f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Jan 2009 13:58:08 +0100
Subject: lockdep: remove macro usage from mark_held_locks()

Now that we have nice numerical relations for the states, remove the macro
magics.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 000d53a2da3..f40d916c191 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2099,14 +2099,6 @@ enum mark_type {
 #undef LOCKDEP_STATE
 };
 
-#define MARK_HELD_CASE(__STATE)						\
-	case __STATE:							\
-		if (hlock->read)					\
-			usage_bit = LOCK_ENABLED_##__STATE##_READ;	\
-		else							\
-			usage_bit = LOCK_ENABLED_##__STATE;		\
-		break;
-
 /*
  * Mark all held locks with a usage bit:
  */
@@ -2120,13 +2112,11 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		hlock = curr->held_locks + i;
 
-		switch (mark) {
-#define LOCKDEP_STATE(__STATE) MARK_HELD_CASE(__STATE)
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-		default:
-			BUG();
-		}
+		usage_bit = 2 + (mark << 2); /* ENABLED */
+		if (hlock->read)
+			usage_bit += 1; /* READ */
+
+		BUG_ON(usage_bit >= LOCK_USAGE_STATES);
 
 		if (!mark_lock(curr, hlock, usage_bit))
 			return 0;
-- 
cgit v1.2.3


From 38aa2714382d886f77f2565277fce293122808b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Jan 2009 14:53:50 +0100
Subject: lockdep: add comments to mark_lock_irq()

re-add some of the comments that got lost in the refactoring.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f40d916c191..02e6e066d56 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2054,6 +2054,9 @@ static int exclusive_bit(int new_bit)
 	int state = new_bit & ~3;
 	int dir = new_bit & 2;
 
+	/*
+	 * keep state, bit flip the direction and strip read.
+	 */
 	return state | (dir ^ 2);
 }
 
@@ -2070,22 +2073,42 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	int read = new_bit & 1;
 	int dir = new_bit & 2;
 
+	/*
+	 * mark USED_IN has to look forwards -- to ensure no dependency
+	 * has ENABLED state, which would allow recursion deadlocks.
+	 *
+	 * mark ENABLED has to look backwards -- to ensure no dependee
+	 * has USED_IN state, which, again, would allow  recursion deadlocks.
+	 */
 	check_usage_f usage = dir ?
 		check_usage_backwards : check_usage_forwards;
 
+	/*
+	 * Validate that this particular lock does not have conflicting
+	 * usage states.
+	 */
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 
-	if (!read && !valid_state(curr, this, new_bit, excl_bit + 1))
-		return 0;
-
-	if ((!read || (!dir || STRICT_READ_CHECKS)) &&
+	/*
+	 * Validate that the lock dependencies don't have conflicting usage
+	 * states.
+	 */
+	if ((!read || !dir || STRICT_READ_CHECKS) &&
 			!usage(curr, this, excl_bit, name))
 		return 0;
 
-	if ((!read && STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit + 1, rname))
-		return 0;
+	/*
+	 * Check for read in write conflicts
+	 */
+	if (!read) {
+		if (!valid_state(curr, this, new_bit, excl_bit + 1))
+			return 0;
+
+		if (STRICT_READ_CHECKS &&
+				!usage(curr, this, excl_bit + 1, rname))
+			return 0;
+	}
 
 	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
-- 
cgit v1.2.3


From 3ff176ca47911630d1555f150d36daa2d0819ea9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 17:40:42 +0100
Subject: lockdep: simplify get_user_chars()

there's too much repetition of code..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 65 +++++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 41 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02e6e066d56..1b4ee3c0b78 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -467,54 +467,37 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 	return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
 }
 
-void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
-					char *c4, char *c5, char *c6)
+static inline unsigned long lock_flag(enum lock_usage_bit bit)
 {
-	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.', *c5 = '.', *c6 = '.';
-
-	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
-		*c1 = '+';
-	else
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
-			*c1 = '-';
-
-	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
-		*c2 = '+';
-	else
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
-			*c2 = '-';
+	return 1UL << bit;
+}
 
-	if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
-		*c3 = '-';
-	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
-		*c3 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
-			*c3 = '?';
-	}
+static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
+{
+	char c = '.';
 
-	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
-		*c4 = '-';
-	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
-		*c4 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
-			*c4 = '?';
+	if (class->usage_mask & lock_flag(bit + 2))
+		c = '+';
+	if (class->usage_mask & lock_flag(bit)) {
+		c = '-';
+		if (class->usage_mask & lock_flag(bit + 2))
+			c = '?';
 	}
 
-	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
-		*c5 = '+';
-	else
-		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS)
-			*c5 = '-';
+	return c;
+}
 
-	if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
-		*c6 = '-';
-	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
-		*c6 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
-			*c6 = '?';
-	}
+void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6)
+{
+	*c1 = get_usage_char(class, LOCK_USED_IN_HARDIRQ);
+	*c2 = get_usage_char(class, LOCK_USED_IN_SOFTITQ);
+	*c3 = get_usage_char(class, LOCK_USED_IN_HARDIRQ_READ);
+	*c4 = get_usage_char(class, LOCK_USED_IN_SOFTITQ_READ);
 
+	*c5 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS);
+	*c6 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS_READ);
 }
 
 static void print_lock_name(struct lock_class *class)
-- 
cgit v1.2.3


From f510b233cfc7bfd57b6007071c52aa42e3d16b06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 17:53:47 +0100
Subject: lockdep: get_user_chars() redo

Generic, states independent, get_user_chars().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/lockdep-design.txt | 30 +++++++++++++++++-------------
 kernel/lockdep.c                 | 24 ++++++++++++------------
 kernel/lockdep_internals.h       |  7 ++++---
 kernel/lockdep_proc.c            |  6 +++---
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index 48877301815..938ea22f2cc 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -27,33 +27,37 @@ lock-class.
 State
 -----
 
-The validator tracks lock-class usage history into 5 separate state bits:
+The validator tracks lock-class usage history into 4n + 1 separate state bits:
 
-- 'ever held in hardirq context'                    [ == hardirq-safe   ]
-- 'ever held in softirq context'                    [ == softirq-safe   ]
-- 'ever held with hardirqs enabled'                 [ == hardirq-unsafe ]
-- 'ever held with softirqs and hardirqs enabled'    [ == softirq-unsafe ]
+- 'ever held in STATE context'
+- 'ever head as readlock in STATE context'
+- 'ever head with STATE enabled'
+- 'ever head as readlock with STATE enabled'
+
+Where STATE can be either one of (kernel/lockdep_states.h)
+ - hardirq
+ - softirq
+ - reclaim_fs
 
 - 'ever used'                                       [ == !unused        ]
 
-When locking rules are violated, these 4 state bits are presented in the
-locking error messages, inside curlies.  A contrived example:
+When locking rules are violated, these state bits are presented in the
+locking error messages, inside curlies. A contrived example:
 
    modprobe/2287 is trying to acquire lock:
-    (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24
+    (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
 
    but task is already holding lock:
-    (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24
+    (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
 
 
-The bit position indicates hardirq, softirq, hardirq-read,
-softirq-read respectively, and the character displayed in each
-indicates:
+The bit position indicates STATE, STATE-read, for each of the states listed
+above, and the character displayed in each indicates:
 
    '.'  acquired while irqs disabled
    '+'  acquired in irq context
    '-'  acquired with irqs enabled
-   '?' read acquired in irq context with irqs enabled.
+   '?'  acquired in irq context with irqs enabled.
 
 Unused mutexes cannot be part of the cause of an error.
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1b4ee3c0b78..22ced8d4912 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -487,25 +487,25 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
 	return c;
 }
 
-void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
-					char *c4, char *c5, char *c6)
+void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
 {
-	*c1 = get_usage_char(class, LOCK_USED_IN_HARDIRQ);
-	*c2 = get_usage_char(class, LOCK_USED_IN_SOFTITQ);
-	*c3 = get_usage_char(class, LOCK_USED_IN_HARDIRQ_READ);
-	*c4 = get_usage_char(class, LOCK_USED_IN_SOFTITQ_READ);
+	int i = 0;
 
-	*c5 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS);
-	*c6 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS_READ);
+#define LOCKDEP_STATE(__STATE) 						\
+	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE);	\
+	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+	usage[i] = '\0';
 }
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6;
+	char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
 	const char *name;
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
+	get_usage_chars(class, usage);
 
 	name = class->name;
 	if (!name) {
@@ -518,7 +518,7 @@ static void print_lock_name(struct lock_class *class)
 		if (class->subclass)
 			printk("/%d", class->subclass);
 	}
-	printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6);
+	printk("){%s}", usage);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 7e653e66ce5..a2cc7e9a6e8 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -70,9 +70,10 @@ enum {
 extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
 
-extern void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
-					char *c4, char *c5, char *c6);
+#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+
+extern void get_usage_chars(struct lock_class *class,
+			    char usage[LOCK_USAGE_CHARS]);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index bd474fd9df9..b51064ce564 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
 {
 	struct lock_class *class = v;
 	struct lock_list *entry;
-	char c1, c2, c3, c4, c5, c6;
+	char usage[LOCK_USAGE_CHARS];
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
 	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
 #endif
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
-	seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6);
+	get_usage_chars(class, usage);
+	seq_printf(m, " %s", usage);
 
 	seq_printf(m, ": ");
 	print_name(m, class);
-- 
cgit v1.2.3


From 4f367d8adca947bed4385740a13d1efb1a06fba1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 18:10:42 +0100
Subject: lockdep: simplify check_prev_add_irq()

Remove the manual state iteration thingy.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 154 ++++++++++++++++++++++---------------------------------
 1 file changed, 61 insertions(+), 93 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 22ced8d4912..42dfc28d12c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1268,68 +1268,84 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 			bit_backwards, bit_forwards, irqclass);
 }
 
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
-		struct held_lock *next)
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
 {
-	/*
-	 * Prove that the new dependency does not connect a hardirq-safe
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
-					LOCK_ENABLED_HARDIRQ, "hard"))
-		return 0;
+	return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+}
 
+static int exclusive_bit(int new_bit)
+{
 	/*
-	 * Prove that the new dependency does not connect a hardirq-safe-read
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
+	 * USED_IN
+	 * USED_IN_READ
+	 * ENABLED
+	 * ENABLED_READ
+	 *
+	 * bit 0 - write/read
+	 * bit 1 - used_in/enabled
+	 * bit 2+  state
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
-					LOCK_ENABLED_HARDIRQ, "hard-read"))
-		return 0;
+
+	int state = new_bit & ~3;
+	int dir = new_bit & 2;
 
 	/*
-	 * Prove that the new dependency does not connect a softirq-safe
-	 * lock with a softirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
+	 * keep state, bit flip the direction and strip read.
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
-					LOCK_ENABLED_SOFTIRQ, "soft"))
-		return 0;
+	return state | (dir ^ 2);
+}
+
+static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
+			   struct held_lock *next, enum lock_usage_bit bit)
+{
 	/*
-	 * Prove that the new dependency does not connect a softirq-safe-read
-	 * lock with a softirq-unsafe lock - to achieve this we search
+	 * Prove that the new dependency does not connect a hardirq-safe
+	 * lock with a hardirq-unsafe lock - to achieve this we search
 	 * the backwards-subgraph starting at <prev>, and the
 	 * forwards-subgraph starting at <next>:
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
-					LOCK_ENABLED_SOFTIRQ, "soft"))
+	if (!check_usage(curr, prev, next, bit,
+			   exclusive_bit(bit), state_name(bit)))
 		return 0;
 
+	bit++; /* _READ */
+
 	/*
-	 * Prove that the new dependency does not connect a reclaim-fs-safe
-	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * Prove that the new dependency does not connect a hardirq-safe-read
+	 * lock with a hardirq-unsafe lock - to achieve this we search
 	 * the backwards-subgraph starting at <prev>, and the
 	 * forwards-subgraph starting at <next>:
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
-					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
+	if (!check_usage(curr, prev, next, bit,
+			   exclusive_bit(bit), state_name(bit)))
 		return 0;
 
-	/*
-	 * Prove that the new dependency does not connect a reclaim-fs-safe-read
-	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
-					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs-read"))
+	return 1;
+}
+
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+		struct held_lock *next)
+{
+#define LOCKDEP_STATE(__STATE)						\
+	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\
 		return 0;
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 
 	return 1;
 }
@@ -1984,30 +2000,6 @@ static int RECLAIM_FS_verbose(struct lock_class *class)
 
 #define STRICT_READ_CHECKS	1
 
-static const char *state_names[] = {
-#define LOCKDEP_STATE(__STATE) \
-	STR(__STATE),
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
-static inline const char *state_name(enum lock_usage_bit bit)
-{
-	return state_names[bit >> 2];
-}
-
-static const char *state_rnames[] = {
-#define LOCKDEP_STATE(__STATE) \
-	STR(__STATE)"-READ",
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
-static inline const char *state_rname(enum lock_usage_bit bit)
-{
-	return state_rnames[bit >> 2];
-}
-
 static int (*state_verbose_f[])(struct lock_class *class) = {
 #define LOCKDEP_STATE(__STATE) \
 	__STATE##_verbose,
@@ -2021,37 +2013,12 @@ static inline int state_verbose(enum lock_usage_bit bit,
 	return state_verbose_f[bit >> 2](class);
 }
 
-static int exclusive_bit(int new_bit)
-{
-	/*
-	 * USED_IN
-	 * USED_IN_READ
-	 * ENABLED
-	 * ENABLED_READ
-	 *
-	 * bit 0 - write/read
-	 * bit 1 - used_in/enabled
-	 * bit 2+  state
-	 */
-
-	int state = new_bit & ~3;
-	int dir = new_bit & 2;
-
-	/*
-	 * keep state, bit flip the direction and strip read.
-	 */
-	return state | (dir ^ 2);
-}
-
 typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
 mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 {
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
 	int excl_bit = exclusive_bit(new_bit);
 	int read = new_bit & 1;
 	int dir = new_bit & 2;
@@ -2078,7 +2045,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	 * states.
 	 */
 	if ((!read || !dir || STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit, name))
+			!usage(curr, this, excl_bit, state_name(new_bit)))
 		return 0;
 
 	/*
@@ -2089,7 +2056,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 			return 0;
 
 		if (STRICT_READ_CHECKS &&
-				!usage(curr, this, excl_bit + 1, rname))
+			!usage(curr, this, excl_bit + 1,
+				state_name(new_bit + 1)))
 			return 0;
 	}
 
-- 
cgit v1.2.3


From b4b136f44b3b7adb9265fd5566d0ea9b99b1cd5f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 29 Jan 2009 14:50:36 +0100
Subject: lockdep: use stringify.h

Arnd pointed out we have the stringify macro magic already in-kernel.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42dfc28d12c..fc84a30483c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -41,6 +41,7 @@
 #include <linux/utsname.h>
 #include <linux/hash.h>
 #include <linux/ftrace.h>
+#include <linux/stringify.h>
 
 #include <asm/sections.h>
 
@@ -445,14 +446,11 @@ atomic_t nr_find_usage_backwards_recursions;
  * Locking printouts:
  */
 
-#define __STR(foo)	#foo
-#define STR(foo)	__STR(foo)
-
 #define __USAGE(__STATE)						\
-	[LOCK_USED_IN_##__STATE] = "IN-"STR(__STATE)"-W",		\
-	[LOCK_ENABLED_##__STATE] = STR(__STATE)"-ON-W",			\
-	[LOCK_USED_IN_##__STATE##_READ] = "IN-"STR(__STATE)"-R",	\
-	[LOCK_ENABLED_##__STATE##_READ] = STR(__STATE)"-ON-R",
+	[LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W",	\
+	[LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W",		\
+	[LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
+	[LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
 
 static const char *usage_str[] =
 {
@@ -1270,14 +1268,14 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 
 static const char *state_names[] = {
 #define LOCKDEP_STATE(__STATE) \
-	STR(__STATE),
+	__stringify(__STATE),
 #include "lockdep_states.h"
 #undef LOCKDEP_STATE
 };
 
 static const char *state_rnames[] = {
 #define LOCKDEP_STATE(__STATE) \
-	STR(__STATE)"-READ",
+	__stringify(__STATE)"-READ",
 #include "lockdep_states.h"
 #undef LOCKDEP_STATE
 };
-- 
cgit v1.2.3


From 9833f8cb952b9aa3f98a71e7bef8820cee3261a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 14 Feb 2009 16:59:04 +0100
Subject: lockstat: warn about disabled lock debugging

Avoid confusion and clearly state lock debugging got disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b51064ce564..d7135aa2d2c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 static void seq_header(struct seq_file *m)
 {
 	seq_printf(m, "lock_stat version 0.3\n");
+
+	if (unlikely(!debug_locks))
+		seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
+
 	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
 			"%14s %14s\n",
-- 
cgit v1.2.3


From 868a23a8043f2a3042dae60105c89bd4680187ba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 15 Feb 2009 00:25:21 +0100
Subject: lockdep: build fix for !PROVE_LOCKING

The __GFP_FS annotations fail to build with CONFIG_LOCKDEP=y,
CONFIG_PROVE_LOCKING=n, ammend that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index fc84a30483c..022d2ed7fd8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2396,6 +2396,10 @@ static inline int separate_irq_context(struct task_struct *curr,
 	return 0;
 }
 
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+}
+
 #endif
 
 /*
-- 
cgit v1.2.3


From 8bad4597c2d71365adfa846ea1ca6cf99161a455 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 14 Feb 2009 21:46:54 -0500
Subject: ext4: Use unsigned int for blocksize in dx_make_map() and
 dx_pack_dirents()

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3e..04824958cba 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct dx_frame *frame,
 				 int *err);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
 		struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -713,15 +713,15 @@ errout:
  * Create map of hash values, offsets, and sizes, stored at end of block.
  * Returns number of entries mapped.
  */
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+		       struct dx_hash_info *hinfo,
+		       struct dx_map_entry *map_tail)
 {
 	int count = 0;
 	char *base = (char *) de;
 	struct dx_hash_info h = *hinfo;
 
-	while ((char *) de < base + size)
-	{
+	while ((char *) de < base + blocksize) {
 		if (de->name_len && de->inode) {
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
@@ -1130,13 +1130,13 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
  * Compact each dir entry in the range to the minimal rec_len.
  * Returns pointer to last entry in range.
  */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 {
 	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
 	unsigned rec_len = 0;
 
 	prev = to = de;
-	while ((char*)de < base + size) {
+	while ((char*)de < base + blocksize) {
 		next = ext4_next_entry(de);
 		if (de->inode && de->name_len) {
 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
-- 
cgit v1.2.3


From 3d0518f4758eca4339e75e5b9dbb7e06a5ce08b4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Sat, 14 Feb 2009 23:01:36 -0500
Subject: ext4: New rec_len encoding for very large blocksizes

The rec_len field in the directory entry is 16 bits, so to encode
blocksizes larger than 64k becomes problematic.  This patch allows us
to supprot block sizes up to 256k, by using the low 2 bits to extend
the range of rec_len to 2**18-1 (since valid rec_len sizes must be a
multiple of 4).  We use the convention that a rec_len of 0 or 65535
means the filesystem block size, for compatibility with older kernels.

It's unlikely we'll see VM pages of up to 256k, but at some point we
might find that the Linux VM has been enhanced to support filesystem
block sizes > than the VM page size, at which point it might be useful
for some applications to allow very large filesystem block sizes.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c   |  16 ++++---
 fs/ext4/ext4.h  |  21 ++-------
 fs/ext4/namei.c | 130 ++++++++++++++++++++++++++++++++++++--------------------
 3 files changed, 98 insertions(+), 69 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01a..b64789929a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 unsigned int offset)
 {
 	const char *error_msg = NULL;
-	const int rlen = ext4_rec_len_from_disk(de->rec_len);
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len)
-						< EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
 					break;
-				i += ext4_rec_len_from_disk(de->rec_len);
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
 				ret = stored;
 				goto out;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len);
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
 					goto revalidate;
 				stored++;
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
 		}
 		offset = 0;
 		brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0db01421da3..d5193b55ca9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -855,24 +855,6 @@ struct ext4_dir_entry_2 {
 					 ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-	unsigned len = le16_to_cpu(dlen);
-
-	if (len == EXT4_MAX_REC_LEN || len == 0)
-		return 1 << 16;
-	return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-	if (len == (1 << 16))
-		return cpu_to_le16(EXT4_MAX_REC_LEN);
-	else if (len > (1 << 16))
-		BUG();
-	return cpu_to_le16(len);
-}
-
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
@@ -1097,7 +1079,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 04824958cba..a5ba1a85809 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -165,7 +165,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-		struct dx_map_entry *offsets, int count);
+		struct dx_map_entry *offsets, int count, unsigned blocksize);
 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return blocksize;
+	return (len & 65532) | ((len & 3) << 16);
+}
+  
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+		BUG();
+	if (len < 65536)
+		return cpu_to_le16(len);
+	if (len == blocksize) {
+		if (blocksize == 65536)
+			return cpu_to_le16(EXT4_MAX_REC_LEN);
+		else 
+			return cpu_to_le16(0);
+	}
+	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
+
 /*
  * p is at least 6 bytes before the end of page
  */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
 {
 	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len));
+		ext4_rec_len_from_disk(p->rec_len, blocksize));
 }
 
 /*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
 			space += EXT4_DIR_REC_LEN(de->name_len);
 			names++;
 		}
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, size);
 	}
 	printk("(%i)\n", names);
 	return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
-	for (; de < top; de = ext4_next_entry(de)) {
+	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
 		if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
 					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 						+((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
 		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
 		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
 			goto errout;
 		count++;
@@ -732,7 +756,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 			cond_resched();
 		}
 		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, blocksize);
 	}
 	return count;
 }
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
 			return 1;
 		}
 		/* prevent looping on a bad block */
-		de_len = ext4_rec_len_from_disk(de->rec_len);
+		de_len = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 		if (de_len <= 0)
 			return -1;
 		offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		de = (struct ext4_dir_entry_2 *) bh->b_data;
 		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de)) {
+		for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
 			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
 				  + ((char *) de - bh->b_data);
 
@@ -1109,7 +1134,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
  * Returns pointer to last entry moved.
  */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+		unsigned blocksize)
 {
 	unsigned rec_len = 0;
 
@@ -1118,7 +1144,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-				ext4_rec_len_to_disk(rec_len);
+				ext4_rec_len_to_disk(rec_len, blocksize);
 		de->inode = 0;
 		map++;
 		to += rec_len;
@@ -1137,12 +1163,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 
 	prev = to = de;
 	while ((char*)de < base + blocksize) {
-		next = ext4_next_entry(de);
+		next = ext4_next_entry(de, blocksize);
 		if (de->inode && de->name_len) {
 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
 			if (de > to)
 				memmove(to, de, rec_len);
-			to->rec_len = ext4_rec_len_to_disk(rec_len);
+			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
 			prev = to;
 			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
 		}
@@ -1215,10 +1241,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 					hash2, split, count-split));
 
 	/* Fancy dance to stay within two buffers */
-	de2 = dx_move_dirents(data1, data2, map + split, count - split);
+	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
 	de = dx_pack_dirents(data1, blocksize);
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
-	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+					   blocksize);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+					    blocksize);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1268,6 +1296,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
 	unsigned int	offset = 0;
+	unsigned int	blocksize = dir->i_sb->s_blocksize;
 	unsigned short	reclen;
 	int		nlen, rlen, err;
 	char		*top;
@@ -1275,7 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	reclen = EXT4_DIR_REC_LEN(namelen);
 	if (!de) {
 		de = (struct ext4_dir_entry_2 *)bh->b_data;
-		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+		top = bh->b_data + blocksize - reclen;
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
 						  bh, offset)) {
@@ -1287,7 +1316,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
-			rlen = ext4_rec_len_from_disk(de->rec_len);
+			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
 			if ((de->inode? rlen - nlen: rlen) >= reclen)
 				break;
 			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1335,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	/* By now the buffer is marked for journaling */
 	nlen = EXT4_DIR_REC_LEN(de->name_len);
-	rlen = ext4_rec_len_from_disk(de->rec_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
 	if (de->inode) {
 		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
-		de->rec_len = ext4_rec_len_to_disk(nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
 		de = de1;
 	}
 	de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1409,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	/* The 0th block becomes the root, move the dirents out */
 	fde = &root->dotdot;
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
-		ext4_rec_len_from_disk(fde->rec_len));
+		ext4_rec_len_from_disk(fde->rec_len, blocksize));
 	if ((char *) de >= (((char *) root) + blocksize)) {
 		ext4_error(dir->i_sb, __func__,
 			   "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1431,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-	while ((char *)(de2 = ext4_next_entry(de)) < top)
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
 		de = de2;
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+					   blocksize);
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+					   blocksize);
 	memset (&root->info, 0, sizeof(root->info));
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1519,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = ext4_rec_len_to_disk(blocksize);
+	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1551,7 +1582,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
-		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+							   sb->s_blocksize);
 		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1671,7 @@ static int ext4_delete_entry(handle_t *handle,
 			     struct buffer_head *bh)
 {
 	struct ext4_dir_entry_2 *de, *pde;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int i;
 
 	i = 0;
@@ -1652,8 +1685,11 @@ static int ext4_delete_entry(handle_t *handle,
 			ext4_journal_get_write_access(handle, bh);
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
-					ext4_rec_len_from_disk(pde->rec_len) +
-					ext4_rec_len_from_disk(de->rec_len));
+					ext4_rec_len_from_disk(pde->rec_len,
+							       blocksize) +
+					ext4_rec_len_from_disk(de->rec_len,
+							       blocksize),
+					blocksize);
 			else
 				de->inode = 0;
 			dir->i_version++;
@@ -1661,9 +1697,9 @@ static int ext4_delete_entry(handle_t *handle,
 			ext4_handle_dirty_metadata(handle, dir, bh);
 			return 0;
 		}
-		i += ext4_rec_len_from_disk(de->rec_len);
+		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
 		pde = de;
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, blocksize);
 	}
 	return -ENOENT;
 }
@@ -1793,6 +1829,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct inode *inode;
 	struct buffer_head *dir_block;
 	struct ext4_dir_entry_2 *de;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int err, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1861,14 @@ retry:
 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
-	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+					   blocksize);
 	strcpy(de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = ext4_next_entry(de);
+	de = ext4_next_entry(de, blocksize);
 	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
-						EXT4_DIR_REC_LEN(1));
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+					   blocksize);
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1923,7 @@ static int empty_dir(struct inode *inode)
 		return 1;
 	}
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	de1 = ext4_next_entry(de);
+	de1 = ext4_next_entry(de, sb->s_blocksize);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
 			strcmp(".", de->name) ||
@@ -1896,9 +1934,9 @@ static int empty_dir(struct inode *inode)
 		brelse(bh);
 		return 1;
 	}
-	offset = ext4_rec_len_from_disk(de->rec_len) +
-		 ext4_rec_len_from_disk(de1->rec_len);
-	de = ext4_next_entry(de1);
+	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+	de = ext4_next_entry(de1, sb->s_blocksize);
 	while (offset < inode->i_size) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1965,8 @@ static int empty_dir(struct inode *inode)
 			brelse(bh);
 			return 0;
 		}
-		offset += ext4_rec_len_from_disk(de->rec_len);
-		de = ext4_next_entry(de);
+		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+		de = ext4_next_entry(de, sb->s_blocksize);
 	}
 	brelse(bh);
 	return 1;
@@ -2297,8 +2335,8 @@ retry:
 	return err;
 }
 
-#define PARENT_INO(buffer) \
-	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+#define PARENT_INO(buffer, size) \
+	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
@@ -2358,7 +2396,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
-		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2469,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");
 		ext4_journal_get_write_access(handle, dir_bh);
-		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
 		ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
 		ext4_dec_count(handle, old_dir);
-- 
cgit v1.2.3


From 6700ec65c207068a81a535e9dca616fefac21671 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 15 Feb 2009 21:18:17 +0100
Subject: lockdep: annotate reclaim context (__GFP_NOFS), fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix build warning

Fix:

  mm/vmscan.c: In function ‘kswapd’:
  mm/vmscan.c:1969: warning: ISO C90 forbids mixed declarations and code

node_to_cpumask_ptr(cpumask, pgdat->node_id), has a side-effect: it
defines the 'cpumask' local variable as well, so it has to go into
the variable definition section.

Sidenote: it might make sense to make this purpose of these macros
more apparent, by naming them the standard way, such as:

  DEFINE_node_to_cpumask_ptr(cpumask, pgdat->node_id);

(But that is outside the scope of this patch.)

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/vmscan.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 303eb658b50..cf844134527 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1963,11 +1963,10 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
+	node_to_cpumask_ptr(cpumask, pgdat->node_id);
 
 	lockdep_set_current_reclaim_state(GFP_KERNEL);
 
-	node_to_cpumask_ptr(cpumask, pgdat->node_id);
-
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
-- 
cgit v1.2.3


From 705895b61133ef43d106fe6a6bbdb2eec923867e Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 15 Feb 2009 18:07:52 -0500
Subject: ext4: allocate ->s_blockgroup_lock separately

As spotted by kmemtrace, struct ext4_sb_info is 17664 bytes on 64-bit
which makes it a very bad fit for SLAB allocators.  The culprit of the
wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when
NR_CPUS >= 32.

To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2
page in the worst case, separately.  This shinks down struct ext4_sb_info
enough to fit a 2 KB slab cache so now we allocate 16 KB + 2 KB instead of
32 KB saving 14 KB of memory.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_sb.h |  4 ++--
 fs/ext4/super.c   | 10 +++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index e318f486cc2..4e4d9cc3f40 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,7 +62,7 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock s_blockgroup_lock;
+	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
 
 	/* Journaling */
@@ -149,7 +149,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif	/* _EXT4_SB */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923..a3768709ce0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -615,6 +615,7 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
 }
@@ -2021,6 +2022,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
@@ -2332,7 +2340,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 &sbi->s_inode_readahead_blks);
 #endif
 
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
-- 
cgit v1.2.3


From 8fa43a81b97853fc69417bb6054182e78f95cbeb Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Sun, 15 Feb 2009 18:57:26 -0500
Subject: ext4: don't inherit inappropriate inode flags from parent

At present INDEX and EXTENTS are the only flags that new ext4 inodes do
NOT inherit from their parent.  In addition prevent the flags DIRTY,
ECOMPR, IMAGIC, TOPDIR, HUGE_FILE and EXT_MIGRATE from being inherited.
List inheritable flags explicitly to prevent future flags from
accidentally being inherited.

This fixes the TOPDIR flag inheritance bug reported at
http://bugzilla.kernel.org/show_bug.cgi?id=9866.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 7 +++++++
 fs/ext4/ialloc.c | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d5193b55ca9..d4700d101ea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -239,6 +239,13 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+			   EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+			   EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
 /*
  * Inode dynamic state flags
  */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8..1ff3df086e5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -889,7 +889,7 @@ got:
 	 * newly created directory and file only if -o extent mount option is
 	 * specified
 	 */
-	ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
+	ei->i_flags = EXT4_I(dir)->i_flags & EXT4_FL_INHERITED;
 	if (S_ISLNK(mode))
 		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
 	/* dirsync only applies to directories */
-- 
cgit v1.2.3


From 2dc6b0d48ca0599837df21b14bb8393d0804af57 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Sun, 15 Feb 2009 18:09:20 -0500
Subject: ext4: tighten restrictions on inode flags

At the moment there are few restrictions on which flags may be set on
which inodes.  Specifically DIRSYNC may only be set on directories and
IMMUTABLE and APPEND may not be set on links.  Tighten that to disallow
TOPDIR being set on non-directories and only NODUMP and NOATIME to be set
on non-regular file, non-directories.

Introduces a flags masking function which masks flags based on mode and
use it during inode creation and when flags are set via the ioctl to
facilitate future consistency.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 17 +++++++++++++++++
 fs/ext4/ialloc.c | 14 +++++---------
 fs/ext4/ioctl.c  |  3 +--
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d4700d101ea..2cbfc0b04d3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -246,6 +246,23 @@ struct flex_groups {
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
 
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT4_REG_FLMASK;
+	else
+		return flags & EXT4_OTHER_FLMASK;
+}
+
 /*
  * Inode dynamic state flags
  */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ff3df086e5..ae3eb57dccd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -885,16 +885,12 @@ got:
 	ei->i_disksize = 0;
 
 	/*
-	 * Don't inherit extent flag from directory. We set extent flag on
-	 * newly created directory and file only if -o extent mount option is
-	 * specified
+	 * Don't inherit extent flag from directory, amongst others. We set
+	 * extent flag on newly created directory and file only if -o extent
+	 * mount option is specified
 	 */
-	ei->i_flags = EXT4_I(dir)->i_flags & EXT4_FL_INHERITED;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
-	/* dirsync only applies to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT4_DIRSYNC_FL;
+	ei->i_flags =
+		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_group = group;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247..22dd29f3ebc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT4_DIRSYNC_FL;
+		flags = ext4_mask_flags(inode->i_mode, flags);
 
 		err = -EPERM;
 		mutex_lock(&inode->i_mutex);
-- 
cgit v1.2.3


From a4912123b688e057084e6557cef8924f7ae5bbde Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 12 Mar 2009 12:18:34 -0400
Subject: ext4: New inode/block allocation algorithms for flex_bg filesystems

The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option.  It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled).  The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks.  This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs:  stat /home/tytso/Maildir/cur
Inode: 1844562   Type: directory    Mode:  0700   Flags: 0x81000
Generation: 1132745781    Version: 0x00000000:0000ad71
User: 15806   Group: 15806   Size: 1060864
File ACL: 0    Directory ACL: 0
Links: 2   Blockcount: 2072
Fragment:  Address: 0    Number: 0    Size: 0
 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   6 ++
 fs/ext4/ext4_i.h  |   3 +
 fs/ext4/extents.c |  25 ++++++-
 fs/ext4/ialloc.c  | 215 ++++++++++++++++++++++++++++++++++++++++--------------
 fs/ext4/inode.c   |  18 ++++-
 fs/ext4/mballoc.c |   7 ++
 6 files changed, 216 insertions(+), 58 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2cbfc0b04d3..096456c8559 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -827,6 +827,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MIN_BATCH_TIME	0
 #define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
 
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
+
 /*
  * Structure of a directory entry
  */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 2d516c0a22a..4ce2187123a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -122,6 +122,9 @@ struct ext4_inode_info {
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
 	/* allocation reservation info for delalloc */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f59..aa3431856c9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	int depth;
 
 	if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	}
 
 	/* OK. use inode's group */
-	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		/*
+		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+		 * block groups per flexgroup, reserve the first block 
+		 * group for directories and special files.  Regular 
+		 * files will start at the second block group.  This
+		 * tends to speed up directory access and improves 
+		 * fsck times.
+		 */
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ae3eb57dccd..617f5a2d800 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -410,6 +410,43 @@ out:
 	return 0;
 }
 
+struct orlov_stats {
+	__u32 free_inodes;
+	__u32 free_blocks;
+	__u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+		       int flex_size, struct orlov_stats *stats)
+{
+	struct ext4_group_desc *desc;
+	ext4_group_t		ngroups = EXT4_SB(sb)->s_groups_count;
+	int			i;
+
+	stats->free_inodes = 0;
+	stats->free_blocks = 0;
+	stats->used_dirs = 0;
+
+	g *= flex_size;
+
+	for (i = 0; i < flex_size; i++) {
+		if (g >= ngroups)
+			break;
+		desc = ext4_get_group_desc(sb, g++, NULL);
+		if (!desc)
+			continue;
+
+		stats->free_inodes += ext4_free_inodes_count(sb, desc);
+		stats->free_blocks += ext4_free_blks_count(sb, desc);
+		stats->used_dirs += ext4_used_dirs_count(sb, desc);
+	}
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -425,35 +462,34 @@ out:
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
  * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
  */
 
-#define INODE_COST 64
-#define BLOCK_COST 256
-
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
-	ext4_fsblk_t blocks_per_dir;
 	unsigned int ndirs;
-	int max_debt, max_dirs, min_inodes;
+	int max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i;
+	ext4_group_t i, grp, g;
 	struct ext4_group_desc *desc;
+	struct orlov_stats stats;
+	int flex_size = ext4_flex_bg_size(sbi);
+
+	if (flex_size > 1) {
+		ngroups = (ngroups + flex_size - 1) >>
+			sbi->s_log_groups_per_flex;
+		parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
@@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	do_div(avefreeb, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-	if ((parent == sb->s_root->d_inode) ||
-	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+	if (S_ISDIR(mode) &&
+	    ((parent == sb->s_root->d_inode) ||
+	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
 		int best_ndir = inodes_per_group;
-		ext4_group_t grp;
 		int ret = -1;
 
 		get_random_bytes(&grp, sizeof(grp));
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
-			grp = (parent_group + i) % ngroups;
-			desc = ext4_get_group_desc(sb, grp, NULL);
-			if (!desc || !ext4_free_inodes_count(sb, desc))
+			g = (parent_group + i) % ngroups;
+			get_orlov_stats(sb, g, flex_size, &stats);
+			if (!stats.free_inodes)
 				continue;
-			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+			if (stats.used_dirs >= best_ndir)
 				continue;
-			if (ext4_free_inodes_count(sb, desc) < avefreei)
+			if (stats.free_inodes < avefreei)
 				continue;
-			if (ext4_free_blks_count(sb, desc) < avefreeb)
+			if (stats.free_blocks < avefreeb)
 				continue;
-			*group = grp;
+			grp = g;
 			ret = 0;
-			best_ndir = ext4_used_dirs_count(sb, desc);
+			best_ndir = stats.used_dirs;
+		}
+		if (ret)
+			goto fallback;
+	found_flex_bg:
+		if (flex_size == 1) {
+			*group = grp;
+			return 0;
+		}
+
+		/*
+		 * We pack inodes at the beginning of the flexgroup's
+		 * inode tables.  Block allocation decisions will do
+		 * something similar, although regular files will
+		 * start at 2nd block group of the flexgroup.  See
+		 * ext4_ext_find_goal() and ext4_find_near().
+		 */
+		grp *= flex_size;
+		for (i = 0; i < flex_size; i++) {
+			if (grp+i >= sbi->s_groups_count)
+				break;
+			desc = ext4_get_group_desc(sb, grp+i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = grp+i;
+				return 0;
+			}
 		}
-		if (ret == 0)
-			return ret;
 		goto fallback;
 	}
 
-	blocks_per_dir = ext4_blocks_count(es) - freeb;
-	do_div(blocks_per_dir, ndirs);
-
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
-	min_inodes = avefreei - inodes_per_group / 4;
-	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
-	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
-	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-	if (max_debt * INODE_COST > inodes_per_group)
-		max_debt = inodes_per_group / INODE_COST;
-	if (max_debt > 255)
-		max_debt = 255;
-	if (max_debt == 0)
-		max_debt = 1;
+	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+	if (min_inodes < 1)
+		min_inodes = 1;
+	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+	/*
+	 * Start looking in the flex group where we last allocated an
+	 * inode for this parent directory
+	 */
+	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+		parent_group = EXT4_I(parent)->i_last_alloc_group;
+		if (flex_size > 1)
+			parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (!desc || !ext4_free_inodes_count(sb, desc))
-			continue;
-		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+		grp = (parent_group + i) % ngroups;
+		get_orlov_stats(sb, grp, flex_size, &stats);
+		if (stats.used_dirs >= max_dirs)
 			continue;
-		if (ext4_free_inodes_count(sb, desc) < min_inodes)
+		if (stats.free_inodes < min_inodes)
 			continue;
-		if (ext4_free_blks_count(sb, desc) < min_blocks)
+		if (stats.free_blocks < min_blocks)
 			continue;
-		return 0;
+		goto found_flex_bg;
 	}
 
 fallback:
+	ngroups = sbi->s_groups_count;
+	avefreei = freei / ngroups;
+	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
+		grp = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, grp, NULL);
 		if (desc && ext4_free_inodes_count(sb, desc) &&
-			ext4_free_inodes_count(sb, desc) >= avefreei)
+		    ext4_free_inodes_count(sb, desc) >= avefreei) {
+			*group = grp;
 			return 0;
+		}
 	}
 
 	if (avefreei) {
@@ -542,12 +604,51 @@ fallback:
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *desc;
-	ext4_group_t i;
+	ext4_group_t i, last;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+	/*
+	 * Try to place the inode is the same flex group as its
+	 * parent.  If we can't find space, use the Orlov algorithm to
+	 * find another flex group, and store that information in the
+	 * parent directory's inode information so that use that flex
+	 * group for future allocations.
+	 */
+	if (flex_size > 1) {
+		int retry = 0;
+
+	try_again:
+		parent_group &= ~(flex_size-1);
+		last = parent_group + flex_size;
+		if (last > ngroups)
+			last = ngroups;
+		for  (i = parent_group; i < last; i++) {
+			desc = ext4_get_group_desc(sb, i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = i;
+				return 0;
+			}
+		}
+		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+			retry = 1;
+			parent_group = EXT4_I(parent)->i_last_alloc_group;
+			goto try_again;
+		}
+		/*
+		 * If this didn't work, use the Orlov search algorithm
+		 * to find a new flex group; we pass in the mode to
+		 * avoid the topdir algorithms.
+		 */
+		*group = parent_group + flex_size;
+		if (*group > ngroups)
+			*group = 0;
+		return find_group_orlov(sb, parent, group, mode);
+	}
 
 	/*
 	 * Try to place the inode in its parent directory
@@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 
-	if (sbi->s_log_groups_per_flex) {
+	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
-			ret2 = find_group_other(sb, dir, &group);
+			ret2 = find_group_other(sb, dir, &group, mode);
 			if (ret2 == 0 && once)
 				once = 0;
 				printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
-			ret2 = find_group_orlov(sb, dir, &group);
+			ret2 = find_group_orlov(sb, dir, &group, mode);
 	} else
-		ret2 = find_group_other(sb, dir, &group);
+		ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
+	EXT4_I(dir)->i_last_alloc_group = group;
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -894,6 +996,7 @@ got:
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_group = group;
+	ei->i_last_alloc_group = ~0;
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db7..25811507d2b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -4287,6 +4302,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
+	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd03..b0d6022eaa6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1726,6 +1726,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
 	unsigned free, fragments;
 	unsigned i, bits;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_desc *desc;
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
@@ -1747,6 +1748,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
 			return 0;
 
+		/* Avoid using the first bg of a flexgroup for data files */
+		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+		    ((group % flex_size) == 0))
+			return 0;
+
 		bits = ac->ac_sb->s_blocksize_bits + 1;
 		for (i = ac->ac_2order; i <= bits; i++)
 			if (grp->bb_counters[i] > 0)
-- 
cgit v1.2.3


From e6f009b0b45220c004672d41a58865e94946104d Mon Sep 17 00:00:00 2001
From: Bryan Donlan <bdonlan@gmail.com>
Date: Sun, 22 Feb 2009 21:20:25 -0500
Subject: ext4: return -EIO not -ESTALE on directory traversal through deleted
 inode

ext4_iget() returns -ESTALE if invoked on a deleted inode, in order to
report errors to NFS properly.  However, in ext4_lookup(), this
-ESTALE can be propagated to userspace if the filesystem is corrupted
such that a directory entry references a deleted inode.  This leads to
a misleading error message - "Stale NFS file handle" - and confusion
on the part of the admin.

The bug can be easily reproduced by creating a new filesystem, making
a link to an unused inode using debugfs, then mounting and attempting
to ls -l said link.

This patch thus changes ext4_lookup to return -EIO if it receives
-ESTALE from ext4_iget(), as ext4 does for other filesystem metadata
corruption; and also invokes the appropriate ext*_error functions when
this case is detected.

Signed-off-by: Bryan Donlan <bdonlan@gmail.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a5ba1a85809..6e1ad68cdc7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1077,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
+		if (unlikely(IS_ERR(inode))) {
+			if (PTR_ERR(inode) == -ESTALE) {
+				ext4_error(dir->i_sb, __func__,
+						"deleted inode referenced: %u",
+						ino);
+				return ERR_PTR(-EIO);
+			} else {
+				return ERR_CAST(inode);
+			}
+		}
 	}
 	return d_splice_alias(inode, dentry);
 }
-- 
cgit v1.2.3


From 56b19868aca856a7d7bf20c3a7a1030e4fd75b2b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 12 Mar 2009 09:51:20 -0400
Subject: ext4: Add checks to validate extent entries.

This patch adds checks to validate the extent entries along with extent
headers, to avoid crashes caused by corrupt filesystems.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aa3431856c9..ee40b7c52c8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -324,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 	return max;
 }
 
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+	ext4_fsblk_t block = ext_pblock(ext);
+	int len = ext4_ext_get_actual_len(ext);
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+			((block + len) > ext4_blocks_count(es))))
+		return 0;
+	else
+		return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+				struct ext4_extent_idx *ext_idx)
+{
+	ext4_fsblk_t block = idx_pblock(ext_idx);
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+			(block > ext4_blocks_count(es))))
+		return 0;
+	else
+		return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+				struct ext4_extent_header *eh,
+				int depth)
+{
+	struct ext4_extent *ext;
+	struct ext4_extent_idx *ext_idx;
+	unsigned short entries;
+	if (eh->eh_entries == 0)
+		return 1;
+
+	entries = le16_to_cpu(eh->eh_entries);
+
+	if (depth == 0) {
+		/* leaf entries */
+		ext = EXT_FIRST_EXTENT(eh);
+		while (entries) {
+			if (!ext4_valid_extent(inode, ext))
+				return 0;
+			ext++;
+			entries--;
+		}
+	} else {
+		ext_idx = EXT_FIRST_INDEX(eh);
+		while (entries) {
+			if (!ext4_valid_extent_idx(inode, ext_idx))
+				return 0;
+			ext_idx++;
+			entries--;
+		}
+	}
+	return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
 					struct ext4_extent_header *eh,
 					int depth)
 {
@@ -352,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
 		error_msg = "invalid eh_entries";
 		goto corrupted;
 	}
+	if (!ext4_valid_extent_entries(inode, eh, depth)) {
+		error_msg = "invalid extent entries";
+		goto corrupted;
+	}
 	return 0;
 
 corrupted:
 	ext4_error(inode->i_sb, function,
-			"bad header in inode #%lu: %s - magic %x, "
+			"bad header/extent in inode #%lu: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
 			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -365,8 +426,8 @@ corrupted:
 	return -EIO;
 }
 
-#define ext4_ext_check_header(inode, eh, depth)	\
-	__ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth)	\
+	__ext4_ext_check(__func__, inode, eh, depth)
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -570,7 +631,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
-	if (ext4_ext_check_header(inode, eh, depth))
+	if (ext4_ext_check(inode, eh, depth))
 		return ERR_PTR(-EIO);
 
 
@@ -607,7 +668,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (ext4_ext_check_header(inode, eh, i))
+		if (ext4_ext_check(inode, eh, i))
 			goto err;
 	}
 
@@ -1204,7 +1265,7 @@ got_index:
 			return -EIO;
 		eh = ext_block_hdr(bh);
 		/* subtract from p_depth to get proper eh_depth */
-		if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
 			put_bh(bh);
 			return -EIO;
 		}
@@ -1217,7 +1278,7 @@ got_index:
 	if (bh == NULL)
 		return -EIO;
 	eh = ext_block_hdr(bh);
-	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
 		put_bh(bh);
 		return -EIO;
 	}
@@ -2160,7 +2221,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		return -ENOMEM;
 	}
 	path[0].p_hdr = ext_inode_hdr(inode);
-	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
@@ -2214,7 +2275,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 				err = -EIO;
 				break;
 			}
-			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+			if (ext4_ext_check(inode, ext_block_hdr(bh),
 							depth - i - 1)) {
 				err = -EIO;
 				break;
-- 
cgit v1.2.3


From 7a262f7c69163cd4811f2f838faef5c5b18439c9 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 27 Mar 2009 16:39:58 -0400
Subject: ext4: Validate extent details only when read from the disk

Make sure we validate extent details only when read from the disk.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Thiemo Nagel <thiemo.nagel@ph.tum.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_extents.h |  1 +
 fs/ext4/extents.c      | 25 ++++++++++++++++++-------
 fs/ext4/inode.c        | 10 ++++++++++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbb..f0c3ec85bd4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ee40b7c52c8..ac77d8b8251 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -429,6 +429,11 @@ corrupted:
 #define ext4_ext_check(inode, eh, depth)	\
 	__ext4_ext_check(__func__, inode, eh, depth)
 
+int ext4_ext_check_inode(struct inode *inode)
+{
+	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
+
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 {
@@ -631,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
-	if (ext4_ext_check(inode, eh, depth))
-		return ERR_PTR(-EIO);
-
 
 	/* account possible depth increase */
 	if (!path) {
@@ -649,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	i = depth;
 	/* walk through the tree */
 	while (i) {
+		int need_to_validate = 0;
+
 		ext_debug("depth %d: num %d, max %d\n",
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
@@ -657,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
-		bh = sb_bread(inode->i_sb, path[ppos].p_block);
-		if (!bh)
+		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+		if (unlikely(!bh))
 			goto err;
-
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto err;
+			}
+			/* validate the extent entries */
+			need_to_validate = 1;
+		}
 		eh = ext_block_hdr(bh);
 		ppos++;
 		BUG_ON(ppos > depth);
@@ -668,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (ext4_ext_check(inode, eh, i))
+		if (need_to_validate && ext4_ext_check(inode, eh, i))
 			goto err;
 	}
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 25811507d2b..cd0399db0ef 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4345,6 +4345,16 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 
+	if (ei->i_flags & EXT4_EXTENTS_FL) {
+		/* Validate extent which is part of inode */
+		ret = ext4_ext_check_inode(inode);
+		if (ret) {
+			brelse(bh);
+			goto bad_inode;
+		}
+
+	}
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
-- 
cgit v1.2.3


From 722bde6875bfb49a0c84e5601eb82dd7ac02d27c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 23 Feb 2009 00:51:57 -0500
Subject: ext4: Add fine print for the 32000 subdirectory limit

Some poeple are reading the ext4 feature list too literally and create
dubious test cases involving very long filenames and 1k blocksize and
then complain when they run into an htree-imposed limit.  So add fine
print to the "fix 32000 subdirectory limit" ext4 feature.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc729..5c484aec2ba 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redundancy in tree
 * improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
 * nsec timestamps for mtime, atime, ctime, create time
 * inode version field on disk (NFSv4, Lustre)
 * reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
   the ordering)
 
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
 2.2 Candidate features for future inclusion
 
 * Online defrag (patches available but not well tested)
-- 
cgit v1.2.3


From ed5bde0bf8995d7d8c0b5a9c33e624a945f333ef Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 23 Feb 2009 10:48:07 -0500
Subject: ext4: Simplify delalloc implementation by removing mpd.get_block

This parameter was always set to ext4_da_get_block_write().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 125 ++++++++++++++++++++++++++------------------------------
 1 file changed, 58 insertions(+), 67 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cd0399db0ef..924ba8afc22 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1705,7 +1705,6 @@ struct mpage_da_data {
 	struct inode *inode;
 	struct buffer_head lbh;			/* extent of blocks */
 	unsigned long first_page, next_page;	/* extent of pages */
-	get_block_t *get_block;
 	struct writeback_control *wbc;
 	int io_done;
 	int pages_written;
@@ -1719,7 +1718,6 @@ struct mpage_da_data {
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
  * @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
  *
  * By the time mpage_da_submit_io() is called we expect all blocks
  * to be allocated. this may be wrong if allocation failed.
@@ -1929,16 +1927,60 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
+#define		EXT4_DELALLOC_RSVED	1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	if (ret <= 0)
+		return ret;
+
+	bh_result->b_size = (ret << inode->i_blkbits);
+
+	if (ext4_should_order_data(inode)) {
+		int retval;
+		retval = ext4_jbd2_file_inode(handle, inode);
+		if (retval)
+			/*
+			 * Failed to add inode for ordered mode. Don't
+			 * update file size
+			 */
+			return retval;
+	}
+
+	/*
+	 * Update on-disk size along with block allocation we don't
+	 * use 'extend_disksize' as size may change within already
+	 * allocated block -bzzz
+	 */
+	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+	if (disksize > i_size_read(inode))
+		disksize = i_size_read(inode);
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, disksize);
+		ret = ext4_mark_inode_dirty(handle, inode);
+		return ret;
+	}
+	return 0;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
  * @mpd->lbh - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
 	struct buffer_head new;
@@ -1960,30 +2002,29 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 */
 	if (!new.b_size)
 		return 0;
-	err = mpd->get_block(mpd->inode, next, &new, 1);
-	if (err) {
 
-		/* If get block returns with error
-		 * we simply return. Later writepage
-		 * will redirty the page and writepages
-		 * will find the dirty page again
+	err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+	if (err) {
+		/*
+		 * If get block returns with error we simply
+		 * return. Later writepage will redirty the page and
+		 * writepages will find the dirty page again
 		 */
 		if (err == -EAGAIN)
 			return 0;
 
 		if (err == -ENOSPC &&
-				ext4_count_free_blocks(mpd->inode->i_sb)) {
+		    ext4_count_free_blocks(mpd->inode->i_sb)) {
 			mpd->retval = err;
 			return 0;
 		}
 
 		/*
-		 * get block failure will cause us
-		 * to loop in writepages. Because
-		 * a_ops->writepage won't be able to
-		 * make progress. The page will be redirtied
-		 * by writepage and writepages will again
-		 * try to write the same.
+		 * get block failure will cause us to loop in
+		 * writepages, because a_ops->writepage won't be able
+		 * to make progress. The page will be redirtied by
+		 * writepage and writepages will again try to write
+		 * the same.
 		 */
 		printk(KERN_EMERG "%s block allocation failed for inode %lu "
 				  "at logical offset %llu with max blocks "
@@ -2212,7 +2253,6 @@ static int __mpage_da_writepage(struct page *page,
  *
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
  *
  * This is a library function, which implements the writepages()
  * address_space_operation.
@@ -2223,9 +2263,6 @@ static int mpage_da_writepages(struct address_space *mapping,
 {
 	int ret;
 
-	if (!mpd->get_block)
-		return generic_writepages(mapping, wbc);
-
 	mpd->lbh.b_size = 0;
 	mpd->lbh.b_state = 0;
 	mpd->lbh.b_blocknr = 0;
@@ -2289,51 +2326,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
 	return ret;
 }
-#define		EXT4_DELALLOC_RSVED	1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
-{
-	int ret;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	loff_t disksize = EXT4_I(inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-			bh_result, create, 0, EXT4_DELALLOC_RSVED);
-	if (ret > 0) {
-
-		bh_result->b_size = (ret << inode->i_blkbits);
-
-		if (ext4_should_order_data(inode)) {
-			int retval;
-			retval = ext4_jbd2_file_inode(handle, inode);
-			if (retval)
-				/*
-				 * Failed to add inode for ordered
-				 * mode. Don't update file size
-				 */
-				return retval;
-		}
-
-		/*
-		 * Update on-disk size along with block allocation
-		 * we don't use 'extend_disksize' as size may change
-		 * within already allocated block -bzzz
-		 */
-		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > EXT4_I(inode)->i_disksize) {
-			ext4_update_i_disksize(inode, disksize);
-			ret = ext4_mark_inode_dirty(handle, inode);
-			return ret;
-		}
-		ret = 0;
-	}
-	return ret;
-}
 
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
@@ -2584,7 +2576,6 @@ retry:
 			dump_stack();
 			goto out_writepages;
 		}
-		mpd.get_block = ext4_da_get_block_write;
 		ret = mpage_da_writepages(mapping, wbc, &mpd);
 
 		ext4_journal_stop(handle);
-- 
cgit v1.2.3


From 8dc207c0e7a259e7122ddfaf56b8bbbc3c92d685 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 23 Feb 2009 06:46:01 -0500
Subject: ext4: Save stack space by removing fake buffer heads

Struct mpage_da_data and mpage_add_bh_to_extent() use a fake struct
buffer_head which is 104 bytes on an x86_64 system, but only use 24
bytes of the structure.  On systems that use a spinlock for atomic_t,
the stack savings will be even greater.

It turns out that using a fake struct buffer_head doesn't even save
that much code, and it makes the code more confusing since it's not
used as a "real" buffer head.  So just store pass b_size and b_state
in mpage_add_bh_to_extent(), and store b_size, b_state, and b_block_nr
in the mpage_da_data structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 83 +++++++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 44 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 924ba8afc22..008b28a859d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1703,7 +1703,9 @@ static void ext4_da_page_release_reservation(struct page *page,
 
 struct mpage_da_data {
 	struct inode *inode;
-	struct buffer_head lbh;			/* extent of blocks */
+	sector_t b_blocknr;		/* start block number of extent */
+	size_t b_size;			/* size of extent */
+	unsigned long b_state;		/* state of the extent */
 	unsigned long first_page, next_page;	/* extent of pages */
 	struct writeback_control *wbc;
 	int io_done;
@@ -1737,7 +1739,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
-	 * If we look at mpd->lbh.b_blocknr we would only be looking
+	 * If we look at mpd->b_blocknr we would only be looking
 	 * at the currently mapped buffer_heads.
 	 */
 	index = mpd->first_page;
@@ -1975,7 +1977,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 /*
  * mpage_da_map_blocks - go through given space
  *
- * @mpd->lbh - bh describing space
+ * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
@@ -1984,18 +1986,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
 	struct buffer_head new;
-	struct buffer_head *lbh = &mpd->lbh;
 	sector_t next;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
 	 */
-	if (buffer_mapped(lbh) && !buffer_delay(lbh))
+	if ((mpd->b_state  & (1 << BH_Mapped)) &&
+	    !(mpd->b_state & (1 << BH_Delay)))
 		return 0;
-	new.b_state = lbh->b_state;
+	new.b_state = mpd->b_state;
 	new.b_blocknr = 0;
-	new.b_size = lbh->b_size;
-	next = lbh->b_blocknr;
+	new.b_size = mpd->b_size;
+	next = mpd->b_blocknr;
 	/*
 	 * If we didn't accumulate anything
 	 * to write simply return
@@ -2031,7 +2033,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 				  "%zd with error %d\n",
 				  __func__, mpd->inode->i_ino,
 				  (unsigned long long)next,
-				  lbh->b_size >> mpd->inode->i_blkbits, err);
+				  mpd->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_EMERG "This should not happen.!! "
 					"Data will be lost\n");
 		if (err == -ENOSPC) {
@@ -2039,7 +2041,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		}
 		/* invlaidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
-				lbh->b_size >> mpd->inode->i_blkbits);
+				mpd->b_size >> mpd->inode->i_blkbits);
 		return err;
 	}
 	BUG_ON(new.b_size == 0);
@@ -2051,7 +2053,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * If blocks are delayed marked, we need to
 	 * put actual blocknr and drop delayed bit
 	 */
-	if (buffer_delay(lbh) || buffer_unwritten(lbh))
+	if ((mpd->b_state & (1 << BH_Delay)) ||
+	    (mpd->b_state & (1 << BH_Unwritten)))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
 	return 0;
@@ -2070,12 +2073,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
  * the function is used to collect contig. blocks in same state
  */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-				   sector_t logical, struct buffer_head *bh)
+				   sector_t logical, size_t b_size,
+				   unsigned long b_state)
 {
 	sector_t next;
-	size_t b_size = bh->b_size;
-	struct buffer_head *lbh = &mpd->lbh;
-	int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 
 	/* check if thereserved journal credits might overflow */
 	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2102,19 +2104,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	/*
 	 * First block in the extent
 	 */
-	if (lbh->b_size == 0) {
-		lbh->b_blocknr = logical;
-		lbh->b_size = b_size;
-		lbh->b_state = bh->b_state & BH_FLAGS;
+	if (mpd->b_size == 0) {
+		mpd->b_blocknr = logical;
+		mpd->b_size = b_size;
+		mpd->b_state = b_state & BH_FLAGS;
 		return;
 	}
 
-	next = lbh->b_blocknr + nrblocks;
+	next = mpd->b_blocknr + nrblocks;
 	/*
 	 * Can we merge the block to our big extent?
 	 */
-	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-		lbh->b_size += b_size;
+	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+		mpd->b_size += b_size;
 		return;
 	}
 
@@ -2143,7 +2145,7 @@ static int __mpage_da_writepage(struct page *page,
 {
 	struct mpage_da_data *mpd = data;
 	struct inode *inode = mpd->inode;
-	struct buffer_head *bh, *head, fake;
+	struct buffer_head *bh, *head;
 	sector_t logical;
 
 	if (mpd->io_done) {
@@ -2185,9 +2187,9 @@ static int __mpage_da_writepage(struct page *page,
 		/*
 		 * ... and blocks
 		 */
-		mpd->lbh.b_size = 0;
-		mpd->lbh.b_state = 0;
-		mpd->lbh.b_blocknr = 0;
+		mpd->b_size = 0;
+		mpd->b_state = 0;
+		mpd->b_blocknr = 0;
 	}
 
 	mpd->next_page = page->index + 1;
@@ -2195,16 +2197,8 @@ static int __mpage_da_writepage(struct page *page,
 		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	if (!page_has_buffers(page)) {
-		/*
-		 * There is no attached buffer heads yet (mmap?)
-		 * we treat the page asfull of dirty blocks
-		 */
-		bh = &fake;
-		bh->b_size = PAGE_CACHE_SIZE;
-		bh->b_state = 0;
-		set_buffer_dirty(bh);
-		set_buffer_uptodate(bh);
-		mpage_add_bh_to_extent(mpd, logical, bh);
+		mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
+				       (1 << BH_Dirty) | (1 << BH_Uptodate));
 		if (mpd->io_done)
 			return MPAGE_DA_EXTENT_TAIL;
 	} else {
@@ -2222,8 +2216,10 @@ static int __mpage_da_writepage(struct page *page,
 			 * with the page in ext4_da_writepage
 			 */
 			if (buffer_dirty(bh) &&
-				(!buffer_mapped(bh) || buffer_delay(bh))) {
-				mpage_add_bh_to_extent(mpd, logical, bh);
+			    (!buffer_mapped(bh) || buffer_delay(bh))) {
+				mpage_add_bh_to_extent(mpd, logical,
+						       bh->b_size,
+						       bh->b_state);
 				if (mpd->io_done)
 					return MPAGE_DA_EXTENT_TAIL;
 			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2235,9 +2231,8 @@ static int __mpage_da_writepage(struct page *page,
 				 * unmapped buffer_head later we need to
 				 * use the b_state flag of that buffer_head.
 				 */
-				if (mpd->lbh.b_size == 0)
-					mpd->lbh.b_state =
-						bh->b_state & BH_FLAGS;
+				if (mpd->b_size == 0)
+					mpd->b_state = bh->b_state & BH_FLAGS;
 			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
@@ -2263,9 +2258,9 @@ static int mpage_da_writepages(struct address_space *mapping,
 {
 	int ret;
 
-	mpd->lbh.b_size = 0;
-	mpd->lbh.b_state = 0;
-	mpd->lbh.b_blocknr = 0;
+	mpd->b_size = 0;
+	mpd->b_state = 0;
+	mpd->b_blocknr = 0;
 	mpd->first_page = 0;
 	mpd->next_page = 0;
 	mpd->io_done = 0;
-- 
cgit v1.2.3


From f63e6005bc63acc0a6bc3bdb8f971dcfbd827185 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 23 Feb 2009 16:42:39 -0500
Subject: ext4: Simplify delalloc code by removing mpage_da_writepages()

The mpage_da_writepages() function is only used in one place, so
inline it to simplify the call stack and make the code easier to
understand.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 74 +++++++++++++++++++++++++--------------------------------
 1 file changed, 32 insertions(+), 42 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 008b28a859d..24d0f9d2b32 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2241,47 +2241,6 @@ static int __mpage_da_writepage(struct page *page,
 	return 0;
 }
 
-/*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
-			       struct writeback_control *wbc,
-			       struct mpage_da_data *mpd)
-{
-	int ret;
-
-	mpd->b_size = 0;
-	mpd->b_state = 0;
-	mpd->b_blocknr = 0;
-	mpd->first_page = 0;
-	mpd->next_page = 0;
-	mpd->io_done = 0;
-	mpd->pages_written = 0;
-	mpd->retval = 0;
-
-	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-	/*
-	 * Handle last extent of pages
-	 */
-	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
-		if (mpage_da_map_blocks(mpd) == 0)
-			mpage_da_submit_io(mpd);
-
-		mpd->io_done = 1;
-		ret = MPAGE_DA_EXTENT_TAIL;
-	}
-	wbc->nr_to_write -= mpd->pages_written;
-	return ret;
-}
-
 /*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
@@ -2571,7 +2530,38 @@ retry:
 			dump_stack();
 			goto out_writepages;
 		}
-		ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+		/*
+		 * Now call __mpage_da_writepage to find the next
+		 * contiguous region of logical blocks that need
+		 * blocks to be allocated by ext4.  We don't actually
+		 * submit the blocks for I/O here, even though
+		 * write_cache_pages thinks it will, and will set the
+		 * pages as clean for write before calling
+		 * __mpage_da_writepage().
+		 */
+		mpd.b_size = 0;
+		mpd.b_state = 0;
+		mpd.b_blocknr = 0;
+		mpd.first_page = 0;
+		mpd.next_page = 0;
+		mpd.io_done = 0;
+		mpd.pages_written = 0;
+		mpd.retval = 0;
+		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+					&mpd);
+		/*
+		 * If we have a contigous extent of pages and we
+		 * haven't done the I/O yet, map the blocks and submit
+		 * them for I/O.
+		 */
+		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+			if (mpage_da_map_blocks(&mpd) == 0)
+				mpage_da_submit_io(&mpd);
+			mpd.io_done = 1;
+			ret = MPAGE_DA_EXTENT_TAIL;
+		}
+		wbc->nr_to_write -= mpd.pages_written;
 
 		ext4_journal_stop(handle);
 
-- 
cgit v1.2.3


From ccd2506bd43113659aa904d5bea5d1300605e2a6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 26 Feb 2009 01:04:07 -0500
Subject: ext4: add EXT4_IOC_ALLOC_DA_BLKS ioctl

Add an ioctl which forces all of the delay allocated blocks to be
allocated.  This also provides a function ext4_alloc_da_blocks() which
will be used by the following commits to force files to be fully
allocated to preserve application-expected ext3 behaviour.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  3 +++
 fs/ext4/inode.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ioctl.c | 14 ++++++++++++++
 3 files changed, 59 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 096456c8559..b0ea70cc94d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -317,7 +317,9 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -1094,6 +1096,7 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24d0f9d2b32..8dd3d5de586 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2837,6 +2837,48 @@ out:
 	return;
 }
 
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+	if (!EXT4_I(inode)->i_reserved_data_blocks &&
+	    !EXT4_I(inode)->i_reserved_meta_blocks)
+		return 0;
+
+	/*
+	 * We do something simple for now.  The filemap_flush() will
+	 * also start triggering a write of the data blocks, which is
+	 * not strictly speaking necessary (and for users of
+	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * would require replicating code paths in:
+	 * 
+	 * ext4_da_writepages() ->
+	 *    write_cache_pages() ---> (via passed in callback function)
+	 *        __mpage_da_writepage() -->
+	 *           mpage_add_bh_to_extent()
+	 *           mpage_da_map_blocks()
+	 *
+	 * The problem is that write_cache_pages(), located in
+	 * mm/page-writeback.c, marks pages clean in preparation for
+	 * doing I/O, which is not desirable if we're not planning on
+	 * doing I/O at all.
+	 *
+	 * We could call write_cache_pages(), and then redirty all of
+	 * the pages by calling redirty_page_for_writeback() but that
+	 * would be ugly in the extreme.  So instead we would need to
+	 * replicate parts of the code in the above functions,
+	 * simplifying them becuase we wouldn't actually intend to
+	 * write out the pages, but rather only collect contiguous
+	 * logical block extents, call the multi-block allocator, and
+	 * then update the buffer heads with the block allocations.
+	 * 
+	 * For now, though, we'll cheat by calling filemap_flush(),
+	 * which will map the blocks, and start the I/O, but not
+	 * actually wait for the I/O to complete.
+	 */
+	return filemap_flush(inode->i_mapping);
+}
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 22dd29f3ebc..91e75f7a9e7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -262,6 +262,20 @@ setversion_out:
 		return err;
 	}
 
+	case EXT4_IOC_ALLOC_DA_BLKS:
+	{
+		int err;
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		err = ext4_alloc_da_blocks(inode);
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+
 	default:
 		return -ENOTTY;
 	}
-- 
cgit v1.2.3


From 7d8f9f7d150dded7b68e61ca6403a1f166fb4edf Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 24 Feb 2009 08:21:14 -0500
Subject: ext4: Automatically allocate delay allocated blocks on close

When closing a file that had been previously truncated, force any
delay allocated blocks that to be allocated so that if the filesystem
is mounted with data=ordered, the data blocks will be pushed out to
disk along with the journal commit.  Many application programs expect
this, so we do this to avoid zero length files if the system crashes
unexpectedly.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 1 +
 fs/ext4/file.c  | 4 ++++
 fs/ext4/inode.c | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0ea70cc94d..1b0c1736463 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -270,6 +270,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 #define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a0..06df8272c63 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,6 +33,10 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+	if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+		ext4_alloc_da_blocks(inode);
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dd3d5de586..80ed6dc9c9d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3901,6 +3901,9 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
+	if (inode->i_size == 0)
+		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		ext4_ext_truncate(inode);
 		return;
-- 
cgit v1.2.3


From 8750c6d5fcbd3342b3d908d157f81d345c5325a7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 23 Feb 2009 23:05:27 -0500
Subject: ext4: Automatically allocate delay allocated blocks on rename

When renaming a file such that a link to another inode is overwritten,
force any delay allocated blocks that to be allocated so that if the
filesystem is mounted with data=ordered, the data blocks will be
pushed out to disk along with the journal commit.  Many application
programs expect this, so we do this to avoid zero length files if the
system crashes unexpectedly.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6e1ad68cdc7..eb20246c896 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2357,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh, *dir_bh;
 	struct ext4_dir_entry_2 *old_de, *new_de;
-	int retval;
+	int retval, force_da_alloc = 0;
 
 	old_bh = new_bh = dir_bh = NULL;
 
@@ -2497,6 +2497,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ext4_mark_inode_dirty(handle, new_inode);
 		if (!new_inode->i_nlink)
 			ext4_orphan_add(handle, new_inode);
+		force_da_alloc = 1;
 	}
 	retval = 0;
 
@@ -2505,6 +2506,8 @@ end_rename:
 	brelse(old_bh);
 	brelse(new_bh);
 	ext4_journal_stop(handle);
+	if (retval == 0 && force_da_alloc)
+		ext4_alloc_da_blocks(old_inode);
 	return retval;
 }
 
-- 
cgit v1.2.3


From d6014301b5599fba395c42a1e96a7fe86f7d0b2d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 27 Mar 2009 22:36:43 -0400
Subject: ext4: Fix discard of inode prealloc space with delayed allocation.

With delayed allocation we should not/cannot discard inode prealloc
space during file close. We would still have dirty pages for which we
haven't allocated blocks yet. With this fix after each get_blocks
request we check whether we have zero reserved blocks and if yes and
we don't have any writers on the file we discard inode prealloc space.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/file.c  | 3 ++-
 fs/ext4/inode.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 06df8272c63..588af8c7724 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -39,7 +39,8 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1))
+			(atomic_read(&inode->i_writecount) == 1) &&
+		        !EXT4_I(inode)->i_reserved_data_blocks)
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
 		ext4_discard_preallocations(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 80ed6dc9c9d..7dcac9d7e49 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1067,9 +1067,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 	/*
 	 * free those over-booking quota for metadata blocks
 	 */
-
 	if (mdb_free)
 		vfs_dq_release_reservation_block(inode, mdb_free);
+
+	/*
+	 * If we have done all the pending block allocations and if
+	 * there aren't any writers on the inode, we can discard the
+	 * inode's preallocations.
+	 */
+	if (!total && (atomic_read(&inode->i_writecount) == 0))
+		ext4_discard_preallocations(inode);
 }
 
 /*
-- 
cgit v1.2.3


From afc32f7ee9febc020c73da61402351d4c90437f3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 28 Feb 2009 19:39:58 -0500
Subject: ext4: Track lifetime disk writes

Add a new superblock value which tracks the lifetime amount of writes
to the filesystem.  This is useful in estimating the amount of wear on
solid state drives (SSD's) caused by writes to the filesystem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 3 ++-
 fs/ext4/ext4_sb.h | 4 ++++
 fs/ext4/super.c   | 7 +++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1b0c1736463..0bd39188531 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -683,7 +683,8 @@ struct ext4_super_block {
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
 	__u8	s_reserved_char_pad2;
 	__le16  s_reserved_pad;
-	__u32   s_reserved[162];        /* Padding to the end of the block */
+	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
+	__u32   s_reserved[160];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 4e4d9cc3f40..50ab1169c37 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -142,6 +142,10 @@ struct ext4_sb_info {
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
 
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 };
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a3768709ce0..30fc27cdf8f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2035,6 +2035,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_resgid = EXT4_DEF_RESGID;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
+	sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+						      sectors[1]);
 
 	unlock_kernel();
 
@@ -2072,6 +2074,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT4_SUPER_MAGIC)
 		goto cantfind_ext4;
+	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2921,6 +2924,10 @@ static int ext4_commit_super(struct super_block *sb,
 		set_buffer_uptodate(sbh);
 	}
 	es->s_wtime = cpu_to_le32(get_seconds());
+	es->s_kbytes_written =
+		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeblocks_counter));
 	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
-- 
cgit v1.2.3


From 3197ebdb130473a92760100cbfe0d7e671838f48 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 31 Mar 2009 09:10:09 -0400
Subject: ext4: Add sysfs support

Add basic sysfs support so that information about the mounted
filesystem and various tuning parameters can be accessed via
/sys/fs/ext4/<dev>/*.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/ABI/testing/sysfs-fs-ext4 |  81 +++++++++++++
 fs/ext4/ext4_sb.h                       |   2 +
 fs/ext4/super.c                         | 207 ++++++++++++++++++++++++++++++++
 3 files changed, 290 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-fs-ext4

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 00000000000..4e79074de28
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
+What:		/sys/fs/ext4/<disk>/mb_stats
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		 Controls whether the multiblock allocator should
+		 collect statistics, which are shown during the unmount.
+		 1 means to collect statistics, 0 means not to collect
+		 statistics
+
+What:		/sys/fs/ext4/<disk>/mb_group_prealloc
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The multiblock allocator will round up allocation
+		requests to a multiple of this tuning parameter if the
+		stripe size is not set in the ext4 superblock
+
+What:		/sys/fs/ext4/<disk>/mb_max_to_scan
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The maximum number of extents the multiblock allocator
+		will search to find the best extent
+
+What:		/sys/fs/ext4/<disk>/mb_min_to_scan
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The minimum number of extents the multiblock allocator
+		will search to find the best extent
+
+What:		/sys/fs/ext4/<disk>/mb_order2_req
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which controls the minimum size for 
+		requests (as a power of 2) where the buddy cache is
+		used
+
+What:		/sys/fs/ext4/<disk>/mb_stream_req
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Files which have fewer blocks than this tunable
+		parameter will have their blocks allocated out of a
+		block group specific preallocation pool, so that small
+		files are packed closely together.  Each large file
+		 will have its blocks allocated out of its own unique
+		 preallocation pool.
+
+What:		/sys/fs/ext4/<disk>/inode_readahead
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which controls the maximum number of
+		inode table blocks that ext4's inode table readahead
+		algorithm will pre-read into the buffer cache
+
+What:		/sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of blocks
+		that are dirty in the page cache, but which do not
+		have their location in the filesystem allocated yet.
+
+What:		/sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of kilobytes
+		of data that have been written to this filesystem since it was
+		created.
+
+What:		/sys/fs/ext4/<disk>/session_write_kbytes
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of
+		kilobytes of data that have been written to this
+		filesystem since it was mounted.
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 50ab1169c37..57b71fefbcc 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -64,6 +64,8 @@ struct ext4_sb_info {
 	struct percpu_counter s_dirtyblocks_counter;
 	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
 
 	/* Journaling */
 	struct inode *s_journal_inode;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 30fc27cdf8f..2883d4318c2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/ctype.h>
 #include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -48,6 +49,7 @@
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -580,6 +582,7 @@ static void ext4_put_super(struct super_block *sb)
 		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
+	kobject_del(&sbi->s_kobj);
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -615,6 +618,16 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
+	/*
+	 * Now that we are completely done shutting down the
+	 * superblock, we need to actually destroy the kobject.
+	 */
+	unlock_kernel();
+	unlock_super(sb);
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+	lock_super(sb);
+	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
@@ -1464,6 +1477,11 @@ set_qf_format:
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
+			if (option & (option - 1)) {
+				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+				       " must be a power of 2\n");
+				return 0;
+			}
 			sbi->s_inode_readahead_blks = option;
 			break;
 		case Opt_journal_ioprio:
@@ -1992,6 +2010,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 	return 0;
 }
 
+/* sysfs supprt */
+
+struct ext4_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+			 const char *, size_t);
+	int offset;
+};
+
+static int parse_strtoul(const char *buf,
+		unsigned long max, unsigned long *value)
+{
+	char *endp;
+
+	while (*buf && isspace(*buf))
+		buf++;
+	*value = simple_strtoul(buf, &endp, 0);
+	while (*endp && isspace(*endp))
+		endp++;
+	if (*endp || *value > max)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+					 struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			 sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			sbi->s_kbytes_written + 
+			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0x40000000, &t))
+		return -EINVAL;
+
+	/* inode_readahead_blks must be a power of 2 */
+	if (t & (t-1))
+		return -EINVAL;
+
+	sbi->s_inode_readahead_blks = t;
+	return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+				struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0xffffffff, &t))
+		return -EINVAL;
+	*ui = t;
+	return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.offset = offsetof(struct ext4_sb_info, _elname),	\
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
+	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+
+static struct attribute *ext4_attrs[] = {
+	ATTR_LIST(delayed_allocation_blocks),
+	ATTR_LIST(session_write_kbytes),
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_max_to_scan),
+	ATTR_LIST(mb_min_to_scan),
+	ATTR_LIST(mb_order2_req),
+	ATTR_LIST(mb_stream_req),
+	ATTR_LIST(mb_group_prealloc),
+	NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+
+static struct sysfs_ops ext4_attr_ops = {
+	.show	= ext4_attr_show,
+	.store	= ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+	.default_attrs	= ext4_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_sb_release,
+};
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2575,6 +2768,16 @@ no_journal:
 		goto failed_mount4;
 	}
 
+	sbi->s_kobj.kset = ext4_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+				   "%s", sb->s_id);
+	if (err) {
+		ext4_mb_release(sb);
+		ext4_ext_release(sb);
+		goto failed_mount4;
+	};
+
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -3734,6 +3937,9 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+	if (!ext4_kset)
+		return -ENOMEM;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
@@ -3775,6 +3981,7 @@ static void __exit exit_ext4_fs(void)
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
cgit v1.2.3


From b713a5ec55bf73c833f9883cdd761b20ee61a1ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 31 Mar 2009 09:11:14 -0400
Subject: ext4: remove /proc tuning knobs

Remove tuning knobs in /proc/fs/ext4/<dev/* since they have been
replaced by knobs in sysfs at /sys/fs/ext4/<dev>/*.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/proc.txt |  21 -------
 fs/ext4/ext4.h                     |  16 -----
 fs/ext4/inode.c                    |   7 +--
 fs/ext4/mballoc.c                  | 117 +++++++++----------------------------
 fs/ext4/super.c                    |  46 ---------------
 5 files changed, 30 insertions(+), 177 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0..efc4fd9f40c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
  File            Content                                        
  mb_groups       details of multiblock allocator buddy cache of free blocks
  mb_history      multiblock allocation history
- stats           controls whether the multiblock allocator should start
-                 collecting statistics, which are shown during the unmount
- group_prealloc  the multiblock allocator will round up allocation
-                 requests to a multiple of this tuning parameter if the
-                 stripe size is not set in the ext4 superblock
- max_to_scan     The maximum number of extents the multiblock allocator
-                 will search to find the best extent
- min_to_scan     The minimum number of extents the multiblock allocator
-                 will search to find the best extent
- order2_req      Tuning parameter which controls the minimum size for 
-                 requests (as a power of 2) where the buddy cache is
-                 used
- stream_req      Files which have fewer blocks than this tunable
-                 parameter will have their blocks allocated out of a
-                 block group specific preallocation pool, so that small
-                 files are packed closely together.  Each large file
-                 will have its blocks allocated out of its own unique
-                 preallocation pool.
-inode_readahead  Tuning parameter which controls the maximum number of
-                 inode table blocks that ext4's inode table readahead
-                 algorithm will pre-read into the buffer cache
 ..............................................................................
 
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0bd39188531..e5c273ff928 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -976,22 +976,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define	EXT4_PROC_HANDLER(name, var)					\
-do {									\
-	proc = proc_create_data(name, mode, sbi->s_proc,		\
-				&ext4_ui_proc_fops, &sbi->s_##var);	\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
-		goto err_out;						\
-	}								\
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
 /*
  * Function prototypes
  */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7dcac9d7e49..d3118d1acc3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4153,12 +4153,7 @@ make_io:
 			unsigned num;
 
 			table = ext4_inode_table(sb, gdp);
-			/* Make sure s_inode_readahead_blks is a power of 2 */
-			while (EXT4_SB(sb)->s_inode_readahead_blks &
-			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
-				EXT4_SB(sb)->s_inode_readahead_blks = 
-				   (EXT4_SB(sb)->s_inode_readahead_blks &
-				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+			/* s_inode_readahead_blks is always a power of 2 */
 			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
 			if (table > b)
 				b = table;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b0d6022eaa6..c4c43097762 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
  * The allocation request involve request for multiple number of blocks
  * near to the goal(block) value specified.
  *
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
  *
  * The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
  *
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
  * 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
  * stripe=<value> option the group prealloc request is normalized to the
  * stripe value (sbi->s_stripe)
  *
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
  *
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
  *
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
  * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
  * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
  * best extent in the found extents. Searching for the blocks starts with
  * the group specified as the goal value in allocation context via
  * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
 
@@ -1978,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	/*
 	 * We search using buddy data only if the order of the request
 	 * is greater than equal to the sbi_s_mb_order2_reqs
-	 * You can tune it via /proc/fs/ext4/<partition>/order2_req
+	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
 	 */
 	if (i >= sbi->s_mb_order2_reqs) {
 		/*
@@ -2753,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
-	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
 	if (sbi->s_journal)
@@ -2836,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
 
 	free_percpu(sbi->s_locality_groups);
 	ext4_mb_history_release(sb);
-	ext4_mb_destroy_per_dev_proc(sb);
 
 	return 0;
 }
@@ -2897,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
 
-#define EXT4_MB_STATS_NAME		"stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME	"max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME	"min_to_scan"
-#define EXT4_MB_ORDER2_REQ		"order2_req"
-#define EXT4_MB_STREAM_REQ		"stream_req"
-#define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct proc_dir_entry *proc;
-
-	if (sbi->s_proc == NULL)
-		return -EINVAL;
-
-	EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
-	EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
-	EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
-	EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
-	EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
-	EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
-	return 0;
-
-err_out:
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-	return -ENOMEM;
-#else
-	return 0;
-#endif
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_proc == NULL)
-		return -EINVAL;
-
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
-	return 0;
-}
-
 int __init init_ext4_mballoc(void)
 {
 	ext4_pspace_cachep =
@@ -3123,7 +3064,7 @@ out_err:
  * here we normalize request for locality group
  * Group request are normalized to s_strip size if we set the same via mount
  * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
  *
  * XXX: should we try to preallocate more than the group has now?
  */
@@ -4239,7 +4180,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  * file is determined by the current size or the resulting size after
  * allocation which ever is larger
  *
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
  */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2883d4318c2..1ec554cc107 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -579,7 +579,6 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_commit_super(sb, es, 1);
 	}
 	if (sbi->s_proc) {
-		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 	kobject_del(&sbi->s_kobj);
@@ -2529,11 +2528,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_PROC_FS
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
-	if (sbi->s_proc)
-		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
-				 &ext4_ui_proc_fops,
-				 &sbi->s_inode_readahead_blks);
 #endif
 
 	bgl_lock_init(sbi->s_blockgroup_lock);
@@ -2832,7 +2826,6 @@ failed_mount2:
 	kfree(sbi->s_group_desc);
 failed_mount:
 	if (sbi->s_proc) {
-		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 #ifdef CONFIG_QUOTA
@@ -3865,45 +3858,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
-	unsigned int *p = m->private;
-
-	seq_printf(m, "%u\n", *p);
-	return 0;
-}
-
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
-			       size_t cnt, loff_t *ppos)
-{
-	unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
-	char str[32];
-
-	if (cnt >= sizeof(str))
-		return -EINVAL;
-	if (copy_from_user(str, buf, cnt))
-		return -EFAULT;
-
-	*p = simple_strtoul(str, NULL, 0);
-	return cnt;
-}
-
-const struct file_operations ext4_ui_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ext4_ui_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= ext4_ui_proc_write,
-};
-#endif
-
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
-- 
cgit v1.2.3


From 9f24e4208f7ee2748f157368b63287dc903fcf60 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 4 Mar 2009 19:09:10 -0500
Subject: ext4: Use atomic_t's in struct flex_groups

Reduce pressure on the sb_bgl_lock family of locks by using atomic_t's
to track the number of free blocks and inodes in each flex_group.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  5 ++---
 fs/ext4/ext4.h    |  4 ++--
 fs/ext4/ialloc.c  | 33 +++++++++++++++------------------
 fs/ext4/mballoc.c |  9 +++------
 fs/ext4/resize.c  |  8 ++++----
 fs/ext4/super.c   |  8 ++++----
 6 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b37b1287558..53c72ad8587 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -470,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 	/*
 	 * request to reload the buddy with the
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e5c273ff928..e52b48f86ed 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -170,8 +170,8 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-	__u32 free_inodes;
-	__u32 free_blocks;
+	atomic_t free_inodes;
+	atomic_t free_blocks;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 617f5a2d800..5f393927fd2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err, count, cleared;
-	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -277,10 +276,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
 			if (sbi->s_log_groups_per_flex) {
-				flex_group = ext4_flex_group(sbi, block_group);
-				spin_lock(sb_bgl_lock(sbi, flex_group));
-				sbi->s_flex_groups[flex_group].free_inodes++;
-				spin_unlock(sb_bgl_lock(sbi, flex_group));
+				ext4_group_t f;
+
+				f = ext4_flex_group(sbi, block_group);
+				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
 			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +359,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
-	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
 	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-	if (flex_group[best_flex].free_inodes &&
+	if (atomic_read(&flex_group[best_flex].free_inodes) &&
 	    flex_freeb_ratio > free_block_ratio)
 		goto found_flexbg;
 
@@ -375,24 +374,24 @@ find_close_to_parent:
 		if (i == parent_fbg_group || i == parent_fbg_group - 1)
 			continue;
 
-		flexbg_free_blocks = flex_group[i].free_blocks;
+		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
 		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 
 		if (flex_freeb_ratio > free_block_ratio &&
-		    flex_group[i].free_inodes) {
+		    (atomic_read(&flex_group[i].free_inodes))) {
 			best_flex = i;
 			goto found_flexbg;
 		}
 
-		if (flex_group[best_flex].free_inodes == 0 ||
-		    (flex_group[i].free_blocks >
-		     flex_group[best_flex].free_blocks &&
-		     flex_group[i].free_inodes))
+		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
+		    ((atomic_read(&flex_group[i].free_blocks) >
+		      atomic_read(&flex_group[best_flex].free_blocks)) &&
+		     atomic_read(&flex_group[i].free_inodes)))
 			best_flex = i;
 	}
 
-	if (!flex_group[best_flex].free_inodes ||
-	    !flex_group[best_flex].free_blocks)
+	if (!atomic_read(&flex_group[best_flex].free_inodes) ||
+	    !atomic_read(&flex_group[best_flex].free_blocks))
 		return -1;
 
 found_flexbg:
@@ -960,9 +959,7 @@ got:
 
 	if (sbi->s_log_groups_per_flex) {
 		flex_group = ext4_flex_group(sbi, group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_inodes--;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
 
 	inode->i_uid = current_fsuid();
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c4c43097762..e72c72a0b80 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3044,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_sub(ac->ac_b_ex.fe_len,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4884,9 +4883,7 @@ do_more:
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += count;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
 	ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd65..546c7dd869e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
-		sbi->s_flex_groups[flex_group].free_blocks +=
-			input->free_blocks_count;
-		sbi->s_flex_groups[flex_group].free_inodes +=
-			EXT4_INODES_PER_GROUP(sb);
+		atomic_add(input->free_blocks_count,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(EXT4_INODES_PER_GROUP(sb),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
 	}
 
 	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1ec554cc107..6b5d5c6399f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1630,10 +1630,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, &bh);
 
 		flex_group = ext4_flex_group(sbi, i);
-		sbi->s_flex_groups[flex_group].free_inodes +=
-			ext4_free_inodes_count(sb, gdp);
-		sbi->s_flex_groups[flex_group].free_blocks +=
-			ext4_free_blks_count(sb, gdp);
+		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+			   ext4_free_inodes_count(sb, gdp));
+		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+			   ext4_free_blks_count(sb, gdp));
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 7d39db14a42cbd719c7515b9da8f85a2eb6a0633 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 4 Mar 2009 19:31:53 -0500
Subject: ext4: Use struct flex_groups to calculate get_orlov_stats()

Instead of looping over all of the block groups in a flex group
summing their summary statistics, start tracking used_dirs in struct
flex_groups, and use struct flex_groups instead.  This should save a
bit of CPU for mkdir-heavy workloads.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  1 +
 fs/ext4/ialloc.c | 45 ++++++++++++++++++++++++++++-----------------
 fs/ext4/super.c  |  2 ++
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e52b48f86ed..46aaaa2ed4c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -172,6 +172,7 @@ struct ext4_group_desc
 struct flex_groups {
 	atomic_t free_inodes;
 	atomic_t free_blocks;
+	atomic_t used_dirs;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5f393927fd2..47b84e8df56 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -267,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			if (is_directory) {
 				count = ext4_used_dirs_count(sb, gdp) - 1;
 				ext4_used_dirs_set(sb, gdp, count);
+				if (sbi->s_log_groups_per_flex) {
+					ext4_group_t f;
+
+					f = ext4_flex_group(sbi, block_group);
+					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+				}
+
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
@@ -424,25 +431,24 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 		       int flex_size, struct orlov_stats *stats)
 {
 	struct ext4_group_desc *desc;
-	ext4_group_t		ngroups = EXT4_SB(sb)->s_groups_count;
-	int			i;
-
-	stats->free_inodes = 0;
-	stats->free_blocks = 0;
-	stats->used_dirs = 0;
+	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
 
-	g *= flex_size;
-
-	for (i = 0; i < flex_size; i++) {
-		if (g >= ngroups)
-			break;
-		desc = ext4_get_group_desc(sb, g++, NULL);
-		if (!desc)
-			continue;
+	if (flex_size > 1) {
+		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+		stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		return;
+	}
 
-		stats->free_inodes += ext4_free_inodes_count(sb, desc);
-		stats->free_blocks += ext4_free_blks_count(sb, desc);
-		stats->used_dirs += ext4_used_dirs_count(sb, desc);
+	desc = ext4_get_group_desc(sb, g, NULL);
+	if (desc) {
+		stats->free_inodes = ext4_free_inodes_count(sb, desc);
+		stats->free_blocks = ext4_free_blks_count(sb, desc);
+		stats->used_dirs = ext4_used_dirs_count(sb, desc);
+	} else {
+		stats->free_inodes = 0;
+		stats->free_blocks = 0;
+		stats->used_dirs = 0;
 	}
 }
 
@@ -765,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
 	if (S_ISDIR(mode)) {
 		count = ext4_used_dirs_count(sb, gdp) + 1;
 		ext4_used_dirs_set(sb, gdp, count);
+		if (sbi->s_log_groups_per_flex) {
+			ext4_group_t f = ext4_flex_group(sbi, group);
+
+			atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		}
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6b5d5c6399f..b9aefceb41e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1634,6 +1634,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 			   ext4_free_inodes_count(sb, gdp));
 		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
 			   ext4_free_blks_count(sb, gdp));
+		atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+			   ext4_used_dirs_count(sb, gdp));
 	}
 
 	return 1;
-- 
cgit v1.2.3


From fe315e76fc3a3f9f7e1581dc22fec7e7719f0896 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Mar 2009 14:10:21 -0400
Subject: SUNRPC: Avoid spurious wake-up during UDP connect processing

To clear out old state, the UDP connect workers unconditionally invoke
xs_close() before proceeding with a new connect.  Nowadays this causes
a spurious wake-up of the task waiting for the connect to complete.

This is a little racey, but usually harmless.  The waiting task
immediately retries the connect via a call_bind/call_connect sequence,
which usually finds the transport already in the connected state
because the connect worker has finished in the background.

To avoid a spurious wake-up, factor the xs_close() logic that resets
the underlying socket into a helper, and have the UDP connect workers
call that helper instead of xs_close().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 29c71e645b2..1127eb93413 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -767,23 +767,13 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
 	sk->sk_error_report = transport->old_error_report;
 }
 
-/**
- * xs_close - close a socket
- * @xprt: transport
- *
- * This is used when all requests are complete; ie, no DRC state remains
- * on the server we want to save.
- */
-static void xs_close(struct rpc_xprt *xprt)
+static void xs_reset_transport(struct sock_xprt *transport)
 {
-	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct socket *sock = transport->sock;
 	struct sock *sk = transport->inet;
 
-	if (!sk)
-		goto clear_close_wait;
-
-	dprintk("RPC:       xs_close xprt %p\n", xprt);
+	if (sk == NULL)
+		return;
 
 	write_lock_bh(&sk->sk_callback_lock);
 	transport->inet = NULL;
@@ -797,7 +787,23 @@ static void xs_close(struct rpc_xprt *xprt)
 	sk->sk_no_check = 0;
 
 	sock_release(sock);
-clear_close_wait:
+}
+
+/**
+ * xs_close - close a socket
+ * @xprt: transport
+ *
+ * This is used when all requests are complete; ie, no DRC state remains
+ * on the server we want to save.
+ */
+static void xs_close(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	dprintk("RPC:       xs_close xprt %p\n", xprt);
+
+	xs_reset_transport(transport);
+
 	smp_mb__before_clear_bit();
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
@@ -1537,9 +1543,10 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 		goto out;
 
 	/* Start by resetting any existing state */
-	xs_close(xprt);
+	xs_reset_transport(transport);
 
-	if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+	err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (err < 0) {
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
@@ -1578,9 +1585,10 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 		goto out;
 
 	/* Start by resetting any existing state */
-	xs_close(xprt);
+	xs_reset_transport(transport);
 
-	if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+	err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (err < 0) {
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
-- 
cgit v1.2.3


From 2b57dc6cf9bf31edc0df430ea18dd1dbd3028975 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Wed, 11 Mar 2009 14:10:22 -0400
Subject: NFS: Minor __nfs_revalidate_inode cleanup

Remove redundant NFS_STALE() check, a leftover due to the commit
691beb13cdc88358334ef0ba867c080a247a760f

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171..acaaa7c7efa 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -670,9 +670,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (NFS_STALE(inode))
 		goto out;
 
-	if (NFS_STALE(inode))
-		goto out;
-
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
 	if (status != 0) {
-- 
cgit v1.2.3


From 37d9d76d8b3a2ac5817e1fa3263cfe0fdb439e51 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 11 Mar 2009 14:10:23 -0400
Subject: NFS: flush cached directory information slightly more readily.

If cached directory contents becomes incorrect, there is no way to
flush the contents.  This contrasts with files where file locking is
the recommended way to ensure cache consistency between multiple
applications (a read-lock always flushes the cache).

Also while changes to files often change the size of the file (thus
triggering a cache flush), changes to directories often do not change
the apparent size (as the size is often rounded to a block size).

So it is particularly important with directories to avoid the
possibility of an incorrect cache wherever possible.

When the link count on a directory changes it implies a change in the
number of child directories, and so a change in the contents of this
directory.  So use that as a trigger to flush cached contents.

When the ctime changes but the mtime does not, there are two possible
reasons.
 1/ The owner/mode information has been changed.
 2/ utimes has been used to set the mtime backwards.

In the first case, a data-cache flush is not required.
In the second case it is.

So on the basis that correctness trumps performance, flush the
directory contents cache in this case also.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index acaaa7c7efa..268ce3a4622 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1113,8 +1113,16 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 		}
 		/* If ctime has changed we should definitely clear access+acl caches */
-		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			/* and probably clear data for a directory too as utimes can cause
+			 * havoc with our cache.
+			 */
+			if (S_ISDIR(inode->i_mode)) {
+				invalid |= NFS_INO_INVALID_DATA;
+				nfs_force_lookup_revalidate(inode);
+			}
+		}
 	} else if (nfsi->change_attr != fattr->change_attr) {
 		dprintk("NFS: change_attr change on server for file %s/%ld\n",
 				inode->i_sb->s_id, inode->i_ino);
@@ -1148,8 +1156,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	    inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 
-	if (inode->i_nlink != fattr->nlink)
+	if (inode->i_nlink != fattr->nlink) {
 		invalid |= NFS_INO_INVALID_ATTR;
+		if (S_ISDIR(inode->i_mode))
+			invalid |= NFS_INO_INVALID_DATA;
+	}
 
 	inode->i_mode = fattr->mode;
 	inode->i_nlink = fattr->nlink;
-- 
cgit v1.2.3


From 78f945f88ef83dcc7c962614a080e0a9a2db5889 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:23 -0400
Subject: NFSv4: Ignore errors on the post-op attributes in SETATTR calls

There is no need to fail or retry a SETATTR call just because the post-op
GETATTR failed.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a..5f0ee3e2bd8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4078,9 +4078,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
 	status = decode_setattr(&xdr, res);
 	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server);
-	if (status == NFS4ERR_DELAY)
-		status = 0;
+	decode_getfattr(&xdr, res->fattr, res->server);
 out:
 	return status;
 }
-- 
cgit v1.2.3


From 9e6e70f8d8b6698e0017c56b86525aabe9c7cd4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:24 -0400
Subject: NFSv4: Support NFSv4 optional attributes in the struct nfs_fattr

Currently, filling struct nfs_fattr is more or less an all or nothing
operation, since NFSv2 and NFSv3 have only mandatory attributes.
In NFSv4, some attributes are optional, and so we may simply not be able to
fill in those fields. Furthermore, NFSv4 allows you to specify which
attributes you are interested in retrieving, thus permitting you to
optimise away retrieval of attributes that you know will no change...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 243 ++++++++++++++++++++++++++++++------------------
 fs/nfs/nfs2xdr.c        |   2 +-
 fs/nfs/nfs3xdr.c        |   6 +-
 fs/nfs/nfs4xdr.c        |   6 +-
 include/linux/nfs_xdr.h |  48 ++++++++--
 5 files changed, 202 insertions(+), 103 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 268ce3a4622..b7656bd3706 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -249,13 +249,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 	struct inode *inode = ERR_PTR(-ENOENT);
 	unsigned long hash;
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
 		goto out_no_inode;
-
-	if (!fattr->nlink) {
-		printk("NFS: Buggy server - nlink == 0!\n");
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
 		goto out_no_inode;
-	}
 
 	hash = nfs_fattr_to_ino_t(fattr);
 
@@ -291,7 +288,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
-			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+					&& !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
 				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
 					inode->i_op = &nfs_referral_inode_operations;
 				else
@@ -304,28 +302,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
+		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+		nfsi->change_attr = 0;
+		inode->i_size = 0;
+		inode->i_nlink = 0;
+		inode->i_uid = -2;
+		inode->i_gid = -2;
+		inode->i_blocks = 0;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
-		inode->i_atime = fattr->atime;
-		inode->i_mtime = fattr->mtime;
-		inode->i_ctime = fattr->ctime;
-		if (fattr->valid & NFS_ATTR_FATTR_V4)
+		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+			inode->i_atime = fattr->atime;
+		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			inode->i_mtime = fattr->mtime;
+		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			inode->i_ctime = fattr->ctime;
+		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			nfsi->change_attr = fattr->change_attr;
-		inode->i_size = nfs_size_to_loff_t(fattr->size);
-		inode->i_nlink = fattr->nlink;
-		inode->i_uid = fattr->uid;
-		inode->i_gid = fattr->gid;
-		if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+			inode->i_nlink = fattr->nlink;
+		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+			inode->i_uid = fattr->uid;
+		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+			inode->i_gid = fattr->gid;
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 			/*
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		} else {
-			inode->i_blocks = fattr->du.nfs2.blocks;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 		nfsi->access_cache = RB_ROOT;
 
 		unlock_new_inode(inode);
@@ -812,25 +827,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
-			nfsi->change_attr == fattr->pre_change_attr) {
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			&& nfsi->change_attr == fattr->pre_change_attr) {
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
-	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-		}
-		if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
-		    nfsi->npages == 0)
-			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 	}
+	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+			&& nfsi->npages == 0)
+			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 }
 
 /**
@@ -850,35 +871,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 
 	/* Has the inode gone and changed behind our back? */
-	if (nfsi->fileid != fattr->fileid
-			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		return -EIO;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		return -EIO;
-	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
-	cur_size = i_size_read(inode);
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	if (cur_size != new_isize && nfsi->npages == 0)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		cur_size = i_size_read(inode);
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		if (cur_size != new_isize && nfsi->npages == 0)
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	}
 
 	/* Have any file permissions changed? */
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
-			|| inode->i_uid != fattr->uid
-			|| inode->i_gid != fattr->gid)
+	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
-	if (inode->i_nlink != fattr->nlink)
+	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_ATTR;
 
-	if (!timespec_equal(&inode->i_atime, &fattr->atime))
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
@@ -890,11 +915,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+		return 0;
 	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return 0;
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
@@ -1030,20 +1059,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	/* Don't do a WCC update if these attributes are already stale */
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
 			!nfs_inode_attrs_need_update(inode, fattr)) {
-		fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+				| NFS_ATTR_FATTR_PRESIZE
+				| NFS_ATTR_FATTR_PREMTIME
+				| NFS_ATTR_FATTR_PRECTIME);
 		goto out_noforce;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
 		fattr->pre_change_attr = NFS_I(inode)->change_attr;
-		fattr->valid |= NFS_ATTR_WCC_V4;
+		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
 		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
 		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
 		fattr->pre_size = i_size_read(inode);
-		fattr->valid |= NFS_ATTR_WCC;
+		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
 	}
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1075,18 +1115,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if (nfsi->fileid != fattr->fileid)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
-	if (S_ISDIR(inode->i_mode) &&
+	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
 			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
 		server->fsid = fattr->fsid;
@@ -1096,14 +1136,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
-			| NFS_INO_REVAL_PAGECACHE);
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+		    | NFS_INO_INVALID_ATIME
+		    | NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
-	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (nfsi->change_attr != fattr->change_attr) {
+			dprintk("NFS: change_attr change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			nfsi->change_attr = fattr->change_attr;
+		}
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		/* NFSv2/v3: Check if the mtime agrees */
 		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 			dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1111,7 +1164,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
+			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
@@ -1122,59 +1178,66 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				invalid |= NFS_INO_INVALID_DATA;
 				nfs_force_lookup_revalidate(inode);
 			}
+			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 		}
-	} else if (nfsi->change_attr != fattr->change_attr) {
-		dprintk("NFS: change_attr change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		if (S_ISDIR(inode->i_mode))
-			nfs_force_lookup_revalidate(inode);
 	}
 
 	/* Check if our cached file size is stale */
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	cur_isize = i_size_read(inode);
-	if (new_isize != cur_isize) {
-		/* Do we perhaps have any outstanding writes, or has
-		 * the file grown beyond our last write? */
-		if (nfsi->npages == 0 || new_isize > cur_isize) {
-			i_size_write(inode, new_isize);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		cur_isize = i_size_read(inode);
+		if (new_isize != cur_isize) {
+			/* Do we perhaps have any outstanding writes, or has
+			 * the file grown beyond our last write? */
+			if (nfsi->npages == 0 || new_isize > cur_isize) {
+				i_size_write(inode, new_isize);
+				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			}
+			dprintk("NFS: isize change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
 		}
-		dprintk("NFS: isize change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
 	}
 
 
-	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-	nfsi->change_attr = fattr->change_attr;
+	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
 
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
-	    inode->i_uid != fattr->uid ||
-	    inode->i_gid != fattr->gid)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-
-	if (inode->i_nlink != fattr->nlink) {
-		invalid |= NFS_INO_INVALID_ATTR;
-		if (S_ISDIR(inode->i_mode))
-			invalid |= NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_mode = fattr->mode;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+		if (inode->i_uid != fattr->uid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_uid = fattr->uid;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+		if (inode->i_gid != fattr->gid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_gid = fattr->gid;
+		}
 	}
 
-	inode->i_mode = fattr->mode;
-	inode->i_nlink = fattr->nlink;
-	inode->i_uid = fattr->uid;
-	inode->i_gid = fattr->gid;
+	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+		if (inode->i_nlink != fattr->nlink) {
+			invalid |= NFS_INO_INVALID_ATTR;
+			if (S_ISDIR(inode->i_mode))
+				invalid |= NFS_INO_INVALID_DATA;
+			inode->i_nlink = fattr->nlink;
+		}
+	}
 
-	if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 		/*
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- 	} else {
- 		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		inode->i_blocks = fattr->du.nfs2.blocks;
 
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d151..bea99992c30 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -136,7 +136,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	p = xdr_decode_time(p, &fattr->ctime);
-	fattr->valid |= NFS_ATTR_FATTR;
+	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
 	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->type = NFFIFO;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde4..c0f7d02aced 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -177,7 +177,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time3(p, &fattr->ctime);
 
 	/* Update the mode bits */
-	fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return p;
 }
 
@@ -233,7 +233,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_hyper(p, &fattr->pre_size);
 	p = xdr_decode_time3(p, &fattr->pre_mtime);
 	p = xdr_decode_time3(p, &fattr->pre_ctime);
-	fattr->valid |= NFS_ATTR_WCC;
+	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PREMTIME
+		| NFS_ATTR_FATTR_PRECTIME;
 	return p;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f0ee3e2bd8..7d220da3db3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3012,7 +3012,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
 	fattr->type = nfs_type2fmt[type].nfs2type;
-	fmode = nfs_type2fmt[type].mode;
+	fattr->mode = nfs_type2fmt[type].mode;
 
 	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
@@ -3026,7 +3026,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 						struct nfs4_fs_locations,
 						fattr))) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+	if ((status = decode_attr_mode(xdr, bitmap, &fmode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
 	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
@@ -3050,7 +3050,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if (fattr->fileid == 0 && fileid != 0)
 		fattr->fileid = fileid;
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
-		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+		fattr->valid = NFS_ATTR_FATTR_V4;
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2e5f00066af..b99295e07cd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -27,7 +27,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 }
 
 struct nfs_fattr {
-	unsigned short		valid;		/* which fields are valid */
+	unsigned int		valid;		/* which fields are valid */
 	__u64			pre_size;	/* pre_op_attr.size	  */
 	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
 	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
@@ -59,12 +59,46 @@ struct nfs_fattr {
 	unsigned long		gencount;
 };
 
-#define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
-#define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
-#define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
-#define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
-#define NFS_ATTR_WCC_V4		0x0010		/* pre-op change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL	0x0020		/* NFSv4 referral */
+#define NFS_ATTR_FATTR_TYPE		(1U << 0)
+#define NFS_ATTR_FATTR_MODE		(1U << 1)
+#define NFS_ATTR_FATTR_NLINK		(1U << 2)
+#define NFS_ATTR_FATTR_OWNER		(1U << 3)
+#define NFS_ATTR_FATTR_GROUP		(1U << 4)
+#define NFS_ATTR_FATTR_RDEV		(1U << 5)
+#define NFS_ATTR_FATTR_SIZE		(1U << 6)
+#define NFS_ATTR_FATTR_PRESIZE		(1U << 7)
+#define NFS_ATTR_FATTR_BLOCKS_USED	(1U << 8)
+#define NFS_ATTR_FATTR_SPACE_USED	(1U << 9)
+#define NFS_ATTR_FATTR_FSID		(1U << 10)
+#define NFS_ATTR_FATTR_FILEID		(1U << 11)
+#define NFS_ATTR_FATTR_ATIME		(1U << 12)
+#define NFS_ATTR_FATTR_MTIME		(1U << 13)
+#define NFS_ATTR_FATTR_CTIME		(1U << 14)
+#define NFS_ATTR_FATTR_PREMTIME		(1U << 15)
+#define NFS_ATTR_FATTR_PRECTIME		(1U << 16)
+#define NFS_ATTR_FATTR_CHANGE		(1U << 17)
+#define NFS_ATTR_FATTR_PRECHANGE	(1U << 18)
+#define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 19)	/* NFSv4 referral */
+
+#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
+		| NFS_ATTR_FATTR_MODE \
+		| NFS_ATTR_FATTR_NLINK \
+		| NFS_ATTR_FATTR_OWNER \
+		| NFS_ATTR_FATTR_GROUP \
+		| NFS_ATTR_FATTR_RDEV \
+		| NFS_ATTR_FATTR_SIZE \
+		| NFS_ATTR_FATTR_FSID \
+		| NFS_ATTR_FATTR_FILEID \
+		| NFS_ATTR_FATTR_ATIME \
+		| NFS_ATTR_FATTR_MTIME \
+		| NFS_ATTR_FATTR_CTIME)
+#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_BLOCKS_USED)
+#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_SPACE_USED)
+#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_SPACE_USED \
+		| NFS_ATTR_FATTR_CHANGE)
 
 /*
  * Info on the file system
-- 
cgit v1.2.3


From 1ca277d88dafdbc3c5a69d32590e7184b9af6371 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:25 -0400
Subject: NFS: Shrink the struct nfs_fattr

We don't need the bitmap[] field anymore, since the 'valid' field tells us
all we need to know about which attributes were filled in...
Also move the pre-op attributes in order to improve the structure packing.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c        | 3 ---
 include/linux/nfs_xdr.h | 7 +++----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7d220da3db3..9f1df836197 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3002,9 +3002,6 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto xdr_error;
 
-	fattr->bitmap[0] = bitmap[0];
-	fattr->bitmap[1] = bitmap[1];
-
 	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
 		goto xdr_error;
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b99295e07cd..6013acb0131 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -28,9 +28,6 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
-	__u64			pre_size;	/* pre_op_attr.size	  */
-	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
-	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	enum nfs_ftype		type;		/* always use NFSv2 types */
 	__u32			mode;
 	__u32			nlink;
@@ -52,9 +49,11 @@ struct nfs_fattr {
 	struct timespec		atime;
 	struct timespec		mtime;
 	struct timespec		ctime;
-	__u32			bitmap[2];	/* NFSv4 returned attribute bitmap */
 	__u64			change_attr;	/* NFSv4 change attribute */
 	__u64			pre_change_attr;/* pre-op NFSv4 change attribute */
+	__u64			pre_size;	/* pre_op_attr.size	  */
+	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
+	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	unsigned long		time_start;
 	unsigned long		gencount;
 };
-- 
cgit v1.2.3


From bca794785c2c12ecddeb09e70165b8ff80baa6ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:26 -0400
Subject: NFS: Fix the type of struct nfs_fattr->mode

There is no point in using anything other than umode_t, since we copy the
content pretty much directly into inode->i_mode.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/getroot.c        |  4 ++--
 fs/nfs/nfs2xdr.c        |  7 +++----
 fs/nfs/nfs3xdr.c        | 31 +++++++++++++------------------
 fs/nfs/nfs4xdr.c        | 40 +++++++++++++++++++---------------------
 include/linux/nfs_xdr.h |  3 +--
 5 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f2..46177cb8706 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " getroot encountered non-directory\n");
 		return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " lookupfh encountered non-directory\n");
 		return -ENOTDIR;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index bea99992c30..c862c9340f9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-	u32 rdev;
-	fattr->type = (enum nfs_ftype) ntohl(*p++);
+	u32 rdev, type;
+	type = ntohl(*p++);
 	fattr->mode = ntohl(*p++);
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -138,8 +138,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->ctime);
 	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
-	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
-		fattr->type = NFFIFO;
+	if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
 		fattr->rdev = 0;
 	}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index c0f7d02aced..e6a1932c711 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
 /*
  * Map file type to S_IFMT bits
  */
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-      { 0,		NFNON	},
-      { S_IFREG,	NFREG	},
-      { S_IFDIR,	NFDIR	},
-      { S_IFBLK,	NFBLK	},
-      { S_IFCHR,	NFCHR	},
-      { S_IFLNK,	NFLNK	},
-      { S_IFSOCK,	NFSOCK	},
-      { S_IFIFO,	NFFIFO	},
-      { 0,		NFBAD	}
+static const umode_t nfs_type2fmt[] = {
+	[NF3BAD] = 0,
+	[NF3REG] = S_IFREG,
+	[NF3DIR] = S_IFDIR,
+	[NF3BLK] = S_IFBLK,
+	[NF3CHR] = S_IFCHR,
+	[NF3LNK] = S_IFLNK,
+	[NF3SOCK] = S_IFSOCK,
+	[NF3FIFO] = S_IFIFO,
 };
 
 /*
@@ -148,13 +144,12 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	unsigned int	type, major, minor;
-	int		fmode;
+	umode_t		fmode;
 
 	type = ntohl(*p++);
-	if (type >= NF3BAD)
-		type = NF3BAD;
-	fmode = nfs_type2fmt[type].mode;
-	fattr->type = nfs_type2fmt[type].nfs2type;
+	if (type > NF3FIFO)
+		type = NF3NON;
+	fmode = nfs_type2fmt[type];
 	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1df836197..c1906d2a226 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
 
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-	{ 0,		NFNON	     },
-	{ S_IFREG,	NFREG	     },
-	{ S_IFDIR,	NFDIR	     },
-	{ S_IFBLK,	NFBLK	     },
-	{ S_IFCHR,	NFCHR	     },
-	{ S_IFLNK,	NFLNK	     },
-	{ S_IFSOCK,	NFSOCK	     },
-	{ S_IFIFO,	NFFIFO	     },
-	{ 0,		NFNON	     },
-	{ 0,		NFNON	     },
+static const umode_t nfs_type2fmt[] = {
+	[NF4BAD] = 0,
+	[NF4REG] = S_IFREG,
+	[NF4DIR] = S_IFDIR,
+	[NF4BLK] = S_IFBLK,
+	[NF4CHR] = S_IFCHR,
+	[NF4LNK] = S_IFLNK,
+	[NF4SOCK] = S_IFSOCK,
+	[NF4FIFO] = S_IFIFO,
+	[NF4ATTRDIR] = 0,
+	[NF4NAMEDATTR] = 0,
 };
 
 struct compound_hdr {
@@ -2173,7 +2170,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return 0;
 }
 
@@ -2580,8 +2577,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	return status;
 }
 
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
 {
+	uint32_t tmp;
 	__be32 *p;
 
 	*mode = 0;
@@ -2589,8 +2587,8 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(*mode);
-		*mode &= ~S_IFMT;
+		READ32(tmp);
+		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
@@ -2994,7 +2992,8 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	uint32_t attrlen,
 		 bitmap[2] = {0},
 		 type;
-	int status, fmode = 0;
+	int status;
+	umode_t fmode = 0;
 	uint64_t fileid;
 
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3008,8 +3007,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 
 	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
-	fattr->type = nfs_type2fmt[type].nfs2type;
-	fattr->mode = nfs_type2fmt[type].mode;
+	fattr->mode = nfs_type2fmt[type];
 
 	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6013acb0131..0691b9c188d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -28,8 +28,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
-	enum nfs_ftype		type;		/* always use NFSv2 types */
-	__u32			mode;
+	umode_t			mode;
 	__u32			nlink;
 	__u32			uid;
 	__u32			gid;
-- 
cgit v1.2.3


From f26c7a78876ccd6c9b477ab4ca127aa1a4ef68c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:26 -0400
Subject: NFSv4: Clean up decode_getfattr()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 78 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c1906d2a226..43c6e50ff17 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2996,55 +2996,91 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	umode_t fmode = 0;
 	uint64_t fileid;
 
-	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+	status = decode_op_hdr(xdr, OP_GETATTR);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
 		goto xdr_error;
 
-	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
 		goto xdr_error;
 
 
-	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
+	status = decode_attr_type(xdr, bitmap, &type);
+	if (status < 0)
 		goto xdr_error;
 	fattr->mode = nfs_type2fmt[type];
 
-	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
+	status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
+
+	status = decode_attr_size(xdr, bitmap, &fattr->size);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
+
+	status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
+
+	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+
+	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
 						struct nfs4_fs_locations,
-						fattr))) != 0)
+						fattr));
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_mode(xdr, bitmap, &fmode)) != 0)
+
+	status = decode_attr_mode(xdr, bitmap, &fmode);
+	if (status < 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
-	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
+
+	status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
+
+	status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
+
+	status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
+
+	status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
+
+	status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
+
+	status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
+
+	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
+
+	status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+
+	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+	if (status < 0)
 		goto xdr_error;
 	if (fattr->fileid == 0 && fileid != 0)
 		fattr->fileid = fileid;
-	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
+
+	status = verify_attr_len(xdr, savep, attrlen);
+	if (status == 0)
 		fattr->valid = NFS_ATTR_FATTR_V4;
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
-- 
cgit v1.2.3


From 409924e4c943072a63c43bb6b77576bf12f1896b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:27 -0400
Subject: NFSv4: Make decode_getfattr() set fattr->valid to reflect what was
 decoded

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 92 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 43c6e50ff17..1690f0e44b9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2157,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*type = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2169,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 			return -EIO;
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
+		ret = NFS_ATTR_FATTR_TYPE;
 	}
 	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*change = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2185,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ_BUF(8);
 		READ64(*change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+		ret = NFS_ATTR_FATTR_CHANGE;
 	}
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*size = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2202,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		READ_BUF(8);
 		READ64(*size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
+		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2242,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	__be32 *p;
+	int ret = 0;
 
 	fsid->major = 0;
 	fsid->minor = 0;
@@ -2252,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		READ64(fsid->major);
 		READ64(fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
+		ret = NFS_ATTR_FATTR_FSID;
 	}
 	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2294,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*fileid = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2302,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ_BUF(8);
 		READ64(*fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
+		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*fileid = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2318,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		READ_BUF(8);
 		READ64(*fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2476,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
 			res->nlocations++;
 	}
+	if (res->nlocations != 0)
+		status = NFS_ATTR_FATTR_V4_REFERRAL;
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
@@ -2581,6 +2595,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 {
 	uint32_t tmp;
 	__be32 *p;
+	int ret = 0;
 
 	*mode = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
@@ -2590,14 +2605,16 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		READ32(tmp);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
+		ret = NFS_ATTR_FATTR_MODE;
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*nlink = 1;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2606,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		READ_BUF(4);
 		READ32(*nlink);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
 {
 	uint32_t len;
 	__be32 *p;
+	int ret = 0;
 
 	*uid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2624,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ32(len);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
+			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+				ret = NFS_ATTR_FATTR_OWNER;
+			else
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
 						__func__);
 		} else
@@ -2633,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
 	dprintk("%s: uid=%d\n", __func__, (int)*uid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
 {
 	uint32_t len;
 	__be32 *p;
+	int ret = 0;
 
 	*gid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2649,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ32(len);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
+			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+				ret = NFS_ATTR_FATTR_GROUP;
+			else
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
 						__func__);
 		} else
@@ -2658,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
 	dprintk("%s: gid=%d\n", __func__, (int)*gid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
 	uint32_t major = 0, minor = 0;
 	__be32 *p;
+	int ret = 0;
 
 	*rdev = MKDEV(0,0);
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2679,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
 		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+		ret = NFS_ATTR_FATTR_RDEV;
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2738,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*used = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2746,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ_BUF(8);
 		READ64(*used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2776,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
 		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_ATIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
 	}
 	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2792,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
 		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_CTIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
 	}
 	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2808,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
 		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_MTIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
 	}
 	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -3012,76 +3046,96 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	status = decode_attr_type(xdr, bitmap, &type);
 	if (status < 0)
 		goto xdr_error;
-	fattr->mode = nfs_type2fmt[type];
+	fattr->mode = 0;
+	if (status != 0) {
+		fattr->mode |= nfs_type2fmt[type];
+		fattr->valid |= status;
+	}
 
 	status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_size(xdr, bitmap, &fattr->size);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
 						struct nfs4_fs_locations,
 						fattr));
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_mode(xdr, bitmap, &fmode);
 	if (status < 0)
 		goto xdr_error;
-	fattr->mode |= fmode;
+	if (status != 0) {
+		fattr->mode |= fmode;
+		fattr->valid |= status;
+	}
 
 	status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
 	if (status < 0)
 		goto xdr_error;
+	fattr->valid |= status;
 
 	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
 	if (status < 0)
 		goto xdr_error;
-	if (fattr->fileid == 0 && fileid != 0)
+	if (status != 0 && !(fattr->valid & status)) {
 		fattr->fileid = fileid;
+		fattr->valid |= status;
+	}
 
 	status = verify_attr_len(xdr, savep, attrlen);
-	if (status == 0)
-		fattr->valid = NFS_ATTR_FATTR_V4;
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
-- 
cgit v1.2.3


From 69aaaae18f7027d9594bce100378f102926cc0be Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:28 -0400
Subject: NFSv4: A referral is assumed to always point to a directory.

Fix a bug whereby we would fail to create a mount point for a referral.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d..aa433d07794 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3678,6 +3678,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+	if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+		(fattr->valid & NFS_ATTR_FATTR_FSID) &&
+		(fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+		return;
+
+	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+		NFS_ATTR_FATTR_NLINK;
+	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	fattr->nlink = 2;
+}
+
 int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page)
 {
@@ -3704,6 +3717,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
 	status = rpc_call_sync(server->client, &msg, 0);
+	nfs_fixup_referral_attributes(&fs_locations->fattr);
 	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
 }
-- 
cgit v1.2.3


From a65318bf3afc93ce49227e849d213799b072c5fd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:28 -0400
Subject: NFSv4: Simplify some cache consistency post-op GETATTRs

Certain asynchronous operations such as write() do not expect
(or care) that other metadata such as the file owner, mode, acls, ...
change. All they want to do is update and/or check the change attribute,
ctime, and mtime.
By skipping the file owner and group update, we also avoid having to do a
potential idmapper upcall for these asynchronous RPC calls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c         | 13 ++++++++-----
 include/linux/nfs_fs_sb.h |  5 +++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aa433d07794..101f5f4c304 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1439,7 +1439,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
 	calldata->arg.fmode = 0;
-	calldata->arg.bitmask = server->attr_bitmask;
+	calldata->arg.bitmask = server->cache_consistency_bitmask;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
@@ -1600,6 +1600,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
+		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
 	return status;
@@ -2079,7 +2082,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->attr_bitmask;
+	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2323,7 +2326,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pages = &page,
 		.pgbase = 0,
 		.count = count,
-		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
@@ -2552,7 +2555,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
@@ -2575,7 +2578,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 9bb81aec91c..29b1e40dce9 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -106,6 +106,11 @@ struct nfs_server {
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
 						   filesystem */
+	u32			cache_consistency_bitmask[2];
+						/* V4 bitmask representing the subset
+						   of change attribute, size, ctime
+						   and mtime attributes supported by
+						   the server */
 	u32			acl_bitmask;	/* V4 bitmask representing the ACEs
 						   that are supported on this
 						   filesystem */
-- 
cgit v1.2.3


From fb8a1f11b64e213d94dfa1cebb2a42a7b8c115c4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:29 -0400
Subject: NFS: cleanup - remove struct nfs_inode->ncommit

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 -
 fs/nfs/write.c         | 25 ++++++++++++++++---------
 include/linux/nfs_fs.h |  3 +--
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b7656bd3706..00f116cdadc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1345,7 +1345,6 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc..1a999939fed 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -404,7 +404,6 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->ncommit++;
 	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
@@ -523,6 +522,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
 	}
 }
 
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
  * nfs_scan_commit - Scan an inode for commit requests
@@ -538,16 +543,18 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int res = 0;
 
-	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(nfsi, dst, idx_start, npages,
-				NFS_PAGE_TAG_COMMIT);
-		nfsi->ncommit -= res;
-	}
-	return res;
+	if (!nfs_need_commit(nfsi))
+		return 0;
+
+	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
 }
 #else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return 0;
+}
+
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
@@ -820,7 +827,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
-		if (!NFS_I(inode)->ncommit)
+		if (!nfs_need_commit(NFS_I(inode)))
 			data->args.stable = NFS_FILE_SYNC;
 	}
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index db867b04ac3..c9fecd3e8f0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -166,8 +166,7 @@ struct nfs_inode {
 	 */
 	struct radix_tree_root	nfs_page_tree;
 
-	unsigned long		ncommit,
-				npages;
+	unsigned long		npages;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
-- 
cgit v1.2.3


From 72cb77f4a5ace37b12dcb47a0e8637a2c28ad881 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:30 -0400
Subject: NFS: Throttle page dirtying while we're flushing to disk

The following patch is a combination of a patch by myself and Peter
Staubach.

Trond: If we allow other processes to dirty pages while a process is doing
a consistency sync to disk, we can end up never making progress.

Peter: Attached is a patch which addresses a continuing problem with
the NFS client generating out of order WRITE requests.  While
this is compliant with all of the current protocol
specifications, there are servers in the market which can not
handle out of order WRITE requests very well.  Also, this may
lead to sub-optimal block allocations in the underlying file
system on the server.  This may cause the read throughputs to
be reduced when reading the file from the server.

Peter: There has been a lot of work recently done to address out of
order issues on a systemic level.  However, the NFS client is
still susceptible to the problem.  Out of order WRITE
requests can occur when pdflush is in the middle of writing
out pages while the process dirtying the pages calls
generic_file_buffered_write which calls
generic_perform_write which calls
balance_dirty_pages_rate_limited which ends up calling
writeback_inodes which ends up calling back into the NFS
client to writes out dirty pages for the same file that
pdflush happens to be working with.

Signed-off-by: Peter Staubach <staubach@redhat.com>
[modification by Trond to merge the two similar patches]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          |  9 +++++++++
 fs/nfs/inode.c         | 12 ++++++++++++
 fs/nfs/internal.h      |  1 +
 fs/nfs/nfs4proc.c      | 10 +---------
 fs/nfs/pagelist.c      | 11 -----------
 fs/nfs/write.c         | 28 +++++++++++++++++++---------
 include/linux/nfs_fs.h |  1 +
 7 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d..404c19c866a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,6 +354,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+	/*
+	 * Prevent starvation issues if someone is doing a consistency
+	 * sync-to-disk
+	 */
+	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		return ret;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00f116cdadc..c40adc5dd60 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,6 +65,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
+/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608..a55e69aa52e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,6 +165,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 101f5f4c304..95f171e7e05 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start, KM_USER0);
 }
 
-static int nfs4_wait_bit_killable(void *word)
-{
-	if (fatal_signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 	might_sleep();
 
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-			nfs4_wait_bit_killable, TASK_KILLABLE);
+			nfs_wait_bit_killable, TASK_KILLABLE);
 	return res;
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70..e2975939126 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
-static int nfs_wait_bit_killable(void *word)
-{
-	int ret = 0;
-
-	if (fatal_signal_pending(current))
-		ret = -ERESTARTSYS;
-	else
-		schedule();
-	return ret;
-}
-
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1a999939fed..36fd35e0de8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
+	/* Stop dirtying of new pages while we sync */
+	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (err)
+		goto out_err;
+
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
+
+	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
 	if (err < 0)
-		return err;
-	if (pgio.pg_error < 0)
-		return pgio.pg_error;
+		goto out_err;
+	err = pgio.pg_error;
+	if (err < 0)
+		goto out_err;
 	return 0;
+out_err:
+	return err;
 }
 
 /*
@@ -1432,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_NONE,
+		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 		.for_writepages = 1,
 	};
-	int ret;
 
-	ret = __nfs_write_mapping(mapping, &wbc, how);
-	if (ret < 0)
-		return ret;
-	wbc.sync_mode = WB_SYNC_ALL;
 	return __nfs_write_mapping(mapping, &wbc, how);
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c9fecd3e8f0..933bc261c0d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -206,6 +206,7 @@ struct nfs_inode {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
+#define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3


From e1ebfd33be068ec933f8954060a499bd22ad6f69 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:37:54 -0400
Subject: NFS: Kill the "defined but not used" compile error on nommu machines

Bryan Wu reports that when compiling NFS on nommu machines he gets a
"defined but not used" error on nfs_file_mmap().

The easiest fix is simply to get rid of the special casing in NFS, and
just always call generic_file_mmap() to set up the file.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 404c19c866a..1eab9c9ad24 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -64,11 +64,7 @@ const struct file_operations nfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= nfs_file_read,
 	.aio_write	= nfs_file_write,
-#ifdef CONFIG_MMU
 	.mmap		= nfs_file_mmap,
-#else
-	.mmap		= generic_file_mmap,
-#endif
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
 	.release	= nfs_file_release,
@@ -304,11 +300,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dprintk("NFS: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_mapping(inode, file->f_mapping);
+	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
+	 *       so we call that before revalidating the mapping
+	 */
+	status = generic_file_mmap(file, vma);
 	if (!status) {
 		vma->vm_ops = &nfs_file_vm_ops;
-		vma->vm_flags |= VM_CAN_NONLINEAR;
-		file_accessed(file);
+		status = nfs_revalidate_mapping(inode, file->f_mapping);
 	}
 	return status;
 }
-- 
cgit v1.2.3


From b1e1e158779f1d99c2cc18e466f6bf9099fc0853 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Wed, 11 Mar 2009 14:37:55 -0400
Subject: SVCRDMA: remove faulty assertions in rpc/rdma chunk validation.

Certain client-provided RPCRDMA chunk alignments result in an
additional scatter/gather entry, which triggered nfs/rdma server
assertions incorrectly. OpenSolaris nfs/rdma client connectathon
testing was blocked by these in the special/locking section.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Cc: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a3334e3b73c..d0bea987d80 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -191,7 +191,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 		   struct xdr_buf *xdr,
 		   struct svc_rdma_req_map *vec)
 {
-	int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
 	int sge_no;
 	u32 sge_bytes;
 	u32 page_bytes;
@@ -235,7 +234,11 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 		sge_no++;
 	}
 
-	BUG_ON(sge_no > sge_max);
+	dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+		"page_base %zd page_len %zd head_len %d tail_len %d\n",
+		sge_no, page_no, xdr->page_base, xdr->page_len,
+		xdr->head[0].iov_len, xdr->tail[0].iov_len);
+
 	vec->count = sge_no;
 	return 0;
 }
@@ -579,7 +582,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 			ctxt->sge[page_no+1].length = 0;
 	}
 	BUG_ON(sge_no > rdma->sc_max_sge);
-	BUG_ON(sge_no > ctxt->count);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
 	send_wr.wr_id = (unsigned long)ctxt;
-- 
cgit v1.2.3


From b38ab40ad58c1fc43ea590d6342f6a6763ac8fb6 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Wed, 11 Mar 2009 14:37:55 -0400
Subject: XPRTRDMA: correct an rpc/rdma inline send marshaling error

Certain client rpc's which contain both lengthy page-contained
metadata and a non-empty xdr_tail buffer require careful handling
to avoid overlapped memory copying. Rearranging of existing rpcrdma
marshaling code avoids it; this fixes an NFSv4 symlink creation error
detected with connectathon basic/test8 to multiple servers.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 14106d26bb9..e5e28d1946a 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -310,6 +310,19 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 		__func__, pad, destp, rqst->rq_slen, curlen);
 
 	copy_len = rqst->rq_snd_buf.page_len;
+
+	if (rqst->rq_snd_buf.tail[0].iov_len) {
+		curlen = rqst->rq_snd_buf.tail[0].iov_len;
+		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
+			memmove(destp + copy_len,
+				rqst->rq_snd_buf.tail[0].iov_base, curlen);
+			r_xprt->rx_stats.pullup_copy_count += curlen;
+		}
+		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
+			__func__, destp + copy_len, curlen);
+		rqst->rq_svec[0].iov_len += curlen;
+	}
+
 	r_xprt->rx_stats.pullup_copy_count += copy_len;
 	npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
 	for (i = 0; copy_len && i < npages; i++) {
@@ -332,17 +345,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 		destp += curlen;
 		copy_len -= curlen;
 	}
-	if (rqst->rq_snd_buf.tail[0].iov_len) {
-		curlen = rqst->rq_snd_buf.tail[0].iov_len;
-		if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
-			memcpy(destp,
-				rqst->rq_snd_buf.tail[0].iov_base, curlen);
-			r_xprt->rx_stats.pullup_copy_count += curlen;
-		}
-		dprintk("RPC:       %s: tail destp 0x%p len %d curlen %d\n",
-			__func__, destp, copy_len, curlen);
-		rqst->rq_svec[0].iov_len += curlen;
-	}
 	/* header now contains entire send message */
 	return pad;
 }
@@ -656,7 +658,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
 			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
 		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
-			memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
 		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
 			__func__, srcp, copy_len, curlen);
 		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
-- 
cgit v1.2.3


From 441e3e242903f9b190d5764bed73edb58f977413 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Wed, 11 Mar 2009 14:37:56 -0400
Subject: SUNRPC: dynamically load RPC transport modules on-demand

Provide an api to attempt to load any necessary kernel RPC
client transport module automatically. By convention, the
desired module name is "xprt"+"transport name". For example,
when NFS mounting with "-o proto=rdma", attempt to load the
"xprtrdma" module.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 11fc71d50c1..2b0d960603b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -235,6 +235,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
+int			xprt_load_transport(const char *);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 62098d101a1..d1afec64039 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,6 +151,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
+/**
+ * xprt_load_transport - load a transport implementation
+ * @transport_name: transport to load
+ *
+ * Returns:
+ * 0:		transport successfully loaded
+ * -ENOENT:	transport module not available
+ */
+int xprt_load_transport(const char *transport_name)
+{
+	struct xprt_class *t;
+	char module_name[sizeof t->name + 5];
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (strcmp(t->name, transport_name) == 0) {
+			spin_unlock(&xprt_list_lock);
+			goto out;
+		}
+	}
+	spin_unlock(&xprt_list_lock);
+	strcpy(module_name, "xprt");
+	strncat(module_name, transport_name, sizeof t->name);
+	result = request_module(module_name);
+out:
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_load_transport);
+
 /**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
-- 
cgit v1.2.3


From a67d18f89f5782806135aad4ee012ff78d45aae7 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Wed, 11 Mar 2009 14:37:56 -0400
Subject: NFS: load the rpc/rdma transport module automatically

When mounting an NFS/RDMA server with the "-o proto=rdma" or
"-o rdma" options, attempt to dynamically load the necessary
"xprtrdma" client transport module. Doing so improves usability,
while avoiding a static module dependency and any unnecesary
resources.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786d..0942fcbbad3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1018,6 +1018,7 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_rdma:
 			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+			xprt_load_transport(p);
 			break;
 		case Opt_acl:
 			mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1205,12 +1206,14 @@ static int nfs_parse_mount_options(char *raw,
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+				xprt_load_transport(string);
 				break;
 			default:
 				errors++;
 				dfprintk(MOUNT, "NFS:   unrecognized "
 						"transport protocol\n");
 			}
+			kfree(string);
 			break;
 		case Opt_mountproto:
 			string = match_strdup(args);
@@ -1218,7 +1221,6 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
-			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
-- 
cgit v1.2.3


From 15f081ca8ddfe150fb639c591b18944a539da0fc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:37:57 -0400
Subject: SUNRPC: Avoid an unnecessary task reschedule on ENOTCONN

If the socket is unconnected, and xprt_transmit() returns ENOTCONN, we
currently give up the lock on the transport channel. Doing so means that
the lock automatically gets assigned to the next task in the xprt->sending
queue, and so that task needs to be woken up to do the actual connect.

The following patch aims to avoid that unnecessary task switch.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 836f15c0c4a..07e9b05321e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1105,14 +1105,24 @@ static void
 call_transmit_status(struct rpc_task *task)
 {
 	task->tk_action = call_status;
-	/*
-	 * Special case: if we've been waiting on the socket's write_space()
-	 * callback, then don't call xprt_end_transmit().
-	 */
-	if (task->tk_status == -EAGAIN)
-		return;
-	xprt_end_transmit(task);
-	rpc_task_force_reencode(task);
+	switch (task->tk_status) {
+	case -EAGAIN:
+		break;
+	default:
+		xprt_end_transmit(task);
+		/*
+		 * Special cases: if we've been waiting on the
+		 * socket's write_space() callback, or if the
+		 * socket just returned a connection error,
+		 * then hold onto the transport lock.
+		 */
+	case -ECONNREFUSED:
+	case -ENOTCONN:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		rpc_task_force_reencode(task);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 670f94573104b4a25525d3fcdcd6496c678df172 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:37:58 -0400
Subject: SUNRPC: Ensure we set XPRT_CLOSING only after we've sent a tcp FIN...

...so that we can distinguish between when we need to shutdown and when we
don't. Also remove the call to xs_tcp_shutdown() from xs_tcp_connect(),
since xprt_connect() makes the same test.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1127eb93413..cb4bd93b921 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1180,7 +1180,6 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
-		set_bit(XPRT_CLOSING, &xprt->state);
 		xprt_force_disconnect(xprt);
 	case TCP_SYN_SENT:
 		xprt->connect_cookie++;
@@ -1193,6 +1192,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
 		break;
 	case TCP_LAST_ACK:
+		set_bit(XPRT_CLOSING, &xprt->state);
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		smp_mb__after_clear_bit();
@@ -1836,9 +1836,6 @@ static void xs_tcp_connect(struct rpc_task *task)
 {
 	struct rpc_xprt *xprt = task->tk_xprt;
 
-	/* Initiate graceful shutdown of the socket if not already done */
-	if (test_bit(XPRT_CONNECTED, &xprt->state))
-		xs_tcp_shutdown(xprt);
 	/* Exit if we need to wait for socket shutdown to complete */
 	if (test_bit(XPRT_CLOSING, &xprt->state))
 		return;
-- 
cgit v1.2.3


From 40d2549db5f515e415894def98b49db7d4c56714 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:37:58 -0400
Subject: SUNRPC: Don't disconnect if a connection is still in progress.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index cb4bd93b921..9d1898f6ee8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1613,10 +1613,9 @@ out:
  * We need to preserve the port number so the reply cache on the server can
  * find our cached RPC replies when we get around to reconnecting.
  */
-static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
+static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
 {
 	int result;
-	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct sockaddr any;
 
 	dprintk("RPC:       disconnecting xprt %p to reuse port\n", xprt);
@@ -1633,6 +1632,17 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
 				result);
 }
 
+static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
+{
+	unsigned int state = transport->inet->sk_state;
+
+	if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED)
+		return;
+	if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT))
+		return;
+	xs_abort_connection(xprt, transport);
+}
+
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1706,7 +1716,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 		}
 	} else
 		/* "close" the socket, preserving the local port */
-		xs_tcp_reuse_connection(xprt);
+		xs_tcp_reuse_connection(xprt, transport);
 
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
@@ -1766,7 +1776,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 		}
 	} else
 		/* "close" the socket, preserving the local port */
-		xs_tcp_reuse_connection(xprt);
+		xs_tcp_reuse_connection(xprt, transport);
 
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
-- 
cgit v1.2.3


From c8485e4d634f6df155040293928707f127f0d06d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:37:59 -0400
Subject: SUNRPC: Handle ECONNREFUSED correctly in xprt_transmit()

If we get an ECONNREFUSED error, we currently go to sleep on the
'xprt->sending' wait queue. The problem is that no timeout is set there,
and there is nothing else that will wake the task up later.

We should deal with ECONNREFUSED in call_status, given that is where we
also deal with -EHOSTDOWN, and friends.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c     |  7 ++++++-
 net/sunrpc/xprt.c     | 38 ++++++++++++++++----------------------
 net/sunrpc/xprtsock.c | 26 ++++++++++++--------------
 3 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 07e9b05321e..145715b5311 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1117,10 +1117,12 @@ call_transmit_status(struct rpc_task *task)
 		 * then hold onto the transport lock.
 		 */
 	case -ECONNREFUSED:
+	case -ECONNRESET:
 	case -ENOTCONN:
 	case -EHOSTDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
+	case -EPIPE:
 		rpc_task_force_reencode(task);
 	}
 }
@@ -1162,9 +1164,12 @@ call_status(struct rpc_task *task)
 			xprt_conditional_disconnect(task->tk_xprt,
 					req->rq_connect_cookie);
 		break;
+	case -ECONNRESET:
 	case -ECONNREFUSED:
-	case -ENOTCONN:
 		rpc_force_rebind(clnt);
+		rpc_delay(task, 3*HZ);
+	case -EPIPE:
+	case -ENOTCONN:
 		task->tk_action = call_bind;
 		break;
 	case -EAGAIN:
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d1afec64039..d588e755e10 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -901,32 +901,26 @@ void xprt_transmit(struct rpc_task *task)
 	req->rq_connect_cookie = xprt->connect_cookie;
 	req->rq_xtime = jiffies;
 	status = xprt->ops->send_request(task);
-	if (status == 0) {
-		dprintk("RPC: %5u xmit complete\n", task->tk_pid);
-		spin_lock_bh(&xprt->transport_lock);
+	if (status != 0) {
+		task->tk_status = status;
+		return;
+	}
 
-		xprt->ops->set_retrans_timeout(task);
+	dprintk("RPC: %5u xmit complete\n", task->tk_pid);
+	spin_lock_bh(&xprt->transport_lock);
 
-		xprt->stat.sends++;
-		xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
-		xprt->stat.bklog_u += xprt->backlog.qlen;
+	xprt->ops->set_retrans_timeout(task);
 
-		/* Don't race with disconnect */
-		if (!xprt_connected(xprt))
-			task->tk_status = -ENOTCONN;
-		else if (!req->rq_received)
-			rpc_sleep_on(&xprt->pending, task, xprt_timer);
-		spin_unlock_bh(&xprt->transport_lock);
-		return;
-	}
+	xprt->stat.sends++;
+	xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
+	xprt->stat.bklog_u += xprt->backlog.qlen;
 
-	/* Note: at this point, task->tk_sleeping has not yet been set,
-	 *	 hence there is no danger of the waking up task being put on
-	 *	 schedq, and being picked up by a parallel run of rpciod().
-	 */
-	task->tk_status = status;
-	if (status == -ECONNREFUSED)
-		rpc_sleep_on(&xprt->sending, task, NULL);
+	/* Don't race with disconnect */
+	if (!xprt_connected(xprt))
+		task->tk_status = -ENOTCONN;
+	else if (!req->rq_received)
+		rpc_sleep_on(&xprt->pending, task, xprt_timer);
+	spin_unlock_bh(&xprt->transport_lock);
 }
 
 static inline void do_xprt_reserve(struct rpc_task *task)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9d1898f6ee8..5e8198bede8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -594,6 +594,8 @@ static int xs_udp_send_request(struct rpc_task *task)
 		/* Still some bytes left; set up for a retry later. */
 		status = -EAGAIN;
 	}
+	if (!transport->sock)
+		goto out;
 
 	switch (status) {
 	case -ENOTSOCK:
@@ -603,19 +605,17 @@ static int xs_udp_send_request(struct rpc_task *task)
 	case -EAGAIN:
 		xs_nospace(task);
 		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
 	case -ENETUNREACH:
 	case -EPIPE:
 	case -ECONNREFUSED:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-		break;
-	default:
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-			-status);
 	}
-
+out:
 	return status;
 }
 
@@ -697,6 +697,8 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		status = -EAGAIN;
 		break;
 	}
+	if (!transport->sock)
+		goto out;
 
 	switch (status) {
 	case -ENOTSOCK:
@@ -706,21 +708,17 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	case -EAGAIN:
 		xs_nospace(task);
 		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
 	case -ECONNRESET:
 		xs_tcp_shutdown(xprt);
 	case -ECONNREFUSED:
 	case -ENOTCONN:
 	case -EPIPE:
-		status = -ENOTCONN;
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-		break;
-	default:
-		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-			-status);
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-		xs_tcp_shutdown(xprt);
 	}
-
+out:
 	return status;
 }
 
-- 
cgit v1.2.3


From 482f32e65d31cbf88d08306fa5d397cc945c3c26 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:00 -0400
Subject: SUNRPC: Handle socket errors correctly

Ensure that we pick up and handle socket errors as they occur.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 5e8198bede8..879af6f27b4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1208,23 +1208,20 @@ static void xs_tcp_state_change(struct sock *sk)
 }
 
 /**
- * xs_tcp_error_report - callback mainly for catching RST events
+ * xs_error_report - callback mainly for catching socket errors
  * @sk: socket
  */
-static void xs_tcp_error_report(struct sock *sk)
+static void xs_error_report(struct sock *sk)
 {
 	struct rpc_xprt *xprt;
 
 	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED)
-		goto out;
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 	dprintk("RPC:       %s client %p...\n"
 			"RPC:       error %d\n",
 			__func__, xprt, sk->sk_err);
-
-	xprt_force_disconnect(xprt);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
 out:
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -1509,6 +1506,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_user_data = xprt;
 		sk->sk_data_ready = xs_udp_data_ready;
 		sk->sk_write_space = xs_udp_write_space;
+		sk->sk_error_report = xs_error_report;
 		sk->sk_no_check = UDP_CSUM_NORCV;
 		sk->sk_allocation = GFP_ATOMIC;
 
@@ -1656,7 +1654,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_data_ready = xs_tcp_data_ready;
 		sk->sk_state_change = xs_tcp_state_change;
 		sk->sk_write_space = xs_tcp_write_space;
-		sk->sk_error_report = xs_tcp_error_report;
+		sk->sk_error_report = xs_error_report;
 		sk->sk_allocation = GFP_ATOMIC;
 
 		/* socket options */
-- 
cgit v1.2.3


From 2a4919919a97911b0aa4b9f5ac1eab90ba87652b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:00 -0400
Subject: SUNRPC: Return EAGAIN instead of ENOTCONN when waking up
 xprt->pending

While we should definitely return socket errors to the task that is
currently trying to send data, there is no need to propagate the same error
to all the other tasks on xprt->pending. Doing so actually slows down
recovery, since it causes more than one tasks to attempt socket recovery.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c     | 15 ++++---------
 net/sunrpc/xprt.c     | 20 ++++++------------
 net/sunrpc/xprtsock.c | 58 +++++++++++++++++++++++++++------------------------
 3 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 145715b5311..5abab094441 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1032,27 +1032,20 @@ call_connect_status(struct rpc_task *task)
 	dprint_status(task);
 
 	task->tk_status = 0;
-	if (status >= 0) {
+	if (status >= 0 || status == -EAGAIN) {
 		clnt->cl_stats->netreconn++;
 		task->tk_action = call_transmit;
 		return;
 	}
 
-	/* Something failed: remote service port may have changed */
-	rpc_force_rebind(clnt);
-
 	switch (status) {
-	case -ENOTCONN:
-	case -EAGAIN:
-		task->tk_action = call_bind;
-		if (!RPC_IS_SOFT(task))
-			return;
 		/* if soft mounted, test if we've timed out */
 	case -ETIMEDOUT:
 		task->tk_action = call_timeout;
-		return;
+		break;
+	default:
+		rpc_exit(task, -EIO);
 	}
-	rpc_exit(task, -EIO);
 }
 
 /*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d588e755e10..a0bfe53f162 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -611,7 +611,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
 	dprintk("RPC:       disconnected transport %p\n", xprt);
 	spin_lock_bh(&xprt->transport_lock);
 	xprt_clear_connected(xprt);
-	xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
 EXPORT_SYMBOL_GPL(xprt_disconnect_done);
@@ -629,7 +629,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
 		queue_work(rpciod_workqueue, &xprt->task_cleanup);
-	xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
@@ -656,7 +656,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
 		queue_work(rpciod_workqueue, &xprt->task_cleanup);
-	xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
 out:
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -726,9 +726,8 @@ static void xprt_connect_status(struct rpc_task *task)
 	}
 
 	switch (task->tk_status) {
-	case -ENOTCONN:
-		dprintk("RPC: %5u xprt_connect_status: connection broken\n",
-				task->tk_pid);
+	case -EAGAIN:
+		dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
 		break;
 	case -ETIMEDOUT:
 		dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
@@ -849,15 +848,8 @@ int xprt_prepare_transmit(struct rpc_task *task)
 		err = req->rq_received;
 		goto out_unlock;
 	}
-	if (!xprt->ops->reserve_xprt(task)) {
+	if (!xprt->ops->reserve_xprt(task))
 		err = -EAGAIN;
-		goto out_unlock;
-	}
-
-	if (!xprt_connected(xprt)) {
-		err = -ENOTCONN;
-		goto out_unlock;
-	}
 out_unlock:
 	spin_unlock_bh(&xprt->transport_lock);
 	return err;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 879af6f27b4..8e58b0b5460 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1162,7 +1162,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			transport->tcp_flags =
 				TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
 
-			xprt_wake_pending_tasks(xprt, 0);
+			xprt_wake_pending_tasks(xprt, -EAGAIN);
 		}
 		spin_unlock_bh(&xprt->transport_lock);
 		break;
@@ -1721,20 +1721,22 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
 			xprt, -status, xprt_connected(xprt),
 			sock->sk->sk_state);
-	if (status < 0) {
-		switch (status) {
-			case -EINPROGRESS:
-			case -EALREADY:
-				goto out_clear;
-			case -ECONNREFUSED:
-			case -ECONNRESET:
-				/* retry with existing socket, after a delay */
-				break;
-			default:
-				/* get rid of existing socket, and retry */
-				xs_tcp_shutdown(xprt);
-		}
+	switch (status) {
+	case 0:
+	case -EINPROGRESS:
+	case -EALREADY:
+		goto out_clear;
+	case -ECONNREFUSED:
+	case -ECONNRESET:
+		/* retry with existing socket, after a delay */
+		break;
+	default:
+		/* get rid of existing socket, and retry */
+		xs_tcp_shutdown(xprt);
+		printk("%s: connect returned unhandled error %d\n",
+				__func__, status);
 	}
+	status = -EAGAIN;
 out:
 	xprt_wake_pending_tasks(xprt, status);
 out_clear:
@@ -1780,20 +1782,22 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 	status = xs_tcp_finish_connecting(xprt, sock);
 	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
 			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
-	if (status < 0) {
-		switch (status) {
-			case -EINPROGRESS:
-			case -EALREADY:
-				goto out_clear;
-			case -ECONNREFUSED:
-			case -ECONNRESET:
-				/* retry with existing socket, after a delay */
-				break;
-			default:
-				/* get rid of existing socket, and retry */
-				xs_tcp_shutdown(xprt);
-		}
+	switch (status) {
+	case 0:
+	case -EINPROGRESS:
+	case -EALREADY:
+		goto out_clear;
+	case -ECONNREFUSED:
+	case -ECONNRESET:
+		/* retry with existing socket, after a delay */
+		break;
+	default:
+		/* get rid of existing socket, and retry */
+		xs_tcp_shutdown(xprt);
+		printk("%s: connect returned unhandled error %d\n",
+				__func__, status);
 	}
+	status = -EAGAIN;
 out:
 	xprt_wake_pending_tasks(xprt, status);
 out_clear:
-- 
cgit v1.2.3


From 8a2cec295f4499cc9d4452e9b02d4ed071bb42d3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:01 -0400
Subject: SUNRPC: Delay, then retry on connection errors.

Enforce the comment in xs_tcp_connect_worker4/xs_tcp_connect_worker6 that
we should delay, then retry on certain connection errors.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8e58b0b5460..9f3e615d3e0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1722,20 +1722,19 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 			xprt, -status, xprt_connected(xprt),
 			sock->sk->sk_state);
 	switch (status) {
+	case -ECONNREFUSED:
+	case -ECONNRESET:
+	case -ENETUNREACH:
+		/* retry with existing socket, after a delay */
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
 		goto out_clear;
-	case -ECONNREFUSED:
-	case -ECONNRESET:
-		/* retry with existing socket, after a delay */
-		break;
-	default:
-		/* get rid of existing socket, and retry */
-		xs_tcp_shutdown(xprt);
-		printk("%s: connect returned unhandled error %d\n",
-				__func__, status);
 	}
+	/* get rid of existing socket, and retry */
+	xs_tcp_shutdown(xprt);
+	printk("%s: connect returned unhandled error %d\n",
+			__func__, status);
 	status = -EAGAIN;
 out:
 	xprt_wake_pending_tasks(xprt, status);
@@ -1783,20 +1782,19 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
 			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
 	switch (status) {
+	case -ECONNREFUSED:
+	case -ECONNRESET:
+	case -ENETUNREACH:
+		/* retry with existing socket, after a delay */
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
 		goto out_clear;
-	case -ECONNREFUSED:
-	case -ECONNRESET:
-		/* retry with existing socket, after a delay */
-		break;
-	default:
-		/* get rid of existing socket, and retry */
-		xs_tcp_shutdown(xprt);
-		printk("%s: connect returned unhandled error %d\n",
-				__func__, status);
 	}
+	/* get rid of existing socket, and retry */
+	xs_tcp_shutdown(xprt);
+	printk("%s: connect returned unhandled error %d\n",
+			__func__, status);
 	status = -EAGAIN;
 out:
 	xprt_wake_pending_tasks(xprt, status);
-- 
cgit v1.2.3


From 5e3771ce2d6a69e10fcc870cdf226d121d868491 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:01 -0400
Subject: SUNRPC: Ensure that xs_nospace return values are propagated

If xs_nospace() finds that the socket has disconnected, it attempts to
return ENOTCONN, however that value is then squashed by the callers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9f3e615d3e0..2e070679ab4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -521,11 +521,12 @@ static void xs_nospace_callback(struct rpc_task *task)
  * @task: task to put to sleep
  *
  */
-static void xs_nospace(struct rpc_task *task)
+static int xs_nospace(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	int ret = 0;
 
 	dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
 			task->tk_pid, req->rq_slen - req->rq_bytes_sent,
@@ -537,6 +538,7 @@ static void xs_nospace(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
 		if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+			ret = -EAGAIN;
 			/*
 			 * Notify TCP that we're limited by the application
 			 * window size
@@ -548,10 +550,11 @@ static void xs_nospace(struct rpc_task *task)
 		}
 	} else {
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-		task->tk_status = -ENOTCONN;
+		ret = -ENOTCONN;
 	}
 
 	spin_unlock_bh(&xprt->transport_lock);
+	return ret;
 }
 
 /**
@@ -603,7 +606,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		xs_nospace(task);
+		status = xs_nospace(task);
 		break;
 	default:
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
@@ -706,7 +709,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		xs_nospace(task);
+		status = xs_nospace(task);
 		break;
 	default:
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-- 
cgit v1.2.3


From d4e3676dba299e24acb66de6da2a0bb44d0d2414 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:12:40 +1030
Subject: cpumask: remove the now-obsoleted pcibus_to_cpumask(): ia64

Impact: reduce stack usage for large NR_CPUS

cpumask_of_pcibus() is the new version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/ia64/include/asm/topology.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 32f3af1641c..6fbbf8709e1 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -117,11 +117,6 @@ void build_cpu_to_node_map(void);
 
 extern void arch_fix_phys_package_id(int num, u32 slot);
 
-#define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
-					CPU_MASK_ALL : \
-					node_to_cpumask(pcibus_to_node(bus)) \
-				)
-
 #define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
 				 cpu_all_mask :				\
 				 cpumask_of_node(pcibus_to_node(bus)))
-- 
cgit v1.2.3


From 40fe697a1759b85f5e06c490599f4f7b03de3be7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:12:41 +1030
Subject: cpumask: arch_send_call_function_ipi_mask: ia64

We're weaning the core code off handing cpumask's around on-stack.
This introduces arch_send_call_function_ipi_mask().

We also take the chance to wean send_IPI_mask off the obsolescent
for_each_cpu_mask(): making it take the pointer seemed the most
natural way.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/ia64/include/asm/smp.h | 3 ++-
 arch/ia64/kernel/smp.c      | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index 21c402365d0..59840833625 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -126,7 +126,8 @@ extern void identify_siblings (struct cpuinfo_ia64 *);
 extern int is_multithreading_enabled(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #else /* CONFIG_SMP */
 
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index da8f020d82c..2ea4199d9c5 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -166,11 +166,11 @@ send_IPI_allbutself (int op)
  * Called with preemption disabled.
  */
 static inline void
-send_IPI_mask(cpumask_t mask, int op)
+send_IPI_mask(const struct cpumask *mask, int op)
 {
 	unsigned int cpu;
 
-	for_each_cpu_mask(cpu, mask) {
+	for_each_cpu(cpu, mask) {
 			send_IPI_single(cpu, op);
 	}
 }
@@ -316,7 +316,7 @@ void arch_send_call_function_single_ipi(int cpu)
 	send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	send_IPI_mask(mask, IPI_CALL_FUNC);
 }
-- 
cgit v1.2.3


From 5dd3c9949a3e92ea7fd8c75d888031f7aff1f1d0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:12:42 +1030
Subject: cpumask: prepare for iterators to only go to
 nr_cpu_ids/nr_cpumask_bits.: ia64

Impact: cleanup, futureproof

In fact, all cpumask ops will only be valid (in general) for bit
numbers < nr_cpu_ids.  So use that instead of NR_CPUS in various
places.

This is always safe: no cpu number can be >= nr_cpu_ids, and
nr_cpu_ids is initialized to NR_CPUS at boot.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/kernel/acpi.c             | 6 +++---
 arch/ia64/kernel/mca.c              | 6 +++---
 arch/ia64/kernel/perfmon.c          | 4 ++--
 arch/ia64/kernel/salinfo.c          | 6 +++---
 arch/ia64/kernel/setup.c            | 4 ++--
 arch/ia64/sn/kernel/setup.c         | 2 +-
 arch/ia64/sn/kernel/sn2/sn2_smp.c   | 6 +++---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 2 +-
 8 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index d541671caf4..c4f41aca107 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -924,9 +924,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	buffer.length = ACPI_ALLOCATE_BUFFER;
 	buffer.pointer = NULL;
 
-	cpus_complement(tmp_map, cpu_present_map);
-	cpu = first_cpu(tmp_map);
-	if (cpu >= NR_CPUS)
+	cpumask_complement(&tmp_map, cpu_present_mask);
+	cpu = cpumask_first(&tmp_map);
+	if (cpu >= nr_cpu_ids)
 		return -EINVAL;
 
 	acpi_map_cpu2node(handle, cpu, physid);
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index bab1de2d2f6..8f33a884042 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1456,9 +1456,9 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
 
 	ia64_mca_cmc_int_handler(cmc_irq, arg);
 
-	for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
 
-	if (cpuid < NR_CPUS) {
+	if (cpuid < nr_cpu_ids) {
 		platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
 	} else {
 		/* If no log record, switch out of polling mode */
@@ -1525,7 +1525,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
 
 	ia64_mca_cpe_int_handler(cpe_irq, arg);
 
-	for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
 
 	if (cpuid < NR_CPUS) {
 		platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 0e499757309..6fc1e638f0e 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5603,7 +5603,7 @@ pfm_interrupt_handler(int irq, void *arg)
  * /proc/perfmon interface, for debug only
  */
 
-#define PFM_PROC_SHOW_HEADER	((void *)NR_CPUS+1)
+#define PFM_PROC_SHOW_HEADER	((void *)nr_cpu_ids+1)
 
 static void *
 pfm_proc_start(struct seq_file *m, loff_t *pos)
@@ -5612,7 +5612,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos)
 		return PFM_PROC_SHOW_HEADER;
 	}
 
-	while (*pos <= NR_CPUS) {
+	while (*pos <= nr_cpu_ids) {
 		if (cpu_online(*pos - 1)) {
 			return (void *)*pos;
 		}
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index ecb9eb78d68..7053c55b764 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -317,7 +317,7 @@ retry:
 	}
 
 	n = data->cpu_check;
-	for (i = 0; i < NR_CPUS; i++) {
+	for (i = 0; i < nr_cpu_ids; i++) {
 		if (cpu_isset(n, data->cpu_event)) {
 			if (!cpu_online(n)) {
 				cpu_clear(n, data->cpu_event);
@@ -326,7 +326,7 @@ retry:
 			cpu = n;
 			break;
 		}
-		if (++n == NR_CPUS)
+		if (++n == nr_cpu_ids)
 			n = 0;
 	}
 
@@ -337,7 +337,7 @@ retry:
 
 	/* for next read, start checking at next CPU */
 	data->cpu_check = cpu;
-	if (++data->cpu_check == NR_CPUS)
+	if (++data->cpu_check == nr_cpu_ids)
 		data->cpu_check = 0;
 
 	snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 865af27c773..ae9ec3dc76b 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -730,10 +730,10 @@ static void *
 c_start (struct seq_file *m, loff_t *pos)
 {
 #ifdef CONFIG_SMP
-	while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map))
+	while (*pos < nr_cpu_ids && !cpu_online(*pos))
 		++*pos;
 #endif
-	return *pos < NR_CPUS ? cpu_data(*pos) : NULL;
+	return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
 }
 
 static void *
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 02c5b8a9fb6..12097776afc 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -750,7 +750,7 @@ nasid_slice_to_cpuid(int nasid, int slice)
 {
 	long cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		if (cpuid_to_nasid(cpu) == nasid &&
 					cpuid_to_slice(cpu) == slice)
 			return cpu;
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index e585f9a2afb..209e1eb467d 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -461,7 +461,7 @@ bool sn_cpu_disable_allowed(int cpu)
 
 static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
 {
-	if (*offset < NR_CPUS)
+	if (*offset < nr_cpu_ids)
 		return offset;
 	return NULL;
 }
@@ -469,7 +469,7 @@ static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
 static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
 {
 	(*offset)++;
-	if (*offset < NR_CPUS)
+	if (*offset < nr_cpu_ids)
 		return offset;
 	return NULL;
 }
@@ -491,7 +491,7 @@ static int sn2_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt);
 	}
 
-	if (cpu < NR_CPUS && cpu_online(cpu)) {
+	if (cpu < nr_cpu_ids && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
 		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
 				stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index be339477f90..45f3c239042 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -612,7 +612,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 	op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK;
 
 	if (cpu != SN_HWPERF_ARG_ANY_CPU) {
-		if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+		if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 			r = -EINVAL;
 			goto out;
 		}
-- 
cgit v1.2.3


From 2af51a3f817a22661fcb52da7c96d078a699f40f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:12:43 +1030
Subject: cpumask: Use accessors code.: ia64

Impact: use new API

Use the accessors rather than frobbing bits directly.  Most of this is
in arch code I haven't even compiled, but is straightforward.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/ia64/kernel/acpi.c    |  2 +-
 arch/ia64/kernel/smpboot.c | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index c4f41aca107..2f19d91b0b8 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -886,7 +886,7 @@ __init void prefill_possible_map(void)
 		possible, max((possible - available_cpus), 0));
 
 	for (i = 0; i < possible; i++)
-		cpu_set(i, cpu_possible_map);
+		set_cpu_possible(i, true);
 }
 
 int acpi_map_lsapic(acpi_handle handle, int *pcpu)
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 52290547c85..7700e23034b 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -581,14 +581,14 @@ smp_build_cpu_map (void)
 
 	ia64_cpu_to_sapicid[0] = boot_cpu_id;
 	cpus_clear(cpu_present_map);
-	cpu_set(0, cpu_present_map);
-	cpu_set(0, cpu_possible_map);
+	set_cpu_present(0, true);
+	set_cpu_possible(0, true);
 	for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
 		sapicid = smp_boot_data.cpu_phys_id[i];
 		if (sapicid == boot_cpu_id)
 			continue;
-		cpu_set(cpu, cpu_present_map);
-		cpu_set(cpu, cpu_possible_map);
+		set_cpu_present(cpu, true);
+		set_cpu_possible(cpu, true);
 		ia64_cpu_to_sapicid[cpu] = sapicid;
 		cpu++;
 	}
@@ -626,12 +626,9 @@ smp_prepare_cpus (unsigned int max_cpus)
 	 */
 	if (!max_cpus) {
 		printk(KERN_INFO "SMP mode deactivated.\n");
-		cpus_clear(cpu_online_map);
-		cpus_clear(cpu_present_map);
-		cpus_clear(cpu_possible_map);
-		cpu_set(0, cpu_online_map);
-		cpu_set(0, cpu_present_map);
-		cpu_set(0, cpu_possible_map);
+		init_cpu_online(cpumask_of(0));
+		init_cpu_present(cpumask_of(0));
+		init_cpu_possible(cpumask_of(0));
 		return;
 	}
 }
-- 
cgit v1.2.3


From 5d8c39f68e1dc78c1a958e28bc685a5bac125b21 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:12:48 +1030
Subject: cpumask: use mm_cpumask() wrapper: ia64

Makes code futureproof against the impending change to mm->cpu_vm_mask.

It's also a chance to use the new cpumask_ ops which take a pointer
(the older ones are deprecated, but there's no hurry for arch code).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/ia64/include/asm/mmu_context.h | 6 +++---
 arch/ia64/mm/tlb.c                  | 2 +-
 arch/ia64/sn/kernel/sn2/sn2_smp.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 040bc87db93..7f2a456603c 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -87,7 +87,7 @@ get_mmu_context (struct mm_struct *mm)
 	/* re-check, now that we've got the lock: */
 	context = mm->context;
 	if (context == 0) {
-		cpus_clear(mm->cpu_vm_mask);
+		cpumask_clear(mm_cpumask(mm));
 		if (ia64_ctx.next >= ia64_ctx.limit) {
 			ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
 					ia64_ctx.max_ctx, ia64_ctx.next);
@@ -166,8 +166,8 @@ activate_context (struct mm_struct *mm)
 
 	do {
 		context = get_mmu_context(mm);
-		if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
-			cpu_set(smp_processor_id(), mm->cpu_vm_mask);
+		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
+			cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 		reload_context(context);
 		/*
 		 * in the unlikely event of a TLB-flush by another thread,
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index bd9818a36b4..b9f3d7bbb33 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -309,7 +309,7 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
 
 	preempt_disable();
 #ifdef CONFIG_SMP
-	if (mm != current->active_mm || cpus_weight(mm->cpu_vm_mask) != 1) {
+	if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
 		platform_global_tlb_purge(mm, start, end, nbits);
 		preempt_enable();
 		return;
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 209e1eb467d..3c2f242d90c 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -133,7 +133,7 @@ sn2_ipi_flush_all_tlb(struct mm_struct *mm)
 	unsigned long itc;
 
 	itc = ia64_get_itc();
-	smp_flush_tlb_cpumask(mm->cpu_vm_mask);
+	smp_flush_tlb_cpumask(*mm_cpumask(mm));
 	itc = ia64_get_itc() - itc;
 	__get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc;
 	__get_cpu_var(ptcstats).shub_ipi_flushes++;
@@ -182,7 +182,7 @@ sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start,
 	nodes_clear(nodes_flushed);
 	i = 0;
 
-	for_each_cpu_mask(cpu, mm->cpu_vm_mask) {
+	for_each_cpu(cpu, mm_cpumask(mm)) {
 		cnode = cpu_to_node(cpu);
 		node_set(cnode, nodes_flushed);
 		lcpu = cpu;
-- 
cgit v1.2.3


From fd8e18e9f486bcbdd8e0d817e6aa8622a5034540 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:22 +1030
Subject: cpumask: Use smp_call_function_many(): sparc64

Impact: Use new API

Change smp_call_function_mask() callers to smp_call_function_many().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/sparc/kernel/smp_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 6cd1a5b6506..a4713e77ce4 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -850,7 +850,7 @@ static void tsb_sync(void *info)
 
 void smp_tsb_sync(struct mm_struct *mm)
 {
-	smp_call_function_mask(mm->cpu_vm_mask, tsb_sync, mm, 1);
+	smp_call_function_many(&mm->cpu_vm_mask, tsb_sync, mm, 1);
 }
 
 extern unsigned long xcall_flush_tlb_mm;
-- 
cgit v1.2.3


From f46df02a5799460e74bcb5a3875d4245978f3bd2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:22 +1030
Subject: cpumask: arch_send_call_function_ipi_mask: sparc

We're weaning the core code off handing cpumask's around on-stack.
This introduces arch_send_call_function_ipi_mask(), and by defining
it, the old arch_send_call_function_ipi is defined by the core code.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/sparc/include/asm/smp_64.h | 3 ++-
 arch/sparc/kernel/smp_64.c      | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h
index 57224dd37b3..becb6bf353a 100644
--- a/arch/sparc/include/asm/smp_64.h
+++ b/arch/sparc/include/asm/smp_64.h
@@ -35,7 +35,8 @@ extern cpumask_t cpu_core_map[NR_CPUS];
 extern int sparc64_multi_core;
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /*
  *	General functions that each host system must provide.
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a4713e77ce4..4e17eec4147 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -808,9 +808,9 @@ static void smp_start_sync_tick_client(int cpu)
 
 extern unsigned long xcall_call_function;
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
-	xcall_deliver((u64) &xcall_call_function, 0, 0, &mask);
+	xcall_deliver((u64) &xcall_call_function, 0, 0, mask);
 }
 
 extern unsigned long xcall_call_function_single;
-- 
cgit v1.2.3


From fe73971cdd9287eba5f834eb3794768c22718581 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:22 +1030
Subject: cpumask: Use accessors code: sparc

Impact: use new API

Use the accessors rather than frobbing bits directly.  Most of this is
in arch code I haven't even compiled, but it is mostly straightforward.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/sparc/kernel/smp_32.c    | 8 ++++----
 arch/sparc/kernel/sun4d_smp.c | 2 +-
 arch/sparc/kernel/sun4m_smp.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 1e5ac4e282e..88f43c5ba38 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -332,8 +332,8 @@ void __init smp_setup_cpu_possible_map(void)
 	instance = 0;
 	while (!cpu_find_by_instance(instance, NULL, &mid)) {
 		if (mid < NR_CPUS) {
-			cpu_set(mid, cpu_possible_map);
-			cpu_set(mid, cpu_present_map);
+			set_cpu_possible(mid, true);
+			set_cpu_present(mid, true);
 		}
 		instance++;
 	}
@@ -351,8 +351,8 @@ void __init smp_prepare_boot_cpu(void)
 		printk("boot cpu id != 0, this could work but is untested\n");
 
 	current_thread_info()->cpu = cpuid;
-	cpu_set(cpuid, cpu_online_map);
-	cpu_set(cpuid, cpu_possible_map);
+	set_cpu_online(cpuid, true);
+	set_cpu_possible(cpuid, true);
 }
 
 int __cpuinit __cpu_up(unsigned int cpu)
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 50afaed99c8..e85e6aa1abd 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -150,7 +150,7 @@ void __cpuinit smp4d_callin(void)
 	spin_lock_irqsave(&sun4d_imsk_lock, flags);
 	cc_set_imsk(cc_get_imsk() & ~0x4000); /* Allow PIL 14 as well */
 	spin_unlock_irqrestore(&sun4d_imsk_lock, flags);
-	cpu_set(cpuid, cpu_online_map);
+	set_cpu_online(cpuid, true);
 
 }
 
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 8040376c489..2a8f4fc4056 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -113,7 +113,7 @@ void __cpuinit smp4m_callin(void)
 
 	local_irq_enable();
 
-	cpu_set(cpuid, cpu_online_map);
+	set_cpu_online(cpuid, true);
 }
 
 /*
-- 
cgit v1.2.3


From 89229071c049e518668e34b234167d5ed9c94534 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:23 +1030
Subject: cpumask: Use accessors code.: sparc64

Impact: use new API

Use the accessors rather than frobbing bits directly.  Most of this is
in arch code I haven't even compiled, but is straightforward.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/sparc/kernel/mdesc.c   | 2 +-
 arch/sparc/kernel/prom_64.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 3f79f0c23a0..f0e6ed23a46 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -567,7 +567,7 @@ static void __init report_platform_properties(void)
 			max_cpu = NR_CPUS;
 		}
 		for (i = 0; i < max_cpu; i++)
-			cpu_set(i, cpu_possible_map);
+			set_cpu_possible(i, true);
 	}
 #endif
 
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index edecca7b811..ca55c7012f7 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -518,8 +518,8 @@ void __init of_fill_in_cpu_data(void)
 		}
 
 #ifdef CONFIG_SMP
-		cpu_set(cpuid, cpu_present_map);
-		cpu_set(cpuid, cpu_possible_map);
+		set_cpu_present(cpuid, true);
+		set_cpu_possible(cpuid, true);
 #endif
 	}
 
-- 
cgit v1.2.3


From e305cb8f09b6e51940f78516f962ea819bc30ccd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:23 +1030
Subject: cpumask: prepare for iterators to only go to
 nr_cpu_ids/nr_cpumask_bits.: sparc64

Impact: cleanup, futureproof

In fact, all cpumask ops will only be valid (in general) for bit
numbers < nr_cpu_ids.  So use that instead of NR_CPUS in various
places.

This is always safe: no cpu number can be >= nr_cpu_ids, and
nr_cpu_ids is initialized to NR_CPUS at boot.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/kernel/ds.c     | 2 +-
 arch/sparc/kernel/irq_64.c | 4 ++--
 arch/sparc/mm/init_64.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index 57c39843fb2..90350f838f0 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -653,7 +653,7 @@ static void __cpuinit dr_cpu_data(struct ds_info *dp,
 		if (cpu_list[i] == CPU_SENTINEL)
 			continue;
 
-		if (cpu_list[i] < NR_CPUS)
+		if (cpu_list[i] < nr_cpu_ids)
 			cpu_set(cpu_list[i], mask);
 	}
 
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 1c378d8e90c..640631b170a 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -265,12 +265,12 @@ static int irq_choose_cpu(unsigned int virt_irq)
 		spin_lock_irqsave(&irq_rover_lock, flags);
 
 		while (!cpu_online(irq_rover)) {
-			if (++irq_rover >= NR_CPUS)
+			if (++irq_rover >= nr_cpu_ids)
 				irq_rover = 0;
 		}
 		cpuid = irq_rover;
 		do {
-			if (++irq_rover >= NR_CPUS)
+			if (++irq_rover >= nr_cpu_ids)
 				irq_rover = 0;
 		} while (!cpu_online(irq_rover));
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 00373ce2d8f..2c8dfeb7ab0 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1092,7 +1092,7 @@ static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
 		if (strcmp(name, "cpu"))
 			continue;
 		id = mdesc_get_property(md, target, "id", NULL);
-		if (*id < NR_CPUS)
+		if (*id < nr_cpu_ids)
 			cpu_set(*id, *mask);
 	}
 }
-- 
cgit v1.2.3


From ec7c14bde80a11e325f26b339b8570a929e87223 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:24 +1030
Subject: cpumask: prepare for iterators to only go to
 nr_cpu_ids/nr_cpumask_bits.: sparc

Impact: cleanup, futureproof

In fact, all cpumask ops will only be valid (in general) for bit
numbers < nr_cpu_ids.  So use that instead of NR_CPUS in various
places.

This is always safe: no cpu number can be >= nr_cpu_ids, and
nr_cpu_ids is initialized to NR_CPUS at boot.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/kernel/smp_32.c    | 11 +++++------
 arch/sparc/kernel/sun4d_smp.c |  9 ++++-----
 arch/sparc/kernel/sun4m_smp.c |  8 +++-----
 arch/sparc/mm/srmmu.c         |  2 +-
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 88f43c5ba38..be1ae37e773 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -70,13 +70,12 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	extern void smp4m_smp_done(void);
 	extern void smp4d_smp_done(void);
 	unsigned long bogosum = 0;
-	int cpu, num;
+	int cpu, num = 0;
 
-	for (cpu = 0, num = 0; cpu < NR_CPUS; cpu++)
-		if (cpu_online(cpu)) {
-			num++;
-			bogosum += cpu_data(cpu).udelay_val;
-		}
+	for_each_online_cpu(cpu) {
+		num++;
+		bogosum += cpu_data(cpu).udelay_val;
+	}
 
 	printk("Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 		num, bogosum/(500000/HZ),
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index e85e6aa1abd..54fb02468f0 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -228,11 +228,10 @@ void __init smp4d_smp_done(void)
 	/* setup cpu list for irq rotation */
 	first = 0;
 	prev = &first;
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_online(i)) {
-			*prev = i;
-			prev = &cpu_data(i).next;
-		}
+	for_each_online_cpu(i) {
+		*prev = i;
+		prev = &cpu_data(i).next;
+	}
 	*prev = first;
 	local_flush_cache_all();
 
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 2a8f4fc4056..960b113d000 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -186,11 +186,9 @@ void __init smp4m_smp_done(void)
 	/* setup cpu list for irq rotation */
 	first = 0;
 	prev = &first;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i)) {
-			*prev = i;
-			prev = &cpu_data(i).next;
-		}
+	for_each_online_cpu(i) {
+		*prev = i;
+		prev = &cpu_data(i).next;
 	}
 	*prev = first;
 	local_flush_cache_all();
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index fe7ed08390b..06c9a7d9820 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -1425,7 +1425,7 @@ static void __init init_vac_layout(void)
 				min_line_size = vac_line_size;
 			//FIXME: cpus not contiguous!!
 			cpu++;
-			if (cpu >= NR_CPUS || !cpu_online(cpu))
+			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
 				break;
 #else
 			break;
-- 
cgit v1.2.3


From 7b45101d09ab13eafc0c3a5232e1606654d122ea Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:25 +1030
Subject: cpumask: remove cpu_coregroup_map: sparc

Impact: cleanup

cpu_coregroup_mask is the New Hotness.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/sparc/include/asm/topology_64.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 5bc0b8fd637..430ce3920f9 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -89,7 +89,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 #define smt_capable()				(sparc64_multi_core)
 #endif /* CONFIG_SMP */
 
-#define cpu_coregroup_map(cpu)			(cpu_core_map[cpu])
 #define cpu_coregroup_mask(cpu)			(&cpu_core_map[cpu])
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */
-- 
cgit v1.2.3


From cc301d261f5d49cbff66b2f459f58f2652899cdb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:28 +1030
Subject: cpumask: remove the now-obsoleted pcibus_to_cpumask(): sparc

Impact: reduce stack usage for large NR_CPUS

cpumask_of_pcibus() is the new version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/sparc/include/asm/topology_64.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 430ce3920f9..a5ba4ad1230 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -43,10 +43,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 }
 #endif
 
-#define pcibus_to_cpumask(bus)	\
-	(pcibus_to_node(bus) == -1 ? \
-	 CPU_MASK_ALL : \
-	 node_to_cpumask(pcibus_to_node(bus)))
 #define cpumask_of_pcibus(bus)	\
 	(pcibus_to_node(bus) == -1 ? \
 	 CPU_MASK_ALL_PTR : \
-- 
cgit v1.2.3


From e9b375120b593d3081c2f63e8ccf350cccc8fd68 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:38 +1030
Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL.: sparc

Impact: cleanup

(Thanks to Al Viro for reminding me of this, via Ingo)

CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so:

	#define CPU_MASK_ALL (cpumask_t) { { ... } }

Taking the address of such a temporary is questionable at best,
unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added
CPU_MASK_ALL_PTR:

	#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)

Which formalizes this practice.  One day gcc could bite us over this
usage (though we seem to have gotten away with it so far).

So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR
with the modern "cpu_all_mask" (a real struct cpumask *), and remove
CPU_MASK_ALL_PTR altogether.

Also remove the confusing and deprecated large-NR_CPUS-only
"cpu_mask_all".

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
---
 arch/sparc/include/asm/topology_64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index a5ba4ad1230..39624abb6a4 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -45,7 +45,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 
 #define cpumask_of_pcibus(bus)	\
 	(pcibus_to_node(bus) == -1 ? \
-	 CPU_MASK_ALL_PTR : \
+	 cpu_all_mask : \
 	 cpumask_of_node(pcibus_to_node(bus)))
 
 #define SD_NODE_INIT (struct sched_domain) {		\
-- 
cgit v1.2.3


From 81f1adf01224f5c0be5f90f43664f799c1f7bb2d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 16 Mar 2009 14:40:39 +1030
Subject: cpumask: use mm_cpumask() wrapper: sparc

Makes code futureproof against the impending change to mm->cpu_vm_mask.

It's also a chance to use the new cpumask_ ops which take a pointer
(the older ones are deprecated, but there's no hurry for arch code).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/sparc/include/asm/mmu_context_64.h |  8 ++++----
 arch/sparc/include/asm/system_32.h      |  2 +-
 arch/sparc/kernel/smp_32.c              | 17 +++++++++--------
 arch/sparc/kernel/smp_64.c              | 10 +++++-----
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 5693ab48260..666a73fef28 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -121,8 +121,8 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 	 * local TLB.
 	 */
 	cpu = smp_processor_id();
-	if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) {
-		cpu_set(cpu, mm->cpu_vm_mask);
+	if (!ctx_valid || !cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+		cpumask_set_cpu(cpu, mm_cpumask(mm));
 		__flush_tlb_mm(CTX_HWBITS(mm->context),
 			       SECONDARY_CONTEXT);
 	}
@@ -141,8 +141,8 @@ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
 	if (!CTX_VALID(mm->context))
 		get_new_mmu_context(mm);
 	cpu = smp_processor_id();
-	if (!cpu_isset(cpu, mm->cpu_vm_mask))
-		cpu_set(cpu, mm->cpu_vm_mask);
+	if (!cpumask_test_cpu(cpu, mm_cpumask(mm)))
+		cpumask_set_cpu(cpu, mm_cpumask(mm));
 
 	load_secondary_context(mm);
 	__flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
diff --git a/arch/sparc/include/asm/system_32.h b/arch/sparc/include/asm/system_32.h
index 79c1ae2b42a..751c8c17f5a 100644
--- a/arch/sparc/include/asm/system_32.h
+++ b/arch/sparc/include/asm/system_32.h
@@ -126,7 +126,7 @@ extern void flushw_all(void);
 #define switch_to(prev, next, last) do {						\
 	SWITCH_ENTER(prev);								\
 	SWITCH_DO_LAZY_FPU(next);							\
-	cpu_set(smp_processor_id(), next->active_mm->cpu_vm_mask);			\
+	cpumask_set_cpu(smp_processor_id(), mm_cpumask(next->active_mm));		\
 	__asm__ __volatile__(								\
 	"sethi	%%hi(here - 0x8), %%o7\n\t"						\
 	"mov	%%g6, %%g3\n\t"								\
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index be1ae37e773..132d81fb261 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -143,7 +143,7 @@ void smp_flush_tlb_all(void)
 void smp_flush_cache_mm(struct mm_struct *mm)
 {
 	if(mm->context != NO_CONTEXT) {
-		cpumask_t cpu_mask = mm->cpu_vm_mask;
+		cpumask_t cpu_mask = *mm_cpumask(mm);
 		cpu_clear(smp_processor_id(), cpu_mask);
 		if (!cpus_empty(cpu_mask))
 			xc1((smpfunc_t) BTFIXUP_CALL(local_flush_cache_mm), (unsigned long) mm);
@@ -154,12 +154,13 @@ void smp_flush_cache_mm(struct mm_struct *mm)
 void smp_flush_tlb_mm(struct mm_struct *mm)
 {
 	if(mm->context != NO_CONTEXT) {
-		cpumask_t cpu_mask = mm->cpu_vm_mask;
+		cpumask_t cpu_mask = *mm_cpumask(mm);
 		cpu_clear(smp_processor_id(), cpu_mask);
 		if (!cpus_empty(cpu_mask)) {
 			xc1((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_mm), (unsigned long) mm);
 			if(atomic_read(&mm->mm_users) == 1 && current->active_mm == mm)
-				mm->cpu_vm_mask = cpumask_of_cpu(smp_processor_id());
+				cpumask_copy(mm_cpumask(mm),
+					     cpumask_of(smp_processor_id()));
 		}
 		local_flush_tlb_mm(mm);
 	}
@@ -171,7 +172,7 @@ void smp_flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 	struct mm_struct *mm = vma->vm_mm;
 
 	if (mm->context != NO_CONTEXT) {
-		cpumask_t cpu_mask = mm->cpu_vm_mask;
+		cpumask_t cpu_mask = *mm_cpumask(mm);
 		cpu_clear(smp_processor_id(), cpu_mask);
 		if (!cpus_empty(cpu_mask))
 			xc3((smpfunc_t) BTFIXUP_CALL(local_flush_cache_range), (unsigned long) vma, start, end);
@@ -185,7 +186,7 @@ void smp_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	struct mm_struct *mm = vma->vm_mm;
 
 	if (mm->context != NO_CONTEXT) {
-		cpumask_t cpu_mask = mm->cpu_vm_mask;
+		cpumask_t cpu_mask = *mm_cpumask(mm);
 		cpu_clear(smp_processor_id(), cpu_mask);
 		if (!cpus_empty(cpu_mask))
 			xc3((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_range), (unsigned long) vma, start, end);
@@ -198,7 +199,7 @@ void smp_flush_cache_page(struct vm_area_struct *vma, unsigned long page)
 	struct mm_struct *mm = vma->vm_mm;
 
 	if(mm->context != NO_CONTEXT) {
-		cpumask_t cpu_mask = mm->cpu_vm_mask;
+		cpumask_t cpu_mask = *mm_cpumask(mm);
 		cpu_clear(smp_processor_id(), cpu_mask);
 		if (!cpus_empty(cpu_mask))
 			xc2((smpfunc_t) BTFIXUP_CALL(local_flush_cache_page), (unsigned long) vma, page);
@@ -211,7 +212,7 @@ void smp_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 	struct mm_struct *mm = vma->vm_mm;
 
 	if(mm->context != NO_CONTEXT) {
-		cpumask_t cpu_mask = mm->cpu_vm_mask;
+		cpumask_t cpu_mask = *mm_cpumask(mm);
 		cpu_clear(smp_processor_id(), cpu_mask);
 		if (!cpus_empty(cpu_mask))
 			xc2((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_page), (unsigned long) vma, page);
@@ -240,7 +241,7 @@ void smp_flush_page_to_ram(unsigned long page)
 
 void smp_flush_sig_insns(struct mm_struct *mm, unsigned long insn_addr)
 {
-	cpumask_t cpu_mask = mm->cpu_vm_mask;
+	cpumask_t cpu_mask = *mm_cpumask(mm);
 	cpu_clear(smp_processor_id(), cpu_mask);
 	if (!cpus_empty(cpu_mask))
 		xc2((smpfunc_t) BTFIXUP_CALL(local_flush_sig_insns), (unsigned long) mm, insn_addr);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 4e17eec4147..2de937c7232 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -850,7 +850,7 @@ static void tsb_sync(void *info)
 
 void smp_tsb_sync(struct mm_struct *mm)
 {
-	smp_call_function_many(&mm->cpu_vm_mask, tsb_sync, mm, 1);
+	smp_call_function_many(mm_cpumask(mm), tsb_sync, mm, 1);
 }
 
 extern unsigned long xcall_flush_tlb_mm;
@@ -1055,13 +1055,13 @@ void smp_flush_tlb_mm(struct mm_struct *mm)
 	int cpu = get_cpu();
 
 	if (atomic_read(&mm->mm_users) == 1) {
-		mm->cpu_vm_mask = cpumask_of_cpu(cpu);
+		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
 		goto local_flush_and_out;
 	}
 
 	smp_cross_call_masked(&xcall_flush_tlb_mm,
 			      ctx, 0, 0,
-			      &mm->cpu_vm_mask);
+			      mm_cpumask(mm));
 
 local_flush_and_out:
 	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
@@ -1075,11 +1075,11 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 	int cpu = get_cpu();
 
 	if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1)
-		mm->cpu_vm_mask = cpumask_of_cpu(cpu);
+		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
 	else
 		smp_cross_call_masked(&xcall_flush_tlb_pending,
 				      ctx, nr, (unsigned long) vaddrs,
-				      &mm->cpu_vm_mask);
+				      mm_cpumask(mm));
 
 	__flush_tlb_pending(ctx, nr, vaddrs);
 
-- 
cgit v1.2.3


From afd4672dc7610b7feef5190168aa917cc2e417e4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 16 Mar 2009 23:12:23 -0400
Subject: ext4: Add auto_da_alloc mount option

Add a mount option which allows the user to disable automatic
allocation of blocks whose allocation by delayed allocation when the
file was originally truncated or when the file is renamed over an
existing file.  This feature is intended to save users from the
effects of naive application writers, but it reduces the effectiveness
of the delayed allocation code.  This mount option disables this
safety feature, which may be desirable for prodcutions systems where
the risk of unclean shutdowns or unexpected system crashes is low.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/inode.c |  2 +-
 fs/ext4/namei.c |  3 ++-
 fs/ext4/super.c | 25 +++++++++++++------------
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 46aaaa2ed4c..a004699e729 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -549,7 +549,7 @@ do {									       \
 #define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION		0x10000	/* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d3118d1acc3..bed4a0abd0d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3908,7 +3908,7 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (inode->i_size == 0)
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index eb20246c896..22098e1cd08 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2497,7 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ext4_mark_inode_dirty(handle, new_inode);
 		if (!new_inode->i_nlink)
 			ext4_orphan_add(handle, new_inode);
-		force_da_alloc = 1;
+		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+			force_da_alloc = 1;
 	}
 	retval = 0;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b9aefceb41e..45d07cf9df3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -816,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",noacl");
 #endif
-	if (!test_opt(sb, RESERVATION))
-		seq_puts(seq, ",noreservation");
 	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
@@ -868,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
 
+	if (test_opt(sb, NO_AUTO_DA_ALLOC))
+		seq_puts(seq, ",auto_da_alloc=0");
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -1017,7 +1018,7 @@ enum {
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_auto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
@@ -1052,8 +1053,6 @@ static const match_table_t tokens = {
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
-	{Opt_reservation, "reservation"},
-	{Opt_noreservation, "noreservation"},
 	{Opt_noload, "noload"},
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
@@ -1088,6 +1087,7 @@ static const match_table_t tokens = {
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
 	{Opt_err, NULL},
 };
 
@@ -1220,12 +1220,6 @@ static int parse_options(char *options, struct super_block *sb,
 			       "not supported\n");
 			break;
 #endif
-		case Opt_reservation:
-			set_opt(sbi->s_mount_opt, RESERVATION);
-			break;
-		case Opt_noreservation:
-			clear_opt(sbi->s_mount_opt, RESERVATION);
-			break;
 		case Opt_journal_update:
 			/* @@@ FIXME */
 			/* Eventually we will want to be able to create
@@ -1491,6 +1485,14 @@ set_qf_format:
 			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
 							    option);
 			break;
+		case Opt_auto_da_alloc:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option)
+				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+			else
+				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -2306,7 +2308,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
-	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
-- 
cgit v1.2.3


From 7d1e8255cf959fba7ee2317550dfde39f0b936ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:03 -0400
Subject: SUNRPC: Add the equivalent of the linger and linger2 timeouts to RPC
 sockets

This fixes a regression against FreeBSD servers as reported by Tomas
Kasparek. Apparently when using RPC over a TCP socket, the FreeBSD servers
don't ever react to the client closing the socket, and so commit
e06799f958bf7f9f8fae15f0c6f519953fb0257c (SUNRPC: Use shutdown() instead of
close() when disconnecting a TCP socket) causes the setup to hang forever
whenever the client attempts to close and then reconnect.

We break the deadlock by adding a 'linger2' style timeout to the socket,
after which, the client will abort the connection using a TCP 'RST'.

The default timeout is set to 15 seconds. A subsequent patch will put it
under user control by means of a systctl.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprtsock.c       | 98 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 2b0d960603b..1758d9f5b5c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -260,6 +260,7 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
+#define XPRT_CONNECTION_ABORT	(7)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2e070679ab4..b51f58b95c3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -49,6 +49,8 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
 unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
 unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
 
+#define XS_TCP_LINGER_TO	(15U * HZ)
+
 /*
  * We can register our own files under /proc/sys/sunrpc by
  * calling register_sysctl_table() again.  The files in that
@@ -806,6 +808,7 @@ static void xs_close(struct rpc_xprt *xprt)
 	xs_reset_transport(transport);
 
 	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	smp_mb__after_clear_bit();
@@ -1133,6 +1136,47 @@ out:
 	read_unlock(&sk->sk_callback_lock);
 }
 
+/*
+ * Do the equivalent of linger/linger2 handling for dealing with
+ * broken servers that don't close the socket in a timely
+ * fashion
+ */
+static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
+		unsigned long timeout)
+{
+	struct sock_xprt *transport;
+
+	if (xprt_test_and_set_connecting(xprt))
+		return;
+	set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	transport = container_of(xprt, struct sock_xprt, xprt);
+	queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
+			   timeout);
+}
+
+static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport;
+
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
+	    !cancel_delayed_work(&transport->connect_worker))
+		return;
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	xprt_clear_connecting(xprt);
+}
+
+static void xs_sock_mark_closed(struct rpc_xprt *xprt)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	clear_bit(XPRT_CLOSING, &xprt->state);
+	smp_mb__after_clear_bit();
+	/* Mark transport as closed and wake up all pending tasks */
+	xprt_disconnect_done(xprt);
+}
+
 /**
  * xs_tcp_state_change - callback to handle TCP socket state changes
  * @sk: socket whose state has changed
@@ -1178,6 +1222,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 		smp_mb__after_clear_bit();
+		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
@@ -1194,17 +1239,14 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_LAST_ACK:
 		set_bit(XPRT_CLOSING, &xprt->state);
+		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		smp_mb__after_clear_bit();
 		break;
 	case TCP_CLOSE:
-		smp_mb__before_clear_bit();
-		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
-		clear_bit(XPRT_CLOSING, &xprt->state);
-		smp_mb__after_clear_bit();
-		/* Mark transport as closed and wake up all pending tasks */
-		xprt_disconnect_done(xprt);
+		xs_tcp_cancel_linger_timeout(xprt);
+		xs_sock_mark_closed(xprt);
 	}
  out:
 	read_unlock(&sk->sk_callback_lock);
@@ -1562,8 +1604,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_wake_pending_tasks(xprt, status);
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
@@ -1604,8 +1646,8 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_wake_pending_tasks(xprt, status);
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /*
@@ -1626,7 +1668,9 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo
 	memset(&any, 0, sizeof(any));
 	any.sa_family = AF_UNSPEC;
 	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
-	if (result)
+	if (!result)
+		xs_sock_mark_closed(xprt);
+	else
 		dprintk("RPC:       AF_UNSPEC connect return code %d\n",
 				result);
 }
@@ -1702,6 +1746,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 		goto out;
 
 	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
@@ -1713,10 +1758,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 			sock_release(sock);
 			goto out;
 		}
-	} else
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt, transport);
 
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1732,17 +1785,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
-		goto out_clear;
+		xprt_clear_connecting(xprt);
+		return;
 	}
 	/* get rid of existing socket, and retry */
 	xs_tcp_shutdown(xprt);
 	printk("%s: connect returned unhandled error %d\n",
 			__func__, status);
+out_eagain:
 	status = -EAGAIN;
 out:
-	xprt_wake_pending_tasks(xprt, status);
-out_clear:
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
@@ -1763,6 +1817,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 		goto out;
 
 	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
@@ -1774,10 +1829,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 			sock_release(sock);
 			goto out;
 		}
-	} else
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt, transport);
 
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1792,17 +1855,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
-		goto out_clear;
+		xprt_clear_connecting(xprt);
+		return;
 	}
 	/* get rid of existing socket, and retry */
 	xs_tcp_shutdown(xprt);
 	printk("%s: connect returned unhandled error %d\n",
 			__func__, status);
+out_eagain:
 	status = -EAGAIN;
 out:
-	xprt_wake_pending_tasks(xprt, status);
-out_clear:
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
-- 
cgit v1.2.3


From 25fe6142a57c720452c5e9ddbc1f32309c1e5c19 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:03 -0400
Subject: SUNRPC: Add a sysctl to control the duration of the socket linger
 timeout

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b51f58b95c3..42222b4dd76 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -50,6 +50,7 @@ unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
 unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
 
 #define XS_TCP_LINGER_TO	(15U * HZ)
+static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
 
 /*
  * We can register our own files under /proc/sys/sunrpc by
@@ -118,6 +119,14 @@ static ctl_table xs_tunables_table[] = {
 		.extra1		= &xprt_min_resvport_limit,
 		.extra2		= &xprt_max_resvport_limit
 	},
+	{
+		.procname	= "tcp_fin_timeout",
+		.data		= &xs_tcp_fin_timeout,
+		.maxlen		= sizeof(xs_tcp_fin_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= sysctl_jiffies
+	},
 	{
 		.ctl_name = 0,
 	},
@@ -1222,7 +1231,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 		smp_mb__after_clear_bit();
-		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
+		xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
@@ -1239,7 +1248,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_LAST_ACK:
 		set_bit(XPRT_CLOSING, &xprt->state);
-		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
+		xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		smp_mb__after_clear_bit();
-- 
cgit v1.2.3


From b61d59fffd3e5b6037c92b4c840605831de8a251 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:04 -0400
Subject: =?UTF-8?q?SUNRPC:=C2=A0xs=5Ftcp=5Fconnect=5Fworker{4,6}:=20merge?=
 =?UTF-8?q?=20common=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 140 ++++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 68 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 42222b4dd76..f05a56e597e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1738,33 +1738,29 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 }
 
 /**
- * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
- * @work: RPC transport to connect
+ * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_connect_worker4(struct work_struct *work)
+static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
+		struct sock_xprt *transport,
+		struct socket *(*create_sock)(struct rpc_xprt *,
+			struct sock_xprt *))
 {
-	struct sock_xprt *transport =
-		container_of(work, struct sock_xprt, connect_worker.work);
-	struct rpc_xprt *xprt = &transport->xprt;
 	struct socket *sock = transport->sock;
-	int err, status = -EIO;
+	int status = -EIO;
 
 	if (xprt->shutdown)
 		goto out;
 
 	if (!sock) {
 		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
-		/* start from scratch */
-		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
-			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
-			goto out;
-		}
-		xs_reclassify_socket4(sock);
-
-		if (xs_bind4(transport, sock) < 0) {
-			sock_release(sock);
+		sock = create_sock(xprt, transport);
+		if (IS_ERR(sock)) {
+			status = PTR_ERR(sock);
 			goto out;
 		}
 	} else {
@@ -1808,74 +1804,82 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
+static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
+		struct sock_xprt *transport)
+{
+	struct socket *sock;
+	int err;
+
+	/* start from scratch */
+	err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		dprintk("RPC:       can't create TCP transport socket (%d).\n",
+				-err);
+		goto out_err;
+	}
+	xs_reclassify_socket4(sock);
+
+	if (xs_bind4(transport, sock) < 0) {
+		sock_release(sock);
+		goto out_err;
+	}
+	return sock;
+out_err:
+	return ERR_PTR(-EIO);
+}
+
 /**
- * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
+ * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
  * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_connect_worker6(struct work_struct *work)
+static void xs_tcp_connect_worker4(struct work_struct *work)
 {
 	struct sock_xprt *transport =
 		container_of(work, struct sock_xprt, connect_worker.work);
 	struct rpc_xprt *xprt = &transport->xprt;
-	struct socket *sock = transport->sock;
-	int err, status = -EIO;
 
-	if (xprt->shutdown)
-		goto out;
-
-	if (!sock) {
-		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
-		/* start from scratch */
-		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
-			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
-			goto out;
-		}
-		xs_reclassify_socket6(sock);
+	xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4);
+}
 
-		if (xs_bind6(transport, sock) < 0) {
-			sock_release(sock);
-			goto out;
-		}
-	} else {
-		int abort_and_exit;
+static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
+		struct sock_xprt *transport)
+{
+	struct socket *sock;
+	int err;
 
-		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
-				&xprt->state);
-		/* "close" the socket, preserving the local port */
-		xs_tcp_reuse_connection(xprt, transport);
+	/* start from scratch */
+	err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		dprintk("RPC:       can't create TCP transport socket (%d).\n",
+				-err);
+		goto out_err;
+	}
+	xs_reclassify_socket6(sock);
 
-		if (abort_and_exit)
-			goto out_eagain;
+	if (xs_bind6(transport, sock) < 0) {
+		sock_release(sock);
+		goto out_err;
 	}
+	return sock;
+out_err:
+	return ERR_PTR(-EIO);
+}
 
-	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
-			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+/**
+ * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_connect_worker6(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
 
-	status = xs_tcp_finish_connecting(xprt, sock);
-	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
-			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
-	switch (status) {
-	case -ECONNREFUSED:
-	case -ECONNRESET:
-	case -ENETUNREACH:
-		/* retry with existing socket, after a delay */
-	case 0:
-	case -EINPROGRESS:
-	case -EALREADY:
-		xprt_clear_connecting(xprt);
-		return;
-	}
-	/* get rid of existing socket, and retry */
-	xs_tcp_shutdown(xprt);
-	printk("%s: connect returned unhandled error %d\n",
-			__func__, status);
-out_eagain:
-	status = -EAGAIN;
-out:
-	xprt_clear_connecting(xprt);
-	xprt_wake_pending_tasks(xprt, status);
+	xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6);
 }
 
 /**
-- 
cgit v1.2.3


From 55420c24a0d4d1fce70ca713f84aa00b6b74a70e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 15:29:24 -0400
Subject: SUNRPC: Ensure we close the socket on EPIPE errors too...

As long as one task is holding the socket lock, then calls to
xprt_force_disconnect(xprt) will not succeed in shutting down the socket.
In particular, this would mean that a server initiated shutdown will not
succeed until the lock is relinquished.
In order to avoid the deadlock, we should ensure that xs_tcp_send_request()
closes the socket on EPIPE errors too.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f05a56e597e..fbc8725c20c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -726,10 +726,10 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
 			-status);
 	case -ECONNRESET:
+	case -EPIPE:
 		xs_tcp_shutdown(xprt);
 	case -ECONNREFUSED:
 	case -ENOTCONN:
-	case -EPIPE:
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
 	}
 out:
-- 
cgit v1.2.3


From 2e3c230bc7149a6af65d26a0c312e230e0c33cc3 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Thu, 12 Mar 2009 22:21:21 -0400
Subject: SVCRDMA: fix recent printk format warnings.

printk formats in prior commit were reversed/incorrect.
Compiled without warning on x86 and x86_64, but detected on ppc.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index d0bea987d80..6c26a675435 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -235,7 +235,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 	}
 
 	dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
-		"page_base %zd page_len %zd head_len %d tail_len %d\n",
+		"page_base %u page_len %u head_len %zu tail_len %zu\n",
 		sge_no, page_no, xdr->page_base, xdr->page_len,
 		xdr->head[0].iov_len, xdr->tail[0].iov_len);
 
-- 
cgit v1.2.3


From 47c62564200609b6de60f535f61f0c73dd10c7c9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 16 Mar 2009 08:13:41 -0400
Subject: NFS: Fix up a mismerged patch

Move the definition of nfs_need_commit() into the #ifdef CONFIG_NFS_V3
section as originally intended in the patch "NFS: cleanup - remove
struct nfs_inode->ncommit"

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 36fd35e0de8..e560a78995a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -537,13 +537,13 @@ static void nfs_cancel_commit_list(struct list_head *head)
 	}
 }
 
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
 {
 	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
 }
 
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
-- 
cgit v1.2.3


From b1e4adf4ea41bb8b5a7bfc1a7001f137e65495df Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 19 Mar 2009 15:35:49 -0400
Subject: NFS: Fix the notifications when renaming onto an existing file

NFS appears to be returning an unnecessary "delete" notification when
we're doing an atomic rename. See

  http://bugzilla.gnome.org/show_bug.cgi?id=575684

The fix is to get rid of the redundant call to d_delete().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 672368f865c..3b2f6973e7c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		} else if (atomic_read(&new_dentry->d_count) > 1)
 			/* dentry still busy? */
 			goto out;
-	} else
-		nfs_drop_nlink(new_inode);
+	}
 
 go_ahead:
 	/*
@@ -1638,10 +1637,8 @@ go_ahead:
 	}
 	nfs_inode_return_delegation(old_inode);
 
-	if (new_inode != NULL) {
+	if (new_inode != NULL)
 		nfs_inode_return_delegation(new_inode);
-		d_delete(new_dentry);
-	}
 
 	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
 					   new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
 	if (rehash)
 		d_rehash(rehash);
 	if (!error) {
+		if (new_inode != NULL)
+			nfs_drop_nlink(new_inode);
 		d_move(old_dentry, new_dentry);
 		nfs_set_verifier(new_dentry,
 					nfs_save_change_attribute(new_dir));
-- 
cgit v1.2.3


From 7fe5c398fc2186ed586db11106a6692d871d0d58 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 19 Mar 2009 15:35:50 -0400
Subject: NFS: Optimise NFS close()

Close-to-open cache consistency rules really only require us to flush out
writes on calls to close(), and require us to revalidate attributes on the
very last close of the file.

Currently we appear to be doing a lot of extra attribute revalidation
and cache flushes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c           | 11 ++---------
 fs/nfs/inode.c          | 41 +++++++++++++++++++++++++++++------------
 fs/nfs/internal.h       |  3 +++
 fs/nfs/nfs3proc.c       |  1 +
 fs/nfs/nfs4proc.c       | 10 ++++++++++
 fs/nfs/proc.c           |  1 +
 include/linux/nfs_xdr.h |  1 +
 7 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1eab9c9ad24..d451073c494 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 
-	/* Ensure that dirty pages are flushed out with the right creds */
-	if (filp->f_mode & FMODE_WRITE)
-		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
 }
@@ -231,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
-	int		status;
 
 	dprintk("NFS: flush(%s/%s)\n",
 			dentry->d_parent->d_name.name,
@@ -241,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 
-	/* Ensure that data+attribute caches are up to date after close() */
-	status = nfs_do_fsync(ctx, inode);
-	if (!status)
-		nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	return status;
+	/* Flush writes to the server and return any errors */
+	return nfs_do_fsync(ctx, inode);
 }
 
 static ssize_t
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c40adc5dd60..a834d1d850b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	return err;
 }
 
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode;
+	struct nfs_server *server;
+
+	if (!(ctx->mode & FMODE_WRITE))
+		return;
+	if (!is_sync)
+		return;
+	inode = ctx->path.dentry->d_inode;
+	if (!list_empty(&NFS_I(inode)->open_files))
+		return;
+	server = NFS_SERVER(inode);
+	if (server->flags & NFS_MOUNT_NOCTO)
+		return;
+	nfs_revalidate_inode(server, inode);
+}
+
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
@@ -567,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 	return ctx;
 }
 
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-	struct inode *inode;
+	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (ctx == NULL)
-		return;
-
-	inode = ctx->path.dentry->d_inode;
 	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
-	if (ctx->state != NULL) {
-		if (wait)
-			nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-		else
-			nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-	}
+	NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a55e69aa52e..2041f68ff1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679..b82fe6847f1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
+	.close_context	= nfs_close_context,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95f171e7e05..97bacccff57 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1572,6 +1572,15 @@ out_drop:
 	return 0;
 }
 
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	if (ctx->state == NULL)
+		return;
+	if (is_sync)
+		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+	else
+		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -3776,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
+	.close_context  = nfs4_close_context,
 };
 
 /*
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7..7be72d90d49 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.commit_setup	= nfs_proc_commit_setup,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
+	.close_context	= nfs_close_context,
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0691b9c188d..9708e78a4d4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -868,6 +868,7 @@ struct nfs_rpc_ops {
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
+	void	(*close_context)(struct nfs_open_context *ctx, int);
 };
 
 /*
-- 
cgit v1.2.3


From 600914ba524130583fa5acdd00df4aa7aa44b173 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Wed, 14 Jan 2009 10:04:25 -0700
Subject: PCI/x86: make early dump handle multi-function devices

The early "dump PCI config space" code skips many multi-function
devices.  This patch fixes that, so it dumps all devices in PCI
domain 0.

We should not skip the rest of the functions if CLASS_REVISION is
0xffffffff.  Often multi-function devices have gaps in the function ID
space, e.g., 1c.0 and 1c.2 exist but 1c.1 doesn't.  The CLASS_REVISION
of the non-existent 1c.1 function will appear to be 0xffffffff.

We should only look at the HEADER_TYPE of function zero.  Often the
"multi-function" is set in function zero, but not in other functions.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/early.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index f6adf2c6d75..c1a2cd54172 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -96,18 +96,21 @@ void early_dump_pci_devices(void)
 			for (func = 0; func < 8; func++) {
 				u32 class;
 				u8 type;
+
 				class = read_pci_config(bus, slot, func,
 							PCI_CLASS_REVISION);
 				if (class == 0xffffffff)
-					break;
+					continue;
 
 				early_dump_pci_device(bus, slot, func);
 
-				/* No multi-function device? */
-				type = read_pci_config_byte(bus, slot, func,
+				if (func == 0) {
+					type = read_pci_config_byte(bus, slot,
+								    func,
 							       PCI_HEADER_TYPE);
-				if (!(type & 0x80))
-					break;
+					if (!(type & 0x80))
+						break;
+				}
 			}
 		}
 	}
-- 
cgit v1.2.3


From 7bc9e77dcc6edf6ce0b3e4677b1e7f4a05b95b85 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Wed, 14 Jan 2009 10:04:30 -0700
Subject: PCI/x86: format early dump like other PCI output

Use %02x:%02x.%d rather than %02x:%02x:%02x so PCI addresses
look the same as in other parts of the kernel.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/early.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index c1a2cd54172..aaf26ae58cd 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -69,11 +69,12 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func)
 	int j;
 	u32 val;
 
-	printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
+	printk(KERN_INFO "pci 0000:%02x:%02x.%d config space:",
+	       bus, slot, func);
 
 	for (i = 0; i < 256; i += 4) {
 		if (!(i & 0x0f))
-			printk("\n%04x:",i);
+			printk("\n  %02x:",i);
 
 		val = read_pci_config(bus, slot, func, i);
 		for (j = 0; j < 4; j++) {
@@ -115,4 +116,3 @@ void early_dump_pci_devices(void)
 		}
 	}
 }
-
-- 
cgit v1.2.3


From 1cc0ca26c57221f5aa0f49a311a8fc83413dfe97 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Wed, 14 Jan 2009 10:04:36 -0700
Subject: PCI/x86: document pci=earlydump argument

Document the "pci=earlydump" argument.  This currently only works on x86.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 54f21a5c262..c7c441e7930 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1659,6 +1659,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			See also Documentation/blockdev/paride.txt.
 
 	pci=option[,option...]	[PCI] various PCI subsystem options:
+		earlydump	[X86] dump PCI config space before the kernel
+			        changes anything
 		off		[X86] don't probe for the PCI bus
 		bios		[X86-32] force use of PCI BIOS, don't access
 				the hardware directly. Use this if your machine
-- 
cgit v1.2.3


From e496b617b40f2abf6d49803f56aa1344ce1b9177 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 7 Jan 2009 16:22:37 -0800
Subject: PCI: __FUNCTION__ is gcc-specific, use __func__

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6d6120007af..5737b8a9a73 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -657,7 +657,7 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 
 	save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP);
 	if (!save_state) {
-		dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__);
+		dev_err(&dev->dev, "buffer not found in %s\n", __func__);
 		return -ENOMEM;
 	}
 	cap = (u16 *)&save_state->data[0];
@@ -700,7 +700,7 @@ static int pci_save_pcix_state(struct pci_dev *dev)
 
 	save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX);
 	if (!save_state) {
-		dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__);
+		dev_err(&dev->dev, "buffer not found in %s\n", __func__);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.2.3


From 1bf83e558cb29d163f4bc6decbc3800ecf4db195 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:38:34 +0100
Subject: PCI: PCIe portdrv: Use driver data to simplify code

PCI Express port driver extension, as defined by struct
pcie_port_device_ext in portdrv.h, is allocated and initialized, but
never used (it also is never freed).  Extend it to hold the PCI Express
port type as well as the port interrupt mode, change its name and use it
to simplify the code in portdrv_core.c .

Additionally, remove the redundant interrupt_mode member of struct
pcie_device defined in include/linux/pcieport_if.h .

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv.h      |  5 ++-
 drivers/pci/pcie/portdrv_core.c | 95 ++++++++++++++++-------------------------
 include/linux/pcieport_if.h     |  1 -
 3 files changed, 39 insertions(+), 62 deletions(-)

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 2529f3f2ea5..b0dcbc73415 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -28,8 +28,9 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-struct pcie_port_device_ext {
-	int interrupt_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
+struct pcie_port_data {
+	int port_type;		/* Type of the port */
+	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
 };
 
 extern struct bus_type pcie_port_bus_type;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 8b3f8c18032..273e97619bc 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -15,10 +15,9 @@
 #include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
+#include "../pci.h"
 #include "portdrv.h"
 
-extern int pcie_mch_quirk;	/* MSI-quirk Indicator */
-
 /**
  * release_pcie_device - free PCI Express port service device structure
  * @dev: Port service device to release
@@ -31,28 +30,6 @@ static void release_pcie_device(struct device *dev)
 	kfree(to_pcie_device(dev));			
 }
 
-static int is_msi_quirked(struct pci_dev *dev)
-{
-	int port_type, quirk = 0;
-	u16 reg16;
-
-	pci_read_config_word(dev, 
-		pci_find_capability(dev, PCI_CAP_ID_EXP) + 
-		PCIE_CAPABILITIES_REG, &reg16);
-	port_type = (reg16 >> 4) & PORT_TYPE_MASK;
-	switch(port_type) {
-	case PCIE_RC_PORT:
-		if (pcie_mch_quirk == 1)
-			quirk = 1;
-		break;
-	case PCIE_SW_UPSTREAM_PORT:
-	case PCIE_SW_DOWNSTREAM_PORT:
-	default:
-		break;	
-	}
-	return quirk;
-}
-
 /**
  * assign_interrupt_mode - choose interrupt mode for PCI Express port services
  *                         (INTx, MSI-X, MSI) and set up vectors
@@ -64,6 +41,7 @@ static int is_msi_quirked(struct pci_dev *dev)
  */
 static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
+	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int i, pos, nvec, status = -EINVAL;
 	int interrupt_mode = PCIE_PORT_INTx_MODE;
 
@@ -75,7 +53,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 	}
 	
 	/* Check MSI quirk */
-	if (is_msi_quirked(dev))
+	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
 		return interrupt_mode;
 
 	/* Select MSI-X over MSI if supported */		
@@ -132,13 +110,11 @@ static int get_port_device_capability(struct pci_dev *dev)
 			pos + PCIE_SLOT_CAPABILITIES_REG, &reg32);
 		if (reg32 & SLOT_HP_CAPABLE_MASK)
 			services |= PCIE_PORT_SERVICE_HP;
-	} 
-	/* PME Capable - root port capability */
-	if (((reg16 >> 4) & PORT_TYPE_MASK) == PCIE_RC_PORT)
-		services |= PCIE_PORT_SERVICE_PME;
-
+	}
+	/* AER capable */
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR))
 		services |= PCIE_PORT_SERVICE_AER;
+	/* VC support */
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VC))
 		services |= PCIE_PORT_SERVICE_VC;
 
@@ -152,15 +128,15 @@ static int get_port_device_capability(struct pci_dev *dev)
  * @port_type: Type of the port
  * @service_type: Type of service to associate with the service device
  * @irq: Interrupt vector to associate with the service device
- * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI)
  */
 static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, 
-	int port_type, int service_type, int irq, int irq_mode)
+	int service_type, int irq)
 {
+	struct pcie_port_data *port_data = pci_get_drvdata(parent);
 	struct device *device;
+	int port_type = port_data->port_type;
 
 	dev->port = parent;
-	dev->interrupt_mode = irq_mode;
 	dev->irq = irq;
 	dev->id.vendor = parent->vendor;
 	dev->id.device = parent->device;
@@ -185,10 +161,9 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
  * @port_type: Type of the port
  * @service_type: Type of service to associate with the service device
  * @irq: Interrupt vector to associate with the service device
- * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI)
  */
 static struct pcie_device* alloc_pcie_device(struct pci_dev *parent,
-	int port_type, int service_type, int irq, int irq_mode)
+	int service_type, int irq)
 {
 	struct pcie_device *device;
 
@@ -196,7 +171,7 @@ static struct pcie_device* alloc_pcie_device(struct pci_dev *parent,
 	if (!device)
 		return NULL;
 
-	pcie_device_init(parent, device, port_type, service_type, irq,irq_mode);
+	pcie_device_init(parent, device, service_type, irq);
 	return device;
 }
 
@@ -230,39 +205,36 @@ int pcie_port_device_probe(struct pci_dev *dev)
  */
 int pcie_port_device_register(struct pci_dev *dev)
 {
-	struct pcie_port_device_ext *p_ext;
-	int status, type, capabilities, irq_mode, i;
+	struct pcie_port_data *port_data;
+	int status, capabilities, irq_mode, i;
 	int vectors[PCIE_PORT_DEVICE_MAXSERVICES];
 	u16 reg16;
 
-	/* Allocate port device extension */
-	if (!(p_ext = kmalloc(sizeof(struct pcie_port_device_ext), GFP_KERNEL)))
+	port_data = kzalloc(sizeof(*port_data), GFP_KERNEL);
+	if (!port_data)
 		return -ENOMEM;
-
-	pci_set_drvdata(dev, p_ext);
+	pci_set_drvdata(dev, port_data);
 
 	/* Get port type */
 	pci_read_config_word(dev,
 		pci_find_capability(dev, PCI_CAP_ID_EXP) +
 		PCIE_CAPABILITIES_REG, &reg16);
-	type = (reg16 >> 4) & PORT_TYPE_MASK;
+	port_data->port_type = (reg16 >> 4) & PORT_TYPE_MASK;
 
-	/* Now get port services */
 	capabilities = get_port_device_capability(dev);
+	/* Root ports are capable of generating PME too */
+	if (port_data->port_type == PCIE_RC_PORT)
+		capabilities |= PCIE_PORT_SERVICE_PME;
+
 	irq_mode = assign_interrupt_mode(dev, vectors, capabilities);
-	p_ext->interrupt_mode = irq_mode;
+	port_data->port_irq_mode = irq_mode;
 
 	/* Allocate child services if any */
 	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
 
 		if (capabilities & (1 << i)) {
-			child = alloc_pcie_device(
-				dev, 		/* parent */
-				type,		/* port type */
-				i,		/* service type */
-				vectors[i],	/* irq */
-				irq_mode	/* interrupt mode */);
+			child = alloc_pcie_device(dev, i, vectors[i]);
 			if (child) {
 				status = device_register(&child->device);
 				if (status) {
@@ -349,25 +321,30 @@ static int remove_iter(struct device *dev, void *data)
  */
 void pcie_port_device_remove(struct pci_dev *dev)
 {
-	struct device *device;
-	unsigned long device_addr;
-	int interrupt_mode = PCIE_PORT_INTx_MODE;
+	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int status;
 
 	do {
+		unsigned long device_addr;
+
 		status = device_for_each_child(&dev->dev, &device_addr, remove_iter);
 		if (status) {
-			device = (struct device*)device_addr;
-			interrupt_mode = (to_pcie_device(device))->interrupt_mode;
+			struct device *device = (struct device*)device_addr;
 			put_device(device);
 			device_unregister(device);
 		}
 	} while (status);
-	/* Switch to INTx by default if MSI enabled */
-	if (interrupt_mode == PCIE_PORT_MSIX_MODE)
+
+	switch (port_data->port_irq_mode) {
+	case PCIE_PORT_MSIX_MODE:
 		pci_disable_msix(dev);
-	else if (interrupt_mode == PCIE_PORT_MSI_MODE)
+		break;
+	case PCIE_PORT_MSI_MODE:
 		pci_disable_msi(dev);
+		break;
+	}
+
+	kfree(port_data);
 }
 
 /**
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 6cd91e3f982..194409af103 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -36,7 +36,6 @@ struct pcie_port_service_id {
 
 struct pcie_device {
 	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
-	int 		interrupt_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */	
 	struct pcie_port_service_id id;	/* Service ID */
 	struct pci_dev	*port;	    /* Root/Upstream/Downstream Port */
 	void		*priv_data; /* Service Private Data */
-- 
cgit v1.2.3


From 90e9cd50f7feeddc911325c8a8c1b7e1fccc6599 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:39:39 +0100
Subject: PCI: PCIe portdrv: Aviod using service devices with wrong interrupts

The PCI Express port driver should not attempt to register service
devices that require the ability to generate interrupts if generating
interrupts is not possible.  Namely, if the port has no interrupt pin
configured and we cannot set up MSI or MSI-X for it, there is no way
it can generate interrupts and in such a case the port services that
rely on interrupts (PME, PCIe HP, AER) should not be enabled for it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_core.c | 41 ++++++++++++++++++++++++++++-------------
 include/linux/pcieport_if.h     |  1 +
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 273e97619bc..265eba033a4 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -43,7 +43,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int i, pos, nvec, status = -EINVAL;
-	int interrupt_mode = PCIE_PORT_INTx_MODE;
+	int interrupt_mode = PCIE_PORT_NO_IRQ;
 
 	/* Set INTx as default */
 	for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
@@ -51,7 +51,9 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 			nvec++;
 		vectors[i] = dev->irq;
 	}
-	
+	if (dev->pin)
+		interrupt_mode = PCIE_PORT_INTx_MODE;
+
 	/* Check MSI quirk */
 	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
 		return interrupt_mode;
@@ -141,7 +143,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
 	dev->id.vendor = parent->vendor;
 	dev->id.device = parent->device;
 	dev->id.port_type = port_type;
-	dev->id.service_type = (1 << service_type);
+	dev->id.service_type = service_type;
 
 	/* Initialize generic device interface */
 	device = &dev->device;
@@ -232,19 +234,32 @@ int pcie_port_device_register(struct pci_dev *dev)
 	/* Allocate child services if any */
 	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
+		int service = 1 << i;
 
-		if (capabilities & (1 << i)) {
-			child = alloc_pcie_device(dev, i, vectors[i]);
-			if (child) {
-				status = device_register(&child->device);
-				if (status) {
-					kfree(child);
-					continue;
-				}
-				get_device(&child->device);
-			}
+		if (!(capabilities & service))
+			continue;
+
+		/*
+		 * Don't use service devices that require interrupts if there is
+		 * no way to generate them.
+		 */
+		if (irq_mode == PCIE_PORT_NO_IRQ
+		    && service != PCIE_PORT_SERVICE_VC)
+			continue;
+
+		child = alloc_pcie_device(dev, service, vectors[i]);
+		if (!child)
+			continue;
+
+		status = device_register(&child->device);
+		if (status) {
+			kfree(child);
+			continue;
 		}
+
+		get_device(&child->device);
 	}
+
 	return 0;
 }
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 194409af103..8e1ae1fd92f 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -22,6 +22,7 @@
 #define PCIE_PORT_SERVICE_VC		8	/* Virtual Channel */
 
 /* Root/Upstream/Downstream Port's Interrupt Mode */
+#define PCIE_PORT_NO_IRQ		(-1)
 #define PCIE_PORT_INTx_MODE		0
 #define PCIE_PORT_MSI_MODE		1
 #define PCIE_PORT_MSIX_MODE		2
-- 
cgit v1.2.3


From f118c0c3cff4fed39bde1863f9d59850719645cc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:42:01 +0100
Subject: PCI: PCIe portdrv: Do not enable port device before setting up
 interrupts

The PCI Express port driver calls pci_enable_device() before setting
up interrupts, which is wrong, because if there is an interrupt pin
configured for the port, pci_enable_device() will likely set up an
interrupt link for it.  However, this shouldn't be done if either
MSI or MSI-X interrupt mode is chosen for the port.

The solution is to call pci_enable_device() after setting up
interrupts, because in that case the interrupt link won't be set up
if MSI or MSI-X are enabled.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_core.c | 38 ++++++++++++++++++++++++++++----------
 drivers/pci/pcie/portdrv_pci.c  | 11 +++--------
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 265eba033a4..91ecbc43155 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -208,7 +208,7 @@ int pcie_port_device_probe(struct pci_dev *dev)
 int pcie_port_device_register(struct pci_dev *dev)
 {
 	struct pcie_port_data *port_data;
-	int status, capabilities, irq_mode, i;
+	int status, capabilities, irq_mode, i, nr_serv;
 	int vectors[PCIE_PORT_DEVICE_MAXSERVICES];
 	u16 reg16;
 
@@ -229,24 +229,32 @@ int pcie_port_device_register(struct pci_dev *dev)
 		capabilities |= PCIE_PORT_SERVICE_PME;
 
 	irq_mode = assign_interrupt_mode(dev, vectors, capabilities);
+	if (irq_mode == PCIE_PORT_NO_IRQ) {
+		/*
+		 * Don't use service devices that require interrupts if there is
+		 * no way to generate them.
+		 */
+		if (!(capabilities & PCIE_PORT_SERVICE_VC)) {
+			status = -ENODEV;
+			goto Error;
+		}
+		capabilities = PCIE_PORT_SERVICE_VC;
+	}
 	port_data->port_irq_mode = irq_mode;
 
+	status = pci_enable_device(dev);
+	if (status)
+		goto Error;
+	pci_set_master(dev);
+
 	/* Allocate child services if any */
-	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
+	for (i = 0, nr_serv = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
 		int service = 1 << i;
 
 		if (!(capabilities & service))
 			continue;
 
-		/*
-		 * Don't use service devices that require interrupts if there is
-		 * no way to generate them.
-		 */
-		if (irq_mode == PCIE_PORT_NO_IRQ
-		    && service != PCIE_PORT_SERVICE_VC)
-			continue;
-
 		child = alloc_pcie_device(dev, service, vectors[i]);
 		if (!child)
 			continue;
@@ -258,9 +266,19 @@ int pcie_port_device_register(struct pci_dev *dev)
 		}
 
 		get_device(&child->device);
+		nr_serv++;
+	}
+	if (!nr_serv) {
+		pci_disable_device(dev);
+		status = -ENODEV;
+		goto Error;
 	}
 
 	return 0;
+
+ Error:
+	kfree(port_data);
+	return status;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 5ea566e20b3..d17611f390b 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -82,18 +82,13 @@ static int __devinit pcie_portdrv_probe (struct pci_dev *dev,
 	if (status)
 		return status;
 
-	if (pci_enable_device(dev) < 0) 
-		return -ENODEV;
-	
-	pci_set_master(dev);
         if (!dev->irq && dev->pin) {
 		dev_warn(&dev->dev, "device [%04x:%04x] has invalid IRQ; "
 			 "check vendor BIOS\n", dev->vendor, dev->device);
 	}
-	if (pcie_port_device_register(dev)) {
-		pci_disable_device(dev);
-		return -ENOMEM;
-	}
+	status = pcie_port_device_register(dev);
+	if (status)
+		return status;
 
 	pcie_portdrv_save_config(dev);
 
-- 
cgit v1.2.3


From 87d2e2ecf6026efa64b01f7f71802b20da736d35 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:43:07 +0100
Subject: PCI: PCIe portdrv: Remove unnecessary function

The function pcie_portdrv_save_config() in portdrv_pci.c is not
necessary.  Remove it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_pci.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index d17611f390b..94d0e2af9ba 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -32,11 +32,6 @@ MODULE_LICENSE("GPL");
 /* global data */
 static const char device_name[] = "pcieport-driver";
 
-static int pcie_portdrv_save_config(struct pci_dev *dev)
-{
-	return pci_save_state(dev);
-}
-
 static int pcie_portdrv_restore_config(struct pci_dev *dev)
 {
 	int retval;
@@ -90,7 +85,7 @@ static int __devinit pcie_portdrv_probe (struct pci_dev *dev,
 	if (status)
 		return status;
 
-	pcie_portdrv_save_config(dev);
+	pci_save_state(dev);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 0516c8bcd25293f438573101c439ce25a18916ad Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:44:19 +0100
Subject: PCI: PCIe portdrv: Simplily probe callback of service drivers

The second argument of the ->probe() callback in
struct pcie_port_service_driver is unnecessary and never used.
Remove it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c | 3 +--
 drivers/pci/hotplug/pciehp_core.c | 2 +-
 drivers/pci/pcie/aer/aerdrv.c     | 6 ++----
 drivers/pci/pcie/portdrv_core.c   | 2 +-
 include/linux/pcieport_if.h       | 3 +--
 5 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index 438d795f9fe..ad8835758a1 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -82,8 +82,7 @@ static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
 
 /* Dummy driver for dumplicate name detection */
-static int __init dummy_probe(struct pcie_device *dev,
-			      const struct pcie_port_service_id *id)
+static int __init dummy_probe(struct pcie_device *dev)
 {
 	int pos;
 	u32 slot_cap;
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 681e3912b82..3429b21dbb5 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -401,7 +401,7 @@ static int get_cur_bus_speed(struct hotplug_slot *hotplug_slot, enum pci_bus_spe
 	return 0;
 }
 
-static int pciehp_probe(struct pcie_device *dev, const struct pcie_port_service_id *id)
+static int pciehp_probe(struct pcie_device *dev)
 {
 	int rc;
 	struct controller *ctrl;
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index e390707661d..57c41204c54 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -38,8 +38,7 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 
-static int __devinit aer_probe (struct pcie_device *dev,
-	const struct pcie_port_service_id *id );
+static int __devinit aer_probe (struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
 static int aer_suspend(struct pcie_device *dev, pm_message_t state)
 {return 0;}
@@ -207,8 +206,7 @@ static void aer_remove(struct pcie_device *dev)
  *
  * Invoked when PCI Express bus loads AER service driver.
  **/
-static int __devinit aer_probe (struct pcie_device *dev,
-				const struct pcie_port_service_id *id )
+static int __devinit aer_probe (struct pcie_device *dev)
 {
 	int status;
 	struct aer_rpc *rpc;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 91ecbc43155..682524b0c93 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -402,7 +402,7 @@ static int pcie_port_probe_service(struct device *dev)
 		return -ENODEV;
 
 	pciedev = to_pcie_device(dev);
-	status = driver->probe(pciedev, driver->id_table);
+	status = driver->probe(pciedev);
 	if (!status) {
 		dev_printk(KERN_DEBUG, dev, "service driver %s loaded\n",
 			driver->name);
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 8e1ae1fd92f..59e90b8a783 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -56,8 +56,7 @@ static inline void* get_service_data(struct pcie_device *dev)
 
 struct pcie_port_service_driver {
 	const char *name;
-	int (*probe) (struct pcie_device *dev, 
-		const struct pcie_port_service_id *id);
+	int (*probe) (struct pcie_device *dev);
 	void (*remove) (struct pcie_device *dev);
 	int (*suspend) (struct pcie_device *dev, pm_message_t state);
 	int (*resume) (struct pcie_device *dev);
-- 
cgit v1.2.3


From 22106368c999246c414610dcaacd485e741605b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:46:46 +0100
Subject: PCI: PCIe portdrv: Remove struct pcie_port_service_id

The PCI Express port driver uses 'struct pcie_port_service_id' for
matching port service devices and drivers, but this structure
contains fields that duplicate information from the port device
itself (vendor, device, subvendor, subdevice) and fields that are not
used by any existing port service driver (class, class_mask,
drvier_data).  Also, both existing port service drivers (AER and
PCIe HP) don't even use the vendor and device fields for device
matching.  Therefore 'struct pcie_port_service_id' can be removed
altogether and the only useful members of it (port_type, service) can
be introduced directly into the port service device and port service
driver structures.  That simplifies the code quite a bit and reduces
its size.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c  | 13 ++-----------
 drivers/pci/hotplug/pciehp_core.c  | 12 ++----------
 drivers/pci/pcie/aer/aerdrv.c      | 16 ++--------------
 drivers/pci/pcie/aer/aerdrv_core.c | 10 +++++-----
 drivers/pci/pcie/portdrv.h         |  5 -----
 drivers/pci/pcie/portdrv_bus.c     | 18 ++++++++++--------
 drivers/pci/pcie/portdrv_core.c    |  5 +----
 include/linux/pcieport_if.h        | 17 ++++++++---------
 8 files changed, 30 insertions(+), 66 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index ad8835758a1..21734c31152 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -67,16 +67,6 @@ static int __init parse_detect_mode(void)
 	return PCIEHP_DETECT_DEFAULT;
 }
 
-static struct pcie_port_service_id __initdata port_pci_ids[] = {
-	{
-		.vendor = PCI_ANY_ID,
-		.device = PCI_ANY_ID,
-		.port_type = PCIE_ANY_PORT,
-		.service_type = PCIE_PORT_SERVICE_HP,
-		.driver_data =  0,
-        }, { /* end: all zeroes */ }
-};
-
 static int __initdata dup_slot_id;
 static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
@@ -110,7 +100,8 @@ static int __init dummy_probe(struct pcie_device *dev)
 
 static struct pcie_port_service_driver __initdata dummy_driver = {
         .name           = "pciehp_dummy",
-        .id_table       = port_pci_ids,
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_HP,
         .probe          = dummy_probe,
 };
 
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 3429b21dbb5..3d21bbba330 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -505,18 +505,10 @@ static int pciehp_resume (struct pcie_device *dev)
 }
 #endif
 
-static struct pcie_port_service_id port_pci_ids[] = { {
-	.vendor = PCI_ANY_ID,
-	.device = PCI_ANY_ID,
-	.port_type = PCIE_ANY_PORT,
-	.service_type = PCIE_PORT_SERVICE_HP,
-	.driver_data =	0,
-	}, { /* end: all zeroes */ }
-};
-
 static struct pcie_port_service_driver hpdriver_portdrv = {
 	.name		= PCIE_MODULE_NAME,
-	.id_table	= &port_pci_ids[0],
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_HP,
 
 	.probe		= pciehp_probe,
 	.remove		= pciehp_remove,
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 57c41204c54..e11c0319406 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -48,19 +48,6 @@ static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
 static void aer_error_resume(struct pci_dev *dev);
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
-/*
- * PCI Express bus's AER Root service driver data structure
- */
-static struct pcie_port_service_id aer_id[] = {
-	{
-	.vendor 	= PCI_ANY_ID,
-	.device 	= PCI_ANY_ID,
-	.port_type 	= PCIE_RC_PORT,
-	.service_type 	= PCIE_PORT_SERVICE_AER,
-	},
-	{ /* end: all zeroes */ }
-};
-
 static struct pci_error_handlers aer_error_handlers = {
 	.error_detected = aer_error_detected,
 	.resume = aer_error_resume,
@@ -68,7 +55,8 @@ static struct pci_error_handlers aer_error_handlers = {
 
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
-	.id_table	= &aer_id[0],
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_AER,
 
 	.probe		= aer_probe,
 	.remove		= aer_remove,
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 38257500738..307452f3003 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -351,21 +351,21 @@ static int find_aer_service_iter(struct device *device, void *data)
 {
 	struct device_driver *driver;
 	struct pcie_port_service_driver *service_driver;
-	struct pcie_device *pcie_dev;
 	struct find_aer_service_data *result;
 
 	result = (struct find_aer_service_data *) data;
 
 	if (device->bus == &pcie_port_bus_type) {
-		pcie_dev = to_pcie_device(device);
-		if (pcie_dev->id.port_type == PCIE_SW_DOWNSTREAM_PORT)
+		struct pcie_port_data *port_data;
+
+		port_data = pci_get_drvdata(to_pcie_device(device)->port);
+		if (port_data->port_type == PCIE_SW_DOWNSTREAM_PORT)
 			result->is_downstream = 1;
 
 		driver = device->driver;
 		if (driver) {
 			service_driver = to_service_driver(driver);
-			if (service_driver->id_table->service_type ==
-					PCIE_PORT_SERVICE_AER) {
+			if (service_driver->service == PCIE_PORT_SERVICE_AER) {
 				result->aer_driver = service_driver;
 				return 1;
 			}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index b0dcbc73415..ad4d082a034 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -28,11 +28,6 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-struct pcie_port_data {
-	int port_type;		/* Type of the port */
-	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
-};
-
 extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_probe(struct pci_dev *dev);
 extern int pcie_port_device_register(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/portdrv_bus.c b/drivers/pci/pcie/portdrv_bus.c
index eec89b767f9..ef3a4eeaebb 100644
--- a/drivers/pci/pcie/portdrv_bus.c
+++ b/drivers/pci/pcie/portdrv_bus.c
@@ -26,20 +26,22 @@ EXPORT_SYMBOL_GPL(pcie_port_bus_type);
 static int pcie_port_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct pcie_device *pciedev;
+	struct pcie_port_data *port_data;
 	struct pcie_port_service_driver *driver;
 
 	if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type)
 		return 0;
-	
+
 	pciedev = to_pcie_device(dev);
 	driver = to_service_driver(drv);
-	if (   (driver->id_table->vendor != PCI_ANY_ID && 
-		driver->id_table->vendor != pciedev->id.vendor) ||
-	       (driver->id_table->device != PCI_ANY_ID &&
-		driver->id_table->device != pciedev->id.device) ||	
-	       (driver->id_table->port_type != PCIE_ANY_PORT &&
-		driver->id_table->port_type != pciedev->id.port_type) ||
-		driver->id_table->service_type != pciedev->id.service_type )
+
+	if (driver->service != pciedev->service)
+		return 0;
+
+	port_data = pci_get_drvdata(pciedev->port);
+
+	if (driver->port_type != PCIE_ANY_PORT
+	     && driver->port_type != port_data->port_type)
 		return 0;
 
 	return 1;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 682524b0c93..843d9e30dd3 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -140,10 +140,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
 
 	dev->port = parent;
 	dev->irq = irq;
-	dev->id.vendor = parent->vendor;
-	dev->id.device = parent->device;
-	dev->id.port_type = port_type;
-	dev->id.service_type = service_type;
+	dev->service = service_type;
 
 	/* Initialize generic device interface */
 	device = &dev->device;
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 59e90b8a783..a3832079508 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -27,18 +27,15 @@
 #define PCIE_PORT_MSI_MODE		1
 #define PCIE_PORT_MSIX_MODE		2
 
-struct pcie_port_service_id {
-	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
-	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
-	__u32 class, class_mask;	/* (class,subclass,prog-if) triplet */
-	__u32 port_type, service_type;	/* Port Entity */
-	kernel_ulong_t driver_data;
+struct pcie_port_data {
+	int port_type;		/* Type of the port */
+	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
 };
 
 struct pcie_device {
 	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
-	struct pcie_port_service_id id;	/* Service ID */
-	struct pci_dev	*port;	    /* Root/Upstream/Downstream Port */
+	struct pci_dev *port;	    /* Root/Upstream/Downstream Port */
+	u32		service;    /* Port service this device represents */
 	void		*priv_data; /* Service Private Data */
 	struct device	device;     /* Generic Device Interface */
 };
@@ -67,7 +64,9 @@ struct pcie_port_service_driver {
 	/* Link Reset Capability - AER service driver specific */
 	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
 
-	const struct pcie_port_service_id *id_table;
+	int port_type;  /* Type of the port this driver can handle */
+	u32 service;    /* Port service this device represents */
+
 	struct device_driver driver;
 };
 #define to_service_driver(d) \
-- 
cgit v1.2.3


From a447b772826fde2a3abfd9bb943dee8750994c55 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 25 Jan 2009 23:53:56 +0100
Subject: PCI: struct device - replace bus_id with dev_name(), dev_set_name()

More dev_set_name conversion.

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/cpqphp_sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/cpqphp_sysfs.c b/drivers/pci/hotplug/cpqphp_sysfs.c
index a13abf55d78..8450f4a6568 100644
--- a/drivers/pci/hotplug/cpqphp_sysfs.c
+++ b/drivers/pci/hotplug/cpqphp_sysfs.c
@@ -225,7 +225,8 @@ void cpqhp_shutdown_debugfs(void)
 
 void cpqhp_create_debugfs_files(struct controller *ctrl)
 {
-	ctrl->dentry = debugfs_create_file(ctrl->pci_dev->dev.bus_id, S_IRUGO, root, ctrl, &debug_ops);
+	ctrl->dentry = debugfs_create_file(dev_name(&ctrl->pci_dev->dev),
+					   S_IRUGO, root, ctrl, &debug_ops);
 }
 
 void cpqhp_remove_debugfs_files(struct controller *ctrl)
-- 
cgit v1.2.3


From a52e2e3513d4beafe8fe8699f1519b021c2d05ba Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 24 Jan 2009 00:21:14 +0100
Subject: PCI/MSI: Introduce pci_msix_table_size()

Introduce new function pci_msix_table_size() returning the size of
the MSI-X table of given PCI device or 0 if the device doesn't
support MSI-X.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 24 +++++++++++++++++++-----
 include/linux/pci.h |  5 +++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index baba2eb5367..08aedd5875b 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -674,6 +674,23 @@ static int msi_free_irqs(struct pci_dev* dev)
 	return 0;
 }
 
+/**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	return multi_msix_capable(control);
+}
+
 /**
  * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
@@ -691,9 +708,8 @@ static int msi_free_irqs(struct pci_dev* dev)
  **/
 int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 {
-	int status, pos, nr_entries;
+	int status, nr_entries;
 	int i, j;
-	u16 control;
 
 	if (!entries)
  		return -EINVAL;
@@ -702,9 +718,7 @@ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 	if (status)
 		return status;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	nr_entries = multi_msix_capable(control);
+	nr_entries = pci_msix_table_size(dev);
 	if (nvec > nr_entries)
 		return -EINVAL;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7bd624bfdcf..b5d6d0e0f1c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -799,6 +799,10 @@ static inline void pci_msi_shutdown(struct pci_dev *dev)
 static inline void pci_disable_msi(struct pci_dev *dev)
 { }
 
+static inline int pci_msix_table_size(struct pci_dev *dev)
+{
+	return 0;
+}
 static inline int pci_enable_msix(struct pci_dev *dev,
 				  struct msix_entry *entries, int nvec)
 {
@@ -823,6 +827,7 @@ static inline int pci_msi_enabled(void)
 extern int pci_enable_msi(struct pci_dev *dev);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
+extern int pci_msix_table_size(struct pci_dev *dev);
 extern int pci_enable_msix(struct pci_dev *dev,
 	struct msix_entry *entries, int nvec);
 extern void pci_msix_shutdown(struct pci_dev *dev);
-- 
cgit v1.2.3


From b43d451385ef833e0696032aac2629da04d46c59 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 24 Jan 2009 00:23:22 +0100
Subject: PCI/PCIe portdrv: Fix allocation of interrupts

If MSI-X interrupt mode is used by the PCI Express port driver, too
many vectors are allocated and it is not ensured that the right
vectors will be used for the right services.  Namely, the PCI Express
specification states that both PCI Express native PME and PCI Express
hotplug will always use the same MSI or MSI-X message for signalling
interrupts, which implies that the same vector will be used by both
of them.  Also, the VC service does not use interrupts at all.
Moreover, is not clear which of the vectors allocated by
pci_enable_msix() in the current code will be used for PME and
hotplug and which of them will be used for AER if all of these
services are configured.

For these reasons, rework the allocation of interrupts for PCI
Express ports so that if MSI-X are enabled, the right vectors will be
used for the right purposes.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv.h      |   6 ++
 drivers/pci/pcie/portdrv_core.c | 206 ++++++++++++++++++++++++++++++++--------
 include/linux/pcieport_if.h     |  12 ++-
 3 files changed, 181 insertions(+), 43 deletions(-)

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ad4d082a034..5b818bd835e 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -25,6 +25,12 @@
 #define PCIE_CAPABILITIES_REG		0x2
 #define PCIE_SLOT_CAPABILITIES_REG	0x14
 #define PCIE_PORT_DEVICE_MAXSERVICES	4
+#define PCIE_PORT_MSI_VECTOR_MASK	0x1f
+/*
+ * According to the PCI Express Base Specification 2.0, the indices of the MSI-X
+ * table entires used by port services must not exceed 31
+ */
+#define PCIE_PORT_MAX_MSIX_ENTRIES	32
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 843d9e30dd3..3aea92a9292 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -30,6 +30,152 @@ static void release_pcie_device(struct device *dev)
 	kfree(to_pcie_device(dev));			
 }
 
+/**
+ * pcie_port_msix_add_entry - add entry to given array of MSI-X entries
+ * @entries: Array of MSI-X entries
+ * @new_entry: Index of the entry to add to the array
+ * @nr_entries: Number of entries aleady in the array
+ *
+ * Return value: Position of the added entry in the array
+ */
+static int pcie_port_msix_add_entry(
+	struct msix_entry *entries, int new_entry, int nr_entries)
+{
+	int j;
+
+	for (j = 0; j < nr_entries; j++)
+		if (entries[j].entry == new_entry)
+			return j;
+
+	entries[j].entry = new_entry;
+	return j;
+}
+
+/**
+ * pcie_port_enable_msix - try to set up MSI-X as interrupt mode for given port
+ * @dev: PCI Express port to handle
+ * @vectors: Array of interrupt vectors to populate
+ * @mask: Bitmask of port capabilities returned by get_port_device_capability()
+ *
+ * Return value: 0 on success, error code on failure
+ */
+static int pcie_port_enable_msix(struct pci_dev *dev, int *vectors, int mask)
+{
+	struct msix_entry *msix_entries;
+	int idx[PCIE_PORT_DEVICE_MAXSERVICES];
+	int nr_entries, status, pos, i, nvec;
+	u16 reg16;
+	u32 reg32;
+
+	nr_entries = pci_msix_table_size(dev);
+	if (!nr_entries)
+		return -EINVAL;
+	if (nr_entries > PCIE_PORT_MAX_MSIX_ENTRIES)
+		nr_entries = PCIE_PORT_MAX_MSIX_ENTRIES;
+
+	msix_entries = kzalloc(sizeof(*msix_entries) * nr_entries, GFP_KERNEL);
+	if (!msix_entries)
+		return -ENOMEM;
+
+	/*
+	 * Allocate as many entries as the port wants, so that we can check
+	 * which of them will be useful.  Moreover, if nr_entries is correctly
+	 * equal to the number of entries this port actually uses, we'll happily
+	 * go through without any tricks.
+	 */
+	for (i = 0; i < nr_entries; i++)
+		msix_entries[i].entry = i;
+
+	status = pci_enable_msix(dev, msix_entries, nr_entries);
+	if (status)
+		goto Exit;
+
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		idx[i] = -1;
+	status = -EIO;
+	nvec = 0;
+
+	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
+		int entry;
+
+		/*
+		 * The code below follows the PCI Express Base Specification 2.0
+		 * stating in Section 6.1.6 that "PME and Hot-Plug Event
+		 * interrupts (when both are implemented) always share the same
+		 * MSI or MSI-X vector, as indicated by the Interrupt Message
+		 * Number field in the PCI Express Capabilities register", where
+		 * according to Section 7.8.2 of the specification "For MSI-X,
+		 * the value in this field indicates which MSI-X Table entry is
+		 * used to generate the interrupt message."
+		 */
+		pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+		pci_read_config_word(dev, pos + PCIE_CAPABILITIES_REG, &reg16);
+		entry = (reg16 >> 9) & PCIE_PORT_MSI_VECTOR_MASK;
+		if (entry >= nr_entries)
+			goto Error;
+
+		i = pcie_port_msix_add_entry(msix_entries, entry, nvec);
+		if (i == nvec)
+			nvec++;
+
+		idx[PCIE_PORT_SERVICE_PME_SHIFT] = i;
+		idx[PCIE_PORT_SERVICE_HP_SHIFT] = i;
+	}
+
+	if (mask & PCIE_PORT_SERVICE_AER) {
+		int entry;
+
+		/*
+		 * The code below follows Section 7.10.10 of the PCI Express
+		 * Base Specification 2.0 stating that bits 31-27 of the Root
+		 * Error Status Register contain a value indicating which of the
+		 * MSI/MSI-X vectors assigned to the port is going to be used
+		 * for AER, where "For MSI-X, the value in this register
+		 * indicates which MSI-X Table entry is used to generate the
+		 * interrupt message."
+		 */
+		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &reg32);
+		entry = reg32 >> 27;
+		if (entry >= nr_entries)
+			goto Error;
+
+		i = pcie_port_msix_add_entry(msix_entries, entry, nvec);
+		if (i == nvec)
+			nvec++;
+
+		idx[PCIE_PORT_SERVICE_AER_SHIFT] = i;
+	}
+
+	/*
+	 * If nvec is equal to the allocated number of entries, we can just use
+	 * what we have.  Otherwise, the port has some extra entries not for the
+	 * services we know and we need to work around that.
+	 */
+	if (nvec == nr_entries) {
+		status = 0;
+	} else {
+		/* Drop the temporary MSI-X setup */
+		pci_disable_msix(dev);
+
+		/* Now allocate the MSI-X vectors for real */
+		status = pci_enable_msix(dev, msix_entries, nvec);
+		if (status)
+			goto Exit;
+	}
+
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		vectors[i] = idx[i] >= 0 ? msix_entries[idx[i]].vector : -1;
+
+ Exit:
+	kfree(msix_entries);
+	return status;
+
+ Error:
+	pci_disable_msix(dev);
+	goto Exit;
+}
+
 /**
  * assign_interrupt_mode - choose interrupt mode for PCI Express port services
  *                         (INTx, MSI-X, MSI) and set up vectors
@@ -42,49 +188,31 @@ static void release_pcie_device(struct device *dev)
 static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
-	int i, pos, nvec, status = -EINVAL;
-	int interrupt_mode = PCIE_PORT_NO_IRQ;
-
-	/* Set INTx as default */
-	for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
-		if (mask & (1 << i)) 
-			nvec++;
-		vectors[i] = dev->irq;
-	}
-	if (dev->pin)
-		interrupt_mode = PCIE_PORT_INTx_MODE;
+	int irq, interrupt_mode = PCIE_PORT_NO_IRQ;
+	int i;
 
 	/* Check MSI quirk */
 	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
-		return interrupt_mode;
+		goto Fallback;
+
+	/* Try to use MSI-X if supported */
+	if (!pcie_port_enable_msix(dev, vectors, mask))
+		return PCIE_PORT_MSIX_MODE;
+
+	/* We're not going to use MSI-X, so try MSI and fall back to INTx */
+	if (!pci_enable_msi(dev))
+		interrupt_mode = PCIE_PORT_MSI_MODE;
+
+ Fallback:
+	if (interrupt_mode == PCIE_PORT_NO_IRQ && dev->pin)
+		interrupt_mode = PCIE_PORT_INTx_MODE;
+
+	irq = interrupt_mode != PCIE_PORT_NO_IRQ ? dev->irq : -1;
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		vectors[i] = irq;
+
+	vectors[PCIE_PORT_SERVICE_VC_SHIFT] = -1;
 
-	/* Select MSI-X over MSI if supported */		
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos) {
-		struct msix_entry msix_entries[PCIE_PORT_DEVICE_MAXSERVICES] = 
-			{{0, 0}, {0, 1}, {0, 2}, {0, 3}};
-		status = pci_enable_msix(dev, msix_entries, nvec);
-		if (!status) {
-			int j = 0;
-
-			interrupt_mode = PCIE_PORT_MSIX_MODE;
-			for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
-				if (mask & (1 << i)) 
-					vectors[i] = msix_entries[j++].vector;
-			}
-		}
-	} 
-	if (status) {
-		pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-		if (pos) {
-			status = pci_enable_msi(dev);
-			if (!status) {
-				interrupt_mode = PCIE_PORT_MSI_MODE;
-				for (i = 0;i < PCIE_PORT_DEVICE_MAXSERVICES;i++)
-					vectors[i] = dev->irq;
-			}
-		}
-	} 
 	return interrupt_mode;
 }
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index a3832079508..5d2afcfa6bc 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -16,10 +16,14 @@
 #define PCIE_ANY_PORT			7
 
 /* Service Type */
-#define PCIE_PORT_SERVICE_PME		1	/* Power Management Event */
-#define PCIE_PORT_SERVICE_AER		2	/* Advanced Error Reporting */
-#define PCIE_PORT_SERVICE_HP		4	/* Native Hotplug */
-#define PCIE_PORT_SERVICE_VC		8	/* Virtual Channel */
+#define PCIE_PORT_SERVICE_PME_SHIFT	0	/* Power Management Event */
+#define PCIE_PORT_SERVICE_PME		(1 << PCIE_PORT_SERVICE_PME_SHIFT)
+#define PCIE_PORT_SERVICE_AER_SHIFT	1	/* Advanced Error Reporting */
+#define PCIE_PORT_SERVICE_AER		(1 << PCIE_PORT_SERVICE_AER_SHIFT)
+#define PCIE_PORT_SERVICE_HP_SHIFT	2	/* Native Hotplug */
+#define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
+#define PCIE_PORT_SERVICE_VC_SHIFT	3	/* Virtual Channel */
+#define PCIE_PORT_SERVICE_VC		(1 << PCIE_PORT_SERVICE_VC_SHIFT)
 
 /* Root/Upstream/Downstream Port's Interrupt Mode */
 #define PCIE_PORT_NO_IRQ		(-1)
-- 
cgit v1.2.3


From 11df1f05514beaf0269484191007dbc8d47e0e6f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Mon, 19 Jan 2009 11:31:00 +1100
Subject: PCI/MSI: Use #ifdefs instead of weak functions

Weak functions aren't all they're cracked up to be. They lead to
incorrect binaries with some toolchains, they require us to have empty
functions we otherwise wouldn't, and the unused code is not elided
(as of gcc 4.3.2 anyway).

So replace the weak MSI arch hooks with the #define foo foo idiom. We no
longer need empty versions of arch_setup/teardown_msi_irq().

This is less source (by 1 line!), and results in smaller binaries too:

   text	   data	    bss	    dec	    hex	filename
9354300	1693916	 678424	11726640 b2ef30	build/powerpc/vmlinux-before
9354052	1693852	 678424	11726328 b2edf8	build/powerpc/vmlinux-after

Also smaller on x86_64 and arm (iop13xx).

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/powerpc/include/asm/pci.h |  4 ++++
 arch/x86/include/asm/pci.h     |  3 +++
 drivers/pci/msi.c              | 26 +++++++++-----------------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 3548159a1be..ba17d5d90a4 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -114,6 +114,10 @@ extern int pci_domain_nr(struct pci_bus *bus);
 /* Decide whether to display the domain number in /proc */
 extern int pci_proc_domain(struct pci_bus *bus);
 
+/* MSI arch hooks */
+#define arch_setup_msi_irqs arch_setup_msi_irqs
+#define arch_teardown_msi_irqs arch_teardown_msi_irqs
+#define arch_msi_check_device arch_msi_check_device
 
 struct vm_area_struct;
 /* Map a range of PCI memory or I/O space for a device into user space */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index a977de23cb4..a0301bfeb95 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -86,6 +86,9 @@ static inline void early_quirks(void) { }
 
 extern void pci_iommu_alloc(void);
 
+/* MSI arch hook */
+#define arch_setup_msi_irqs arch_setup_msi_irqs
+
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_32
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 08aedd5875b..33adf323f06 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -27,20 +27,15 @@ static int pci_msi_enable = 1;
 
 /* Arch hooks */
 
-int __attribute__ ((weak))
-arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
+#ifndef arch_msi_check_device
+int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
 {
 	return 0;
 }
+#endif
 
-int __attribute__ ((weak))
-arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
-{
-	return 0;
-}
-
-int __attribute__ ((weak))
-arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+#ifndef arch_setup_msi_irqs
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	struct msi_desc *entry;
 	int ret;
@@ -53,14 +48,10 @@ arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
 	return 0;
 }
+#endif
 
-void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
-{
-	return;
-}
-
-void __attribute__ ((weak))
-arch_teardown_msi_irqs(struct pci_dev *dev)
+#ifndef arch_teardown_msi_irqs
+void arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct msi_desc *entry;
 
@@ -69,6 +60,7 @@ arch_teardown_msi_irqs(struct pci_dev *dev)
 			arch_teardown_msi_irq(entry->irq);
 	}
 }
+#endif
 
 static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
 {
-- 
cgit v1.2.3


From 2b56313448bb8efad3af19f211d988c8352ac04d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Wed, 28 Jan 2009 18:27:21 +0800
Subject: PCI: check if a bus is added when removing it

When removing a bus, 'is_added' should be checked to make sure the
bus has been successfully added by pci_bus_add_child() who will sets
'is_added'.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/remove.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 042e0892442..caf8e1eae45 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -71,6 +71,9 @@ void pci_remove_bus(struct pci_bus *pci_bus)
 	down_write(&pci_bus_sem);
 	list_del(&pci_bus->node);
 	up_write(&pci_bus_sem);
+	if (!pci_bus->is_added)
+		return;
+
 	pci_remove_legacy_files(pci_bus);
 	device_remove_file(&pci_bus->dev, &dev_attr_cpuaffinity);
 	device_remove_file(&pci_bus->dev, &dev_attr_cpulistaffinity);
-- 
cgit v1.2.3


From 1c35b8e538cb6259accb215099cdb673310cad84 Mon Sep 17 00:00:00 2001
From: Frank Seidel <frank@f-seidel.de>
Date: Fri, 6 Feb 2009 10:23:36 +0100
Subject: PCI: add missing KERN_* constants to printks

According to kerneljanitors todo list all printk calls (beginning
a new line) should have an according KERN_* constant.
Those are the missing pieces here for the pci subsystem.

Signed-off-by: Frank Seidel <frank@f-seidel.de>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Tested-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp.h | 10 +++++-----
 drivers/pci/hotplug/shpchp.h | 10 +++++-----
 drivers/pci/intel-iommu.c    |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 39ae37589fd..1f887f64e49 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -46,10 +46,10 @@ extern int pciehp_force;
 extern struct workqueue_struct *pciehp_wq;
 
 #define dbg(format, arg...)						\
-	do {								\
-		if (pciehp_debug)					\
-			printk("%s: " format, MY_NAME , ## arg);	\
-	} while (0)
+do {									\
+	if (pciehp_debug)						\
+		printk(KERN_DEBUG "%s: " format, MY_NAME , ## arg);	\
+} while (0)
 #define err(format, arg...)						\
 	printk(KERN_ERR "%s: " format, MY_NAME , ## arg)
 #define info(format, arg...)						\
@@ -60,7 +60,7 @@ extern struct workqueue_struct *pciehp_wq;
 #define ctrl_dbg(ctrl, format, arg...)					\
 	do {								\
 		if (pciehp_debug)					\
-			dev_printk(, &ctrl->pcie->device,		\
+			dev_printk(KERN_DEBUG, &ctrl->pcie->device,	\
 					format, ## arg);		\
 	} while (0)
 #define ctrl_err(ctrl, format, arg...)					\
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index 6aba0b6cf2e..974e924ca96 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -48,10 +48,10 @@ extern int shpchp_debug;
 extern struct workqueue_struct *shpchp_wq;
 
 #define dbg(format, arg...)						\
-	do {								\
-		if (shpchp_debug)					\
-			printk("%s: " format, MY_NAME , ## arg);	\
-	} while (0)
+do {									\
+	if (shpchp_debug)						\
+		printk(KERN_DEBUG "%s: " format, MY_NAME , ## arg);	\
+} while (0)
 #define err(format, arg...)						\
 	printk(KERN_ERR "%s: " format, MY_NAME , ## arg)
 #define info(format, arg...)						\
@@ -62,7 +62,7 @@ extern struct workqueue_struct *shpchp_wq;
 #define ctrl_dbg(ctrl, format, arg...)					\
 	do {								\
 		if (shpchp_debug)					\
-			dev_printk(, &ctrl->pci_dev->dev,		\
+			dev_printk(KERN_DEBUG, &ctrl->pci_dev->dev,	\
 					format, ## arg);		\
 	} while (0)
 #define ctrl_err(ctrl, format, arg...)					\
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f3f686581a9..b548937d474 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1970,7 +1970,7 @@ static inline void iommu_prepare_isa(void)
 	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
 
 	if (ret)
-		printk("IOMMU: Failed to create 0-64M identity map, "
+		printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
 			"floppy might not work\n");
 
 }
-- 
cgit v1.2.3


From 0b3e7388e3b438500aaa0630879ce536747a47ca Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sun, 8 Feb 2009 22:45:24 +0100
Subject: PCI: introduce missing kfree

Error handling code following a kmalloc should free the allocated data.
Since the subsequent code that could provoke an error does not use the
allocated data, the allocation is just moved below it.

The semantic match that finds the problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
statement S;
expression E;
identifier f,l;
position p1,p2;
expression *ptr != NULL;
@@

(
if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S
|
x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...);
...
if (x == NULL) S
)
<... when != x
     when != if (...) { <+...x...+> }
x->f = E
...>
(
 return \(0\|<+...x...+>\|ptr\);
|
 return@p2 ...;
)

@script:python@
p1 << r.p1;
p2 << r.p2;
@@

print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index 21734c31152..96048010e7d 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -79,14 +79,15 @@ static int __init dummy_probe(struct pcie_device *dev)
 	struct slot *slot, *tmp;
 	struct pci_dev *pdev = dev->port;
 	struct pci_bus *pbus = pdev->subordinate;
-	if (!(slot = kzalloc(sizeof(*slot), GFP_KERNEL)))
-		return -ENOMEM;
 	/* Note: pciehp_detect_mode != PCIEHP_DETECT_ACPI here */
 	if (pciehp_get_hp_hw_control_from_firmware(pdev))
 		return -ENODEV;
 	if (!(pos = pci_find_capability(pdev, PCI_CAP_ID_EXP)))
 		return -ENODEV;
 	pci_read_config_dword(pdev, pos + PCI_EXP_SLTCAP, &slot_cap);
+	slot = kzalloc(sizeof(*slot), GFP_KERNEL);
+	if (!slot)
+		return -ENOMEM;
 	slot->number = slot_cap >> 19;
 	list_for_each_entry(tmp, &dummy_slots, slot_list) {
 		if (tmp->number == slot->number)
-- 
cgit v1.2.3


From 81b840cd27e3ee9af67b6e05a4847868f74fce69 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 3 Feb 2009 15:06:13 +0900
Subject: PCI: pciehp: fix possible endless loop in pcie_isr

Fix possible endless loop in pcie_isr.

Currently, pcie_isr() (interrupt service routine of pciehp) can end up in an
endless loop if the Slot Status register is set again immediately after being
cleared. According to the past discussion (see below URL) this case can happen
if the power fault detected bit is set during handling.

http://sourceforge.net/mailarchive/message.php?msg_id=20051130135409.A14918%40unix-os.sc.intel.com

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_hpc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 7a16c6897bb..c1a312f2406 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -672,10 +672,11 @@ static irqreturn_t pcie_isr(int irq, void *dev_id)
 		detected &= (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
 			     PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_PDC |
 			     PCI_EXP_SLTSTA_CC);
+		detected &= ~intr_loc;
 		intr_loc |= detected;
 		if (!intr_loc)
 			return IRQ_NONE;
-		if (detected && pciehp_writew(ctrl, PCI_EXP_SLTSTA, detected)) {
+		if (detected && pciehp_writew(ctrl, PCI_EXP_SLTSTA, intr_loc)) {
 			ctrl_err(ctrl, "%s: Cannot write to SLOTSTATUS\n",
 				 __func__);
 			return IRQ_NONE;
-- 
cgit v1.2.3


From 99f0169c17f334a11b0ace91188501c612f3e1e6 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 3 Feb 2009 15:06:16 +0900
Subject: PCI: pciehp: enable software notification on empty slots

Current pciehp disables software notification of adapter presence
changed event and MRL changed event when slot is turned off. Because
of this, there is no way to detect those events on empty slots in the
current pciehp implementation.

According to the past discussion(*), this behavior was introduced to
prevent endless loop that could happen if pcie_isr() runs after power
fault is detected on a certain platform whose stickey power-fault bit
remains on till the slot is powered on again.

(*) http://sourceforge.net/mailarchive/message.php?msg_id=20051130135409.A14918%40unix-os.sc.intel.com

I think this endless loop can be avoided using one bit flag that
indicates power fault had been detected, instead of disabling software
notification of adapter present changed event and MRL changed event.

With this patch, we can enable software notification mechanism of
presence changed and MRL changed event on the empty slots again.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp.h     |  1 +
 drivers/pci/hotplug/pciehp_hpc.c | 31 +++++++++++--------------------
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 1f887f64e49..2bf8d28062e 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -112,6 +112,7 @@ struct controller {
 	unsigned int no_cmd_complete:1;
 	unsigned int link_active_reporting:1;
 	unsigned int notification_enabled:1;
+	unsigned int power_fault_detected;
 };
 
 #define INT_BUTTON_IGNORE		0
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index c1a312f2406..07bd3215114 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -548,23 +548,21 @@ static int hpc_power_on_slot(struct slot * slot)
 
 	slot_cmd = POWER_ON;
 	cmd_mask = PCI_EXP_SLTCTL_PCC;
-	/* Enable detection that we turned off at slot power-off time */
 	if (!pciehp_poll_mode) {
-		slot_cmd |= (PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE |
-			     PCI_EXP_SLTCTL_PDCE);
-		cmd_mask |= (PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE |
-			     PCI_EXP_SLTCTL_PDCE);
+		/* Enable power fault detection turned off at power off time */
+		slot_cmd |= PCI_EXP_SLTCTL_PFDE;
+		cmd_mask |= PCI_EXP_SLTCTL_PFDE;
 	}
 
 	retval = pcie_write_cmd(ctrl, slot_cmd, cmd_mask);
-
 	if (retval) {
 		ctrl_err(ctrl, "Write %x command failed!\n", slot_cmd);
-		return -1;
+		return retval;
 	}
 	ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n",
 		 __func__, ctrl->cap_base + PCI_EXP_SLTCTL, slot_cmd);
 
+	ctrl->power_fault_detected = 0;
 	return retval;
 }
 
@@ -621,18 +619,10 @@ static int hpc_power_off_slot(struct slot * slot)
 
 	slot_cmd = POWER_OFF;
 	cmd_mask = PCI_EXP_SLTCTL_PCC;
-	/*
-	 * If we get MRL or presence detect interrupts now, the isr
-	 * will notice the sticky power-fault bit too and issue power
-	 * indicator change commands. This will lead to an endless loop
-	 * of command completions, since the power-fault bit remains on
-	 * till the slot is powered on again.
-	 */
 	if (!pciehp_poll_mode) {
-		slot_cmd &= ~(PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE |
-			      PCI_EXP_SLTCTL_PDCE);
-		cmd_mask |= (PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE |
-			     PCI_EXP_SLTCTL_PDCE);
+		/* Disable power fault detection */
+		slot_cmd &= ~PCI_EXP_SLTCTL_PFDE;
+		cmd_mask |= PCI_EXP_SLTCTL_PFDE;
 	}
 
 	retval = pcie_write_cmd(ctrl, slot_cmd, cmd_mask);
@@ -710,9 +700,10 @@ static irqreturn_t pcie_isr(int irq, void *dev_id)
 		pciehp_handle_presence_change(p_slot);
 
 	/* Check Power Fault Detected */
-	if (intr_loc & PCI_EXP_SLTSTA_PFD)
+	if ((intr_loc & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) {
+		ctrl->power_fault_detected = 1;
 		pciehp_handle_power_fault(p_slot);
-
+	}
 	return IRQ_HANDLED;
 }
 
-- 
cgit v1.2.3


From 6a82e21823058eea95325005b79f3b8c9492460f Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 3 Feb 2009 15:06:18 +0900
Subject: PCI: pciehp: make cmd_busy flag one bit

The cmd_busy field in struct controller takes only two values 0 or
1. So it should be one bit.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 2bf8d28062e..0a368547e63 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -108,7 +108,7 @@ struct controller {
 	u32 slot_cap;
 	u8 cap_base;
 	struct timer_list poll_timer;
-	int cmd_busy;
+	unsigned int cmd_busy:1;
 	unsigned int no_cmd_complete:1;
 	unsigned int link_active_reporting:1;
 	unsigned int notification_enabled:1;
-- 
cgit v1.2.3


From 62795041418dd63cd9ff6ff7bbdf1d1c513c189b Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Wed, 4 Feb 2009 11:25:22 -0700
Subject: PCI: enhance physical slot debug information

Convert usages of pr_debug to dev_dbg and add physical slot name.

Note that we use dev_dbg on the struct pci_bus and still manually
print out the PCI slot number (instead of calling dev_dbg on a
pci_dev) because a struct pci_bus with empty physical slots will
not have any pci_devs.

Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/slot.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 5a8ccb4f604..21189447e54 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -1,8 +1,8 @@
 /*
  * drivers/pci/slot.c
  * Copyright (C) 2006 Matthew Wilcox <matthew@wil.cx>
- * Copyright (C) 2006-2008 Hewlett-Packard Development Company, L.P.
- * 	Alex Chiang <achiang@hp.com>
+ * Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P.
+ *	Alex Chiang <achiang@hp.com>
  */
 
 #include <linux/kobject.h>
@@ -52,8 +52,8 @@ static void pci_slot_release(struct kobject *kobj)
 	struct pci_dev *dev;
 	struct pci_slot *slot = to_pci_slot(kobj);
 
-	pr_debug("%s: releasing pci_slot on %x:%d\n", __func__,
-		 slot->bus->number, slot->number);
+	dev_dbg(&slot->bus->dev, "dev %02x, released physical slot %s\n",
+		slot->number, pci_slot_name(slot));
 
 	list_for_each_entry(dev, &slot->bus->devices, bus_list)
 		if (PCI_SLOT(dev->devfn) == slot->number)
@@ -248,9 +248,8 @@ placeholder:
 		if (PCI_SLOT(dev->devfn) == slot_nr)
 			dev->slot = slot;
 
-	/* Don't care if debug printk has a -1 for slot_nr */
-	pr_debug("%s: created pci_slot on %04x:%02x:%02x\n",
-		 __func__, pci_domain_nr(parent), parent->number, slot_nr);
+	dev_dbg(&parent->dev, "dev %02x, created physical slot %s\n",
+		slot_nr, pci_slot_name(slot));
 
 out:
 	kfree(slot_name);
@@ -299,9 +298,8 @@ EXPORT_SYMBOL_GPL(pci_renumber_slot);
  */
 void pci_destroy_slot(struct pci_slot *slot)
 {
-	pr_debug("%s: dec refcount to %d on %04x:%02x:%02x\n", __func__,
-		 atomic_read(&slot->kobj.kref.refcount) - 1,
-		 pci_domain_nr(slot->bus), slot->bus->number, slot->number);
+	dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n",
+		slot->number, atomic_read(&slot->kobj.kref.refcount) - 1);
 
 	down_write(&pci_bus_sem);
 	kobject_put(&slot->kobj);
-- 
cgit v1.2.3


From 4c9c16867e4980fbd7d1fcc9516c9269ecb4d06f Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 8 Dec 2008 16:19:14 +0100
Subject: PCI quirk: don't mark one netmos as class other

Let it stay as serial, since it doesn't have subdevice in the form of 0x00PS.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/quirks.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 92b9efe9bca..5aa2afb23ef 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1664,9 +1664,13 @@ static void __devinit quirk_netmos(struct pci_dev *dev)
 	 * of parallel ports and <S> is the number of serial ports.
 	 */
 	switch (dev->device) {
+	case PCI_DEVICE_ID_NETMOS_9835:
+		/* Well, this rule doesn't hold for the following 9835 device */
+		if (dev->subsystem_vendor == PCI_VENDOR_ID_IBM &&
+				dev->subsystem_device == 0x0299)
+			return;
 	case PCI_DEVICE_ID_NETMOS_9735:
 	case PCI_DEVICE_ID_NETMOS_9745:
-	case PCI_DEVICE_ID_NETMOS_9835:
 	case PCI_DEVICE_ID_NETMOS_9845:
 	case PCI_DEVICE_ID_NETMOS_9855:
 		if ((dev->class >> 8) == PCI_CLASS_COMMUNICATION_SERIAL &&
-- 
cgit v1.2.3


From 5fe5db05f64d0d10b563b1c13b58e4a52b190686 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 9 Feb 2009 14:53:47 +0800
Subject: PCI: Speed up device reset function

For all devices need to do function level reset, currently we need wait for
at least 200ms, which can be too long if we have lots of devices...

The patch checked pending bit before msleep() to skip some unnecessary
sleeping interval.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 5737b8a9a73..0b3e20f1b6f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2028,18 +2028,24 @@ static int __pcie_flr(struct pci_dev *dev, int probe)
 	pci_block_user_cfg_access(dev);
 
 	/* Wait for Transaction Pending bit clean */
+	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
+	if (!(status & PCI_EXP_DEVSTA_TRPND))
+		goto transaction_done;
+
 	msleep(100);
 	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (status & PCI_EXP_DEVSTA_TRPND) {
-		dev_info(&dev->dev, "Busy after 100ms while trying to reset; "
+	if (!(status & PCI_EXP_DEVSTA_TRPND))
+		goto transaction_done;
+
+	dev_info(&dev->dev, "Busy after 100ms while trying to reset; "
 			"sleeping for 1 second\n");
-		ssleep(1);
-		pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-		if (status & PCI_EXP_DEVSTA_TRPND)
-			dev_info(&dev->dev, "Still busy after 1s; "
+	ssleep(1);
+	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
+	if (status & PCI_EXP_DEVSTA_TRPND)
+		dev_info(&dev->dev, "Still busy after 1s; "
 				"proceeding with reset anyway\n");
-	}
 
+transaction_done:
 	pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL,
 				PCI_EXP_DEVCTL_BCR_FLR);
 	mdelay(100);
@@ -2066,18 +2072,24 @@ static int __pci_af_flr(struct pci_dev *dev, int probe)
 	pci_block_user_cfg_access(dev);
 
 	/* Wait for Transaction Pending bit clean */
+	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
+	if (!(status & PCI_AF_STATUS_TP))
+		goto transaction_done;
+
 	msleep(100);
 	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (status & PCI_AF_STATUS_TP) {
-		dev_info(&dev->dev, "Busy after 100ms while trying to"
-				" reset; sleeping for 1 second\n");
-		ssleep(1);
-		pci_read_config_byte(dev,
-				cappos + PCI_AF_STATUS, &status);
-		if (status & PCI_AF_STATUS_TP)
-			dev_info(&dev->dev, "Still busy after 1s; "
-					"proceeding with reset anyway\n");
-	}
+	if (!(status & PCI_AF_STATUS_TP))
+		goto transaction_done;
+
+	dev_info(&dev->dev, "Busy after 100ms while trying to"
+			" reset; sleeping for 1 second\n");
+	ssleep(1);
+	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
+	if (status & PCI_AF_STATUS_TP)
+		dev_info(&dev->dev, "Still busy after 1s; "
+				"proceeding with reset anyway\n");
+
+transaction_done:
 	pci_write_config_byte(dev, cappos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
 	mdelay(100);
 
-- 
cgit v1.2.3


From 63f10f0f6df4e4e860b790d64bebfde85b540b0a Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 9 Feb 2009 15:59:29 +0900
Subject: PCI/ACPI: move _OSC code to pci_root.c

Move PCI _OSC management code from drivers/pci/pci-acpi.c to
drivers/acpi/pci_root.c. The benefits are

- We no longer need struct osc_data and its management code (contents
  are moved to struct acpi_pci_root). This simplify the code, and we
  no longer care about kmalloc() failure.

- We can make pci_acpi_osc_support() be a static function, which is
  called only from drivers/acpi/pci_root.c.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Tested-by: Andrew Patterson <andrew.patterson@hp.com>
Acked-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_root.c  | 180 ++++++++++++++++++++++++++++++++++++++-
 drivers/pci/pci-acpi.c   | 215 -----------------------------------------------
 include/linux/pci-acpi.h |   1 -
 3 files changed, 178 insertions(+), 218 deletions(-)

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 5b38a026d12..979eccc82c5 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -66,11 +66,18 @@ struct acpi_pci_root {
 	struct acpi_device * device;
 	struct acpi_pci_id id;
 	struct pci_bus *bus;
+
+	u32 osc_support_set;	/* _OSC state of support bits */
+	u32 osc_control_set;	/* _OSC state of control bits */
+	u32 osc_control_qry;	/* the latest _OSC query result */
+
+	u32 osc_queried:1;	/* has _OSC control been queried? */
 };
 
 static LIST_HEAD(acpi_pci_roots);
 
 static struct acpi_pci_driver *sub_driver;
+static DEFINE_MUTEX(osc_lock);
 
 int acpi_pci_register_driver(struct acpi_pci_driver *driver)
 {
@@ -185,6 +192,175 @@ static void acpi_pci_bridge_scan(struct acpi_device *device)
 		}
 }
 
+static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40,
+			  0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66};
+
+static acpi_status acpi_pci_run_osc(acpi_handle handle,
+				    const u32 *capbuf, u32 *retval)
+{
+	acpi_status status;
+	struct acpi_object_list input;
+	union acpi_object in_params[4];
+	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *out_obj;
+	u32 errors;
+
+	/* Setting up input parameters */
+	input.count = 4;
+	input.pointer = in_params;
+	in_params[0].type 		= ACPI_TYPE_BUFFER;
+	in_params[0].buffer.length 	= 16;
+	in_params[0].buffer.pointer	= OSC_UUID;
+	in_params[1].type 		= ACPI_TYPE_INTEGER;
+	in_params[1].integer.value 	= 1;
+	in_params[2].type 		= ACPI_TYPE_INTEGER;
+	in_params[2].integer.value	= 3;
+	in_params[3].type		= ACPI_TYPE_BUFFER;
+	in_params[3].buffer.length 	= 12;
+	in_params[3].buffer.pointer 	= (u8 *)capbuf;
+
+	status = acpi_evaluate_object(handle, "_OSC", &input, &output);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (!output.length)
+		return AE_NULL_OBJECT;
+
+	out_obj = output.pointer;
+	if (out_obj->type != ACPI_TYPE_BUFFER) {
+		printk(KERN_DEBUG "_OSC evaluation returned wrong type\n");
+		status = AE_TYPE;
+		goto out_kfree;
+	}
+	/* Need to ignore the bit0 in result code */
+	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+	if (errors) {
+		if (errors & OSC_REQUEST_ERROR)
+			printk(KERN_DEBUG "_OSC request failed\n");
+		if (errors & OSC_INVALID_UUID_ERROR)
+			printk(KERN_DEBUG "_OSC invalid UUID\n");
+		if (errors & OSC_INVALID_REVISION_ERROR)
+			printk(KERN_DEBUG "_OSC invalid revision\n");
+		if (errors & OSC_CAPABILITIES_MASK_ERROR) {
+			if (capbuf[OSC_QUERY_TYPE] & OSC_QUERY_ENABLE)
+				goto out_success;
+			printk(KERN_DEBUG
+			       "Firmware did not grant requested _OSC control\n");
+			status = AE_SUPPORT;
+			goto out_kfree;
+		}
+		status = AE_ERROR;
+		goto out_kfree;
+	}
+out_success:
+	*retval = *((u32 *)(out_obj->buffer.pointer + 8));
+	status = AE_OK;
+
+out_kfree:
+	kfree(output.pointer);
+	return status;
+}
+
+static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, u32 flags)
+{
+	acpi_status status;
+	u32 support_set, result, capbuf[3];
+
+	/* do _OSC query for all possible controls */
+	support_set = root->osc_support_set | (flags & OSC_SUPPORT_MASKS);
+	capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
+	capbuf[OSC_SUPPORT_TYPE] = support_set;
+	capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS;
+
+	status = acpi_pci_run_osc(root->device->handle, capbuf, &result);
+	if (ACPI_SUCCESS(status)) {
+		root->osc_support_set = support_set;
+		root->osc_control_qry = result;
+		root->osc_queried = 1;
+	}
+	return status;
+}
+
+static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags)
+{
+	acpi_status status;
+	acpi_handle tmp;
+
+	status = acpi_get_handle(root->device->handle, "_OSC", &tmp);
+	if (ACPI_FAILURE(status))
+		return status;
+	mutex_lock(&osc_lock);
+	status = acpi_pci_query_osc(root, flags);
+	mutex_unlock(&osc_lock);
+	return status;
+}
+
+static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
+{
+	struct acpi_pci_root *root;
+	list_for_each_entry(root, &acpi_pci_roots, node) {
+		if (root->device->handle == handle)
+			return root;
+	}
+	return NULL;
+}
+
+/**
+ * pci_osc_control_set - commit requested control to Firmware
+ * @handle: acpi_handle for the target ACPI object
+ * @flags: driver's requested control bits
+ *
+ * Attempt to take control from Firmware on requested control bits.
+ **/
+acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
+{
+	acpi_status status;
+	u32 control_req, result, capbuf[3];
+	acpi_handle tmp;
+	struct acpi_pci_root *root;
+
+	status = acpi_get_handle(handle, "_OSC", &tmp);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	control_req = (flags & OSC_CONTROL_MASKS);
+	if (!control_req)
+		return AE_TYPE;
+
+	root = acpi_pci_find_root(handle);
+	if (!root)
+		return AE_NOT_EXIST;
+
+	mutex_lock(&osc_lock);
+	/* No need to evaluate _OSC if the control was already granted. */
+	if ((root->osc_control_set & control_req) == control_req)
+		goto out;
+
+	/* Need to query controls first before requesting them */
+	if (!root->osc_queried) {
+		status = acpi_pci_query_osc(root, root->osc_support_set);
+		if (ACPI_FAILURE(status))
+			goto out;
+	}
+	if ((root->osc_control_qry & control_req) != control_req) {
+		printk(KERN_DEBUG
+		       "Firmware did not grant requested _OSC control\n");
+		status = AE_SUPPORT;
+		goto out;
+	}
+
+	capbuf[OSC_QUERY_TYPE] = 0;
+	capbuf[OSC_SUPPORT_TYPE] = root->osc_support_set;
+	capbuf[OSC_CONTROL_TYPE] = root->osc_control_set | control_req;
+	status = acpi_pci_run_osc(handle, capbuf, &result);
+	if (ACPI_SUCCESS(status))
+		root->osc_control_set = result;
+out:
+	mutex_unlock(&osc_lock);
+	return status;
+}
+EXPORT_SYMBOL(pci_osc_control_set);
+
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
 	int result = 0;
@@ -217,7 +393,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	 * PCI domains, so we indicate this in _OSC support capabilities.
 	 */
 	flags = base_flags = OSC_PCI_SEGMENT_GROUPS_SUPPORT;
-	pci_acpi_osc_support(device->handle, flags);
+	acpi_pci_osc_support(root, flags);
 
 	/* 
 	 * Segment
@@ -353,7 +529,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	if (pci_msi_enabled())
 		flags |= OSC_MSI_SUPPORT;
 	if (flags != base_flags)
-		pci_acpi_osc_support(device->handle, flags);
+		acpi_pci_osc_support(root, flags);
 
       end:
 	if (result) {
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index deea8a187eb..fac5eddcefd 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -18,221 +18,6 @@
 #include <linux/pci-acpi.h>
 #include "pci.h"
 
-struct acpi_osc_data {
-	acpi_handle handle;
-	u32 support_set;
-	u32 control_set;
-	u32 control_query;
-	int is_queried;
-	struct list_head sibiling;
-};
-static LIST_HEAD(acpi_osc_data_list);
-
-struct acpi_osc_args {
-	u32 capbuf[3];
-};
-
-static DEFINE_MUTEX(pci_acpi_lock);
-
-static struct acpi_osc_data *acpi_get_osc_data(acpi_handle handle)
-{
-	struct acpi_osc_data *data;
-
-	list_for_each_entry(data, &acpi_osc_data_list, sibiling) {
-		if (data->handle == handle)
-			return data;
-	}
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return NULL;
-	INIT_LIST_HEAD(&data->sibiling);
-	data->handle = handle;
-	list_add_tail(&data->sibiling, &acpi_osc_data_list);
-	return data;
-}
-
-static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40,
-			  0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66};
-
-static acpi_status acpi_run_osc(acpi_handle handle,
-				struct acpi_osc_args *osc_args, u32 *retval)
-{
-	acpi_status status;
-	struct acpi_object_list input;
-	union acpi_object in_params[4];
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object *out_obj;
-	u32 errors, flags = osc_args->capbuf[OSC_QUERY_TYPE];
-
-	/* Setting up input parameters */
-	input.count = 4;
-	input.pointer = in_params;
-	in_params[0].type 		= ACPI_TYPE_BUFFER;
-	in_params[0].buffer.length 	= 16;
-	in_params[0].buffer.pointer	= OSC_UUID;
-	in_params[1].type 		= ACPI_TYPE_INTEGER;
-	in_params[1].integer.value 	= 1;
-	in_params[2].type 		= ACPI_TYPE_INTEGER;
-	in_params[2].integer.value	= 3;
-	in_params[3].type		= ACPI_TYPE_BUFFER;
-	in_params[3].buffer.length 	= 12;
-	in_params[3].buffer.pointer 	= (u8 *)osc_args->capbuf;
-
-	status = acpi_evaluate_object(handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	if (!output.length)
-		return AE_NULL_OBJECT;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		printk(KERN_DEBUG "Evaluate _OSC returns wrong type\n");
-		status = AE_TYPE;
-		goto out_kfree;
-	}
-	/* Need to ignore the bit0 in result code */
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		if (errors & OSC_REQUEST_ERROR)
-			printk(KERN_DEBUG "_OSC request fails\n"); 
-		if (errors & OSC_INVALID_UUID_ERROR)
-			printk(KERN_DEBUG "_OSC invalid UUID\n"); 
-		if (errors & OSC_INVALID_REVISION_ERROR)
-			printk(KERN_DEBUG "_OSC invalid revision\n"); 
-		if (errors & OSC_CAPABILITIES_MASK_ERROR) {
-			if (flags & OSC_QUERY_ENABLE)
-				goto out_success;
-			printk(KERN_DEBUG "_OSC FW not grant req. control\n");
-			status = AE_SUPPORT;
-			goto out_kfree;
-		}
-		status = AE_ERROR;
-		goto out_kfree;
-	}
-out_success:
-	*retval = *((u32 *)(out_obj->buffer.pointer + 8));
-	status = AE_OK;
-
-out_kfree:
-	kfree(output.pointer);
-	return status;
-}
-
-static acpi_status __acpi_query_osc(u32 flags, struct acpi_osc_data *osc_data)
-{
-	acpi_status status;
-	u32 support_set, result;
-	struct acpi_osc_args osc_args;
-
-	/* do _OSC query for all possible controls */
-	support_set = osc_data->support_set | (flags & OSC_SUPPORT_MASKS);
-	osc_args.capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
-	osc_args.capbuf[OSC_SUPPORT_TYPE] = support_set;
-	osc_args.capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS;
-
-	status = acpi_run_osc(osc_data->handle, &osc_args, &result);
-	if (ACPI_SUCCESS(status)) {
-		osc_data->support_set = support_set;
-		osc_data->control_query = result;
-		osc_data->is_queried = 1;
-	}
-
-	return status;
-}
-
-/*
- * pci_acpi_osc_support: Invoke _OSC indicating support for the given feature
- * @flags: Bitmask of flags to support
- *
- * See the ACPI spec for the definition of the flags
- */
-int pci_acpi_osc_support(acpi_handle handle, u32 flags)
-{
-	acpi_status status;
-	acpi_handle tmp;
-	struct acpi_osc_data *osc_data;
-	int rc = 0;
-
-	status = acpi_get_handle(handle, "_OSC", &tmp);
-	if (ACPI_FAILURE(status))
-		return -ENOTTY;
-
-	mutex_lock(&pci_acpi_lock);
-	osc_data = acpi_get_osc_data(handle);
-	if (!osc_data) {
-		printk(KERN_ERR "acpi osc data array is full\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	__acpi_query_osc(flags, osc_data);
-out:
-	mutex_unlock(&pci_acpi_lock);
-	return rc;
-}
-
-/**
- * pci_osc_control_set - commit requested control to Firmware
- * @handle: acpi_handle for the target ACPI object
- * @flags: driver's requested control bits
- *
- * Attempt to take control from Firmware on requested control bits.
- **/
-acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
-{
-	acpi_status status;
-	u32 control_req, control_set, result;
-	acpi_handle tmp;
-	struct acpi_osc_data *osc_data;
-	struct acpi_osc_args osc_args;
-
-	status = acpi_get_handle(handle, "_OSC", &tmp);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	mutex_lock(&pci_acpi_lock);
-	osc_data = acpi_get_osc_data(handle);
-	if (!osc_data) {
-		printk(KERN_ERR "acpi osc data array is full\n");
-		status = AE_ERROR;
-		goto out;
-	}
-
-	control_req = (flags & OSC_CONTROL_MASKS);
-	if (!control_req) {
-		status = AE_TYPE;
-		goto out;
-	}
-
-	/* No need to evaluate _OSC if the control was already granted. */
-	if ((osc_data->control_set & control_req) == control_req)
-		goto out;
-
-	if (!osc_data->is_queried) {
-		status = __acpi_query_osc(osc_data->support_set, osc_data);
-		if (ACPI_FAILURE(status))
-			goto out;
-	}
-
-	if ((osc_data->control_query & control_req) != control_req) {
-		status = AE_SUPPORT;
-		goto out;
-	}
-
-	control_set = osc_data->control_set | control_req;
-	osc_args.capbuf[OSC_QUERY_TYPE] = 0;
-	osc_args.capbuf[OSC_SUPPORT_TYPE] = osc_data->support_set;
-	osc_args.capbuf[OSC_CONTROL_TYPE] = control_set;
-	status = acpi_run_osc(handle, &osc_args, &result);
-	if (ACPI_SUCCESS(status))
-		osc_data->control_set = result;
-out:
-	mutex_unlock(&pci_acpi_lock);
-	return status;
-}
-EXPORT_SYMBOL(pci_osc_control_set);
-
 /*
  * _SxD returns the D-state with the highest power
  * (lowest D-state number) supported in the S-state "x".
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 042c166f65d..65cb103b21d 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -50,7 +50,6 @@
 
 #ifdef CONFIG_ACPI
 extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
-int pci_acpi_osc_support(acpi_handle handle, u32 flags);
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	/* Find root host bridge */
-- 
cgit v1.2.3


From 9f5404d8ea90bfa4d58a3936e5a3d0d28cecf60f Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 9 Feb 2009 16:00:04 +0900
Subject: PCI/ACPI: rename pci_osc_control_set()

- Rename pci_osc_control_set() to acpi_pci_osc_control_set() according
  to the other API names in drivers/acpi/pci_root.c.

- Move _OSC related definitions to include/linux/acpi.h because _OSC
  related API is implemented in drivers/acpi/pci_root.c now.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Tested-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_root.c            |  6 ++---
 drivers/pci/hotplug/acpi_pcihp.c   |  5 ++---
 drivers/pci/pcie/aer/aerdrv_acpi.c |  2 +-
 include/linux/acpi.h               | 34 ++++++++++++++++++++++++++++
 include/linux/pci-acpi.h           | 45 --------------------------------------
 5 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 979eccc82c5..196f97d0095 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -306,13 +306,13 @@ static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
 }
 
 /**
- * pci_osc_control_set - commit requested control to Firmware
+ * acpi_pci_osc_control_set - commit requested control to Firmware
  * @handle: acpi_handle for the target ACPI object
  * @flags: driver's requested control bits
  *
  * Attempt to take control from Firmware on requested control bits.
  **/
-acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
+acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags)
 {
 	acpi_status status;
 	u32 control_req, result, capbuf[3];
@@ -359,7 +359,7 @@ out:
 	mutex_unlock(&osc_lock);
 	return status;
 }
-EXPORT_SYMBOL(pci_osc_control_set);
+EXPORT_SYMBOL(acpi_pci_osc_control_set);
 
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 1c114180106..f47bc74be56 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -30,9 +30,8 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
+#include <linux/acpi.h>
 #include <linux/pci-acpi.h>
-#include <acpi/acpi.h>
-#include <acpi/acpi_bus.h>
 
 #define MY_NAME	"acpi_pcihp"
 
@@ -408,7 +407,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags)
 		acpi_get_name(handle, ACPI_FULL_PATHNAME, &string);
 		dbg("Trying to get hotplug control for %s\n",
 				(char *)string.pointer);
-		status = pci_osc_control_set(handle, flags);
+		status = acpi_pci_osc_control_set(handle, flags);
 		if (ACPI_SUCCESS(status))
 			goto got_one;
 		kfree(string.pointer);
diff --git a/drivers/pci/pcie/aer/aerdrv_acpi.c b/drivers/pci/pcie/aer/aerdrv_acpi.c
index ebce26c3704..8edb2f300e8 100644
--- a/drivers/pci/pcie/aer/aerdrv_acpi.c
+++ b/drivers/pci/pcie/aer/aerdrv_acpi.c
@@ -38,7 +38,7 @@ int aer_osc_setup(struct pcie_device *pciedev)
 
 	handle = acpi_find_root_bridge_handle(pdev);
 	if (handle) {
-		status = pci_osc_control_set(handle,
+		status = acpi_pci_osc_control_set(handle,
 					OSC_PCI_EXPRESS_AER_CONTROL |
 					OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL);
 	}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6fce2fc2d12..2a3b189e3e2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -256,6 +256,40 @@ void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
 void __init acpi_s4_no_nvs(void);
 #endif /* CONFIG_PM_SLEEP */
+
+#define OSC_QUERY_TYPE			0
+#define OSC_SUPPORT_TYPE 		1
+#define OSC_CONTROL_TYPE		2
+#define OSC_SUPPORT_MASKS		0x1f
+
+/* _OSC DW0 Definition */
+#define OSC_QUERY_ENABLE		1
+#define OSC_REQUEST_ERROR		2
+#define OSC_INVALID_UUID_ERROR		4
+#define OSC_INVALID_REVISION_ERROR	8
+#define OSC_CAPABILITIES_MASK_ERROR	16
+
+/* _OSC DW1 Definition (OS Support Fields) */
+#define OSC_EXT_PCI_CONFIG_SUPPORT		1
+#define OSC_ACTIVE_STATE_PWR_SUPPORT 		2
+#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT	4
+#define OSC_PCI_SEGMENT_GROUPS_SUPPORT		8
+#define OSC_MSI_SUPPORT				16
+
+/* _OSC DW1 Definition (OS Control Fields) */
+#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	1
+#define OSC_SHPC_NATIVE_HP_CONTROL 		2
+#define OSC_PCI_EXPRESS_PME_CONTROL		4
+#define OSC_PCI_EXPRESS_AER_CONTROL		8
+#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL	16
+
+#define OSC_CONTROL_MASKS 	(OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | 	\
+				OSC_SHPC_NATIVE_HP_CONTROL | 		\
+				OSC_PCI_EXPRESS_PME_CONTROL |		\
+				OSC_PCI_EXPRESS_AER_CONTROL |		\
+				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
+
+extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
 #else	/* CONFIG_ACPI */
 
 static inline int early_acpi_boot_init(void)
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 65cb103b21d..20480b9f10c 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -10,46 +10,7 @@
 
 #include <linux/acpi.h>
 
-#define OSC_QUERY_TYPE			0
-#define OSC_SUPPORT_TYPE 		1
-#define OSC_CONTROL_TYPE		2
-#define OSC_SUPPORT_MASKS		0x1f
-
-/*
- * _OSC DW0 Definition 
- */
-#define OSC_QUERY_ENABLE		1
-#define OSC_REQUEST_ERROR		2
-#define OSC_INVALID_UUID_ERROR		4
-#define OSC_INVALID_REVISION_ERROR	8
-#define OSC_CAPABILITIES_MASK_ERROR	16
-
-/*
- * _OSC DW1 Definition (OS Support Fields)
- */
-#define OSC_EXT_PCI_CONFIG_SUPPORT		1
-#define OSC_ACTIVE_STATE_PWR_SUPPORT 		2
-#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT	4
-#define OSC_PCI_SEGMENT_GROUPS_SUPPORT		8
-#define OSC_MSI_SUPPORT				16
-
-/*
- * _OSC DW1 Definition (OS Control Fields)
- */
-#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	1
-#define OSC_SHPC_NATIVE_HP_CONTROL 		2
-#define OSC_PCI_EXPRESS_PME_CONTROL		4
-#define OSC_PCI_EXPRESS_AER_CONTROL		8
-#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL	16
-
-#define OSC_CONTROL_MASKS 	(OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | 	\
-				OSC_SHPC_NATIVE_HP_CONTROL | 		\
-				OSC_PCI_EXPRESS_PME_CONTROL |		\
-				OSC_PCI_EXPRESS_AER_CONTROL |		\
-				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
-
 #ifdef CONFIG_ACPI
-extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	/* Find root host bridge */
@@ -69,12 +30,6 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 	return acpi_get_pci_rootbridge_handle(seg, busnr);
 }
 #else
-#if !defined(AE_ERROR)
-typedef u32 		acpi_status;
-#define AE_ERROR      	(acpi_status) (0x0001)
-#endif    
-static inline acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
-{return AE_ERROR;}
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 { return NULL; }
 #endif
-- 
cgit v1.2.3


From 35e1801ea637810830e653ffe7ff62c7048ae03a Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 11 Feb 2009 21:13:45 +0100
Subject: PCI hotplug: shpchp: fix bus number check to avoid false positive

With for (busnr = 0; busnr <= end; busnr++) { ... } busnr reaches end + 1
after the loop.  So fix the "no busses available" check to look for just
busnr > end rather than >=.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/shpchp_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/shpchp_pci.c b/drivers/pci/hotplug/shpchp_pci.c
index 138f161becc..aa315e52529 100644
--- a/drivers/pci/hotplug/shpchp_pci.c
+++ b/drivers/pci/hotplug/shpchp_pci.c
@@ -137,7 +137,7 @@ int __ref shpchp_configure_device(struct slot *p_slot)
 							busnr))
 					break;
 			}
-			if (busnr >= end) {
+			if (busnr > end) {
 				ctrl_err(ctrl,
 					 "No free bus for hot-added bridge\n");
 				pci_dev_put(dev);
-- 
cgit v1.2.3


From b5fbf53324f65646154e172af350674d5a2a1629 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 11 Feb 2009 22:27:02 +1100
Subject: PCI/MSI: Allow arch code to return the number of MSI-X available

There is code in msix_capability_init() which, when the requested number
of MSI-X couldn't be allocated, calculates how many MSI-X /could/ be
allocated and returns that to the driver. That allows the driver to then
make a second request, with a number of MSIs that should succeed.

The current code requires the arch code to setup as many msi_descs as it
can, and then return to the generic code. On some platforms the arch
code may already know how many MSI-X it can allocate, before it sets up
any of the msi_descs.

So change the logic such that if the arch code returns a positive error
code, that is taken to be the number of MSI-X that could be allocated.
If the error code is negative we still calculate the number available
using the old method.

Because it's a little subtle, make sure the error return code from
arch_setup_msi_irq() is always negative. That way only implementations
of arch_setup_msi_irqs() need to be careful about returning a positive
error code.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 33adf323f06..dceea56f734 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -42,8 +42,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
 		ret = arch_setup_msi_irq(dev, entry);
-		if (ret)
+		if (ret < 0)
 			return ret;
+		if (ret > 0)
+			return -ENOSPC;
 	}
 
 	return 0;
@@ -487,7 +489,9 @@ static int msix_capability_init(struct pci_dev *dev,
 	}
 
 	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
-	if (ret) {
+	if (ret < 0) {
+		/* If we had some success report the number of irqs
+		 * we succeeded in setting up. */
 		int avail = 0;
 		list_for_each_entry(entry, &dev->msi_list, list) {
 			if (entry->irq != 0) {
@@ -495,14 +499,13 @@ static int msix_capability_init(struct pci_dev *dev,
 			}
 		}
 
-		msi_free_irqs(dev);
+		if (avail != 0)
+			ret = avail;
+	}
 
-		/* If we had some success report the number of irqs
-		 * we succeeded in setting up.
-		 */
-		if (avail == 0)
-			avail = ret;
-		return avail;
+	if (ret) {
+		msi_free_irqs(dev);
+		return ret;
 	}
 
 	i = 0;
-- 
cgit v1.2.3


From c48f1670f42b71f39f4a3bfba01ffb691cc9206c Mon Sep 17 00:00:00 2001
From: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Date: Tue, 3 Feb 2009 15:45:26 -0800
Subject: PCI: constify pci_bus_add_devices()

drivers/pci/hotplug/fakephp.c:283: warning: passing argument 1 of 'pci_bus_add_devices' discards qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/bus.c   | 2 +-
 include/linux/pci.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 52b54f053be..118c77778d2 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -133,7 +133,7 @@ int pci_bus_add_child(struct pci_bus *bus)
  *
  * Call hotplug for each new devices.
  */
-void pci_bus_add_devices(struct pci_bus *bus)
+void pci_bus_add_devices(const struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	struct pci_bus *child;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b5d6d0e0f1c..a1af2fe0063 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -528,7 +528,7 @@ void pcibios_update_irq(struct pci_dev *, int irq);
 /* Generic PCI functions used internally */
 
 extern struct pci_bus *pci_find_bus(int domain, int busnr);
-void pci_bus_add_devices(struct pci_bus *bus);
+void pci_bus_add_devices(const struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
 				      struct pci_ops *ops, void *sysdata);
 static inline struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops,
-- 
cgit v1.2.3


From ea7415512a07add2b09c070c9a5d1950833cf9b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 18 Feb 2009 10:44:29 -0800
Subject: PCI: constify pci_bus_assign_resources()

drivers/pci/hotplug/fakephp.c: In function 'pci_rescan_bus':
drivers/pci/hotplug/fakephp.c:271: warning: passing argument 1 of 'pci_bus_assign_resources' discards qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/setup-bus.c | 4 ++--
 include/linux/pci.h     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 70460894578..170a3eda9dd 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -27,7 +27,7 @@
 #include <linux/slab.h>
 
 
-static void pbus_assign_resources_sorted(struct pci_bus *bus)
+static void pbus_assign_resources_sorted(const struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	struct resource *res;
@@ -495,7 +495,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
-void __ref pci_bus_assign_resources(struct pci_bus *bus)
+void __ref pci_bus_assign_resources(const struct pci_bus *bus)
 {
 	struct pci_bus *b;
 	struct pci_dev *dev;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a1af2fe0063..7baf2a5db12 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -708,7 +708,7 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 int pci_vpd_truncate(struct pci_dev *dev, size_t size);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
-void pci_bus_assign_resources(struct pci_bus *bus);
+void pci_bus_assign_resources(const struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
 void pci_assign_unassigned_resources(void);
-- 
cgit v1.2.3


From 10a0ef39fbd1d484c2bbc1ffd83d57ecef209140 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Tue, 17 Feb 2009 13:46:53 +0300
Subject: PCI/alpha: pci sysfs resources

This closes http://bugzilla.kernel.org/show_bug.cgi?id=10893
which is a showstopper for X development on alpha.

The generic HAVE_PCI_MMAP code (drivers/pci-sysfs.c) is not
very useful since we have to deal with three different types
of MMIO address spaces: sparse and dense mappings for old
ev4/ev5 machines and "normal" 1:1 MMIO space (bwx) for ev56 and
later.
Also "write combine" mappings are meaningless on alpha - roughly
speaking, alpha does write combining, IO reordering and other
optimizations by default, unless user splits IO accesses
with memory barriers.

I think the cleanest way to deal with resource files on alpha
is to convert the default no-op pci_create_resource_files() and
pci_remove_resource_files() for !HAVE_PCI_MMAP case into __weak
functions and override them with alpha specific ones.

Another alpha hook is needed for "legacy_" resource files
to handle sparse addressing (pci_adjust_legacy_attr).

With the "standard" resourceN files on ev56/ev6 libpciaccess
works "out of the box". Handling of resourceN_sparse/resourceN_dense
files on older machines obviously requires some userland work.

Sparse/dense stuff has been tested on sx164 (pca56/pyxis, normally
uses bwx IO) with the kernel hacked into "cia compatible" mode.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/alpha/include/asm/pci.h  |  14 ++
 arch/alpha/kernel/Makefile    |   2 +-
 arch/alpha/kernel/pci-sysfs.c | 366 ++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci-sysfs.c       |  19 ++-
 4 files changed, 398 insertions(+), 3 deletions(-)
 create mode 100644 arch/alpha/kernel/pci-sysfs.c

diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index 2a14302c17a..cb04eaa6ba3 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -273,4 +273,18 @@ struct pci_dev *alpha_gendev_to_pci(struct device *dev);
 
 extern struct pci_dev *isa_bridge;
 
+extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val,
+			   size_t count);
+extern int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val,
+			    size_t count);
+extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
+				      struct vm_area_struct *vma,
+				      enum pci_mmap_state mmap_state);
+extern void pci_adjust_legacy_attr(struct pci_bus *bus,
+				   enum pci_mmap_state mmap_type);
+#define HAVE_PCI_LEGACY	1
+
+extern int pci_create_resource_files(struct pci_dev *dev);
+extern void pci_remove_resource_files(struct pci_dev *dev);
+
 #endif /* __ALPHA_PCI_H */
diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index b4697759a12..a427538252f 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -12,7 +12,7 @@ obj-y    := entry.o traps.o process.o init_task.o osf_sys.o irq.o \
 
 obj-$(CONFIG_VGA_HOSE)	+= console.o
 obj-$(CONFIG_SMP)	+= smp.o
-obj-$(CONFIG_PCI)	+= pci.o pci_iommu.o
+obj-$(CONFIG_PCI)	+= pci.o pci_iommu.o pci-sysfs.o
 obj-$(CONFIG_SRM_ENV)	+= srm_env.o
 obj-$(CONFIG_MODULES)	+= module.o
 
diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
new file mode 100644
index 00000000000..6ea822e7f72
--- /dev/null
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -0,0 +1,366 @@
+/*
+ * arch/alpha/kernel/pci-sysfs.c
+ *
+ * Copyright (C) 2009 Ivan Kokshaysky
+ *
+ * Alpha PCI resource files.
+ *
+ * Loosely based on generic HAVE_PCI_MMAP implementation in
+ * drivers/pci/pci-sysfs.c
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+
+static int hose_mmap_page_range(struct pci_controller *hose,
+				struct vm_area_struct *vma,
+				enum pci_mmap_state mmap_type, int sparse)
+{
+	unsigned long base;
+
+	if (mmap_type == pci_mmap_mem)
+		base = sparse ? hose->sparse_mem_base : hose->dense_mem_base;
+	else
+		base = sparse ? hose->sparse_io_base : hose->dense_io_base;
+
+	vma->vm_pgoff += base >> PAGE_SHIFT;
+	vma->vm_flags |= (VM_IO | VM_RESERVED);
+
+	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				  vma->vm_end - vma->vm_start,
+				  vma->vm_page_prot);
+}
+
+static int __pci_mmap_fits(struct pci_dev *pdev, int num,
+			   struct vm_area_struct *vma, int sparse)
+{
+	unsigned long nr, start, size;
+	int shift = sparse ? 5 : 0;
+
+	nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	start = vma->vm_pgoff;
+	size = ((pci_resource_len(pdev, num) - 1) >> (PAGE_SHIFT - shift)) + 1;
+
+	if (start < size && size - start >= nr)
+		return 1;
+	WARN(1, "process \"%s\" tried to map%s 0x%08lx-0x%08lx on %s BAR %d "
+		"(size 0x%08lx)\n",
+		current->comm, sparse ? " sparse" : "", start, start + nr,
+		pci_name(pdev), num, size);
+	return 0;
+}
+
+/**
+ * pci_mmap_resource - map a PCI resource into user memory space
+ * @kobj: kobject for mapping
+ * @attr: struct bin_attribute for the file being mapped
+ * @vma: struct vm_area_struct passed into the mmap
+ * @sparse: address space type
+ *
+ * Use the bus mapping routines to map a PCI resource into userspace.
+ */
+static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
+			     struct vm_area_struct *vma, int sparse)
+{
+	struct pci_dev *pdev = to_pci_dev(container_of(kobj,
+						       struct device, kobj));
+	struct resource *res = (struct resource *)attr->private;
+	enum pci_mmap_state mmap_type;
+	struct pci_bus_region bar;
+	int i;
+
+	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+		if (res == &pdev->resource[i])
+			break;
+	if (i >= PCI_ROM_RESOURCE)
+		return -ENODEV;
+
+	if (!__pci_mmap_fits(pdev, i, vma, sparse))
+		return -EINVAL;
+
+	if (iomem_is_exclusive(res->start))
+		return -EINVAL;
+
+	pcibios_resource_to_bus(pdev, &bar, res);
+	vma->vm_pgoff += bar.start >> (PAGE_SHIFT - (sparse ? 5 : 0));
+	mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io;
+
+	return hose_mmap_page_range(pdev->sysdata, vma, mmap_type, sparse);
+}
+
+static int pci_mmap_resource_sparse(struct kobject *kobj,
+				    struct bin_attribute *attr,
+				    struct vm_area_struct *vma)
+{
+	return pci_mmap_resource(kobj, attr, vma, 1);
+}
+
+static int pci_mmap_resource_dense(struct kobject *kobj,
+				   struct bin_attribute *attr,
+				   struct vm_area_struct *vma)
+{
+	return pci_mmap_resource(kobj, attr, vma, 0);
+}
+
+/**
+ * pci_remove_resource_files - cleanup resource files
+ * @dev: dev to cleanup
+ *
+ * If we created resource files for @dev, remove them from sysfs and
+ * free their resources.
+ */
+void pci_remove_resource_files(struct pci_dev *pdev)
+{
+	int i;
+
+	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+		struct bin_attribute *res_attr;
+
+		res_attr = pdev->res_attr[i];
+		if (res_attr) {
+			sysfs_remove_bin_file(&pdev->dev.kobj, res_attr);
+			kfree(res_attr);
+		}
+
+		res_attr = pdev->res_attr_wc[i];
+		if (res_attr) {
+			sysfs_remove_bin_file(&pdev->dev.kobj, res_attr);
+			kfree(res_attr);
+		}
+	}
+}
+
+static int sparse_mem_mmap_fits(struct pci_dev *pdev, int num)
+{
+	struct pci_bus_region bar;
+	struct pci_controller *hose = pdev->sysdata;
+	long dense_offset;
+	unsigned long sparse_size;
+
+	pcibios_resource_to_bus(pdev, &bar, &pdev->resource[num]);
+
+	/* All core logic chips have 4G sparse address space, except
+	   CIA which has 16G (see xxx_SPARSE_MEM and xxx_DENSE_MEM
+	   definitions in asm/core_xxx.h files). This corresponds
+	   to 128M or 512M of the bus space. */
+	dense_offset = (long)(hose->dense_mem_base - hose->sparse_mem_base);
+	sparse_size = dense_offset >= 0x400000000UL ? 0x20000000 : 0x8000000;
+
+	return bar.end < sparse_size;
+}
+
+static int pci_create_one_attr(struct pci_dev *pdev, int num, char *name,
+			       char *suffix, struct bin_attribute *res_attr,
+			       unsigned long sparse)
+{
+	size_t size = pci_resource_len(pdev, num);
+
+	sprintf(name, "resource%d%s", num, suffix);
+	res_attr->mmap = sparse ? pci_mmap_resource_sparse :
+				  pci_mmap_resource_dense;
+	res_attr->attr.name = name;
+	res_attr->attr.mode = S_IRUSR | S_IWUSR;
+	res_attr->size = sparse ? size << 5 : size;
+	res_attr->private = &pdev->resource[num];
+	return sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
+}
+
+static int pci_create_attr(struct pci_dev *pdev, int num)
+{
+	/* allocate attribute structure, piggyback attribute name */
+	int retval, nlen1, nlen2 = 0, res_count = 1;
+	unsigned long sparse_base, dense_base;
+	struct bin_attribute *attr;
+	struct pci_controller *hose = pdev->sysdata;
+	char *suffix, *attr_name;
+
+	suffix = "";	/* Assume bwx machine, normal resourceN files. */
+	nlen1 = 10;
+
+	if (pdev->resource[num].flags & IORESOURCE_MEM) {
+		sparse_base = hose->sparse_mem_base;
+		dense_base = hose->dense_mem_base;
+		if (sparse_base && !sparse_mem_mmap_fits(pdev, num)) {
+			sparse_base = 0;
+			suffix = "_dense";
+			nlen1 = 16;	/* resourceN_dense */
+		}
+	} else {
+		sparse_base = hose->sparse_io_base;
+		dense_base = hose->dense_io_base;
+	}
+
+	if (sparse_base) {
+		suffix = "_sparse";
+		nlen1 = 17;
+		if (dense_base) {
+			nlen2 = 16;	/* resourceN_dense */
+			res_count = 2;
+		}
+	}
+
+	attr = kzalloc(sizeof(*attr) * res_count + nlen1 + nlen2, GFP_ATOMIC);
+	if (!attr)
+		return -ENOMEM;
+
+	/* Create bwx, sparse or single dense file */
+	attr_name = (char *)(attr + res_count);
+	pdev->res_attr[num] = attr;
+	retval = pci_create_one_attr(pdev, num, attr_name, suffix, attr,
+				     sparse_base);
+	if (retval || res_count == 1)
+		return retval;
+
+	/* Create dense file */
+	attr_name += nlen1;
+	attr++;
+	pdev->res_attr_wc[num] = attr;
+	return pci_create_one_attr(pdev, num, attr_name, "_dense", attr, 0);
+}
+
+/**
+ * pci_create_resource_files - create resource files in sysfs for @dev
+ * @dev: dev in question
+ *
+ * Walk the resources in @dev creating files for each resource available.
+ */
+int pci_create_resource_files(struct pci_dev *pdev)
+{
+	int i;
+	int retval;
+
+	/* Expose the PCI resources from this device as files */
+	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+
+		/* skip empty resources */
+		if (!pci_resource_len(pdev, i))
+			continue;
+
+		retval = pci_create_attr(pdev, i);
+		if (retval) {
+			pci_remove_resource_files(pdev);
+			return retval;
+		}
+	}
+	return 0;
+}
+
+/* Legacy I/O bus mapping stuff. */
+
+static int __legacy_mmap_fits(struct pci_controller *hose,
+			      struct vm_area_struct *vma,
+			      unsigned long res_size, int sparse)
+{
+	unsigned long nr, start, size;
+
+	nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	start = vma->vm_pgoff;
+	size = ((res_size - 1) >> PAGE_SHIFT) + 1;
+
+	if (start < size && size - start >= nr)
+		return 1;
+	WARN(1, "process \"%s\" tried to map%s 0x%08lx-0x%08lx on hose %d "
+		"(size 0x%08lx)\n",
+		current->comm, sparse ? " sparse" : "", start, start + nr,
+		hose->index, size);
+	return 0;
+}
+
+static inline int has_sparse(struct pci_controller *hose,
+			     enum pci_mmap_state mmap_type)
+{
+	unsigned long base;
+
+	base = (mmap_type == pci_mmap_mem) ? hose->sparse_mem_base :
+					     hose->sparse_io_base;
+
+	return base != 0;
+}
+
+int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma,
+			       enum pci_mmap_state mmap_type)
+{
+	struct pci_controller *hose = bus->sysdata;
+	int sparse = has_sparse(hose, mmap_type);
+	unsigned long res_size;
+
+	res_size = (mmap_type == pci_mmap_mem) ? bus->legacy_mem->size :
+						 bus->legacy_io->size;
+	if (!__legacy_mmap_fits(hose, vma, res_size, sparse))
+		return -EINVAL;
+
+	return hose_mmap_page_range(hose, vma, mmap_type, sparse);
+}
+
+/**
+ * pci_adjust_legacy_attr - adjustment of legacy file attributes
+ * @b: bus to create files under
+ * @mmap_type: I/O port or memory
+ *
+ * Adjust file name and size for sparse mappings.
+ */
+void pci_adjust_legacy_attr(struct pci_bus *bus, enum pci_mmap_state mmap_type)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	if (!has_sparse(hose, mmap_type))
+		return;
+
+	if (mmap_type == pci_mmap_mem) {
+		bus->legacy_mem->attr.name = "legacy_mem_sparse";
+		bus->legacy_mem->size <<= 5;
+	} else {
+		bus->legacy_io->attr.name = "legacy_io_sparse";
+		bus->legacy_io->size <<= 5;
+	}
+	return;
+}
+
+/* Legacy I/O bus read/write functions */
+int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	port += hose->io_space->start;
+
+	switch(size) {
+	case 1:
+		*((u8 *)val) = inb(port);
+		return 1;
+	case 2:
+		if (port & 1)
+			return -EINVAL;
+		*((u16 *)val) = inw(port);
+		return 2;
+	case 4:
+		if (port & 3)
+			return -EINVAL;
+		*((u32 *)val) = inl(port);
+		return 4;
+	}
+	return -EINVAL;
+}
+
+int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, size_t size)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	port += hose->io_space->start;
+
+	switch(size) {
+	case 1:
+		outb(port, val);
+		return 1;
+	case 2:
+		if (port & 1)
+			return -EINVAL;
+		outw(port, val);
+		return 2;
+	case 4:
+		if (port & 3)
+			return -EINVAL;
+		outl(port, val);
+		return 4;
+	}
+	return -EINVAL;
+}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index dfc4e0ddf24..1c892980140 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -492,6 +492,19 @@ pci_mmap_legacy_io(struct kobject *kobj, struct bin_attribute *attr,
         return pci_mmap_legacy_page_range(bus, vma, pci_mmap_io);
 }
 
+/**
+ * pci_adjust_legacy_attr - adjustment of legacy file attributes
+ * @b: bus to create files under
+ * @mmap_type: I/O port or memory
+ *
+ * Stub implementation. Can be overridden by arch if necessary.
+ */
+void __weak
+pci_adjust_legacy_attr(struct pci_bus *b, enum pci_mmap_state mmap_type)
+{
+	return;
+}
+
 /**
  * pci_create_legacy_files - create legacy I/O port and memory files
  * @b: bus to create files under
@@ -518,6 +531,7 @@ void pci_create_legacy_files(struct pci_bus *b)
 	b->legacy_io->read = pci_read_legacy_io;
 	b->legacy_io->write = pci_write_legacy_io;
 	b->legacy_io->mmap = pci_mmap_legacy_io;
+	pci_adjust_legacy_attr(b, pci_mmap_io);
 	error = device_create_bin_file(&b->dev, b->legacy_io);
 	if (error)
 		goto legacy_io_err;
@@ -528,6 +542,7 @@ void pci_create_legacy_files(struct pci_bus *b)
 	b->legacy_mem->size = 1024*1024;
 	b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR;
 	b->legacy_mem->mmap = pci_mmap_legacy_mem;
+	pci_adjust_legacy_attr(b, pci_mmap_mem);
 	error = device_create_bin_file(&b->dev, b->legacy_mem);
 	if (error)
 		goto legacy_mem_err;
@@ -719,8 +734,8 @@ static int pci_create_resource_files(struct pci_dev *pdev)
 	return 0;
 }
 #else /* !HAVE_PCI_MMAP */
-static inline int pci_create_resource_files(struct pci_dev *dev) { return 0; }
-static inline void pci_remove_resource_files(struct pci_dev *dev) { return; }
+int __weak pci_create_resource_files(struct pci_dev *dev) { return 0; }
+void __weak pci_remove_resource_files(struct pci_dev *dev) { return; }
 #endif /* HAVE_PCI_MMAP */
 
 /**
-- 
cgit v1.2.3


From ae40582e9959cdb7bfe4b918be8e3d19f9511798 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Feb 2009 20:16:07 -0800
Subject: PCI: pcie_portdriver: fix pcie_port_device_remove

pcie_port_device_remove currently calls the remove method of port
drivers twice.  Ouch!

We are calling device_for_each_child multiple times for no apparent
reason.

So make it simple. Place put_device and device_unregister into
remove_iter, and throw out the rest.  Only call device_for_each_child
once.

The code is simpler and actually works!

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_core.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 3aea92a9292..569af0015fc 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -456,16 +456,9 @@ int pcie_port_device_resume(struct pci_dev *dev)
 
 static int remove_iter(struct device *dev, void *data)
 {
-	struct pcie_port_service_driver *service_driver;
-
 	if (dev->bus == &pcie_port_bus_type) {
-		if (dev->driver) {
-			service_driver = to_service_driver(dev->driver);
-			if (service_driver->remove)
-				service_driver->remove(to_pcie_device(dev));
-		}
-		*(unsigned long*)data = (unsigned long)dev;
-		return 1;
+		put_device(dev);
+		device_unregister(dev);
 	}
 	return 0;
 }
@@ -480,18 +473,8 @@ static int remove_iter(struct device *dev, void *data)
 void pcie_port_device_remove(struct pci_dev *dev)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
-	int status;
-
-	do {
-		unsigned long device_addr;
 
-		status = device_for_each_child(&dev->dev, &device_addr, remove_iter);
-		if (status) {
-			struct device *device = (struct device*)device_addr;
-			put_device(device);
-			device_unregister(device);
-		}
-	} while (status);
+	device_for_each_child(&dev->dev, NULL, remove_iter);
 
 	switch (port_data->port_irq_mode) {
 	case PCIE_PORT_MSIX_MODE:
-- 
cgit v1.2.3


From 3a3c244c9a355105bc193fde873c73727bf87192 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 15 Feb 2009 22:32:48 +0100
Subject: PCI: PCIe portdrv: Implement pm object

Implement pm object for the PCI Express port driver in order to use
the new power management framework and reduce the code size.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_core.c |  4 ++--
 drivers/pci/pcie/aer/aerdrv.c     |  6 ------
 drivers/pci/pcie/portdrv.h        |  4 ++--
 drivers/pci/pcie/portdrv_core.c   | 14 ++++++--------
 drivers/pci/pcie/portdrv_pci.c    | 31 +++++++++++++++----------------
 include/linux/pcieport_if.h       |  2 +-
 6 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 3d21bbba330..fb254b2454d 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -475,7 +475,7 @@ static void pciehp_remove (struct pcie_device *dev)
 }
 
 #ifdef CONFIG_PM
-static int pciehp_suspend (struct pcie_device *dev, pm_message_t state)
+static int pciehp_suspend (struct pcie_device *dev)
 {
 	dev_info(&dev->device, "%s ENTRY\n", __func__);
 	return 0;
@@ -503,7 +503,7 @@ static int pciehp_resume (struct pcie_device *dev)
 	}
 	return 0;
 }
-#endif
+#endif /* PM */
 
 static struct pcie_port_service_driver hpdriver_portdrv = {
 	.name		= PCIE_MODULE_NAME,
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index e11c0319406..32ade5af927 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -40,9 +40,6 @@ MODULE_LICENSE("GPL");
 
 static int __devinit aer_probe (struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
-static int aer_suspend(struct pcie_device *dev, pm_message_t state)
-{return 0;}
-static int aer_resume(struct pcie_device *dev) {return 0;}
 static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
 	enum pci_channel_state error);
 static void aer_error_resume(struct pci_dev *dev);
@@ -61,9 +58,6 @@ static struct pcie_port_service_driver aerdriver = {
 	.probe		= aer_probe,
 	.remove		= aer_remove,
 
-	.suspend	= aer_suspend,
-	.resume		= aer_resume,
-
 	.err_handler	= &aer_error_handlers,
 
 	.reset_link	= aer_root_reset,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 5b818bd835e..17ad53868f9 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -38,8 +38,8 @@ extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_probe(struct pci_dev *dev);
 extern int pcie_port_device_register(struct pci_dev *dev);
 #ifdef CONFIG_PM
-extern int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state);
-extern int pcie_port_device_resume(struct pci_dev *dev);
+extern int pcie_port_device_suspend(struct device *dev);
+extern int pcie_port_device_resume(struct device *dev);
 #endif
 extern void pcie_port_device_remove(struct pci_dev *dev);
 extern int __must_check pcie_port_bus_register(void);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 569af0015fc..5a5bfe7cdf5 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -410,13 +410,12 @@ int pcie_port_device_register(struct pci_dev *dev)
 static int suspend_iter(struct device *dev, void *data)
 {
 	struct pcie_port_service_driver *service_driver;
-	pm_message_t state = * (pm_message_t *) data;
 
  	if ((dev->bus == &pcie_port_bus_type) &&
  	    (dev->driver)) {
  		service_driver = to_service_driver(dev->driver);
  		if (service_driver->suspend)
- 			service_driver->suspend(to_pcie_device(dev), state);
+ 			service_driver->suspend(to_pcie_device(dev));
   	}
 	return 0;
 }
@@ -424,11 +423,10 @@ static int suspend_iter(struct device *dev, void *data)
 /**
  * pcie_port_device_suspend - suspend port services associated with a PCIe port
  * @dev: PCI Express port to handle
- * @state: Representation of system power management transition in progress
  */
-int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state)
+int pcie_port_device_suspend(struct device *dev)
 {
-	return device_for_each_child(&dev->dev, &state, suspend_iter);
+	return device_for_each_child(dev, NULL, suspend_iter);
 }
 
 static int resume_iter(struct device *dev, void *data)
@@ -448,11 +446,11 @@ static int resume_iter(struct device *dev, void *data)
  * pcie_port_device_suspend - resume port services associated with a PCIe port
  * @dev: PCI Express port to handle
  */
-int pcie_port_device_resume(struct pci_dev *dev)
+int pcie_port_device_resume(struct device *dev)
 {
-	return device_for_each_child(&dev->dev, NULL, resume_iter);
+	return device_for_each_child(dev, NULL, resume_iter);
 }
-#endif
+#endif /* PM */
 
 static int remove_iter(struct device *dev, void *data)
 {
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 94d0e2af9ba..a61f4930d67 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -44,21 +44,21 @@ static int pcie_portdrv_restore_config(struct pci_dev *dev)
 }
 
 #ifdef CONFIG_PM
-static int pcie_portdrv_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	return pcie_port_device_suspend(dev, state);
+static struct dev_pm_ops pcie_portdrv_pm_ops = {
+	.suspend	= pcie_port_device_suspend,
+	.resume		= pcie_port_device_resume,
+	.freeze		= pcie_port_device_suspend,
+	.thaw		= pcie_port_device_resume,
+	.poweroff	= pcie_port_device_suspend,
+	.restore	= pcie_port_device_resume,
+};
 
-}
+#define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
 
-static int pcie_portdrv_resume(struct pci_dev *dev)
-{
-	pci_set_master(dev);
-	return pcie_port_device_resume(dev);
-}
-#else
-#define pcie_portdrv_suspend NULL
-#define pcie_portdrv_resume NULL
-#endif
+#else /* !PM */
+
+#define PCIE_PORTDRV_PM_OPS	NULL
+#endif /* !PM */
 
 /*
  * pcie_portdrv_probe - Probe PCI-Express port devices
@@ -268,10 +268,9 @@ static struct pci_driver pcie_portdriver = {
 	.probe		= pcie_portdrv_probe,
 	.remove		= pcie_portdrv_remove,
 
-	.suspend	= pcie_portdrv_suspend,
-	.resume		= pcie_portdrv_resume,
-
 	.err_handler 	= &pcie_portdrv_err_handler,
+
+	.driver.pm 	= PCIE_PORTDRV_PM_OPS,
 };
 
 static int __init pcie_portdrv_init(void)
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 5d2afcfa6bc..b4c79545330 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -59,7 +59,7 @@ struct pcie_port_service_driver {
 	const char *name;
 	int (*probe) (struct pcie_device *dev);
 	void (*remove) (struct pcie_device *dev);
-	int (*suspend) (struct pcie_device *dev, pm_message_t state);
+	int (*suspend) (struct pcie_device *dev);
 	int (*resume) (struct pcie_device *dev);
 
 	/* Service Error Recovery Handler */
-- 
cgit v1.2.3


From 0747aaf42d78d26684c6f6b34a4103ff81f571f8 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:11:56 +0900
Subject: PCI/ACPI: fix wrong assumption in acpi_pci_get_bridge_handle

Current acpi_pci_get_bridge_handle() has an assumption that
pci_bus->self is NULL on the root pci bus. But it might not true on
some platforms. Because of this wrong assumption, current
acpi_pci_get_bridge_handle() might return improper ACPI handle. We
must check pci_bus->parent instead.

This bug is the root cause of the following kernel panic reported by
James Bottomley. This problem was introduced by the commit
e8c331e963c58b83db24b7d0e39e8c07f687dbc6. The immediate cause was
acpi_pci_get_bridge_handle() returned NULL unexpectedly and it was
passed as the second argument of acpi_walk_namespace().

pci_hotplug: PCI Hot Plug PCI Core version: 0.5
acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.28 #1
RIP: 0010:[<ffffffff8039646f>]  [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
RSP: 0018:ffff88007f87fd30  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffffffff8037d260 R09: ffff88007f87fdfc
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffffffff80742040(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000010 CR3: 0000000000201000 CR4: 00000000000006a0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 1, threadinfo ffff88007f87e000, task ffff88007f875040)
Stack:
 0000000000000000 ffffffff803964f5 ffff88007f81b728 0000000000001001
 ffff88007f87fdfc ffffffff8037d260 0000000600000001 0000000000000000
 ffffffff8037d260 0000000000000000 0000000000000001 ffff88007f87fdfc
Call Trace:
 [<ffffffff803964f5>] acpi_ns_walk_namespace+0x55/0x138
 [<ffffffff8037d260>] is_pci_dock_device+0x0/0x20
 [<ffffffff8037d260>] is_pci_dock_device+0x0/0x20
 [<ffffffff80394a9e>] acpi_walk_namespace+0x5f/0x83
 [<ffffffff8037dd33>] detect_ejectable_slots+0x53/0x70
 [<ffffffff8037de38>] add_bridge+0xe8/0x200
 [<ffffffff80394aaa>] acpi_walk_namespace+0x6b/0x83
 [<ffffffff803a4ad1>] acpi_pci_register_driver+0x48/0x61
 [<ffffffff806fc5df>] acpiphp_init+0x0/0x58
 [<ffffffff806fc732>] acpiphp_glue_init+0x4c/0x5a
 [<ffffffff806fc616>] acpiphp_init+0x37/0x58
 [<ffffffff8020903b>] _stext+0x3b/0x180
 [<ffffffff80312598>] create_proc_entry+0x58/0xa0
 [<ffffffff802815d1>] register_irq_proc+0xc1/0xe0
 [<ffffffff806db64b>] kernel_init+0x152/0x1ac
 [<ffffffff8023d970>] finish_task_switch+0x0/0x110
 [<ffffffff8020ca7a>] child_rip+0xa/0x20
 [<ffffffff8020c47c>] restore_args+0x0/0x30
 [<ffffffff806db4f9>] kernel_init+0x0/0x1ac
 [<ffffffff8020ca70>] child_rip+0x0/0x20
Code: 89 c2 48 8b 00 48 85 c0 75 f5 48 8b 45 00 48 89 02 44 88 65 09 48 89 5d 00 31 c0 5b 5d 41 5c c3 53 48 85 d2 89 fb 48 89 d7 75 06 <48> 8b 56 10 eb 08 e8 73 f1 ff ff 48 89 c2 85 db 74 1a eb 13 0f
RIP  [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
 RSP <ffff88007f87fd30>
CR2: 0000000000000010
---[ end trace a7919e7f17c0a725 ]---

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 20480b9f10c..3cee2367459 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -23,11 +23,10 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 {
-	int seg = pci_domain_nr(pbus), busnr = pbus->number;
-	struct pci_dev *bridge = pbus->self;
-	if (bridge)
-		return DEVICE_ACPI_HANDLE(&(bridge->dev));
-	return acpi_get_pci_rootbridge_handle(seg, busnr);
+	if (pbus->parent)
+		return DEVICE_ACPI_HANDLE(&(pbus->self->dev));
+	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
+					      pbus->number);
 }
 #else
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
-- 
cgit v1.2.3


From d18690af626b83fef1d1953b9f70e09497060586 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:12:36 +0900
Subject: PCI/ACPI: fix wrong assumption in acpi_find_root_bridge_handle

Current acpi_find_root_bridge_handle() has a assumption that
pci_bus->self is NULL on the root pci bus. But it might not be true on
some platforms. Because of this wrong assumption, current
acpi_find_root_bridge_handle() might cause endless loop. We must check
pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 3cee2367459..092e82e0048 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -13,12 +13,12 @@
 #ifdef CONFIG_ACPI
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
-	/* Find root host bridge */
-	while (pdev->bus->self)
-		pdev = pdev->bus->self;
-
-	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pdev->bus),
-			pdev->bus->number);
+	struct pci_bus *pbus = pdev->bus;
+	/* Find a PCI root bus */
+	while (pbus->parent)
+		pbus = pbus->parent;
+	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
+					      pbus->number);
 }
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
-- 
cgit v1.2.3


From 267efd7eec5eca62f32f8c9bc1721b578d5da963 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:13:20 +0900
Subject: PCI hotplug: fix wrong assumption in acpi_get_hp_params_from_firmware

Current acpi_get_hp_params_from_firmware() has a assumption that
pci_bus->self is NULL on the root pci bus. But it might not true on
some platforms. Because of this wrong assumption, current
acpi_get_hp_params_from_firmware() might cause endless loop. We must
check pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/acpi_pcihp.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index f47bc74be56..09a84402986 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -332,19 +332,14 @@ acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 {
 	acpi_status status = AE_NOT_FOUND;
 	acpi_handle handle, phandle;
-	struct pci_bus *pbus = bus;
-	struct pci_dev *pdev;
+	struct pci_bus *pbus;
 
-	do {
-		pdev = pbus->self;
-		if (!pdev) {
-			handle = acpi_get_pci_rootbridge_handle(
-				pci_domain_nr(pbus), pbus->number);
+	handle = NULL;
+	for (pbus = bus; pbus; pbus = pbus->parent) {
+		handle = acpi_pci_get_bridge_handle(pbus);
+		if (handle)
 			break;
-		}
-		handle = DEVICE_ACPI_HANDLE(&(pdev->dev));
-		pbus = pbus->parent;
-	} while (!handle);
+	}
 
 	/*
 	 * _HPP settings apply to all child buses, until another _HPP is
-- 
cgit v1.2.3


From d391f00f0e7fb6d883c6724b31a1799e19a584c5 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:13:59 +0900
Subject: PCI hotplug: fix wrong assumption in
 acpi_get_hp_hw_control_from_firmware

Current acpi_get_hp_hw_control_from_firmware() has a assumption that
pci_bus->self is NULL on a PCI root bus. But it might not be true on
some platforms. Because of this wrong assumption, current
acpi_get_hp_hw_control_from_firmware() might cause endless loop. We
must check pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/acpi_pcihp.c | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 09a84402986..fbc63d5e459 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -372,12 +372,10 @@ EXPORT_SYMBOL_GPL(acpi_get_hp_params_from_firmware);
  *
  * Attempt to take hotplug control from firmware.
  */
-int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags)
+int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags)
 {
 	acpi_status status;
 	acpi_handle chandle, handle;
-	struct pci_dev *pdev = dev;
-	struct pci_bus *parent;
 	struct acpi_buffer string = { ACPI_ALLOCATE_BUFFER, NULL };
 
 	flags &= (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL |
@@ -409,26 +407,18 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags)
 		string = (struct acpi_buffer){ ACPI_ALLOCATE_BUFFER, NULL };
 	}
 
-	pdev = dev;
-	handle = DEVICE_ACPI_HANDLE(&dev->dev);
-	while (!handle) {
+	handle = DEVICE_ACPI_HANDLE(&pdev->dev);
+	if (!handle) {
 		/*
 		 * This hotplug controller was not listed in the ACPI name
 		 * space at all. Try to get acpi handle of parent pci bus.
 		 */
-		if (!pdev || !pdev->bus->parent)
-			break;
-		parent = pdev->bus->parent;
-		dbg("Could not find %s in acpi namespace, trying parent\n",
-		    pci_name(pdev));
-		if (!parent->self)
-			/* Parent must be a host bridge */
-			handle = acpi_get_pci_rootbridge_handle(
-					pci_domain_nr(parent),
-					parent->number);
-		else
-			handle = DEVICE_ACPI_HANDLE(&(parent->self->dev));
-		pdev = parent->self;
+		struct pci_bus *pbus;
+		for (pbus = pdev->bus; pbus; pbus = pbus->parent) {
+			handle = acpi_pci_get_bridge_handle(pbus);
+			if (handle)
+				break;
+		}
 	}
 
 	while (handle) {
@@ -447,13 +437,13 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags)
 	}
 
 	dbg("Cannot get control of hotplug hardware for pci %s\n",
-	    pci_name(dev));
+	    pci_name(pdev));
 
 	kfree(string.pointer);
 	return -ENODEV;
 got_one:
-	dbg("Gained control for hotplug HW for pci %s (%s)\n", pci_name(dev),
-			(char *)string.pointer);
+	dbg("Gained control for hotplug HW for pci %s (%s)\n",
+	    pci_name(pdev), (char *)string.pointer);
 	kfree(string.pointer);
 	return 0;
 }
-- 
cgit v1.2.3


From 151ab36a2ea0b3181d103f7244636e0d16e685de Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:14:36 +0900
Subject: PCI: fix wrong assumption in pci_find_upstream_pcie_bridge

Current pci_find_upstream_pcie_bridge() has a wrong assumption that
pci_bus->self is NULL on the root pci bus. But it might not true on
some platforms. Because of this wrong assumption, current
pci_find_upstream_pcie_bridge() might cause endless loop. We must
check pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/search.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 5af8bd53814..710d4ea6956 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -29,7 +29,7 @@ pci_find_upstream_pcie_bridge(struct pci_dev *pdev)
 	if (pdev->is_pcie)
 		return NULL;
 	while (1) {
-		if (!pdev->bus->self)
+		if (!pdev->bus->parent)
 			break;
 		pdev = pdev->bus->self;
 		/* a p2p bridge */
-- 
cgit v1.2.3


From f92d4e29d785f1d4217dee7f1ae6ff7140547ed5 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:15:16 +0900
Subject: PCI: fix wrong assumption in pci_read_bridge_bases

Current pci_read_bridge_bases() has an assumption that pci_bus->self
is NULL on the pci root bus (It checks pci_bus->self to see if the pci
bus is root bus). But is might not true on some platforms. We must
check pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 55ec44a27e8..23362e8c696 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -287,7 +287,7 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child)
 	struct resource *res;
 	int i;
 
-	if (!dev)		/* It's a host bus, nothing to read */
+	if (!child->parent)	/* It's a host bus, nothing to read */
 		return;
 
 	if (dev->transparent) {
-- 
cgit v1.2.3


From c2a3072e010943ac749794622f26b3ef54de25be Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:15:45 +0900
Subject: PCI: fix wrong assumption in pci_get_interrupt_pin

Current pci_get_interrupt_pin() seems to have an assumption that
pci_bus->self is NULL on the root pci bus. But it might not be true on
some platforms. Because of this wrong assumption, current
pci_get_interrupt_pin() might cause endless loop. We must check
pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 0b3e20f1b6f..0cfed9e28ea 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1484,7 +1484,7 @@ pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge)
 	if (!pin)
 		return -1;
 
-	while (dev->bus->self) {
+	while (dev->bus->parent) {
 		pin = pci_swizzle_interrupt_pin(dev, pin);
 		dev = dev->bus->self;
 	}
-- 
cgit v1.2.3


From c74d724462d1845535667f4d3f720e02e3432e53 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:16:13 +0900
Subject: PCI: fix wrong assumption in pci_common_swizzle

Current pci_common_swizzle() seems to have a assumption that
pci_bus->self is NULL on the pci root bus. But it might not be true on
some platforms. Because of this wrong assumption, pci_common_swizzle()
might cause endless loop. We must check pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 0cfed9e28ea..8310dc2f943 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1504,7 +1504,7 @@ u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp)
 {
 	u8 pin = *pinp;
 
-	while (dev->bus->self) {
+	while (dev->bus->parent) {
 		pin = pci_swizzle_interrupt_pin(dev, pin);
 		dev = dev->bus->self;
 	}
-- 
cgit v1.2.3


From 998dd7c719f62dcfa91d7bf7f4eb9c160e03d817 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Wed, 25 Feb 2009 13:15:52 +0800
Subject: PCI: fix incorrect mask of PM No_Soft_Reset bit

Reviewed-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 027815b4635..b647a4df59f 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -235,7 +235,7 @@
 #define  PCI_PM_CAP_PME_SHIFT	11	/* Start of the PME Mask in PMC */
 #define PCI_PM_CTRL		4	/* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0004	/* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0008	/* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
-- 
cgit v1.2.3


From 13bf75766966e1bcc71fae536988caec312eef8f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Tue, 24 Feb 2009 10:38:22 -0700
Subject: x86: use dev_printk in quirk message

This patch changes a VIA PCI quirk to use dev_info() rather than printk().

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgek.org>
---
 arch/x86/kernel/pci-dma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b2542853314..022833bb9ff 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -290,8 +290,7 @@ fs_initcall(pci_iommu_init);
 static __devinit void via_no_dac(struct pci_dev *dev)
 {
 	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO
-			"PCI: VIA PCI bridge detected. Disabling DAC.\n");
+		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
 		forbid_dac = 1;
 	}
 }
-- 
cgit v1.2.3


From 0994375e9614f78657031e04e30019b9cdb62795 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Mon, 23 Feb 2009 21:52:23 -0800
Subject: PCI: add remove_id sysfs entry

This adds a remove_id sysfs entry to allow users of new_id to later
remove the added dynid.  One use case is management tools that want to
dynamically bind/unbind devices to pci-stub driver while devices are
assigned to KVM guests.  Rather than having to track which driver was
originally bound to the driver, a mangement tool can simply:

Guest uses device

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci | 16 +++++++
 drivers/pci/pci-driver.c                | 80 ++++++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index e638e15a889..3d29793a0ea 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -41,6 +41,22 @@ Description:
 		for the device and attempt to bind to it.  For example:
 		# echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id
 
+What:		/sys/bus/pci/drivers/.../remove_id
+Date:		February 2009
+Contact:	Chris Wright <chrisw@sous-sol.org>
+Description:
+		Writing a device ID to this file will remove an ID
+		that was dynamically added via the new_id sysfs entry.
+		The format for the device ID is:
+		VVVV DDDD SVVV SDDD CCCC MMMM.	That is Vendor ID, Device
+		ID, Subsystem Vendor ID, Subsystem Device ID, Class,
+		and Class Mask.  The Vendor ID and Device ID fields are
+		required, the rest are optional.  After successfully
+		removing an ID, the driver will no longer support the
+		device.  This is useful to ensure auto probing won't
+		match the driver to the device.  For example:
+		# echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id
+
 What:		/sys/bus/pci/devices/.../vpd
 Date:		February 2008
 Contact:	Ben Hutchings <bhutchings@solarflare.com>
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 93eac142358..87a5ddbb324 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -99,6 +99,52 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count)
 }
 static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id);
 
+/**
+ * store_remove_id - remove a PCI device ID from this driver
+ * @driver: target device driver
+ * @buf: buffer for scanning device ID data
+ * @count: input size
+ *
+ * Removes a dynamic pci device ID to this driver.
+ */
+static ssize_t
+store_remove_id(struct device_driver *driver, const char *buf, size_t count)
+{
+	struct pci_dynid *dynid, *n;
+	struct pci_driver *pdrv = to_pci_driver(driver);
+	__u32 vendor, device, subvendor = PCI_ANY_ID,
+		subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
+	int fields = 0;
+	int retval = -ENODEV;
+
+	fields = sscanf(buf, "%x %x %x %x %x %x",
+			&vendor, &device, &subvendor, &subdevice,
+			&class, &class_mask);
+	if (fields < 2)
+		return -EINVAL;
+
+	spin_lock(&pdrv->dynids.lock);
+	list_for_each_entry_safe(dynid, n, &pdrv->dynids.list, node) {
+		struct pci_device_id *id = &dynid->id;
+		if ((id->vendor == vendor) &&
+		    (id->device == device) &&
+		    (subvendor == PCI_ANY_ID || id->subvendor == subvendor) &&
+		    (subdevice == PCI_ANY_ID || id->subdevice == subdevice) &&
+		    !((id->class ^ class) & class_mask)) {
+			list_del(&dynid->node);
+			kfree(dynid);
+			retval = 0;
+			break;
+		}
+	}
+	spin_unlock(&pdrv->dynids.lock);
+
+	if (retval)
+		return retval;
+	return count;
+}
+static DRIVER_ATTR(remove_id, S_IWUSR, NULL, store_remove_id);
+
 static void
 pci_free_dynids(struct pci_driver *drv)
 {
@@ -125,6 +171,20 @@ static void pci_remove_newid_file(struct pci_driver *drv)
 {
 	driver_remove_file(&drv->driver, &driver_attr_new_id);
 }
+
+static int
+pci_create_removeid_file(struct pci_driver *drv)
+{
+	int error = 0;
+	if (drv->probe != NULL)
+		error = driver_create_file(&drv->driver,&driver_attr_remove_id);
+	return error;
+}
+
+static void pci_remove_removeid_file(struct pci_driver *drv)
+{
+	driver_remove_file(&drv->driver, &driver_attr_remove_id);
+}
 #else /* !CONFIG_HOTPLUG */
 static inline void pci_free_dynids(struct pci_driver *drv) {}
 static inline int pci_create_newid_file(struct pci_driver *drv)
@@ -132,6 +192,11 @@ static inline int pci_create_newid_file(struct pci_driver *drv)
 	return 0;
 }
 static inline void pci_remove_newid_file(struct pci_driver *drv) {}
+static inline int pci_create_removeid_file(struct pci_driver *drv)
+{
+	return 0;
+}
+static inline void pci_remove_removeid_file(struct pci_driver *drv) {}
 #endif
 
 /**
@@ -852,13 +917,23 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner,
 	/* register with core */
 	error = driver_register(&drv->driver);
 	if (error)
-		return error;
+		goto out;
 
 	error = pci_create_newid_file(drv);
 	if (error)
-		driver_unregister(&drv->driver);
+		goto out_newid;
 
+	error = pci_create_removeid_file(drv);
+	if (error)
+		goto out_removeid;
+out:
 	return error;
+
+out_removeid:
+	pci_remove_newid_file(drv);
+out_newid:
+	driver_unregister(&drv->driver);
+	goto out;
 }
 
 /**
@@ -874,6 +949,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner,
 void
 pci_unregister_driver(struct pci_driver *drv)
 {
+	pci_remove_removeid_file(drv);
 	pci_remove_newid_file(drv);
 	driver_unregister(&drv->driver);
 	pci_free_dynids(drv);
-- 
cgit v1.2.3


From c41ade2ee1dc146d2de2ee470a87cd6b878a08f4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:05 -0400
Subject: Rewrite MSI-HOWTO

I didn't find the previous version very useful, so I rewrote it.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Randy Dunlap <randy.dunlap@oracle.com>
Reviewed-by: Grant Grundler <grundler@parisc-linunx.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/PCI/MSI-HOWTO.txt | 758 +++++++++++++++-------------------------
 1 file changed, 277 insertions(+), 481 deletions(-)

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 256defd7e17..1c02431f1d1 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -4,506 +4,302 @@
 	Revised Feb 12, 2004 by Martine Silbermann
 		email: Martine.Silbermann@hp.com
 	Revised Jun 25, 2004 by Tom L Nguyen
+	Revised Jul  9, 2008 by Matthew Wilcox <willy@linux.intel.com>
+		Copyright 2003, 2008 Intel Corporation
 
 1. About this guide
 
-This guide describes the basics of Message Signaled Interrupts (MSI),
-the advantages of using MSI over traditional interrupt mechanisms,
-and how to enable your driver to use MSI or MSI-X. Also included is
-a Frequently Asked Questions (FAQ) section.
-
-1.1 Terminology
-
-PCI devices can be single-function or multi-function.  In either case,
-when this text talks about enabling or disabling MSI on a "device
-function," it is referring to one specific PCI device and function and
-not to all functions on a PCI device (unless the PCI device has only
-one function).
-
-2. Copyright 2003 Intel Corporation
-
-3. What is MSI/MSI-X?
-
-Message Signaled Interrupt (MSI), as described in the PCI Local Bus
-Specification Revision 2.3 or later, is an optional feature, and a
-required feature for PCI Express devices. MSI enables a device function
-to request service by sending an Inbound Memory Write on its PCI bus to
-the FSB as a Message Signal Interrupt transaction. Because MSI is
-generated in the form of a Memory Write, all transaction conditions,
-such as a Retry, Master-Abort, Target-Abort or normal completion, are
-supported.
-
-A PCI device that supports MSI must also support pin IRQ assertion
-interrupt mechanism to provide backward compatibility for systems that
-do not support MSI. In systems which support MSI, the bus driver is
-responsible for initializing the message address and message data of
-the device function's MSI/MSI-X capability structure during device
-initial configuration.
-
-An MSI capable device function indicates MSI support by implementing
-the MSI/MSI-X capability structure in its PCI capability list. The
-device function may implement both the MSI capability structure and
-the MSI-X capability structure; however, the bus driver should not
-enable both.
-
-The MSI capability structure contains Message Control register,
-Message Address register and Message Data register. These registers
-provide the bus driver control over MSI. The Message Control register
-indicates the MSI capability supported by the device. The Message
-Address register specifies the target address and the Message Data
-register specifies the characteristics of the message. To request
-service, the device function writes the content of the Message Data
-register to the target address. The device and its software driver
-are prohibited from writing to these registers.
-
-The MSI-X capability structure is an optional extension to MSI. It
-uses an independent and separate capability structure. There are
-some key advantages to implementing the MSI-X capability structure
-over the MSI capability structure as described below.
-
-	- Support a larger maximum number of vectors per function.
-
-	- Provide the ability for system software to configure
-	each vector with an independent message address and message
-	data, specified by a table that resides in Memory Space.
-
-        - MSI and MSI-X both support per-vector masking. Per-vector
-	masking is an optional extension of MSI but a required
-	feature for MSI-X. Per-vector masking provides the kernel the
-	ability to mask/unmask a single MSI while running its
-	interrupt service routine. If per-vector masking is
-	not supported, then the device driver should provide the
-	hardware/software synchronization to ensure that the device
-	generates MSI when the driver wants it to do so.
-
-4. Why use MSI?
-
-As a benefit to the simplification of board design, MSI allows board
-designers to remove out-of-band interrupt routing. MSI is another
-step towards a legacy-free environment.
-
-Due to increasing pressure on chipset and processor packages to
-reduce pin count, the need for interrupt pins is expected to
-diminish over time. Devices, due to pin constraints, may implement
-messages to increase performance.
-
-PCI Express endpoints uses INTx emulation (in-band messages) instead
-of IRQ pin assertion. Using INTx emulation requires interrupt
-sharing among devices connected to the same node (PCI bridge) while
-MSI is unique (non-shared) and does not require BIOS configuration
-support. As a result, the PCI Express technology requires MSI
-support for better interrupt performance.
-
-Using MSI enables the device functions to support two or more
-vectors, which can be configured to target different CPUs to
-increase scalability.
-
-5. Configuring a driver to use MSI/MSI-X
-
-By default, the kernel will not enable MSI/MSI-X on all devices that
-support this capability. The CONFIG_PCI_MSI kernel option
-must be selected to enable MSI/MSI-X support.
-
-5.1 Including MSI/MSI-X support into the kernel
-
-To allow MSI/MSI-X capable device drivers to selectively enable
-MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described
-below), the VECTOR based scheme needs to be enabled by setting
-CONFIG_PCI_MSI during kernel config.
-
-Since the target of the inbound message is the local APIC, providing
-CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI.
-
-5.2 Configuring for MSI support
-
-Due to the non-contiguous fashion in vector assignment of the
-existing Linux kernel, this version does not support multiple
-messages regardless of a device function is capable of supporting
-more than one vector. To enable MSI on a device function's MSI
-capability structure requires a device driver to call the function
-pci_enable_msi() explicitly.
-
-5.2.1 API pci_enable_msi
+This guide describes the basics of Message Signaled Interrupts (MSIs),
+the advantages of using MSI over traditional interrupt mechanisms, how
+to change your driver to use MSI or MSI-X and some basic diagnostics to
+try if a device doesn't support MSIs.
+
+
+2. What are MSIs?
+
+A Message Signaled Interrupt is a write from the device to a special
+address which causes an interrupt to be received by the CPU.
+
+The MSI capability was first specified in PCI 2.2 and was later enhanced
+in PCI 3.0 to allow each interrupt to be masked individually.  The MSI-X
+capability was also introduced with PCI 3.0.  It supports more interrupts
+per device than MSI and allows interrupts to be independently configured.
+
+Devices may support both MSI and MSI-X, but only one can be enabled at
+a time.
+
+
+3. Why use MSIs?
+
+There are three reasons why using MSIs can give an advantage over
+traditional pin-based interrupts.
+
+Pin-based PCI interrupts are often shared amongst several devices.
+To support this, the kernel must call each interrupt handler associated
+with an interrupt, which leads to reduced performance for the system as
+a whole.  MSIs are never shared, so this problem cannot arise.
+
+When a device writes data to memory, then raises a pin-based interrupt,
+it is possible that the interrupt may arrive before all the data has
+arrived in memory (this becomes more likely with devices behind PCI-PCI
+bridges).  In order to ensure that all the data has arrived in memory,
+the interrupt handler must read a register on the device which raised
+the interrupt.  PCI transaction ordering rules require that all the data
+arrives in memory before the value can be returned from the register.
+Using MSIs avoids this problem as the interrupt-generating write cannot
+pass the data writes, so by the time the interrupt is raised, the driver
+knows that all the data has arrived in memory.
+
+PCI devices can only support a single pin-based interrupt per function.
+Often drivers have to query the device to find out what event has
+occurred, slowing down interrupt handling for the common case.  With
+MSIs, a device can support more interrupts, allowing each interrupt
+to be specialised to a different purpose.  One possible design gives
+infrequent conditions (such as errors) their own interrupt which allows
+the driver to handle the normal interrupt handling path more efficiently.
+Other possible designs include giving one interrupt to each packet queue
+in a network card or each port in a storage controller.
+
+
+4. How to use MSIs
+
+PCI devices are initialised to use pin-based interrupts.  The device
+driver has to set up the device to use MSI or MSI-X.  Not all machines
+support MSIs correctly, and for those machines, the APIs described below
+will simply fail and the device will continue to use pin-based interrupts.
+
+4.1 Include kernel support for MSIs
+
+To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
+option enabled.  This option is only available on some architectures,
+and it may depend on some other options also being set.  For example,
+on x86, you must also enable X86_UP_APIC or SMP in order to see the
+CONFIG_PCI_MSI option.
+
+4.2 Using MSI
+
+Most of the hard work is done for the driver in the PCI layer.  It simply
+has to request that the PCI layer set up the MSI capability for this
+device.
+
+4.2.1 pci_enable_msi
 
 int pci_enable_msi(struct pci_dev *dev)
 
-With this new API, a device driver that wants to have MSI
-enabled on its device function must call this API to enable MSI.
-A successful call will initialize the MSI capability structure
-with ONE vector, regardless of whether a device function is
-capable of supporting multiple messages. This vector replaces the
-pre-assigned dev->irq with a new MSI vector. To avoid a conflict
-of the new assigned vector with existing pre-assigned vector requires
-a device driver to call this API before calling request_irq().
+A successful call will allocate ONE interrupt to the device, regardless
+of how many MSIs the device supports.  The device will be switched from
+pin-based interrupt mode to MSI mode.  The dev->irq number is changed
+to a new number which represents the message signaled interrupt.
+This function should be called before the driver calls request_irq()
+since enabling MSIs disables the pin-based IRQ and the driver will not
+receive interrupts on the old interrupt.
 
-5.2.2 API pci_disable_msi
+4.2.2 pci_disable_msi
 
 void pci_disable_msi(struct pci_dev *dev)
 
-This API should always be used to undo the effect of pci_enable_msi()
-when a device driver is unloading. This API restores dev->irq with
-the pre-assigned IOAPIC vector and switches a device's interrupt
-mode to PCI pin-irq assertion/INTx emulation mode.
-
-Note that a device driver should always call free_irq() on the MSI vector
-that it has done request_irq() on before calling this API. Failure to do
-so results in a BUG_ON() and a device will be left with MSI enabled and
-leaks its vector.
-
-5.2.3 MSI mode vs. legacy mode diagram
-
-The below diagram shows the events which switch the interrupt
-mode on the MSI-capable device function between MSI mode and
-PIN-IRQ assertion mode.
-
-	 ------------   pci_enable_msi 	 ------------------------
-	|	     | <===============	| 			 |
-	| MSI MODE   |	  	     	| PIN-IRQ ASSERTION MODE |
-	| 	     | ===============>	|			 |
- 	 ------------	pci_disable_msi  ------------------------
-
-
-Figure 1. MSI Mode vs. Legacy Mode
-
-In Figure 1, a device operates by default in legacy mode. Legacy
-in this context means PCI pin-irq assertion or PCI-Express INTx
-emulation. A successful MSI request (using pci_enable_msi()) switches
-a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
-stored in dev->irq will be saved by the PCI subsystem and a new
-assigned MSI vector will replace dev->irq.
-
-To return back to its default mode, a device driver should always call
-pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
-device driver should always call free_irq() on the MSI vector it has
-done request_irq() on before calling pci_disable_msi(). Failure to do
-so results in a BUG_ON() and a device will be left with MSI enabled and
-leaks its vector. Otherwise, the PCI subsystem restores a device's
-dev->irq with a pre-assigned IOAPIC vector and marks the released
-MSI vector as unused.
-
-Once being marked as unused, there is no guarantee that the PCI
-subsystem will reserve this MSI vector for a device. Depending on
-the availability of current PCI vector resources and the number of
-MSI/MSI-X requests from other drivers, this MSI may be re-assigned.
-
-For the case where the PCI subsystem re-assigns this MSI vector to
-another driver, a request to switch back to MSI mode may result
-in being assigned a different MSI vector or a failure if no more
-vectors are available.
-
-5.3 Configuring for MSI-X support
-
-Due to the ability of the system software to configure each vector of
-the MSI-X capability structure with an independent message address
-and message data, the non-contiguous fashion in vector assignment of
-the existing Linux kernel has no impact on supporting multiple
-messages on an MSI-X capable device functions. To enable MSI-X on
-a device function's MSI-X capability structure requires its device
-driver to call the function pci_enable_msix() explicitly.
-
-The function pci_enable_msix(), once invoked, enables either
-all or nothing, depending on the current availability of PCI vector
-resources. If the PCI vector resources are available for the number
-of vectors requested by a device driver, this function will configure
-the MSI-X table of the MSI-X capability structure of a device with
-requested messages. To emphasize this reason, for example, a device
-may be capable for supporting the maximum of 32 vectors while its
-software driver usually may request 4 vectors. It is recommended
-that the device driver should call this function once during the
-initialization phase of the device driver.
-
-Unlike the function pci_enable_msi(), the function pci_enable_msix()
-does not replace the pre-assigned IOAPIC dev->irq with a new MSI
-vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
-into the field vector of each element contained in a second argument.
-Note that the pre-assigned IOAPIC dev->irq is valid only if the device
-operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at
-using dev->irq by the device driver to request for interrupt service
-may result in unpredictable behavior.
-
-For each MSI-X vector granted, a device driver is responsible for calling
-other functions like request_irq(), enable_irq(), etc. to enable
-this vector with its corresponding interrupt service handler. It is
-a device driver's choice to assign all vectors with the same
-interrupt service handler or each vector with a unique interrupt
-service handler.
-
-5.3.1 Handling MMIO address space of MSI-X Table
-
-The PCI 3.0 specification has implementation notes that MMIO address
-space for a device's MSI-X structure should be isolated so that the
-software system can set different pages for controlling accesses to the
-MSI-X structure. The implementation of MSI support requires the PCI
-subsystem, not a device driver, to maintain full control of the MSI-X
-table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X
-table/MSI-X PBA.  A device driver should not access the MMIO address
-space of the MSI-X table/MSI-X PBA.
-
-5.3.2 API pci_enable_msix
-
-int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+This function should be used to undo the effect of pci_enable_msi().
+Calling it restores dev->irq to the pin-based interrupt number and frees
+the previously allocated message signaled interrupt(s).  The interrupt
+may subsequently be assigned to another device, so drivers should not
+cache the value of dev->irq.
 
-This API enables a device driver to request the PCI subsystem
-to enable MSI-X messages on its hardware device. Depending on
-the availability of PCI vectors resources, the PCI subsystem enables
-either all or none of the requested vectors.
+A device driver must always call free_irq() on the interrupt(s)
+for which it has called request_irq() before calling this function.
+Failure to do so will result in a BUG_ON(), the device will be left with
+MSI enabled and will leak its vector.
 
-Argument 'dev' points to the device (pci_dev) structure.
+4.3 Using MSI-X
 
-Argument 'entries' is a pointer to an array of msix_entry structs.
-The number of entries is indicated in argument 'nvec'.
-struct msix_entry is defined in /driver/pci/msi.h:
+The MSI-X capability is much more flexible than the MSI capability.
+It supports up to 2048 interrupts, each of which can be controlled
+independently.  To support this flexibility, drivers must use an array of
+`struct msix_entry':
 
 struct msix_entry {
 	u16 	vector; /* kernel uses to write alloc vector */
 	u16	entry; /* driver uses to specify entry */
 };
 
-A device driver is responsible for initializing the field 'entry' of
-each element with a unique entry supported by MSI-X table. Otherwise,
--EINVAL will be returned as a result. A successful return of zero
-indicates the PCI subsystem completed initializing each of the requested
-entries of the MSI-X table with message address and message data.
-Last but not least, the PCI subsystem will write the 1:1
-vector-to-entry mapping into the field 'vector' of each element. A
-device driver is responsible for keeping track of allocated MSI-X
-vectors in its internal data structure.
-
-A return of zero indicates that the number of MSI-X vectors was
-successfully allocated. A return of greater than zero indicates
-MSI-X vector shortage. Or a return of less than zero indicates
-a failure. This failure may be a result of duplicate entries
-specified in second argument, or a result of no available vector,
-or a result of failing to initialize MSI-X table entries.
-
-5.3.3 API pci_disable_msix
+This allows for the device to use these interrupts in a sparse fashion;
+for example it could use interrupts 3 and 1027 and allocate only a
+two-element array.  The driver is expected to fill in the 'entry' value
+in each element of the array to indicate which entries it wants the kernel
+to assign interrupts for.  It is invalid to fill in two entries with the
+same number.
+
+4.3.1 pci_enable_msix
+
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+
+Calling this function asks the PCI subsystem to allocate 'nvec' MSIs.
+The 'entries' argument is a pointer to an array of msix_entry structs
+which should be at least 'nvec' entries in size.  On success, the
+function will return 0 and the device will have been switched into
+MSI-X interrupt mode.  The 'vector' elements in each entry will have
+been filled in with the interrupt number.  The driver should then call
+request_irq() for each 'vector' that it decides to use.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to allocate any more MSI-X interrupts for
+this device.  If it returns a positive number, it indicates the maximum
+number of interrupt vectors that could have been allocated.
+
+This function, in contrast with pci_enable_msi(), does not adjust
+dev->irq.  The device will not generate interrupts for this interrupt
+number once MSI-X is enabled.  The device driver is responsible for
+keeping track of the interrupts assigned to the MSI-X vectors so it can
+free them again later.
+
+Device drivers should normally call this function once per device
+during the initialization phase.
+
+4.3.2 pci_disable_msix
 
 void pci_disable_msix(struct pci_dev *dev)
 
-This API should always be used to undo the effect of pci_enable_msix()
-when a device driver is unloading. Note that a device driver should
-always call free_irq() on all MSI-X vectors it has done request_irq()
-on before calling this API. Failure to do so results in a BUG_ON() and
-a device will be left with MSI-X enabled and leaks its vectors.
-
-5.3.4 MSI-X mode vs. legacy mode diagram
-
-The below diagram shows the events which switch the interrupt
-mode on the MSI-X capable device function between MSI-X mode and
-PIN-IRQ assertion mode (legacy).
-
-	 ------------   pci_enable_msix(,,n) ------------------------
-	|	     | <===============	    | 			     |
-	| MSI-X MODE |	  	     	    | PIN-IRQ ASSERTION MODE |
-	| 	     | ===============>	    |			     |
- 	 ------------	pci_disable_msix     ------------------------
-
-Figure 2. MSI-X Mode vs. Legacy Mode
-
-In Figure 2, a device operates by default in legacy mode. A
-successful MSI-X request (using pci_enable_msix()) switches a
-device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector
-stored in dev->irq will be saved by the PCI subsystem; however,
-unlike MSI mode, the PCI subsystem will not replace dev->irq with
-assigned MSI-X vector because the PCI subsystem already writes the 1:1
-vector-to-entry mapping into the field 'vector' of each element
-specified in second argument.
-
-To return back to its default mode, a device driver should always call
-pci_disable_msix() to undo the effect of pci_enable_msix(). Note that
-a device driver should always call free_irq() on all MSI-X vectors it
-has done request_irq() on before calling pci_disable_msix(). Failure
-to do so results in a BUG_ON() and a device will be left with MSI-X
-enabled and leaks its vectors. Otherwise, the PCI subsystem switches a
-device function's interrupt mode from MSI-X mode to legacy mode and
-marks all allocated MSI-X vectors as unused.
-
-Once being marked as unused, there is no guarantee that the PCI
-subsystem will reserve these MSI-X vectors for a device. Depending on
-the availability of current PCI vector resources and the number of
-MSI/MSI-X requests from other drivers, these MSI-X vectors may be
-re-assigned.
-
-For the case where the PCI subsystem re-assigned these MSI-X vectors
-to other drivers, a request to switch back to MSI-X mode may result
-being assigned with another set of MSI-X vectors or a failure if no
-more vectors are available.
-
-5.4 Handling function implementing both MSI and MSI-X capabilities
-
-For the case where a function implements both MSI and MSI-X
-capabilities, the PCI subsystem enables a device to run either in MSI
-mode or MSI-X mode but not both. A device driver determines whether it
-wants MSI or MSI-X enabled on its hardware device. Once a device
-driver requests for MSI, for example, it is prohibited from requesting
-MSI-X; in other words, a device driver is not permitted to ping-pong
-between MSI mod MSI-X mode during a run-time.
-
-5.5 Hardware requirements for MSI/MSI-X support
-
-MSI/MSI-X support requires support from both system hardware and
-individual hardware device functions.
-
-5.5.1 Required x86 hardware support
-
-Since the target of MSI address is the local APIC CPU, enabling
-MSI/MSI-X support in the Linux kernel is dependent on whether existing
-system hardware supports local APIC. Users should verify that their
-system supports local APIC operation by testing that it runs when
-CONFIG_X86_LOCAL_APIC=y.
-
-In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set;
-however, in UP environment, users must manually set
-CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
-CONFIG_PCI_MSI enables the VECTOR based scheme and the option for
-MSI-capable device drivers to selectively enable MSI/MSI-X.
-
-Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X
-vector is allocated new during runtime and MSI/MSI-X support does not
-depend on BIOS support. This key independency enables MSI/MSI-X
-support on future IOxAPIC free platforms.
-
-5.5.2 Device hardware support
-
-The hardware device function supports MSI by indicating the
-MSI/MSI-X capability structure on its PCI capability list. By
-default, this capability structure will not be initialized by
-the kernel to enable MSI during the system boot. In other words,
-the device function is running on its default pin assertion mode.
-Note that in many cases the hardware supporting MSI have bugs,
-which may result in system hangs. The software driver of specific
-MSI-capable hardware is responsible for deciding whether to call
-pci_enable_msi or not. A return of zero indicates the kernel
-successfully initialized the MSI/MSI-X capability structure of the
-device function. The device function is now running on MSI/MSI-X mode.
-
-5.6 How to tell whether MSI/MSI-X is enabled on device function
-
-At the driver level, a return of zero from the function call of
-pci_enable_msi()/pci_enable_msix() indicates to a device driver that
-its device function is initialized successfully and ready to run in
-MSI/MSI-X mode.
-
-At the user level, users can use the command 'cat /proc/interrupts'
-to display the vectors allocated for devices and their interrupt
-MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is
-enabled on a SCSI Adaptec 39320D Ultra320 controller.
-
-           CPU0       CPU1
-  0:     324639          0    IO-APIC-edge  timer
-  1:       1186          0    IO-APIC-edge  i8042
-  2:          0          0          XT-PIC  cascade
- 12:       2797          0    IO-APIC-edge  i8042
- 14:       6543          0    IO-APIC-edge  ide0
- 15:          1          0    IO-APIC-edge  ide1
-169:          0          0   IO-APIC-level  uhci-hcd
-185:          0          0   IO-APIC-level  uhci-hcd
-193:        138         10         PCI-MSI  aic79xx
-201:         30          0         PCI-MSI  aic79xx
-225:         30          0   IO-APIC-level  aic7xxx
-233:         30          0   IO-APIC-level  aic7xxx
-NMI:          0          0
-LOC:     324553     325068
-ERR:          0
-MIS:          0
-
-6. MSI quirks
-
-Several PCI chipsets or devices are known to not support MSI.
-The PCI stack provides 3 possible levels of MSI disabling:
-* on a single device
-* on all devices behind a specific bridge
-* globally
-
-6.1. Disabling MSI on a single device
-
-Under some circumstances it might be required to disable MSI on a
-single device.  This may be achieved by either not calling pci_enable_msi()
-or all, or setting the pci_dev->no_msi flag before (most of the time
-in a quirk).
-
-6.2. Disabling MSI below a bridge
-
-The vast majority of MSI quirks are required by PCI bridges not
-being able to route MSI between busses. In this case, MSI have to be
-disabled on all devices behind this bridge. It is achieves by setting
-the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge
-subordinate bus. There is no need to set the same flag on bridges that
-are below the broken bridge. When pci_enable_msi() is called to enable
-MSI on a device, pci_msi_supported() takes care of checking the NO_MSI
-flag in all parent busses of the device.
-
-Some bridges actually support dynamic MSI support enabling/disabling
-by changing some bits in their PCI configuration space (especially
-the Hypertransport chipsets such as the nVidia nForce and Serverworks
-HT2000). It may then be required to update the NO_MSI flag on the
-corresponding devices in the sysfs hierarchy. To enable MSI support
-on device "0000:00:0e", do:
-
-	echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus
-
-To disable MSI support, echo 0 instead of 1. Note that it should be
-used with caution since changing this value might break interrupts.
-
-6.3. Disabling MSI globally
-
-Some extreme cases may require to disable MSI globally on the system.
-For now, the only known case is a Serverworks PCI-X chipsets (MSI are
-not supported on several busses that are not all connected to the
-chipset in the Linux PCI hierarchy). In the vast majority of other
-cases, disabling only behind a specific bridge is enough.
-
-For debugging purpose, the user may also pass pci=nomsi on the kernel
-command-line to explicitly disable MSI globally. But, once the appro-
-priate quirks are added to the kernel, this option should not be
-required anymore.
-
-6.4. Finding why MSI cannot be enabled on a device
-
-Assuming that MSI are not enabled on a device, you should look at
-dmesg to find messages that quirks may output when disabling MSI
-on some devices, some bridges or even globally.
-Then, lspci -t gives the list of bridges above a device. Reading
-/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI
-are enabled (1) or disabled (0). In 0 is found in a single bridge
-msi_bus file above the device, MSI cannot be enabled.
-
-7. FAQ
-
-Q1. Are there any limitations on using the MSI?
-
-A1. If the PCI device supports MSI and conforms to the
-specification and the platform supports the APIC local bus,
-then using MSI should work.
-
-Q2. Will it work on all the Pentium processors (P3, P4, Xeon,
-AMD processors)? In P3 IPI's are transmitted on the APIC local
-bus and in P4 and Xeon they are transmitted on the system
-bus. Are there any implications with this?
-
-A2. MSI support enables a PCI device sending an inbound
-memory write (0xfeexxxxx as target address) on its PCI bus
-directly to the FSB. Since the message address has a
-redirection hint bit cleared, it should work.
-
-Q3. The target address 0xfeexxxxx will be translated by the
-Host Bridge into an interrupt message. Are there any
-limitations on the chipsets such as Intel 8xx, Intel e7xxx,
-or VIA?
-
-A3. If these chipsets support an inbound memory write with
-target address set as 0xfeexxxxx, as conformed to PCI
-specification 2.3 or latest, then it should work.
-
-Q4. From the driver point of view, if the MSI is lost because
-of errors occurring during inbound memory write, then it may
-wait forever. Is there a mechanism for it to recover?
-
-A4. Since the target of the transaction is an inbound memory
-write, all transaction termination conditions (Retry,
-Master-Abort, Target-Abort, or normal completion) are
-supported. A device sending an MSI must abide by all the PCI
-rules and conditions regarding that inbound memory write. So,
-if a retry is signaled it must retry, etc... We believe that
-the recommendation for Abort is also a retry (refer to PCI
-specification 2.3 or latest).
+This API should be used to undo the effect of pci_enable_msix().  It frees
+the previously allocated message signaled interrupts.  The interrupts may
+subsequently be assigned to another device, so drivers should not cache
+the value of the 'vector' elements over a call to pci_disable_msix().
+
+A device driver must always call free_irq() on the interrupt(s)
+for which it has called request_irq() before calling this function.
+Failure to do so will result in a BUG_ON(), the device will be left with
+MSI enabled and will leak its vector.
+
+4.3.3 The MSI-X Table
+
+The MSI-X capability specifies a BAR and offset within that BAR for the
+MSI-X Table.  This address is mapped by the PCI subsystem, and should not
+be accessed directly by the device driver.  If the driver wishes to
+mask or unmask an interrupt, it should call disable_irq() / enable_irq().
+
+4.4 Handling devices implementing both MSI and MSI-X capabilities
+
+If a device implements both MSI and MSI-X capabilities, it can
+run in either MSI mode or MSI-X mode but not both simultaneously.
+This is a requirement of the PCI spec, and it is enforced by the
+PCI layer.  Calling pci_enable_msi() when MSI-X is already enabled or
+pci_enable_msix() when MSI is already enabled will result in an error.
+If a device driver wishes to switch between MSI and MSI-X at runtime,
+it must first quiesce the device, then switch it back to pin-interrupt
+mode, before calling pci_enable_msi() or pci_enable_msix() and resuming
+operation.  This is not expected to be a common operation but may be
+useful for debugging or testing during development.
+
+4.5 Considerations when using MSIs
+
+4.5.1 Choosing between MSI-X and MSI
+
+If your device supports both MSI-X and MSI capabilities, you should use
+the MSI-X facilities in preference to the MSI facilities.  As mentioned
+above, MSI-X supports any number of interrupts between 1 and 2048.
+In constrast, MSI is restricted to a maximum of 32 interrupts (and
+must be a power of two).  In addition, the MSI interrupt vectors must
+be allocated consecutively, so the system may not be able to allocate
+as many vectors for MSI as it could for MSI-X.  On some platforms, MSI
+interrupts must all be targetted at the same set of CPUs whereas MSI-X
+interrupts can all be targetted at different CPUs.
+
+4.5.2 Spinlocks
+
+Most device drivers have a per-device spinlock which is taken in the
+interrupt handler.  With pin-based interrupts or a single MSI, it is not
+necessary to disable interrupts (Linux guarantees the same interrupt will
+not be re-entered).  If a device uses multiple interrupts, the driver
+must disable interrupts while the lock is held.  If the device sends
+a different interrupt, the driver will deadlock trying to recursively
+acquire the spinlock.
+
+There are two solutions.  The first is to take the lock with
+spin_lock_irqsave() or spin_lock_irq() (see
+Documentation/DocBook/kernel-locking).  The second is to specify
+IRQF_DISABLED to request_irq() so that the kernel runs the entire
+interrupt routine with interrupts disabled.
+
+If your MSI interrupt routine does not hold the lock for the whole time
+it is running, the first solution may be best.  The second solution is
+normally preferred as it avoids making two transitions from interrupt
+disabled to enabled and back again.
+
+4.6 How to tell whether MSI/MSI-X is enabled on a device
+
+Using 'lspci -v' (as root) may show some devices with "MSI", "Message
+Signalled Interrupts" or "MSI-X" capabilities.  Each of these capabilities
+has an 'Enable' flag which will be followed with either "+" (enabled)
+or "-" (disabled).
+
+
+5. MSI quirks
+
+Several PCI chipsets or devices are known not to support MSIs.
+The PCI stack provides three ways to disable MSIs:
+
+1. globally
+2. on all devices behind a specific bridge
+3. on a single device
+
+5.1. Disabling MSIs globally
+
+Some host chipsets simply don't support MSIs properly.  If we're
+lucky, the manufacturer knows this and has indicated it in the ACPI
+FADT table.  In this case, Linux will automatically disable MSIs.
+Some boards don't include this information in the table and so we have
+to detect them ourselves.  The complete list of these is found near the
+quirk_disable_all_msi() function in drivers/pci/quirks.c.
+
+If you have a board which has problems with MSIs, you can pass pci=nomsi
+on the kernel command line to disable MSIs on all devices.  It would be
+in your best interests to report the problem to linux-pci@vger.kernel.org
+including a full 'lspci -v' so we can add the quirks to the kernel.
+
+5.2. Disabling MSIs below a bridge
+
+Some PCI bridges are not able to route MSIs between busses properly.
+In this case, MSIs must be disabled on all devices behind the bridge.
+
+Some bridges allow you to enable MSIs by changing some bits in their
+PCI configuration space (especially the Hypertransport chipsets such
+as the nVidia nForce and Serverworks HT2000).  As with host chipsets,
+Linux mostly knows about them and automatically enables MSIs if it can.
+If you have a bridge which Linux doesn't yet know about, you can enable
+MSIs in configuration space using whatever method you know works, then
+enable MSIs on that bridge by doing:
+
+       echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
+
+where $bridge is the PCI address of the bridge you've enabled (eg
+0000:00:0e.0).
+
+To disable MSIs, echo 0 instead of 1.  Changing this value should be
+done with caution as it can break interrupt handling for all devices
+below this bridge.
+
+Again, please notify linux-pci@vger.kernel.org of any bridges that need
+special handling.
+
+5.3. Disabling MSIs on a single device
+
+Some devices are known to have faulty MSI implementations.  Usually this
+is handled in the individual device driver but occasionally it's necessary
+to handle this with a quirk.  Some drivers have an option to disable use
+of MSI.  While this is a convenient workaround for the driver author,
+it is not good practise, and should not be emulated.
+
+5.4. Finding why MSIs are disabled on a device
+
+From the above three sections, you can see that there are many reasons
+why MSIs may not be enabled for a given device.  Your first step should
+be to examine your dmesg carefully to determine whether MSIs are enabled
+for your machine.  You should also check your .config to be sure you
+have enabled CONFIG_PCI_MSI.
+
+Then, 'lspci -t' gives the list of bridges above a device.  Reading
+/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1)
+or disabled (0).  If 0 is found in any of the msi_bus files belonging
+to bridges between the PCI root and the device, MSIs are disabled.
+
+It is also worth checking the device driver to see whether it supports MSIs.
+For example, it may contain calls to pci_enable_msi(), pci_enable_msix() or
+pci_enable_msi_block().
-- 
cgit v1.2.3


From 24d27553390c69d11cdbd930d635193956fc295f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:06 -0400
Subject: PCI MSI: Replace 'type' with 'is_msix'

By changing from a 5-bit field to a 1-bit field, we free up some bits
that can be used by a later patch.  Also rearrange the fields for better
packing on 64-bit platforms (reducing the size of msi_desc from 72 bytes
to 64 bytes).

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 115 ++++++++++++++++++----------------------------------
 include/linux/msi.h |   4 +-
 2 files changed, 41 insertions(+), 78 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index dceea56f734..b3db4388f97 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -111,20 +111,10 @@ static void msix_flush_writes(struct irq_desc *desc)
 
 	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-		/* nothing to do */
-		break;
-	case PCI_CAP_ID_MSIX:
-	{
+	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
 		readl(entry->mask_base + offset);
-		break;
-	}
-	default:
-		BUG();
-		break;
 	}
 }
 
@@ -143,32 +133,23 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 
 	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-		if (entry->msi_attrib.maskbit) {
-			int pos;
-			u32 mask_bits;
-
-			pos = (long)entry->mask_base;
-			pci_read_config_dword(entry->dev, pos, &mask_bits);
-			mask_bits &= ~(mask);
-			mask_bits |= flag & mask;
-			pci_write_config_dword(entry->dev, pos, mask_bits);
-		} else {
-			return 0;
-		}
-		break;
-	case PCI_CAP_ID_MSIX:
-	{
+	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
 		writel(flag, entry->mask_base + offset);
 		readl(entry->mask_base + offset);
-		break;
-	}
-	default:
-		BUG();
-		break;
+	} else {
+		int pos;
+		u32 mask_bits;
+
+		if (!entry->msi_attrib.maskbit)
+			return 0;
+
+		pos = (long)entry->mask_base;
+		pci_read_config_dword(entry->dev, pos, &mask_bits);
+		mask_bits &= ~mask;
+		mask_bits |= flag & mask;
+		pci_write_config_dword(entry->dev, pos, mask_bits);
 	}
 	entry->msi_attrib.masked = !!flag;
 	return 1;
@@ -177,9 +158,14 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
-	switch(entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-	{
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
 		u16 data;
@@ -195,21 +181,6 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 			pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
 		}
 		msg->data = data;
-		break;
-	}
-	case PCI_CAP_ID_MSIX:
-	{
-		void __iomem *base;
-		base = entry->mask_base +
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
-		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
- 		break;
- 	}
- 	default:
-		BUG();
 	}
 }
 
@@ -223,9 +194,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
 void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-	{
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base;
+		base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		writel(msg->address_lo,
+			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		writel(msg->address_hi,
+			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
 
@@ -240,23 +219,6 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 			pci_write_config_word(dev, msi_data_reg(pos, 0),
 						msg->data);
 		}
-		break;
-	}
-	case PCI_CAP_ID_MSIX:
-	{
-		void __iomem *base;
-		base = entry->mask_base +
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
-		writel(msg->address_lo,
-			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		writel(msg->address_hi,
-			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
-		break;
-	}
-	default:
-		BUG();
 	}
 	entry->msg = *msg;
 }
@@ -393,7 +355,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->msi_attrib.type = PCI_CAP_ID_MSI;
+	entry->msi_attrib.is_msix = 0;
 	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
@@ -475,7 +437,7 @@ static int msix_capability_init(struct pci_dev *dev,
 			break;
 
  		j = entries[i].entry;
-		entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+		entry->msi_attrib.is_msix = 1;
 		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
 		entry->msi_attrib.maskbit = 1;
@@ -619,12 +581,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
 		struct irq_desc *desc = irq_to_desc(dev->irq);
 		msi_set_mask_bits(desc, mask, ~mask);
 	}
-	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+	if (!entry->dev || entry->msi_attrib.is_msix)
 		return;
 
 	/* Restore dev->irq to its default pin-assertion irq */
 	dev->irq = entry->msi_attrib.default_irq;
 }
+
 void pci_disable_msi(struct pci_dev* dev)
 {
 	struct msi_desc *entry;
@@ -635,7 +598,7 @@ void pci_disable_msi(struct pci_dev* dev)
 	pci_msi_shutdown(dev);
 
 	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+	if (!entry->dev || entry->msi_attrib.is_msix)
 		return;
 
 	msi_free_irqs(dev);
@@ -654,7 +617,7 @@ static int msi_free_irqs(struct pci_dev* dev)
 	arch_teardown_msi_irqs(dev);
 
 	list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
-		if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
+		if (entry->msi_attrib.is_msix) {
 			writel(1, entry->mask_base + entry->msi_attrib.entry_nr
 				  * PCI_MSIX_ENTRY_SIZE
 				  + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d2b8a1e8ca1..9c5ce214fbf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -20,13 +20,13 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
 	struct {
-		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	is_msix	: 1;
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	masked	: 1;
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
-		__u32	maskbits_mask;  /* mask bits mask */
 		__u16	entry_nr;    	/* specific enabled entry 	  */
+		__u32	maskbits_mask;  /* mask bits mask */
 		unsigned default_irq;	/* default pre-assigned irq	  */
 	}msi_attrib;
 
-- 
cgit v1.2.3


From 379f5327a86f7822a51ec7d088a085167724df75 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:07 -0400
Subject: PCI MSI: msi_desc->dev is always initialised

By passing the pci_dev into alloc_msi_entry() we can be sure that
the ->dev entry is always assigned and so we don't need to check it.
Also, we used kzalloc() so we don't need to initialise ->irq to 0.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b3db4388f97..a658c0f34e1 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -110,7 +110,7 @@ static void msix_flush_writes(struct irq_desc *desc)
 	struct msi_desc *entry;
 
 	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry || !entry->dev);
+	BUG_ON(!entry);
 	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
@@ -132,7 +132,7 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 	struct msi_desc *entry;
 
 	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry || !entry->dev);
+	BUG_ON(!entry);
 	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
@@ -248,19 +248,16 @@ void unmask_msi_irq(unsigned int irq)
 
 static int msi_free_irqs(struct pci_dev* dev);
 
-static struct msi_desc* alloc_msi_entry(void)
+static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
 {
-	struct msi_desc *entry;
-
-	entry = kzalloc(sizeof(struct msi_desc), GFP_KERNEL);
-	if (!entry)
+	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
 		return NULL;
 
-	INIT_LIST_HEAD(&entry->list);
-	entry->irq = 0;
-	entry->dev = NULL;
+	INIT_LIST_HEAD(&desc->list);
+	desc->dev = dev;
 
-	return entry;
+	return desc;
 }
 
 static void pci_intx_for_msi(struct pci_dev *dev, int enable)
@@ -351,7 +348,7 @@ static int msi_capability_init(struct pci_dev *dev)
    	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	/* MSI Entry Initialization */
-	entry = alloc_msi_entry();
+	entry = alloc_msi_entry(dev);
 	if (!entry)
 		return -ENOMEM;
 
@@ -362,7 +359,6 @@ static int msi_capability_init(struct pci_dev *dev)
 	entry->msi_attrib.masked = 1;
 	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.pos = pos;
-	entry->dev = dev;
 	if (entry->msi_attrib.maskbit) {
 		unsigned int base, maskbits, temp;
 
@@ -432,7 +428,7 @@ static int msix_capability_init(struct pci_dev *dev,
 
 	/* MSI-X Table Initialization */
 	for (i = 0; i < nvec; i++) {
-		entry = alloc_msi_entry();
+		entry = alloc_msi_entry(dev);
 		if (!entry)
 			break;
 
@@ -444,7 +440,6 @@ static int msix_capability_init(struct pci_dev *dev,
 		entry->msi_attrib.masked = 1;
 		entry->msi_attrib.default_irq = dev->irq;
 		entry->msi_attrib.pos = pos;
-		entry->dev = dev;
 		entry->mask_base = base;
 
 		list_add_tail(&entry->list, &dev->msi_list);
@@ -581,7 +576,7 @@ void pci_msi_shutdown(struct pci_dev* dev)
 		struct irq_desc *desc = irq_to_desc(dev->irq);
 		msi_set_mask_bits(desc, mask, ~mask);
 	}
-	if (!entry->dev || entry->msi_attrib.is_msix)
+	if (entry->msi_attrib.is_msix)
 		return;
 
 	/* Restore dev->irq to its default pin-assertion irq */
@@ -598,7 +593,7 @@ void pci_disable_msi(struct pci_dev* dev)
 	pci_msi_shutdown(dev);
 
 	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	if (!entry->dev || entry->msi_attrib.is_msix)
+	if (entry->msi_attrib.is_msix)
 		return;
 
 	msi_free_irqs(dev);
-- 
cgit v1.2.3


From 264d9caaa1c574c0274b019a810abfe957391005 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:08 -0400
Subject: PCI MSI: Use mask_pos instead of mask_base when appropriate

MSI interrupts have a mask_pos where MSI-X have a mask_base.  Use a
transparent union to get rid of some ugly casts.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 5 ++---
 include/linux/msi.h | 5 ++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a658c0f34e1..fcde04df6df 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -145,7 +145,7 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 		if (!entry->msi_attrib.maskbit)
 			return 0;
 
-		pos = (long)entry->mask_base;
+		pos = entry->mask_pos;
 		pci_read_config_dword(entry->dev, pos, &mask_bits);
 		mask_bits &= ~mask;
 		mask_bits |= flag & mask;
@@ -363,8 +363,7 @@ static int msi_capability_init(struct pci_dev *dev)
 		unsigned int base, maskbits, temp;
 
 		base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
-		entry->mask_base = (void __iomem *)(long)base;
-
+		entry->mask_pos = base;
 		/* All MSIs are unmasked by default, Mask them all */
 		pci_read_config_dword(dev, base, &maskbits);
 		temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 9c5ce214fbf..5025ca4d91e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -33,7 +33,10 @@ struct msi_desc {
 	unsigned int irq;
 	struct list_head list;
 
-	void __iomem *mask_base;
+	union {
+		void __iomem *mask_base;
+		u8 mask_pos;
+	};
 	struct pci_dev *dev;
 
 	/* Last set MSI message */
-- 
cgit v1.2.3


From f2440d9acbe866b917b16cc0f927366341ce9215 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:09 -0400
Subject: PCI MSI: Refactor interrupt masking code

Since most of the callers already know whether they have an MSI or
an MSI-X capability, split msi_set_mask_bits() into msi_mask_irq()
and msix_mask_irq().  The only callers which don't (mask_msi_irq()
and unmask_msi_irq()) can share code in msi_set_mask_bit().  This then
becomes the only caller of msix_flush_writes(), so we can inline it.
The flushing read can be to any address that belongs to the device,
so we can eliminate the calculation too.

We can also get rid of maskbits_mask from struct msi_desc and simply
recalculate it on the rare occasion that we need it.  The single-bit
'masked' element is replaced by a copy of the 32-bit 'masked' register,
so this patch does not affect the size of msi_desc.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 155 +++++++++++++++++++++++++---------------------------
 include/linux/msi.h |   5 +-
 2 files changed, 77 insertions(+), 83 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index fcde04df6df..adcc7824257 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -105,17 +105,14 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
 	return (1 << (1 << x)) - 1;
 }
 
-static void msix_flush_writes(struct irq_desc *desc)
+static inline __attribute_const__ u32 msi_capable_mask(u16 control)
 {
-	struct msi_desc *entry;
+	return msi_mask((control >> 1) & 7);
+}
 
-	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry);
-	if (entry->msi_attrib.is_msix) {
-		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-		readl(entry->mask_base + offset);
-	}
+static inline __attribute_const__ u32 msi_enabled_mask(u16 control)
+{
+	return msi_mask((control >> 4) & 7);
 }
 
 /*
@@ -127,32 +124,57 @@ static void msix_flush_writes(struct irq_desc *desc)
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
+static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	struct msi_desc *entry;
+	u32 mask_bits = desc->masked;
 
-	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry);
-	if (entry->msi_attrib.is_msix) {
-		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-		writel(flag, entry->mask_base + offset);
-		readl(entry->mask_base + offset);
-	} else {
-		int pos;
-		u32 mask_bits;
+	if (!desc->msi_attrib.maskbit)
+		return;
+
+	mask_bits &= ~mask;
+	mask_bits |= flag;
+	pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits);
+	desc->masked = mask_bits;
+}
+
+/*
+ * This internal function does not flush PCI writes to the device.
+ * All users must ensure that they read from the device before either
+ * assuming that the device state is up to date, or returning out of this
+ * file.  This saves a few milliseconds when initialising devices with lots
+ * of MSI-X interrupts.
+ */
+static void msix_mask_irq(struct msi_desc *desc, u32 flag)
+{
+	u32 mask_bits = desc->masked;
+	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+					PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+	mask_bits &= ~1;
+	mask_bits |= flag;
+	writel(mask_bits, desc->mask_base + offset);
+	desc->masked = mask_bits;
+}
 
-		if (!entry->msi_attrib.maskbit)
-			return 0;
+static void msi_set_mask_bit(unsigned irq, u32 flag)
+{
+	struct msi_desc *desc = get_irq_msi(irq);
 
-		pos = entry->mask_pos;
-		pci_read_config_dword(entry->dev, pos, &mask_bits);
-		mask_bits &= ~mask;
-		mask_bits |= flag & mask;
-		pci_write_config_dword(entry->dev, pos, mask_bits);
+	if (desc->msi_attrib.is_msix) {
+		msix_mask_irq(desc, flag);
+		readl(desc->mask_base);		/* Flush write to device */
+	} else {
+		msi_mask_irq(desc, 1, flag);
 	}
-	entry->msi_attrib.masked = !!flag;
-	return 1;
+}
+
+void mask_msi_irq(unsigned int irq)
+{
+	msi_set_mask_bit(irq, 1);
+}
+
+void unmask_msi_irq(unsigned int irq)
+{
+	msi_set_mask_bit(irq, 0);
 }
 
 void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
@@ -230,22 +252,6 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 	write_msi_msg_desc(desc, msg);
 }
 
-void mask_msi_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	msi_set_mask_bits(desc, 1, 1);
-	msix_flush_writes(desc);
-}
-
-void unmask_msi_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	msi_set_mask_bits(desc, 1, 0);
-	msix_flush_writes(desc);
-}
-
 static int msi_free_irqs(struct pci_dev* dev);
 
 static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
@@ -281,13 +287,9 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit) {
-		struct irq_desc *desc = irq_to_desc(dev->irq);
-		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
-				  entry->msi_attrib.masked);
-	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
+	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
 	control |= PCI_MSI_FLAGS_ENABLE;
 	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
@@ -307,9 +309,8 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
+		msix_mask_irq(entry, entry->masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -342,6 +343,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	struct msi_desc *entry;
 	int pos, ret;
 	u16 control;
+	unsigned mask;
 
 	msi_set_enable(dev, 0);	/* Ensure msi is disabled as I set it up */
 
@@ -356,21 +358,16 @@ static int msi_capability_init(struct pci_dev *dev)
 	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
-	entry->msi_attrib.masked = 1;
 	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.pos = pos;
-	if (entry->msi_attrib.maskbit) {
-		unsigned int base, maskbits, temp;
-
-		base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
-		entry->mask_pos = base;
-		/* All MSIs are unmasked by default, Mask them all */
-		pci_read_config_dword(dev, base, &maskbits);
-		temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1);
-		maskbits |= temp;
-		pci_write_config_dword(dev, base, maskbits);
-		entry->msi_attrib.maskbits_mask = temp;
-	}
+
+	entry->mask_pos = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
+	/* All MSIs are unmasked by default, Mask them all */
+	if (entry->msi_attrib.maskbit)
+		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
+	mask = msi_capable_mask(control);
+	msi_mask_irq(entry, mask, mask);
+
 	list_add_tail(&entry->list, &dev->msi_list);
 
 	/* Configure MSI capability structure */
@@ -435,11 +432,12 @@ static int msix_capability_init(struct pci_dev *dev,
 		entry->msi_attrib.is_msix = 1;
 		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
-		entry->msi_attrib.maskbit = 1;
-		entry->msi_attrib.masked = 1;
 		entry->msi_attrib.default_irq = dev->irq;
 		entry->msi_attrib.pos = pos;
 		entry->mask_base = base;
+		entry->masked = readl(base + j * PCI_MSIX_ENTRY_SIZE +
+					PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		msix_mask_irq(entry, 1);
 
 		list_add_tail(&entry->list, &dev->msi_list);
 	}
@@ -556,9 +554,11 @@ int pci_enable_msi(struct pci_dev* dev)
 }
 EXPORT_SYMBOL(pci_enable_msi);
 
-void pci_msi_shutdown(struct pci_dev* dev)
+void pci_msi_shutdown(struct pci_dev *dev)
 {
-	struct msi_desc *entry;
+	struct msi_desc *desc;
+	u32 mask;
+	u16 ctrl;
 
 	if (!pci_msi_enable || !dev || !dev->msi_enabled)
 		return;
@@ -568,18 +568,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
 	dev->msi_enabled = 0;
 
 	BUG_ON(list_empty(&dev->msi_list));
-	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	/* Return the the pci reset with msi irqs unmasked */
-	if (entry->msi_attrib.maskbit) {
-		u32 mask = entry->msi_attrib.maskbits_mask;
-		struct irq_desc *desc = irq_to_desc(dev->irq);
-		msi_set_mask_bits(desc, mask, ~mask);
-	}
-	if (entry->msi_attrib.is_msix)
-		return;
+	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);
+	pci_read_config_word(dev, desc->msi_attrib.pos + PCI_MSI_FLAGS, &ctrl);
+	mask = msi_capable_mask(ctrl);
+	msi_mask_irq(desc, mask, ~mask);
 
 	/* Restore dev->irq to its default pin-assertion irq */
-	dev->irq = entry->msi_attrib.default_irq;
+	dev->irq = desc->msi_attrib.default_irq;
 }
 
 void pci_disable_msi(struct pci_dev* dev)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 5025ca4d91e..37c1bbe546e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -22,14 +22,13 @@ struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
-		__u8	masked	: 1;
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
 		__u16	entry_nr;    	/* specific enabled entry 	  */
-		__u32	maskbits_mask;  /* mask bits mask */
 		unsigned default_irq;	/* default pre-assigned irq	  */
-	}msi_attrib;
+	} msi_attrib;
 
+	u32 masked;			/* mask bits */
 	unsigned int irq;
 	struct list_head list;
 
-- 
cgit v1.2.3


From 1c8d7b0a562da06d3ebe83f01b1ed553205d1ae4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:10 -0400
Subject: PCI MSI: Add support for multiple MSI

Add the new API pci_enable_msi_block() to allow drivers to
request multiple MSI and reimplement pci_enable_msi in terms of
pci_enable_msi_block.  Ensure that the architecture back ends don't
have to know about multiple MSI.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/PCI/MSI-HOWTO.txt | 45 +++++++++++++++++---
 arch/powerpc/kernel/msi.c       |  4 ++
 arch/x86/kernel/io_apic.c       |  4 ++
 drivers/pci/msi.c               | 91 +++++++++++++++++++++++++++++------------
 drivers/pci/msi.h               |  6 ---
 include/linux/msi.h             |  1 +
 include/linux/pci.h             |  6 ++-
 7 files changed, 116 insertions(+), 41 deletions(-)

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 1c02431f1d1..9494f6dc38e 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -94,15 +94,48 @@ This function should be called before the driver calls request_irq()
 since enabling MSIs disables the pin-based IRQ and the driver will not
 receive interrupts on the old interrupt.
 
-4.2.2 pci_disable_msi
+4.2.2 pci_enable_msi_block
+
+int pci_enable_msi_block(struct pci_dev *dev, int count)
+
+This variation on the above call allows a device driver to request multiple
+MSIs.  The MSI specification only allows interrupts to be allocated in
+powers of two, up to a maximum of 2^5 (32).
+
+If this function returns 0, it has succeeded in allocating at least as many
+interrupts as the driver requested (it may have allocated more in order
+to satisfy the power-of-two requirement).  In this case, the function
+enables MSI on this device and updates dev->irq to be the lowest of
+the new interrupts assigned to it.  The other interrupts assigned to
+the device are in the range dev->irq to dev->irq + count - 1.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to request any more MSI interrupts for
+this device.  If this function returns a positive number, it will be
+less than 'count' and indicate the number of interrupts that could have
+been allocated.  In neither case will the irq value have been
+updated, nor will the device have been switched into MSI mode.
+
+The device driver must decide what action to take if
+pci_enable_msi_block() returns a value less than the number asked for.
+Some devices can make use of fewer interrupts than the maximum they
+request; in this case the driver should call pci_enable_msi_block()
+again.  Note that it is not guaranteed to succeed, even when the
+'count' has been reduced to the value returned from a previous call to
+pci_enable_msi_block().  This is because there are multiple constraints
+on the number of vectors that can be allocated; pci_enable_msi_block()
+will return as soon as it finds any constraint that doesn't allow the
+call to succeed.
+
+4.2.3 pci_disable_msi
 
 void pci_disable_msi(struct pci_dev *dev)
 
-This function should be used to undo the effect of pci_enable_msi().
-Calling it restores dev->irq to the pin-based interrupt number and frees
-the previously allocated message signaled interrupt(s).  The interrupt
-may subsequently be assigned to another device, so drivers should not
-cache the value of dev->irq.
+This function should be used to undo the effect of pci_enable_msi() or
+pci_enable_msi_block().  Calling it restores dev->irq to the pin-based
+interrupt number and frees the previously allocated message signaled
+interrupt(s).  The interrupt may subsequently be assigned to another
+device, so drivers should not cache the value of dev->irq.
 
 A device driver must always call free_irq() on the interrupt(s)
 for which it has called request_irq() before calling this function.
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 3bb7d3dd28b..0c16e2a854e 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -19,6 +19,10 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
 		return -ENOSYS;
 	}
 
+	/* PowerPC doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	if (ppc_md.msi_check_device) {
 		pr_debug("msi: Using platform check routine.\n");
 		return ppc_md.msi_check_device(dev, nvec, type);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index bc7ac4da90d..a09549a6321 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3510,6 +3510,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int index = 0;
 #endif
 
+	/* x86 doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index adcc7824257..6f2e6295e77 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -40,6 +40,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	struct msi_desc *entry;
 	int ret;
 
+	/*
+	 * If an architecture wants to support multiple MSI, it needs to
+	 * override arch_setup_msi_irqs()
+	 */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	list_for_each_entry(entry, &dev->msi_list, list) {
 		ret = arch_setup_msi_irq(dev, entry);
 		if (ret < 0)
@@ -58,8 +65,12 @@ void arch_teardown_msi_irqs(struct pci_dev *dev)
 	struct msi_desc *entry;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq != 0)
-			arch_teardown_msi_irq(entry->irq);
+		int i, nvec;
+		if (entry->irq == 0)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			arch_teardown_msi_irq(entry->irq + i);
 	}
 }
 #endif
@@ -163,7 +174,8 @@ static void msi_set_mask_bit(unsigned irq, u32 flag)
 		msix_mask_irq(desc, flag);
 		readl(desc->mask_base);		/* Flush write to device */
 	} else {
-		msi_mask_irq(desc, 1, flag);
+		unsigned offset = irq - desc->dev->irq;
+		msi_mask_irq(desc, 1 << offset, flag << offset);
 	}
 }
 
@@ -229,6 +241,12 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
+		u16 msgctl;
+
+		pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
+		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+		msgctl |= entry->msi_attrib.multiple << 4;
+		pci_write_config_word(dev, msi_control_reg(pos), msgctl);
 
 		pci_write_config_dword(dev, msi_lower_address_reg(pos),
 					msg->address_lo);
@@ -291,7 +309,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
-	control |= PCI_MSI_FLAGS_ENABLE;
+	control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE;
 	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
 }
 
@@ -332,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
 /**
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
  *
- * Setup the MSI capability structure of device function with a single
- * MSI irq, regardless of device function is capable of handling
- * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI irq or non-zero for otherwise.
- **/
-static int msi_capability_init(struct pci_dev *dev)
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts.  A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq.  A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
 {
 	struct msi_desc *entry;
 	int pos, ret;
@@ -371,7 +391,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	list_add_tail(&entry->list, &dev->msi_list);
 
 	/* Configure MSI capability structure */
-	ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
+	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
 	if (ret) {
 		msi_free_irqs(dev);
 		return ret;
@@ -524,35 +544,48 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
 }
 
 /**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
  *
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * irq or non-zero for otherwise.
- **/
-int pci_enable_msi(struct pci_dev* dev)
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs.  If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate.  If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
-	int status;
+	int status, pos, maxvec;
+	u16 msgctl;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+	maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+	if (nvec > maxvec)
+		return maxvec;
 
-	status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
 	if (status)
 		return status;
 
 	WARN_ON(!!dev->msi_enabled);
 
-	/* Check whether driver already requested for MSI-X irqs */
+	/* Check whether driver already requested MSI-X irqs */
 	if (dev->msix_enabled) {
 		dev_info(&dev->dev, "can't enable MSI "
 			 "(MSI-X already enabled)\n");
 		return -EINVAL;
 	}
-	status = msi_capability_init(dev);
+
+	status = msi_capability_init(dev, nvec);
 	return status;
 }
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
 
 void pci_msi_shutdown(struct pci_dev *dev)
 {
@@ -599,8 +632,12 @@ static int msi_free_irqs(struct pci_dev* dev)
 	struct msi_desc *entry, *tmp;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq)
-			BUG_ON(irq_has_action(entry->irq));
+		int i, nvec;
+		if (!entry->irq)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			BUG_ON(irq_has_action(entry->irq + i));
 	}
 
 	arch_teardown_msi_irqs(dev);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 3898f523714..71f4df2ef65 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -20,14 +20,8 @@
 #define msi_mask_bits_reg(base, is64bit) \
 	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
 #define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
-#define multi_msi_capable(control) \
-	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
-	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
-	control |= PCI_MSI_FLAGS_ENABLE
 
 #define msix_table_offset_reg(base)	(base + 0x04)
 #define msix_pba_offset_reg(base)	(base + 0x08)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 37c1bbe546e..6991ab5b24d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -21,6 +21,7 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
+		__u8	multiple: 3;	/* log2 number of messages */
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7baf2a5db12..1f6c5ddaae3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -789,7 +789,7 @@ struct msix_entry {
 
 
 #ifndef CONFIG_PCI_MSI
-static inline int pci_enable_msi(struct pci_dev *dev)
+static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
 	return -1;
 }
@@ -824,7 +824,7 @@ static inline int pci_msi_enabled(void)
 	return 0;
 }
 #else
-extern int pci_enable_msi(struct pci_dev *dev);
+extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
 extern int pci_msix_table_size(struct pci_dev *dev);
@@ -846,6 +846,8 @@ static inline int pcie_aspm_enabled(void)
 extern int pcie_aspm_enabled(void);
 #endif
 
+#define pci_enable_msi(pdev)	pci_enable_msi_block(pdev, 1)
+
 #ifdef CONFIG_HT_IRQ
 /* The functions a driver should call */
 int  ht_create_irq(struct pci_dev *dev, int idx);
-- 
cgit v1.2.3


From 32a9a682bef2f6fce7026bd94d1ce20028b0e52d Mon Sep 17 00:00:00 2001
From: Yuji Shimada <shimada-yxb@necst.nec.co.jp>
Date: Mon, 16 Mar 2009 17:13:39 +0900
Subject: PCI: allow assignment of memory resources with a specified alignment

This patch allows memory resources to be assigned with a specified
alignment at boot-time or run-time. The patch is useful when we use PCI
pass-through, because page-aligned memory resources are required to
securely share PCI resources with guest drivers.

If you want to assign the resource at boot time, please set
"pci=resource_alignment=" boot parameter.

This is format of "pci=resource_alignment=" boot parameter:

        [<order of align>@][<domain>:]<bus>:<slot>.<func>[; ...]
                Specifies alignment and device to reassign
                aligned memory resources.
                If <order of align> is not specified, PAGE_SIZE is
                used as alignment.
                PCI-PCI bridge can be specified, if resource
                windows need to be expanded.

This is example:

        pci=resource_alignment=20@07:00.0;18@0f:00.0;00:1d.7

If you want to assign the resource at run-time, please set
"/sys/bus/pci/resource_alignment" file, and hot-remove the device and
hot-add the device.  For this purpose, fakephp or PCI hotplug interfaces
can be used.

The format of "/sys/bus/pci/resource_alignment" file is the same with
boot parameter. You can use "," instead of ";".

For example:

        # cd /sys/bus/pci
        # echo -n 20@12:00.0 > resource_alignment
        # echo 1 > devices/0000:12:00.0/remove
        # echo 1 > rescan

Reviewed-by: Alex Chiang <achiang@hp.com>
Reviewed-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Yuji Shimada <shimada-yxb@necst.nec.co.jp>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt |   9 +++
 drivers/pci/pci.c                   | 120 ++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h                   |   6 ++
 drivers/pci/quirks.c                |  60 ++++++++++++++++++
 drivers/pci/setup-res.c             |  15 +++++
 5 files changed, 210 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c7c441e7930..1754fedc531 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1760,6 +1760,15 @@ and is between 256 and 4096 characters. It is defined in the file
 		cbmemsize=nn[KMG]	The fixed amount of bus space which is
 				reserved for the CardBus bridge's memory
 				window. The default value is 64 megabytes.
+		resource_alignment=
+				Format:
+				[<order of align>@][<domain>:]<bus>:<slot>.<func>[; ...]
+				Specifies alignment and device to reassign
+				aligned memory resources.
+				If <order of align> is not specified,
+				PAGE_SIZE is used as alignment.
+				PCI-PCI bridge can be specified, if resource
+				windows need to be expanded.
 
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8310dc2f943..a35a8b2ba63 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -20,6 +20,8 @@
 #include <linux/pm_wakeup.h>
 #include <linux/interrupt.h>
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
+#include <linux/device.h>
+#include <asm/setup.h>
 #include "pci.h"
 
 unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT;
@@ -2370,6 +2372,121 @@ int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type)
 	return 0;
 }
 
+#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
+static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
+spinlock_t resource_alignment_lock = SPIN_LOCK_UNLOCKED;
+
+/**
+ * pci_specified_resource_alignment - get resource alignment specified by user.
+ * @dev: the PCI device to get
+ *
+ * RETURNS: Resource alignment if it is specified.
+ *          Zero if it is not specified.
+ */
+resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
+{
+	int seg, bus, slot, func, align_order, count;
+	resource_size_t align = 0;
+	char *p;
+
+	spin_lock(&resource_alignment_lock);
+	p = resource_alignment_param;
+	while (*p) {
+		count = 0;
+		if (sscanf(p, "%d%n", &align_order, &count) == 1 &&
+							p[count] == '@') {
+			p += count + 1;
+		} else {
+			align_order = -1;
+		}
+		if (sscanf(p, "%x:%x:%x.%x%n",
+			&seg, &bus, &slot, &func, &count) != 4) {
+			seg = 0;
+			if (sscanf(p, "%x:%x.%x%n",
+					&bus, &slot, &func, &count) != 3) {
+				/* Invalid format */
+				printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
+					p);
+				break;
+			}
+		}
+		p += count;
+		if (seg == pci_domain_nr(dev->bus) &&
+			bus == dev->bus->number &&
+			slot == PCI_SLOT(dev->devfn) &&
+			func == PCI_FUNC(dev->devfn)) {
+			if (align_order == -1) {
+				align = PAGE_SIZE;
+			} else {
+				align = 1 << align_order;
+			}
+			/* Found */
+			break;
+		}
+		if (*p != ';' && *p != ',') {
+			/* End of param or invalid format */
+			break;
+		}
+		p++;
+	}
+	spin_unlock(&resource_alignment_lock);
+	return align;
+}
+
+/**
+ * pci_is_reassigndev - check if specified PCI is target device to reassign
+ * @dev: the PCI device to check
+ *
+ * RETURNS: non-zero for PCI device is a target device to reassign,
+ *          or zero is not.
+ */
+int pci_is_reassigndev(struct pci_dev *dev)
+{
+	return (pci_specified_resource_alignment(dev) != 0);
+}
+
+ssize_t pci_set_resource_alignment_param(const char *buf, size_t count)
+{
+	if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1)
+		count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1;
+	spin_lock(&resource_alignment_lock);
+	strncpy(resource_alignment_param, buf, count);
+	resource_alignment_param[count] = '\0';
+	spin_unlock(&resource_alignment_lock);
+	return count;
+}
+
+ssize_t pci_get_resource_alignment_param(char *buf, size_t size)
+{
+	size_t count;
+	spin_lock(&resource_alignment_lock);
+	count = snprintf(buf, size, "%s", resource_alignment_param);
+	spin_unlock(&resource_alignment_lock);
+	return count;
+}
+
+static ssize_t pci_resource_alignment_show(struct bus_type *bus, char *buf)
+{
+	return pci_get_resource_alignment_param(buf, PAGE_SIZE);
+}
+
+static ssize_t pci_resource_alignment_store(struct bus_type *bus,
+					const char *buf, size_t count)
+{
+	return pci_set_resource_alignment_param(buf, count);
+}
+
+BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
+					pci_resource_alignment_store);
+
+static int __init pci_resource_alignment_sysfs_init(void)
+{
+	return bus_create_file(&pci_bus_type,
+					&bus_attr_resource_alignment);
+}
+
+late_initcall(pci_resource_alignment_sysfs_init);
+
 static void __devinit pci_no_domains(void)
 {
 #ifdef CONFIG_PCI_DOMAINS
@@ -2418,6 +2535,9 @@ static int __init pci_setup(char *str)
 				pci_cardbus_io_size = memparse(str + 9, &str);
 			} else if (!strncmp(str, "cbmemsize=", 10)) {
 				pci_cardbus_mem_size = memparse(str + 10, &str);
+			} else if (!strncmp(str, "resource_alignment=", 19)) {
+				pci_set_resource_alignment_param(str + 19,
+							strlen(str + 19));
 			} else {
 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
 						str);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 07c0aa5275e..2cd1cba7236 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -195,4 +195,10 @@ static inline int pci_ari_enabled(struct pci_bus *bus)
 	return bus->self && bus->self->ari_enabled;
 }
 
+#ifdef CONFIG_PCI_QUIRKS
+extern int pci_is_reassigndev(struct pci_dev *dev);
+resource_size_t pci_specified_resource_alignment(struct pci_dev *dev);
+extern void pci_disable_bridge_window(struct pci_dev *dev);
+#endif
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 5aa2afb23ef..50233818a76 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -24,6 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/dmi.h>
 #include <linux/pci-aspm.h>
+#include <linux/ioport.h>
 #include "pci.h"
 
 int isa_dma_bridge_buggy;
@@ -34,6 +35,65 @@ int pcie_mch_quirk;
 EXPORT_SYMBOL(pcie_mch_quirk);
 
 #ifdef CONFIG_PCI_QUIRKS
+/*
+ * This quirk function disables the device and releases resources
+ * which is specified by kernel's boot parameter 'pci=resource_alignment='.
+ * It also rounds up size to specified alignment.
+ * Later on, the kernel will assign page-aligned memory resource back
+ * to that device.
+ */
+static void __devinit quirk_resource_alignment(struct pci_dev *dev)
+{
+	int i;
+	struct resource *r;
+	resource_size_t align, size;
+
+	if (!pci_is_reassigndev(dev))
+		return;
+
+	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
+	    (dev->class >> 8) == PCI_CLASS_BRIDGE_HOST) {
+		dev_warn(&dev->dev,
+			"Can't reassign resources to host bridge.\n");
+		return;
+	}
+
+	dev_info(&dev->dev, "Disabling device and release resources.\n");
+	pci_disable_device(dev);
+
+	align = pci_specified_resource_alignment(dev);
+	for (i=0; i < PCI_BRIDGE_RESOURCES; i++) {
+		r = &dev->resource[i];
+		if (!(r->flags & IORESOURCE_MEM))
+			continue;
+		size = resource_size(r);
+		if (size < align) {
+			size = align;
+			dev_info(&dev->dev,
+				"Rounding up size of resource #%d to %#llx.\n",
+				i, (unsigned long long)size);
+		}
+		r->end = size - 1;
+		r->start = 0;
+	}
+	/* Need to disable bridge's resource window,
+	 * to enable the kernel to reassign new resource
+	 * window later on.
+	 */
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE &&
+	    (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
+			r = &dev->resource[i];
+			if (!(r->flags & IORESOURCE_MEM))
+				continue;
+			r->end = resource_size(r) - 1;
+			r->start = 0;
+		}
+		pci_disable_bridge_window(dev);
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_resource_alignment);
+
 /* The Mellanox Tavor device gives false positive parity errors
  * Mark this device with a broken_parity_status, to allow
  * PCI scanning code to "skip" this now blacklisted device.
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 32e8d88a461..3039fcb86af 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -120,6 +120,21 @@ int pci_claim_resource(struct pci_dev *dev, int resource)
 	return err;
 }
 
+#ifdef CONFIG_PCI_QUIRKS
+void pci_disable_bridge_window(struct pci_dev *dev)
+{
+	dev_dbg(&dev->dev, "Disabling bridge window.\n");
+
+	/* MMIO Base/Limit */
+	pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
+
+	/* Prefetchable MMIO Base/Limit */
+	pci_write_config_dword(dev, PCI_PREF_LIMIT_UPPER32, 0);
+	pci_write_config_dword(dev, PCI_PREF_MEMORY_BASE, 0x0000fff0);
+	pci_write_config_dword(dev, PCI_PREF_BASE_UPPER32, 0xffffffff);
+}
+#endif	/* CONFIG_PCI_QUIRKS */
+
 int pci_assign_resource(struct pci_dev *dev, int resno)
 {
 	struct pci_bus *bus = dev->bus;
-- 
cgit v1.2.3


From 6a3b3e26803fc823058fbb05abb5e0d92a52e1bd Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 15 Mar 2009 20:14:37 +0100
Subject: PCI: Use kzalloc() in pci_create_bus()

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 23362e8c696..9e7d642e66b 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1114,7 +1114,7 @@ struct pci_bus * pci_create_bus(struct device *parent,
 	if (!b)
 		return NULL;
 
-	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev){
 		kfree(b);
 		return NULL;
@@ -1133,7 +1133,6 @@ struct pci_bus * pci_create_bus(struct device *parent,
 	list_add_tail(&b->node, &pci_root_buses);
 	up_write(&pci_bus_sem);
 
-	memset(dev, 0, sizeof(*dev));
 	dev->parent = parent;
 	dev->release = pci_release_bus_bridge_dev;
 	dev_set_name(dev, "pci%04x:%02x", pci_domain_nr(b), bus);
-- 
cgit v1.2.3


From 9efb5fe1b80a45130ba659ba755f24534c83301b Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 9 Mar 2009 12:08:15 -0600
Subject: PCI: PCIe portdrv: eliminate double kfree in remove path

Commit 55633af3 (PCIe portdrv: Use driver data to simplify code)
added a kfree of the driver private data in pcie_port_device_remove
but forgot to remove the old kfree from pcie_portdrv_remove.

Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_pci.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index a61f4930d67..b924e2463f8 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -94,7 +94,6 @@ static void pcie_portdrv_remove (struct pci_dev *dev)
 {
 	pcie_port_device_remove(dev);
 	pci_disable_device(dev);
-	kfree(pci_get_drvdata(dev));
 }
 
 static int error_detected_iter(struct device *device, void *data)
-- 
cgit v1.2.3


From 745be2e700cdddd5da4e402854a484242c3628df Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Sat, 7 Mar 2009 21:46:49 -0700
Subject: PCIe: portdrv: call pci_disable_device during remove

The PCIe port driver calls pci_enable_device when registering
ports, but never calls pci_disable_device during removal.

Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 5a5bfe7cdf5..e3998250386 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -473,6 +473,7 @@ void pcie_port_device_remove(struct pci_dev *dev)
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 
 	device_for_each_child(&dev->dev, NULL, remove_iter);
+	pci_disable_device(dev);
 
 	switch (port_data->port_irq_mode) {
 	case PCIE_PORT_MSIX_MODE:
-- 
cgit v1.2.3


From dfadd9edff498d767008edc6b2a6e86a7a19934d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 8 Mar 2009 21:35:37 -0700
Subject: PCI/x86: detect host bridge config space size w/o using quirks

Many host bridges support a 4k config space, so check them directy
instead of using quirks to add them.

We only need to do this extra check for host bridges at this point,
because only host bridges are known to have extended address space
without also having a PCI-X/PCI-E caps.  Other devices with this
property could be done with quirks (if there are any).

As a bonus, we can remove the quirks for AMD host bridges with family
10h and 11h since they're not needed any more.

With this patch, we can get correct pci cfg size of new Intel CPUs/IOHs
with host bridges.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/fixup.c | 20 --------------------
 drivers/pci/probe.c  |  9 ++++++++-
 2 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf54..096b0ed0713 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -494,26 +494,6 @@ static void __devinit pci_siemens_interrupt_controller(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
 			  pci_siemens_interrupt_controller);
 
-/*
- * Regular PCI devices have 256 bytes, but AMD Family 10h/11h CPUs have
- * 4096 bytes configuration space for each function of their processor
- * configuration space.
- */
-static void amd_cpu_pci_cfg_space_size(struct pci_dev *dev)
-{
-	dev->cfg_size = pci_cfg_space_size_ext(dev);
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1300, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1301, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1302, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1303, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1304, amd_cpu_pci_cfg_space_size);
-
 /*
  * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
  * confusing the PCI engine:
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 9e7d642e66b..579a56c8181 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -847,6 +847,11 @@ int pci_cfg_space_size(struct pci_dev *dev)
 {
 	int pos;
 	u32 status;
+	u16 class;
+
+	class = dev->class >> 8;
+	if (class == PCI_CLASS_BRIDGE_HOST)
+		return pci_cfg_space_size_ext(dev);
 
 	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
 	if (!pos) {
@@ -936,7 +941,6 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 	dev->multifunction = !!(hdr_type & 0x80);
 	dev->vendor = l & 0xffff;
 	dev->device = (l >> 16) & 0xffff;
-	dev->cfg_size = pci_cfg_space_size(dev);
 	dev->error_state = pci_channel_io_normal;
 	set_pcie_port_type(dev);
 
@@ -952,6 +956,9 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 		return NULL;
 	}
 
+	/* need to have dev->class ready */
+	dev->cfg_size = pci_cfg_space_size(dev);
+
 	return dev;
 }
 
-- 
cgit v1.2.3


From 217f45de3d2f68b6d0b646bb4f2a155a71648396 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@linux.ie>
Date: Wed, 4 Mar 2009 05:57:05 +0000
Subject: PCI: expose boot VGA device via sysfs.

X really would like to know which VGA device was considered the boot
device by the system. The x86 PCI fixups have support for discovering
this but we provide no way to expose it to userspace.

This adds a sysfs file per VGA class device which has the value 0 for
non the boot device or unknown, and 1 if the VGA device is the boot
device.

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci-sysfs.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 1c892980140..ec7a175dcba 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -240,6 +240,17 @@ struct device_attribute pci_dev_attrs[] = {
 	__ATTR_NULL,
 };
 
+static ssize_t
+boot_vga_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%u\n",
+		!!(pdev->resource[PCI_ROM_RESOURCE].flags &
+		   IORESOURCE_ROM_SHADOW));
+}
+struct device_attribute vga_attr = __ATTR_RO(boot_vga);
+
 static ssize_t
 pci_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
 		char *buf, loff_t off, size_t count)
@@ -899,18 +910,27 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 		pdev->rom_attr = attr;
 	}
 
+	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) {
+		retval = device_create_file(&pdev->dev, &vga_attr);
+		if (retval)
+			goto err_rom_file;
+	}
+
 	/* add platform-specific attributes */
 	retval = pcibios_add_platform_entries(pdev);
 	if (retval)
-		goto err_rom_file;
+		goto err_vga_file;
 
 	/* add sysfs entries for various capabilities */
 	retval = pci_create_capabilities_sysfs(pdev);
 	if (retval)
-		goto err_rom_file;
+		goto err_vga_file;
 
 	return 0;
 
+err_vga_file:
+	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
+		device_remove_file(&pdev->dev, &vga_attr);
 err_rom_file:
 	if (rom_size) {
 		sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
-- 
cgit v1.2.3


From 8293b0f629095efbe7c7e3f9b437f8c040c19eb5 Mon Sep 17 00:00:00 2001
From: David O'Shea <dcoshea@hotmail.com>
Date: Mon, 2 Mar 2009 09:51:13 +0100
Subject: PCI: Compaq Evo D510 SMBus quirk using USB instead of VGA

On the Compaq Evo D510 SFF/CMT, a PCI quirk activated the SMBus device
based on detection of the on-board VGA controller, but the on-board
VGA is disabled if an AGP card is inserted, so look for one of the USB
controllers instead.

Signed-off-by: David O'Shea <dcoshea@hotmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/quirks.c    | 9 +++++++--
 include/linux/pci_ids.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 50233818a76..7ddcfc65e79 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1186,10 +1186,15 @@ static void __init asus_hides_smbus_hostbridge(struct pci_dev *dev)
 				 * its on-board VGA controller */
 				asus_hides_smbus = 1;
 			}
-		else if (dev->device == PCI_DEVICE_ID_INTEL_82845G_IG)
+		else if (dev->device == PCI_DEVICE_ID_INTEL_82801DB_2)
 			switch(dev->subsystem_device) {
 			case 0x00b8: /* Compaq Evo D510 CMT */
 			case 0x00b9: /* Compaq Evo D510 SFF */
+				/* Motherboard doesn't have Host bridge
+				 * subvendor/subdevice IDs and on-board VGA
+				 * controller is disabled if an AGP card is
+				 * inserted, therefore checking USB UHCI
+				 * Controller #1 */
 				asus_hides_smbus = 1;
 			}
 		else if (dev->device == PCI_DEVICE_ID_INTEL_82815_CGC)
@@ -1214,7 +1219,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82855GM_HB,	as
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82915GM_HB, asus_hides_smbus_hostbridge);
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82810_IG3,	asus_hides_smbus_hostbridge);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82845G_IG,	asus_hides_smbus_hostbridge);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82801DB_2,	asus_hides_smbus_hostbridge);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82815_CGC,	asus_hides_smbus_hostbridge);
 
 static void asus_hides_smbus_lpc(struct pci_dev *dev)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index aca8c458aa8..3ddf8beabdf 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2373,6 +2373,7 @@
 #define PCI_DEVICE_ID_INTEL_82801CA_12	0x248c
 #define PCI_DEVICE_ID_INTEL_82801DB_0	0x24c0
 #define PCI_DEVICE_ID_INTEL_82801DB_1	0x24c1
+#define PCI_DEVICE_ID_INTEL_82801DB_2	0x24c2
 #define PCI_DEVICE_ID_INTEL_82801DB_3	0x24c3
 #define PCI_DEVICE_ID_INTEL_82801DB_5	0x24c5
 #define PCI_DEVICE_ID_INTEL_82801DB_6	0x24c6
-- 
cgit v1.2.3


From d1b054da8f599905f3c18a218961dcf17f9d5f13 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:11 +0800
Subject: PCI: initialize and release SR-IOV capability

If a device has the SR-IOV capability, initialize it (set the ARI
Capable Hierarchy in the lowest numbered PF if necessary; calculate
the System Page Size for the VF MMIO, probe the VF Offset, Stride
and BARs). A lock for the VF bus allocation is also initialized if
a PF is the lowest numbered PF.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/Kconfig      |  10 +++
 drivers/pci/Makefile     |   2 +
 drivers/pci/iov.c        | 182 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c        |   7 ++
 drivers/pci/pci.h        |  37 ++++++++++
 drivers/pci/probe.c      |   4 ++
 include/linux/pci.h      |  11 +++
 include/linux/pci_regs.h |  33 +++++++++
 8 files changed, 286 insertions(+)
 create mode 100644 drivers/pci/iov.c

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2a4501dd251..fdc864f9cf2 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -59,3 +59,13 @@ config HT_IRQ
 	   This allows native hypertransport devices to use interrupts.
 
 	   If unsure say Y.
+
+config PCI_IOV
+	bool "PCI IOV support"
+	depends on PCI
+	help
+	  I/O Virtualization is a PCI feature supported by some devices
+	  which allows them to create virtual devices which share their
+	  physical resources.
+
+	  If unsure, say N.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 3d07ce24f6a..ba6af162fd3 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -29,6 +29,8 @@ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
 
 obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
 
+obj-$(CONFIG_PCI_IOV) += iov.o
+
 #
 # Some architectures use the generic PCI setup functions
 #
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
new file mode 100644
index 00000000000..66cc414ed15
--- /dev/null
+++ b/drivers/pci/iov.c
@@ -0,0 +1,182 @@
+/*
+ * drivers/pci/iov.c
+ *
+ * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
+ *
+ * PCI Express I/O Virtualization (IOV) support.
+ *   Single Root IOV 1.0
+ */
+
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include "pci.h"
+
+
+static int sriov_init(struct pci_dev *dev, int pos)
+{
+	int i;
+	int rc;
+	int nres;
+	u32 pgsz;
+	u16 ctrl, total, offset, stride;
+	struct pci_sriov *iov;
+	struct resource *res;
+	struct pci_dev *pdev;
+
+	if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
+	    dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
+		return -ENODEV;
+
+	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
+	if (ctrl & PCI_SRIOV_CTRL_VFE) {
+		pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
+		ssleep(1);
+	}
+
+	pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
+	if (!total)
+		return 0;
+
+	ctrl = 0;
+	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+		if (pdev->is_physfn)
+			goto found;
+
+	pdev = NULL;
+	if (pci_ari_enabled(dev->bus))
+		ctrl |= PCI_SRIOV_CTRL_ARI;
+
+found:
+	pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
+	pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (total > 1 && !stride))
+		return -EIO;
+
+	pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
+	i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
+	pgsz &= ~((1 << i) - 1);
+	if (!pgsz)
+		return -EIO;
+
+	pgsz &= ~(pgsz - 1);
+	pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		i += __pci_read_base(dev, pci_bar_unknown, res,
+				     pos + PCI_SRIOV_BAR + i * 4);
+		if (!res->flags)
+			continue;
+		if (resource_size(res) & (PAGE_SIZE - 1)) {
+			rc = -EIO;
+			goto failed;
+		}
+		res->end = res->start + resource_size(res) * total - 1;
+		nres++;
+	}
+
+	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+	if (!iov) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	iov->pos = pos;
+	iov->nres = nres;
+	iov->ctrl = ctrl;
+	iov->total = total;
+	iov->offset = offset;
+	iov->stride = stride;
+	iov->pgsz = pgsz;
+	iov->self = dev;
+	pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
+	pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
+
+	if (pdev)
+		iov->dev = pci_dev_get(pdev);
+	else {
+		iov->dev = dev;
+		mutex_init(&iov->lock);
+	}
+
+	dev->sriov = iov;
+	dev->is_physfn = 1;
+
+	return 0;
+
+failed:
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		res->flags = 0;
+	}
+
+	return rc;
+}
+
+static void sriov_release(struct pci_dev *dev)
+{
+	if (dev == dev->sriov->dev)
+		mutex_destroy(&dev->sriov->lock);
+	else
+		pci_dev_put(dev->sriov->dev);
+
+	kfree(dev->sriov);
+	dev->sriov = NULL;
+}
+
+/**
+ * pci_iov_init - initialize the IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_iov_init(struct pci_dev *dev)
+{
+	int pos;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+	if (pos)
+		return sriov_init(dev, pos);
+
+	return -ENODEV;
+}
+
+/**
+ * pci_iov_release - release resources used by the IOV capability
+ * @dev: the PCI device
+ */
+void pci_iov_release(struct pci_dev *dev)
+{
+	if (dev->is_physfn)
+		sriov_release(dev);
+}
+
+/**
+ * pci_iov_resource_bar - get position of the SR-IOV BAR
+ * @dev: the PCI device
+ * @resno: the resource number
+ * @type: the BAR type to be filled in
+ *
+ * Returns position of the BAR encapsulated in the SR-IOV capability.
+ */
+int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+			 enum pci_bar_type *type)
+{
+	if (resno < PCI_IOV_RESOURCES || resno > PCI_IOV_RESOURCE_END)
+		return 0;
+
+	BUG_ON(!dev->is_physfn);
+
+	*type = pci_bar_unknown;
+
+	return dev->sriov->pos + PCI_SRIOV_BAR +
+		4 * (resno - PCI_IOV_RESOURCES);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a35a8b2ba63..2b3201ec2b0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2360,12 +2360,19 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags)
  */
 int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type)
 {
+	int reg;
+
 	if (resno < PCI_ROM_RESOURCE) {
 		*type = pci_bar_unknown;
 		return PCI_BASE_ADDRESS_0 + 4 * resno;
 	} else if (resno == PCI_ROM_RESOURCE) {
 		*type = pci_bar_mem32;
 		return dev->rom_base_reg;
+	} else if (resno < PCI_BRIDGE_RESOURCES) {
+		/* device specific resource */
+		reg = pci_iov_resource_bar(dev, resno, type);
+		if (reg)
+			return reg;
 	}
 
 	dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2cd1cba7236..7d5327c986f 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -201,4 +201,41 @@ resource_size_t pci_specified_resource_alignment(struct pci_dev *dev);
 extern void pci_disable_bridge_window(struct pci_dev *dev);
 #endif
 
+/* Single Root I/O Virtualization */
+struct pci_sriov {
+	int pos;		/* capability position */
+	int nres;		/* number of resources */
+	u32 cap;		/* SR-IOV Capabilities */
+	u16 ctrl;		/* SR-IOV Control */
+	u16 total;		/* total VFs associated with the PF */
+	u16 offset;		/* first VF Routing ID offset */
+	u16 stride;		/* following VF stride */
+	u32 pgsz;		/* page size for BAR alignment */
+	u8 link;		/* Function Dependency Link */
+	struct pci_dev *dev;	/* lowest numbered PF */
+	struct pci_dev *self;	/* this PF */
+	struct mutex lock;	/* lock for VF bus */
+};
+
+#ifdef CONFIG_PCI_IOV
+extern int pci_iov_init(struct pci_dev *dev);
+extern void pci_iov_release(struct pci_dev *dev);
+extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+				enum pci_bar_type *type);
+#else
+static inline int pci_iov_init(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+static inline void pci_iov_release(struct pci_dev *dev)
+
+{
+}
+static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+				       enum pci_bar_type *type)
+{
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 579a56c8181..0471f6ea146 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -785,6 +785,7 @@ static int pci_setup_device(struct pci_dev * dev)
 static void pci_release_capabilities(struct pci_dev *dev)
 {
 	pci_vpd_release(dev);
+	pci_iov_release(dev);
 }
 
 /**
@@ -979,6 +980,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 
 	/* Alternative Routing-ID Forwarding */
 	pci_enable_ari(dev);
+
+	/* Single Root I/O Virtualization */
+	pci_iov_init(dev);
 }
 
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1f6c5ddaae3..8ce2f2d9ab6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -93,6 +93,12 @@ enum {
 	/* #6: expansion ROM resource */
 	PCI_ROM_RESOURCE,
 
+	/* device specific resources */
+#ifdef CONFIG_PCI_IOV
+	PCI_IOV_RESOURCES,
+	PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1,
+#endif
+
 	/* resources assigned to buses behind the bridge */
 #define PCI_BRIDGE_RESOURCE_NUM 4
 
@@ -180,6 +186,7 @@ struct pci_cap_saved_state {
 
 struct pcie_link_state;
 struct pci_vpd;
+struct pci_sriov;
 
 /*
  * The pci_dev structure is used to describe PCI devices.
@@ -257,6 +264,7 @@ struct pci_dev {
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
+	unsigned int	is_physfn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -270,6 +278,9 @@ struct pci_dev {
 	struct list_head msi_list;
 #endif
 	struct pci_vpd *vpd;
+#ifdef CONFIG_PCI_IOV
+	struct pci_sriov *sriov;	/* SR-IOV capability related */
+#endif
 };
 
 extern struct pci_dev *alloc_pci_dev(void);
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b647a4df59f..d4e663877f4 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,7 @@
 #define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
 #define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
 #define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define  PCI_EXP_TYPE_RC_END	0x9	/* Root Complex Integrated Endpoint */
 #define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
 #define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
 #define PCI_EXP_DEVCAP		4	/* Device capabilities */
@@ -498,6 +499,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_SRIOV	16
 
 /* Advanced Error Reporting */
 #define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
@@ -615,4 +617,35 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Single Root I/O Virtualization */
+#define PCI_SRIOV_CAP		0x04	/* SR-IOV Capabilities */
+#define  PCI_SRIOV_CAP_VFM	0x01	/* VF Migration Capable */
+#define  PCI_SRIOV_CAP_INTR(x)	((x) >> 21) /* Interrupt Message Number */
+#define PCI_SRIOV_CTRL		0x08	/* SR-IOV Control */
+#define  PCI_SRIOV_CTRL_VFE	0x01	/* VF Enable */
+#define  PCI_SRIOV_CTRL_VFM	0x02	/* VF Migration Enable */
+#define  PCI_SRIOV_CTRL_INTR	0x04	/* VF Migration Interrupt Enable */
+#define  PCI_SRIOV_CTRL_MSE	0x08	/* VF Memory Space Enable */
+#define  PCI_SRIOV_CTRL_ARI	0x10	/* ARI Capable Hierarchy */
+#define PCI_SRIOV_STATUS	0x0a	/* SR-IOV Status */
+#define  PCI_SRIOV_STATUS_VFM	0x01	/* VF Migration Status */
+#define PCI_SRIOV_INITIAL_VF	0x0c	/* Initial VFs */
+#define PCI_SRIOV_TOTAL_VF	0x0e	/* Total VFs */
+#define PCI_SRIOV_NUM_VF	0x10	/* Number of VFs */
+#define PCI_SRIOV_FUNC_LINK	0x12	/* Function Dependency Link */
+#define PCI_SRIOV_VF_OFFSET	0x14	/* First VF Offset */
+#define PCI_SRIOV_VF_STRIDE	0x16	/* Following VF Stride */
+#define PCI_SRIOV_VF_DID	0x1a	/* VF Device ID */
+#define PCI_SRIOV_SUP_PGSIZE	0x1c	/* Supported Page Sizes */
+#define PCI_SRIOV_SYS_PGSIZE	0x20	/* System Page Size */
+#define PCI_SRIOV_BAR		0x24	/* VF BAR0 */
+#define  PCI_SRIOV_NUM_BARS	6	/* Number of VF BARs */
+#define PCI_SRIOV_VFM		0x3c	/* VF Migration State Array Offset*/
+#define  PCI_SRIOV_VFM_BIR(x)	((x) & 7)	/* State BIR */
+#define  PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7)	/* State Offset */
+#define  PCI_SRIOV_VFM_UA	0x0	/* Inactive.Unavailable */
+#define  PCI_SRIOV_VFM_MI	0x1	/* Dormant.MigrateIn */
+#define  PCI_SRIOV_VFM_MO	0x2	/* Active.MigrateOut */
+#define  PCI_SRIOV_VFM_AV	0x3	/* Active.Available */
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 8c5cdb6adc6688b9b8fd82ea4a5cf4674dabad79 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:12 +0800
Subject: PCI: restore saved SR-IOV state

Restore the volatile registers in the SR-IOV capability after the
D3->D0 transition.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c | 29 +++++++++++++++++++++++++++++
 drivers/pci/pci.c |  1 +
 drivers/pci/pci.h |  4 ++++
 3 files changed, 34 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 66cc414ed15..b121e47402f 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -129,6 +129,25 @@ static void sriov_release(struct pci_dev *dev)
 	dev->sriov = NULL;
 }
 
+static void sriov_restore_state(struct pci_dev *dev)
+{
+	int i;
+	u16 ctrl;
+	struct pci_sriov *iov = dev->sriov;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl);
+	if (ctrl & PCI_SRIOV_CTRL_VFE)
+		return;
+
+	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
+		pci_update_resource(dev, i);
+
+	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
+		msleep(100);
+}
+
 /**
  * pci_iov_init - initialize the IOV capability
  * @dev: the PCI device
@@ -180,3 +199,13 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 	return dev->sriov->pos + PCI_SRIOV_BAR +
 		4 * (resno - PCI_IOV_RESOURCES);
 }
+
+/**
+ * pci_restore_iov_state - restore the state of the IOV capability
+ * @dev: the PCI device
+ */
+void pci_restore_iov_state(struct pci_dev *dev)
+{
+	if (dev->is_physfn)
+		sriov_restore_state(dev);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2b3201ec2b0..676bbcbc272 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -775,6 +775,7 @@ pci_restore_state(struct pci_dev *dev)
 	}
 	pci_restore_pcix_state(dev);
 	pci_restore_msi_state(dev);
+	pci_restore_iov_state(dev);
 
 	return 0;
 }
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 7d5327c986f..fd5ea4d445e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -222,6 +222,7 @@ extern int pci_iov_init(struct pci_dev *dev);
 extern void pci_iov_release(struct pci_dev *dev);
 extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 				enum pci_bar_type *type);
+extern void pci_restore_iov_state(struct pci_dev *dev);
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -236,6 +237,9 @@ static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 {
 	return 0;
 }
+static inline void pci_restore_iov_state(struct pci_dev *dev)
+{
+}
 #endif /* CONFIG_PCI_IOV */
 
 #endif /* DRIVERS_PCI_H */
-- 
cgit v1.2.3


From a28724b0fb909d247229a70761c90bb37b13366a Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:13 +0800
Subject: PCI: reserve bus range for SR-IOV device

Reserve the bus number range used by the Virtual Function when
pcibios_assign_all_busses() returns true.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 36 ++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |  5 +++++
 drivers/pci/probe.c |  3 +++
 3 files changed, 44 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index b121e47402f..5ddfc09a8d3 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -14,6 +14,18 @@
 #include "pci.h"
 
 
+static inline u8 virtfn_bus(struct pci_dev *dev, int id)
+{
+	return dev->bus->number + ((dev->devfn + dev->sriov->offset +
+				    dev->sriov->stride * id) >> 8);
+}
+
+static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
+{
+	return (dev->devfn + dev->sriov->offset +
+		dev->sriov->stride * id) & 0xff;
+}
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
 	int i;
@@ -209,3 +221,27 @@ void pci_restore_iov_state(struct pci_dev *dev)
 	if (dev->is_physfn)
 		sriov_restore_state(dev);
 }
+
+/**
+ * pci_iov_bus_range - find bus range used by Virtual Function
+ * @bus: the PCI bus
+ *
+ * Returns max number of buses (exclude current one) used by Virtual
+ * Functions.
+ */
+int pci_iov_bus_range(struct pci_bus *bus)
+{
+	int max = 0;
+	u8 busnr;
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		if (!dev->is_physfn)
+			continue;
+		busnr = virtfn_bus(dev, dev->sriov->total - 1);
+		if (busnr > max)
+			max = busnr;
+	}
+
+	return max ? max - bus->number : 0;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fd5ea4d445e..5c29cb2b8e6 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -223,6 +223,7 @@ extern void pci_iov_release(struct pci_dev *dev);
 extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 				enum pci_bar_type *type);
 extern void pci_restore_iov_state(struct pci_dev *dev);
+extern int pci_iov_bus_range(struct pci_bus *bus);
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -240,6 +241,10 @@ static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 static inline void pci_restore_iov_state(struct pci_dev *dev)
 {
 }
+static inline int pci_iov_bus_range(struct pci_bus *bus)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_IOV */
 
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0471f6ea146..0ecdaea3915 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1085,6 +1085,9 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
 	for (devfn = 0; devfn < 0x100; devfn += 8)
 		pci_scan_slot(bus, devfn);
 
+	/* Reserve buses for SR-IOV capability. */
+	max += pci_iov_bus_range(bus);
+
 	/*
 	 * After performing arch-dependent fixup of the bus, look behind
 	 * all PCI-to-PCI bridges on this bus.
-- 
cgit v1.2.3


From 480b93b7837fb3cf0579a42f4953ac463a5b9e1e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:14 +0800
Subject: PCI: centralize device setup code

Move the device setup stuff into pci_setup_device() which will be used
to setup the Virtual Function later.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.h   |  1 +
 drivers/pci/probe.c | 78 +++++++++++++++++++++++++++--------------------------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5c29cb2b8e6..f4fc10fc587 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -178,6 +178,7 @@ enum pci_bar_type {
 	pci_bar_mem64,		/* A 64-bit memory BAR */
 };
 
+extern int pci_setup_device(struct pci_dev *dev);
 extern int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 				struct resource *res, unsigned int reg);
 extern int pci_resource_bar(struct pci_dev *dev, int resno,
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0ecdaea3915..943c49a7842 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -674,6 +674,19 @@ static void pci_read_irq(struct pci_dev *dev)
 	dev->irq = irq;
 }
 
+static void set_pcie_port_type(struct pci_dev *pdev)
+{
+	int pos;
+	u16 reg16;
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (!pos)
+		return;
+	pdev->is_pcie = 1;
+	pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
+	pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
+}
+
 #define LEGACY_IO_RESOURCE	(IORESOURCE_IO | IORESOURCE_PCI_FIXED)
 
 /**
@@ -683,12 +696,34 @@ static void pci_read_irq(struct pci_dev *dev)
  * Initialize the device structure with information about the device's 
  * vendor,class,memory and IO-space addresses,IRQ lines etc.
  * Called at initialisation of the PCI subsystem and by CardBus services.
- * Returns 0 on success and -1 if unknown type of device (not normal, bridge
- * or CardBus).
+ * Returns 0 on success and negative if unknown type of device (not normal,
+ * bridge or CardBus).
  */
-static int pci_setup_device(struct pci_dev * dev)
+int pci_setup_device(struct pci_dev *dev)
 {
 	u32 class;
+	u8 hdr_type;
+	struct pci_slot *slot;
+
+	if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
+		return -EIO;
+
+	dev->sysdata = dev->bus->sysdata;
+	dev->dev.parent = dev->bus->bridge;
+	dev->dev.bus = &pci_bus_type;
+	dev->hdr_type = hdr_type & 0x7f;
+	dev->multifunction = !!(hdr_type & 0x80);
+	dev->cfg_size = pci_cfg_space_size(dev);
+	dev->error_state = pci_channel_io_normal;
+	set_pcie_port_type(dev);
+
+	list_for_each_entry(slot, &dev->bus->slots, list)
+		if (PCI_SLOT(dev->devfn) == slot->number)
+			dev->slot = slot;
+
+	/* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
+	   set this higher, assuming the system even supports it.  */
+	dev->dma_mask = 0xffffffff;
 
 	dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus),
 		     dev->bus->number, PCI_SLOT(dev->devfn),
@@ -708,7 +743,6 @@ static int pci_setup_device(struct pci_dev * dev)
 
 	/* Early fixups, before probing the BARs */
 	pci_fixup_device(pci_fixup_early, dev);
-	class = dev->class >> 8;
 
 	switch (dev->hdr_type) {		    /* header type */
 	case PCI_HEADER_TYPE_NORMAL:		    /* standard header */
@@ -770,7 +804,7 @@ static int pci_setup_device(struct pci_dev * dev)
 	default:				    /* unknown header */
 		dev_err(&dev->dev, "unknown header type %02x, "
 			"ignoring device\n", dev->hdr_type);
-		return -1;
+		return -EIO;
 
 	bad:
 		dev_err(&dev->dev, "ignoring class %02x (doesn't match header "
@@ -804,19 +838,6 @@ static void pci_release_dev(struct device *dev)
 	kfree(pci_dev);
 }
 
-static void set_pcie_port_type(struct pci_dev *pdev)
-{
-	int pos;
-	u16 reg16;
-
-	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
-	if (!pos)
-		return;
-	pdev->is_pcie = 1;
-	pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
-	pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
-}
-
 /**
  * pci_cfg_space_size - get the configuration space size of the PCI device.
  * @dev: PCI device
@@ -897,9 +918,7 @@ EXPORT_SYMBOL(alloc_pci_dev);
 static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 {
 	struct pci_dev *dev;
-	struct pci_slot *slot;
 	u32 l;
-	u8 hdr_type;
 	int delay = 1;
 
 	if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l))
@@ -926,33 +945,16 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 		}
 	}
 
-	if (pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type))
-		return NULL;
-
 	dev = alloc_pci_dev();
 	if (!dev)
 		return NULL;
 
 	dev->bus = bus;
-	dev->sysdata = bus->sysdata;
-	dev->dev.parent = bus->bridge;
-	dev->dev.bus = &pci_bus_type;
 	dev->devfn = devfn;
-	dev->hdr_type = hdr_type & 0x7f;
-	dev->multifunction = !!(hdr_type & 0x80);
 	dev->vendor = l & 0xffff;
 	dev->device = (l >> 16) & 0xffff;
-	dev->error_state = pci_channel_io_normal;
-	set_pcie_port_type(dev);
-
-	list_for_each_entry(slot, &bus->slots, list)
-		if (PCI_SLOT(devfn) == slot->number)
-			dev->slot = slot;
 
-	/* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
-	   set this higher, assuming the system even supports it.  */
-	dev->dma_mask = 0xffffffff;
-	if (pci_setup_device(dev) < 0) {
+	if (pci_setup_device(dev)) {
 		kfree(dev);
 		return NULL;
 	}
-- 
cgit v1.2.3


From dd7cc44d0bcec5e9c42fe52e88dc254ae62eac8d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:15 +0800
Subject: PCI: add SR-IOV API for Physical Function driver

Add or remove the Virtual Function when the SR-IOV is enabled or
disabled by the device driver. This can happen anytime rather than
only at the device probe stage.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 314 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |   2 +
 include/linux/pci.h |  19 +++-
 3 files changed, 334 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 5ddfc09a8d3..d0ff8ad8f7b 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include "pci.h"
 
+#define VIRTFN_ID_LEN	16
 
 static inline u8 virtfn_bus(struct pci_dev *dev, int id)
 {
@@ -26,6 +27,284 @@ static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
 		dev->sriov->stride * id) & 0xff;
 }
 
+static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
+{
+	int rc;
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return bus;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	if (child)
+		return child;
+
+	child = pci_add_new_bus(bus, NULL, busnr);
+	if (!child)
+		return NULL;
+
+	child->subordinate = busnr;
+	child->dev.parent = bus->bridge;
+	rc = pci_bus_add_child(child);
+	if (rc) {
+		pci_remove_bus(child);
+		return NULL;
+	}
+
+	return child;
+}
+
+static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
+{
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	BUG_ON(!child);
+
+	if (list_empty(&child->devices))
+		pci_remove_bus(child);
+}
+
+static int virtfn_add(struct pci_dev *dev, int id, int reset)
+{
+	int i;
+	int rc;
+	u64 size;
+	char buf[VIRTFN_ID_LEN];
+	struct pci_dev *virtfn;
+	struct resource *res;
+	struct pci_sriov *iov = dev->sriov;
+
+	virtfn = alloc_pci_dev();
+	if (!virtfn)
+		return -ENOMEM;
+
+	mutex_lock(&iov->dev->sriov->lock);
+	virtfn->bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+	if (!virtfn->bus) {
+		kfree(virtfn);
+		mutex_unlock(&iov->dev->sriov->lock);
+		return -ENOMEM;
+	}
+	virtfn->devfn = virtfn_devfn(dev, id);
+	virtfn->vendor = dev->vendor;
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
+	pci_setup_device(virtfn);
+	virtfn->dev.parent = dev->dev.parent;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		if (!res->parent)
+			continue;
+		virtfn->resource[i].name = pci_name(virtfn);
+		virtfn->resource[i].flags = res->flags;
+		size = resource_size(res);
+		do_div(size, iov->total);
+		virtfn->resource[i].start = res->start + size * id;
+		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+		rc = request_resource(res, &virtfn->resource[i]);
+		BUG_ON(rc);
+	}
+
+	if (reset)
+		pci_execute_reset_function(virtfn);
+
+	pci_device_add(virtfn, virtfn->bus);
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	virtfn->physfn = pci_dev_get(dev);
+	virtfn->is_virtfn = 1;
+
+	rc = pci_bus_add_device(virtfn);
+	if (rc)
+		goto failed1;
+	sprintf(buf, "virtfn%u", id);
+	rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
+	if (rc)
+		goto failed1;
+	rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
+	if (rc)
+		goto failed2;
+
+	kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
+
+	return 0;
+
+failed2:
+	sysfs_remove_link(&dev->dev.kobj, buf);
+failed1:
+	pci_dev_put(dev);
+	mutex_lock(&iov->dev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	return rc;
+}
+
+static void virtfn_remove(struct pci_dev *dev, int id, int reset)
+{
+	char buf[VIRTFN_ID_LEN];
+	struct pci_bus *bus;
+	struct pci_dev *virtfn;
+	struct pci_sriov *iov = dev->sriov;
+
+	bus = pci_find_bus(pci_domain_nr(dev->bus), virtfn_bus(dev, id));
+	if (!bus)
+		return;
+
+	virtfn = pci_get_slot(bus, virtfn_devfn(dev, id));
+	if (!virtfn)
+		return;
+
+	pci_dev_put(virtfn);
+
+	if (reset) {
+		device_release_driver(&virtfn->dev);
+		pci_execute_reset_function(virtfn);
+	}
+
+	sprintf(buf, "virtfn%u", id);
+	sysfs_remove_link(&dev->dev.kobj, buf);
+	sysfs_remove_link(&virtfn->dev.kobj, "physfn");
+
+	mutex_lock(&iov->dev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	pci_dev_put(dev);
+}
+
+static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
+{
+	int rc;
+	int i, j;
+	int nres;
+	u16 offset, stride, initial;
+	struct resource *res;
+	struct pci_dev *pdev;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!nr_virtfn)
+		return 0;
+
+	if (iov->nr_virtfn)
+		return -EINVAL;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
+	if (initial > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
+		return -EIO;
+
+	if (nr_virtfn < 0 || nr_virtfn > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
+		return -EINVAL;
+
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (nr_virtfn > 1 && !stride))
+		return -EIO;
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		if (res->parent)
+			nres++;
+	}
+	if (nres != iov->nres) {
+		dev_err(&dev->dev, "not enough MMIO resources for SR-IOV\n");
+		return -ENOMEM;
+	}
+
+	iov->offset = offset;
+	iov->stride = stride;
+
+	if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) {
+		dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
+		return -ENOMEM;
+	}
+
+	if (iov->link != dev->devfn) {
+		pdev = pci_get_slot(dev->bus, iov->link);
+		if (!pdev)
+			return -ENODEV;
+
+		pci_dev_put(pdev);
+
+		if (!pdev->is_physfn)
+			return -ENODEV;
+
+		rc = sysfs_create_link(&dev->dev.kobj,
+					&pdev->dev.kobj, "dep_link");
+		if (rc)
+			return rc;
+	}
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	msleep(100);
+	pci_unblock_user_cfg_access(dev);
+
+	iov->initial = initial;
+	if (nr_virtfn < initial)
+		initial = nr_virtfn;
+
+	for (i = 0; i < initial; i++) {
+		rc = virtfn_add(dev, i, 0);
+		if (rc)
+			goto failed;
+	}
+
+	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
+	iov->nr_virtfn = nr_virtfn;
+
+	return 0;
+
+failed:
+	for (j = 0; j < i; j++)
+		virtfn_remove(dev, j, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&dev->dev.kobj, "dep_link");
+
+	return rc;
+}
+
+static void sriov_disable(struct pci_dev *dev)
+{
+	int i;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return;
+
+	for (i = 0; i < iov->nr_virtfn; i++)
+		virtfn_remove(dev, i, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&dev->dev.kobj, "dep_link");
+
+	iov->nr_virtfn = 0;
+}
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
 	int i;
@@ -132,6 +411,8 @@ failed:
 
 static void sriov_release(struct pci_dev *dev)
 {
+	BUG_ON(dev->sriov->nr_virtfn);
+
 	if (dev == dev->sriov->dev)
 		mutex_destroy(&dev->sriov->lock);
 	else
@@ -155,6 +436,7 @@ static void sriov_restore_state(struct pci_dev *dev)
 		pci_update_resource(dev, i);
 
 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
 	if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
 		msleep(100);
@@ -245,3 +527,35 @@ int pci_iov_bus_range(struct pci_bus *bus)
 
 	return max ? max - bus->number : 0;
 }
+
+/**
+ * pci_enable_sriov - enable the SR-IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return -ENODEV;
+
+	return sriov_enable(dev, nr_virtfn);
+}
+EXPORT_SYMBOL_GPL(pci_enable_sriov);
+
+/**
+ * pci_disable_sriov - disable the SR-IOV capability
+ * @dev: the PCI device
+ */
+void pci_disable_sriov(struct pci_dev *dev)
+{
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return;
+
+	sriov_disable(dev);
+}
+EXPORT_SYMBOL_GPL(pci_disable_sriov);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f4fc10fc587..0f1c7d10350 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -209,6 +209,8 @@ struct pci_sriov {
 	u32 cap;		/* SR-IOV Capabilities */
 	u16 ctrl;		/* SR-IOV Control */
 	u16 total;		/* total VFs associated with the PF */
+	u16 initial;		/* initial VFs associated with the PF */
+	u16 nr_virtfn;		/* number of VFs available */
 	u16 offset;		/* first VF Routing ID offset */
 	u16 stride;		/* following VF stride */
 	u32 pgsz;		/* page size for BAR alignment */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8ce2f2d9ab6..c2e491e0406 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -265,6 +265,7 @@ struct pci_dev {
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
+	unsigned int	is_virtfn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -279,7 +280,10 @@ struct pci_dev {
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCI_IOV
-	struct pci_sriov *sriov;	/* SR-IOV capability related */
+	union {
+		struct pci_sriov *sriov;	/* SR-IOV capability related */
+		struct pci_dev *physfn;	/* the PF this VF is associated with */
+	};
 #endif
 };
 
@@ -1212,5 +1216,18 @@ int pci_ext_cfg_avail(struct pci_dev *dev);
 
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 
+#ifdef CONFIG_PCI_IOV
+extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+extern void pci_disable_sriov(struct pci_dev *dev);
+#else
+static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_sriov(struct pci_dev *dev)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 74bb1bcc7dbbc9ddef773bf3395d7ff92aaaad2e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:16 +0800
Subject: PCI: handle SR-IOV Virtual Function Migration

Add or remove a Virtual Function after receiving a Migrate In or Out
Request.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |   4 ++
 include/linux/pci.h |   6 +++
 3 files changed, 129 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index d0ff8ad8f7b..7227efc760d 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -179,6 +179,97 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
 	pci_dev_put(dev);
 }
 
+static int sriov_migration(struct pci_dev *dev)
+{
+	u16 status;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return 0;
+
+	if (!(iov->cap & PCI_SRIOV_CAP_VFM))
+		return 0;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_STATUS, &status);
+	if (!(status & PCI_SRIOV_STATUS_VFM))
+		return 0;
+
+	schedule_work(&iov->mtask);
+
+	return 1;
+}
+
+static void sriov_migration_task(struct work_struct *work)
+{
+	int i;
+	u8 state;
+	u16 status;
+	struct pci_sriov *iov = container_of(work, struct pci_sriov, mtask);
+
+	for (i = iov->initial; i < iov->nr_virtfn; i++) {
+		state = readb(iov->mstate + i);
+		if (state == PCI_SRIOV_VFM_MI) {
+			writeb(PCI_SRIOV_VFM_AV, iov->mstate + i);
+			state = readb(iov->mstate + i);
+			if (state == PCI_SRIOV_VFM_AV)
+				virtfn_add(iov->self, i, 1);
+		} else if (state == PCI_SRIOV_VFM_MO) {
+			virtfn_remove(iov->self, i, 1);
+			writeb(PCI_SRIOV_VFM_UA, iov->mstate + i);
+			state = readb(iov->mstate + i);
+			if (state == PCI_SRIOV_VFM_AV)
+				virtfn_add(iov->self, i, 0);
+		}
+	}
+
+	pci_read_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, &status);
+	status &= ~PCI_SRIOV_STATUS_VFM;
+	pci_write_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, status);
+}
+
+static int sriov_enable_migration(struct pci_dev *dev, int nr_virtfn)
+{
+	int bir;
+	u32 table;
+	resource_size_t pa;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (nr_virtfn <= iov->initial)
+		return 0;
+
+	pci_read_config_dword(dev, iov->pos + PCI_SRIOV_VFM, &table);
+	bir = PCI_SRIOV_VFM_BIR(table);
+	if (bir > PCI_STD_RESOURCE_END)
+		return -EIO;
+
+	table = PCI_SRIOV_VFM_OFFSET(table);
+	if (table + nr_virtfn > pci_resource_len(dev, bir))
+		return -EIO;
+
+	pa = pci_resource_start(dev, bir) + table;
+	iov->mstate = ioremap(pa, nr_virtfn);
+	if (!iov->mstate)
+		return -ENOMEM;
+
+	INIT_WORK(&iov->mtask, sriov_migration_task);
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR;
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+	return 0;
+}
+
+static void sriov_disable_migration(struct pci_dev *dev)
+{
+	struct pci_sriov *iov = dev->sriov;
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+	cancel_work_sync(&iov->mtask);
+	iounmap(iov->mstate);
+}
+
 static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 {
 	int rc;
@@ -261,6 +352,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 			goto failed;
 	}
 
+	if (iov->cap & PCI_SRIOV_CAP_VFM) {
+		rc = sriov_enable_migration(dev, nr_virtfn);
+		if (rc)
+			goto failed;
+	}
+
 	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
 	iov->nr_virtfn = nr_virtfn;
 
@@ -290,6 +387,9 @@ static void sriov_disable(struct pci_dev *dev)
 	if (!iov->nr_virtfn)
 		return;
 
+	if (iov->cap & PCI_SRIOV_CAP_VFM)
+		sriov_disable_migration(dev);
+
 	for (i = 0; i < iov->nr_virtfn; i++)
 		virtfn_remove(dev, i, 0);
 
@@ -559,3 +659,22 @@ void pci_disable_sriov(struct pci_dev *dev)
 	sriov_disable(dev);
 }
 EXPORT_SYMBOL_GPL(pci_disable_sriov);
+
+/**
+ * pci_sriov_migration - notify SR-IOV core of Virtual Function Migration
+ * @dev: the PCI device
+ *
+ * Returns IRQ_HANDLED if the IRQ is handled, or IRQ_NONE if not.
+ *
+ * Physical Function driver is responsible to register IRQ handler using
+ * VF Migration Interrupt Message Number, and call this function when the
+ * interrupt is generated by the hardware.
+ */
+irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+	if (!dev->is_physfn)
+		return IRQ_NONE;
+
+	return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_migration);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0f1c7d10350..22dcfdb75d9 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,6 +1,8 @@
 #ifndef DRIVERS_PCI_H
 #define DRIVERS_PCI_H
 
+#include <linux/workqueue.h>
+
 #define PCI_CFG_SPACE_SIZE	256
 #define PCI_CFG_SPACE_EXP_SIZE	4096
 
@@ -218,6 +220,8 @@ struct pci_sriov {
 	struct pci_dev *dev;	/* lowest numbered PF */
 	struct pci_dev *self;	/* this PF */
 	struct mutex lock;	/* lock for VF bus */
+	struct work_struct mtask; /* VF Migration task */
+	u8 __iomem *mstate;	/* VF Migration State Array */
 };
 
 #ifdef CONFIG_PCI_IOV
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c2e491e0406..1216843412d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -52,6 +52,7 @@
 #include <asm/atomic.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/irqreturn.h>
 
 /* Include the ID list */
 #include <linux/pci_ids.h>
@@ -1219,6 +1220,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 #ifdef CONFIG_PCI_IOV
 extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 extern void pci_disable_sriov(struct pci_dev *dev);
+extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
 #else
 static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 {
@@ -1227,6 +1229,10 @@ static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 static inline void pci_disable_sriov(struct pci_dev *dev)
 {
 }
+static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+	return IRQ_NONE;
+}
 #endif
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 01db4957179c92fda7d9a06e49b7ae56fb7c925b Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:17 +0800
Subject: PCI: document SR-IOV sysfs entries

Reviewed-by: Randy Dunlap <rdunlap@xenotime.net>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 3d29793a0ea..d175a2a5fac 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -68,3 +68,30 @@ Description:
 		that some devices may have malformatted data.  If the
 		underlying VPD has a writable section then the
 		corresponding section of this file will be writable.
+
+What:		/sys/bus/pci/devices/.../virtfnN
+Date:		March 2009
+Contact:	Yu Zhao <yu.zhao@intel.com>
+Description:
+		This symbolic link appears when hardware supports the SR-IOV
+		capability and the Physical Function driver has enabled it.
+		The symbolic link points to the PCI device sysfs entry of the
+		Virtual Function whose index is N (0...MaxVFs-1).
+
+What:		/sys/bus/pci/devices/.../dep_link
+Date:		March 2009
+Contact:	Yu Zhao <yu.zhao@intel.com>
+Description:
+		This symbolic link appears when hardware supports the SR-IOV
+		capability and the Physical Function driver has enabled it,
+		and this device has vendor specific dependencies with others.
+		The symbolic link points to the PCI device sysfs entry of
+		Physical Function this device depends on.
+
+What:		/sys/bus/pci/devices/.../physfn
+Date:		March 2009
+Contact:	Yu Zhao <yu.zhao@intel.com>
+Description:
+		This symbolic link appears when a device is a Virtual Function.
+		The symbolic link points to the PCI device sysfs entry of the
+		Physical Function this device associates with.
-- 
cgit v1.2.3


From 15b49bee3a2b228370194f1b3ebc3db427cc9c94 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:18 +0800
Subject: PCI: manual for SR-IOV user and driver developer

Reviewed-by: Randy Dunlap <rdunlap@xenotime.net>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/DocBook/kernel-api.tmpl |  1 +
 Documentation/PCI/pci-iov-howto.txt   | 99 +++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 Documentation/PCI/pci-iov-howto.txt

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index bc962cda650..58c194572c7 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -199,6 +199,7 @@ X!Edrivers/pci/hotplug.c
 -->
 !Edrivers/pci/probe.c
 !Edrivers/pci/rom.c
+!Edrivers/pci/iov.c
      </sect1>
      <sect1><title>PCI Hotplug Support Library</title>
 !Edrivers/pci/hotplug/pci_hotplug_core.c
diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt
new file mode 100644
index 00000000000..fc73ef5d65b
--- /dev/null
+++ b/Documentation/PCI/pci-iov-howto.txt
@@ -0,0 +1,99 @@
+		PCI Express I/O Virtualization Howto
+		Copyright (C) 2009 Intel Corporation
+		    Yu Zhao <yu.zhao@intel.com>
+
+
+1. Overview
+
+1.1 What is SR-IOV
+
+Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
+capability which makes one physical device appear as multiple virtual
+devices. The physical device is referred to as Physical Function (PF)
+while the virtual devices are referred to as Virtual Functions (VF).
+Allocation of the VF can be dynamically controlled by the PF via
+registers encapsulated in the capability. By default, this feature is
+not enabled and the PF behaves as traditional PCIe device. Once it's
+turned on, each VF's PCI configuration space can be accessed by its own
+Bus, Device and Function Number (Routing ID). And each VF also has PCI
+Memory Space, which is used to map its register set. VF device driver
+operates on the register set so it can be functional and appear as a
+real existing PCI device.
+
+2. User Guide
+
+2.1 How can I enable SR-IOV capability
+
+The device driver (PF driver) will control the enabling and disabling
+of the capability via API provided by SR-IOV core. If the hardware
+has SR-IOV capability, loading its PF driver would enable it and all
+VFs associated with the PF.
+
+2.2 How can I use the Virtual Functions
+
+The VF is treated as hot-plugged PCI devices in the kernel, so they
+should be able to work in the same way as real PCI devices. The VF
+requires device driver that is same as a normal PCI device's.
+
+3. Developer Guide
+
+3.1 SR-IOV API
+
+To enable SR-IOV capability:
+	int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+	'nr_virtfn' is number of VFs to be enabled.
+
+To disable SR-IOV capability:
+	void pci_disable_sriov(struct pci_dev *dev);
+
+To notify SR-IOV core of Virtual Function Migration:
+	irqreturn_t pci_sriov_migration(struct pci_dev *dev);
+
+3.2 Usage example
+
+Following piece of code illustrates the usage of the SR-IOV API.
+
+static int __devinit dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	pci_enable_sriov(dev, NR_VIRTFN);
+
+	...
+
+	return 0;
+}
+
+static void __devexit dev_remove(struct pci_dev *dev)
+{
+	pci_disable_sriov(dev);
+
+	...
+}
+
+static int dev_suspend(struct pci_dev *dev, pm_message_t state)
+{
+	...
+
+	return 0;
+}
+
+static int dev_resume(struct pci_dev *dev)
+{
+	...
+
+	return 0;
+}
+
+static void dev_shutdown(struct pci_dev *dev)
+{
+	...
+}
+
+static struct pci_driver dev_driver = {
+	.name =		"SR-IOV Physical Function driver",
+	.id_table =	dev_id_table,
+	.probe =	dev_probe,
+	.remove =	__devexit_p(dev_remove),
+	.suspend =	dev_suspend,
+	.resume =	dev_resume,
+	.shutdown =	dev_shutdown,
+};
-- 
cgit v1.2.3


From 5546d6f56807115a035d140f7364ce5807dbcc87 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@aristanetworks.com>
Date: Thu, 19 Mar 2009 20:57:56 -0700
Subject: x86/PCI: Detect mmconfig on nVidia MCP55

Detect and enable memory-mapped PCI configuration space on the nVidia
MCP55 southbridge.  Tested against 2.6.27.4 on an Arista Networks
development board with one MCP55, Coreboot firmware, no ACPI.

Signed-off-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 89bf9242c80..d68dc1bb01b 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -154,6 +154,68 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
 	return "AMD Family 10h NB";
 }
 
+static bool __initdata mcp55_checked;
+static const char __init *pci_mmcfg_nvidia_mcp55(void)
+{
+	int bus;
+	int mcp55_mmconf_found = 0;
+
+	static const u32 extcfg_regnum		= 0x90;
+	static const u32 extcfg_regsize		= 4;
+	static const u32 extcfg_enable_mask	= 1<<31;
+	static const u32 extcfg_start_mask	= 0xff<<16;
+	static const int extcfg_start_shift	= 16;
+	static const u32 extcfg_size_mask	= 0x3<<28;
+	static const int extcfg_size_shift	= 28;
+	static const int extcfg_sizebus[]	= {0x100, 0x80, 0x40, 0x20};
+	static const u32 extcfg_base_mask[]	= {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff};
+	static const int extcfg_base_lshift	= 25;
+
+	/*
+	 * do check if amd fam10h already took over
+	 */
+	if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked)
+		return NULL;
+
+	mcp55_checked = true;
+	for (bus = 0; bus < 256; bus++) {
+		u64 base;
+		u32 l, extcfg;
+		u16 vendor, device;
+		int start, size_index, end;
+
+		raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), 0, 4, &l);
+		vendor = l & 0xffff;
+		device = (l >> 16) & 0xffff;
+
+		if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device)
+			continue;
+
+		raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), extcfg_regnum,
+				  extcfg_regsize, &extcfg);
+
+		if (!(extcfg & extcfg_enable_mask))
+			continue;
+
+		if (extend_mmcfg(1) == -1)
+			continue;
+
+		size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
+		base = extcfg & extcfg_base_mask[size_index];
+		/* base could > 4G */
+		base <<= extcfg_base_lshift;
+		start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
+		end = start + extcfg_sizebus[size_index] - 1;
+		fill_one_mmcfg(base, 0, start, end);
+		mcp55_mmconf_found++;
+	}
+
+	if (!mcp55_mmconf_found)
+		return NULL;
+
+	return "nVidia MCP55";
+}
+
 struct pci_mmcfg_hostbridge_probe {
 	u32 bus;
 	u32 devfn;
@@ -171,6 +233,8 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
 	  0x1200, pci_mmcfg_amd_fam10h },
 	{ 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD,
 	  0x1200, pci_mmcfg_amd_fam10h },
+	{ 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_NVIDIA,
+	  0x0369, pci_mmcfg_nvidia_mcp55 },
 };
 
 static int __init pci_mmcfg_check_hostbridge(void)
-- 
cgit v1.2.3


From fafad5bf06c3a3bb8b24b28b6f065367e7411872 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 20 Mar 2009 15:22:12 +1100
Subject: PCI MSI: Add example request loop to MSI-HOWTO.txt

Encourage driver writers to think about supporting a variable number
of MSI-X interrupts, and give an example of how to do such a
request.

Acked-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/PCI/MSI-HOWTO.txt | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 9494f6dc38e..dcf7acc720e 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -176,7 +176,8 @@ request_irq() for each 'vector' that it decides to use.
 If this function returns a negative number, it indicates an error and
 the driver should not attempt to allocate any more MSI-X interrupts for
 this device.  If it returns a positive number, it indicates the maximum
-number of interrupt vectors that could have been allocated.
+number of interrupt vectors that could have been allocated. See example
+below.
 
 This function, in contrast with pci_enable_msi(), does not adjust
 dev->irq.  The device will not generate interrupts for this interrupt
@@ -187,6 +188,26 @@ free them again later.
 Device drivers should normally call this function once per device
 during the initialization phase.
 
+It is ideal if drivers can cope with a variable number of MSI-X interrupts,
+there are many reasons why the platform may not be able to provide the
+exact number a driver asks for.
+
+A request loop to achieve that might look like:
+
+static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
+{
+	while (nvec >= FOO_DRIVER_MINIMUM_NVEC) {
+		rc = pci_enable_msix(adapter->pdev,
+				     adapter->msix_entries, nvec);
+		if (rc > 0)
+			nvec = rc;
+		else
+			return rc;
+	}
+
+	return -ENOSPC;
+}
+
 4.3.2 pci_disable_msix
 
 void pci_disable_msix(struct pci_dev *dev)
-- 
cgit v1.2.3


From 068258bc15439c11a966e873f931cc8e513dca61 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 19 Mar 2009 20:55:35 -0700
Subject: x86/PCI: host mmconfig detect clean up

Fix mmconfig detection to not assume a single mmconfig space in the
northbridge, paving the way for AMD fam10h + mcp55 CPUs.  On those, the
MSR has some range, but the mcp55 pci config will have another one.

Also helps the mcp55 + io55 case, where every one will have one range.

If it is mcp55, exclude the range that is used by CPU MSR, in other
words , if the CPU claims busses 0-255, the range in mcp55 is dropped,
because CPU HW will not route those ranges to mcp55 mmconfig to handle
it.

Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 163 +++++++++++++++++++++++++++--------------
 arch/x86/pci/mmconfig_64.c     |  17 +++--
 2 files changed, 120 insertions(+), 60 deletions(-)

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index d68dc1bb01b..905bb526b13 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
+#include <linux/sort.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
@@ -24,24 +25,49 @@
 /* Indicate if the mmcfg resources have been placed into the resource table. */
 static int __initdata pci_mmcfg_resources_inserted;
 
+static __init int extend_mmcfg(int num)
+{
+	struct acpi_mcfg_allocation *new;
+	int new_num = pci_mmcfg_config_num + num;
+
+	new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL);
+	if (!new)
+		return -1;
+
+	if (pci_mmcfg_config) {
+		memcpy(new, pci_mmcfg_config,
+			 sizeof(pci_mmcfg_config[0]) * new_num);
+		kfree(pci_mmcfg_config);
+	}
+	pci_mmcfg_config = new;
+
+	return 0;
+}
+
+static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end)
+{
+	int i = pci_mmcfg_config_num;
+
+	pci_mmcfg_config_num++;
+	pci_mmcfg_config[i].address = addr;
+	pci_mmcfg_config[i].pci_segment = segment;
+	pci_mmcfg_config[i].start_bus_number = start;
+	pci_mmcfg_config[i].end_bus_number = end;
+}
+
 static const char __init *pci_mmcfg_e7520(void)
 {
 	u32 win;
 	raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win);
 
 	win = win & 0xf000;
-	if(win == 0x0000 || win == 0xf000)
-		pci_mmcfg_config_num = 0;
-	else {
-		pci_mmcfg_config_num = 1;
-		pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
-		if (!pci_mmcfg_config)
-			return NULL;
-		pci_mmcfg_config[0].address = win << 16;
-		pci_mmcfg_config[0].pci_segment = 0;
-		pci_mmcfg_config[0].start_bus_number = 0;
-		pci_mmcfg_config[0].end_bus_number = 255;
-	}
+	if (win == 0x0000 || win == 0xf000)
+		return NULL;
+
+	if (extend_mmcfg(1) == -1)
+		return NULL;
+
+	fill_one_mmcfg(win << 16, 0, 0, 255);
 
 	return "Intel Corporation E7520 Memory Controller Hub";
 }
@@ -50,13 +76,11 @@ static const char __init *pci_mmcfg_intel_945(void)
 {
 	u32 pciexbar, mask = 0, len = 0;
 
-	pci_mmcfg_config_num = 1;
-
 	raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar);
 
 	/* Enable bit */
 	if (!(pciexbar & 1))
-		pci_mmcfg_config_num = 0;
+		return NULL;
 
 	/* Size bits */
 	switch ((pciexbar >> 1) & 3) {
@@ -73,28 +97,23 @@ static const char __init *pci_mmcfg_intel_945(void)
 		len  = 0x04000000U;
 		break;
 	default:
-		pci_mmcfg_config_num = 0;
+		return NULL;
 	}
 
 	/* Errata #2, things break when not aligned on a 256Mb boundary */
 	/* Can only happen in 64M/128M mode */
 
 	if ((pciexbar & mask) & 0x0fffffffU)
-		pci_mmcfg_config_num = 0;
+		return NULL;
 
 	/* Don't hit the APIC registers and their friends */
 	if ((pciexbar & mask) >= 0xf0000000U)
-		pci_mmcfg_config_num = 0;
-
-	if (pci_mmcfg_config_num) {
-		pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
-		if (!pci_mmcfg_config)
-			return NULL;
-		pci_mmcfg_config[0].address = pciexbar & mask;
-		pci_mmcfg_config[0].pci_segment = 0;
-		pci_mmcfg_config[0].start_bus_number = 0;
-		pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
-	}
+		return NULL;
+
+	if (extend_mmcfg(1) == -1)
+		return NULL;
+
+	fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
 
 	return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
 }
@@ -138,18 +157,11 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
 		busnbits = 8;
 	}
 
-	pci_mmcfg_config_num = (1 << segnbits);
-	pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]) *
-				   pci_mmcfg_config_num, GFP_KERNEL);
-	if (!pci_mmcfg_config)
+	if (extend_mmcfg(1 << segnbits) == -1)
 		return NULL;
 
-	for (i = 0; i < (1 << segnbits); i++) {
-		pci_mmcfg_config[i].address = base + (1<<28) * i;
-		pci_mmcfg_config[i].pci_segment = i;
-		pci_mmcfg_config[i].start_bus_number = 0;
-		pci_mmcfg_config[i].end_bus_number = (1 << busnbits) - 1;
-	}
+	for (i = 0; i < (1 << segnbits); i++)
+		fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1);
 
 	return "AMD Family 10h NB";
 }
@@ -237,6 +249,48 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
 	  0x0369, pci_mmcfg_nvidia_mcp55 },
 };
 
+static int __init cmp_mmcfg(const void *x1, const void *x2)
+{
+	const typeof(pci_mmcfg_config[0]) *m1 = x1;
+	const typeof(pci_mmcfg_config[0]) *m2 = x2;
+	int start1, start2;
+
+	start1 = m1->start_bus_number;
+	start2 = m2->start_bus_number;
+
+	return start1 - start2;
+}
+
+static void __init pci_mmcfg_check_end_bus_number(void)
+{
+	int i;
+	typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
+
+	/* sort them at first */
+	sort(pci_mmcfg_config, pci_mmcfg_config_num,
+		 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
+
+	/* last one*/
+	if (pci_mmcfg_config_num > 0) {
+		i = pci_mmcfg_config_num - 1;
+		cfg = &pci_mmcfg_config[i];
+		if (cfg->end_bus_number < cfg->start_bus_number)
+			cfg->end_bus_number = 255;
+	}
+
+	/* don't overlap please */
+	for (i = 0; i < pci_mmcfg_config_num - 1; i++) {
+		cfg = &pci_mmcfg_config[i];
+		cfgx = &pci_mmcfg_config[i+1];
+
+		if (cfg->end_bus_number < cfg->start_bus_number)
+			cfg->end_bus_number = 255;
+
+		if (cfg->end_bus_number >= cfgx->start_bus_number)
+			cfg->end_bus_number = cfgx->start_bus_number - 1;
+	}
+}
+
 static int __init pci_mmcfg_check_hostbridge(void)
 {
 	u32 l;
@@ -250,31 +304,33 @@ static int __init pci_mmcfg_check_hostbridge(void)
 
 	pci_mmcfg_config_num = 0;
 	pci_mmcfg_config = NULL;
-	name = NULL;
 
-	for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
+	for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
 		bus =  pci_mmcfg_probes[i].bus;
 		devfn = pci_mmcfg_probes[i].devfn;
 		raw_pci_ops->read(0, bus, devfn, 0, 4, &l);
 		vendor = l & 0xffff;
 		device = (l >> 16) & 0xffff;
 
+		name = NULL;
 		if (pci_mmcfg_probes[i].vendor == vendor &&
 		    pci_mmcfg_probes[i].device == device)
 			name = pci_mmcfg_probes[i].probe();
-	}
 
-	if (name) {
-		printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n",
-		       name, pci_mmcfg_config_num ? "with" : "without");
+		if (name)
+			printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n",
+			       name);
 	}
 
-	return name != NULL;
+	/* some end_bus_number is crazy, fix it */
+	pci_mmcfg_check_end_bus_number();
+
+	return pci_mmcfg_config_num != 0;
 }
 
 static void __init pci_mmcfg_insert_resources(void)
 {
-#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+#define PCI_MMCFG_RESOURCE_NAME_LEN 24
 	int i;
 	struct resource *res;
 	char *names;
@@ -292,9 +348,10 @@ static void __init pci_mmcfg_insert_resources(void)
 		struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
 		num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
 		res->name = names;
-		snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
-			 cfg->pci_segment);
-		res->start = cfg->address;
+		snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
+			 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
+			 cfg->start_bus_number, cfg->end_bus_number);
+		res->start = cfg->address + (cfg->start_bus_number << 20);
 		res->end = res->start + (num_buses << 20) - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		insert_resource(&iomem_resource, res);
@@ -418,8 +475,6 @@ static void __init pci_mmcfg_reject_broken(int early)
 	    (pci_mmcfg_config[0].address == 0))
 		return;
 
-	cfg = &pci_mmcfg_config[0];
-
 	for (i = 0; i < pci_mmcfg_config_num; i++) {
 		int valid = 0;
 		u64 addr, size;
@@ -487,10 +542,10 @@ static void __init __pci_mmcfg_init(int early)
 			known_bridge = 1;
 	}
 
-	if (!known_bridge) {
+	if (!known_bridge)
 		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
-		pci_mmcfg_reject_broken(early);
-	}
+
+	pci_mmcfg_reject_broken(early);
 
 	if ((pci_mmcfg_config_num == 0) ||
 	    (pci_mmcfg_config == NULL) ||
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 30007ffc8e1..94349f8b2f9 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -112,13 +112,18 @@ static struct pci_raw_ops pci_mmcfg = {
 static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
 {
 	void __iomem *addr;
-	u32 size;
-
-	size = (cfg->end_bus_number + 1) << 20;
-	addr = ioremap_nocache(cfg->address, size);
+	u64 start, size;
+
+	start = cfg->start_bus_number;
+	start <<= 20;
+	start += cfg->address;
+	size = cfg->end_bus_number + 1 - cfg->start_bus_number;
+	size <<= 20;
+	addr = ioremap_nocache(start, size);
 	if (addr) {
 		printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
-		       cfg->address, cfg->address + size - 1);
+		       start, start + size - 1);
+		addr -= cfg->start_bus_number << 20;
 	}
 	return addr;
 }
@@ -157,7 +162,7 @@ void __init pci_mmcfg_arch_free(void)
 
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
 		if (pci_mmcfg_virt[i].virt) {
-			iounmap(pci_mmcfg_virt[i].virt);
+			iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20));
 			pci_mmcfg_virt[i].virt = NULL;
 			pci_mmcfg_virt[i].cfg = NULL;
 		}
-- 
cgit v1.2.3


From 79af72d716cf1bb13b175429cf181a6c4d063ee8 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 20 Mar 2009 14:55:55 -0600
Subject: PCI: pci_is_root_bus helper

Introduce pci_is_root_bus helper function. This will help make code
more consistent, as well as prevent incorrect assumptions (such as
pci_bus->self == NULL on a root bus, which is not always true).

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1216843412d..50d94388e87 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -357,6 +357,15 @@ struct pci_bus {
 #define pci_bus_b(n)	list_entry(n, struct pci_bus, node)
 #define to_pci_bus(n)	container_of(n, struct pci_bus, dev)
 
+/*
+ * Returns true if the pci bus is root (behind host-pci bridge),
+ * false otherwise
+ */
+static inline bool pci_is_root_bus(struct pci_bus *pbus)
+{
+	return !(pbus->parent);
+}
+
 #ifdef CONFIG_PCI_MSI
 static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev)
 {
-- 
cgit v1.2.3


From 90bdb3117f4209baa6d712b126f0e7791b24dc3f Mon Sep 17 00:00:00 2001
From: Trent Piepho <xyzzy@speakeasy.org>
Date: Fri, 20 Mar 2009 14:56:00 -0600
Subject: PCI: don't scan existing devices

pci_scan_single_device is supposed to add newly discovered
devices to pci_bus->devices, but doesn't check to see if the
device has already been added. This can cause problems if we ever
want to use this interface to rescan the PCI bus.

If the device is already added, just return it.

Signed-off-by: Trent Piepho <xyzzy@speakeasy.org>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 943c49a7842..140b9deb482 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1019,6 +1019,12 @@ struct pci_dev *__ref pci_scan_single_device(struct pci_bus *bus, int devfn)
 {
 	struct pci_dev *dev;
 
+	dev = pci_get_slot(bus, devfn);
+	if (dev) {
+		pci_dev_put(dev);
+		return dev;
+	}
+
 	dev = pci_scan_device(bus, devfn);
 	if (!dev)
 		return NULL;
-- 
cgit v1.2.3


From 1b69dfc649e6658fc38499cf704750d74cabc73d Mon Sep 17 00:00:00 2001
From: Trent Piepho <xyzzy@speakeasy.org>
Date: Fri, 20 Mar 2009 14:56:05 -0600
Subject: PCI: pci_scan_slot() returns newly found devices

pci_scan_slot() has been rewritten to be less complex and will now
return the number of *new* devices found.

Existing callers need not worry because they already assume that
they can't call pci_scan_slot() on an already-scanned slot.

Thus, there is no semantic change for existing callers: returning
newly found devices (this patch) is exactly equal to returning all
found devices (before this patch).

This patch adds some more groundwork to allow us to rescan the
PCI bus during runtime to discover newly added devices.

Signed-off-by: Trent Piepho <xyzzy@speakeasy.org>
Reviewed-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 140b9deb482..f3aabdf28f8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1043,35 +1043,27 @@ EXPORT_SYMBOL(pci_scan_single_device);
  * Scan a PCI slot on the specified PCI bus for devices, adding
  * discovered devices to the @bus->devices list.  New devices
  * will not have is_added set.
+ *
+ * Returns the number of new devices found.
  */
 int pci_scan_slot(struct pci_bus *bus, int devfn)
 {
-	int func, nr = 0;
-	int scan_all_fns;
-
-	scan_all_fns = pcibios_scan_all_fns(bus, devfn);
-
-	for (func = 0; func < 8; func++, devfn++) {
-		struct pci_dev *dev;
-
-		dev = pci_scan_single_device(bus, devfn);
-		if (dev) {
-			nr++;
+	int fn, nr = 0;
+	struct pci_dev *dev;
 
-			/*
-		 	 * If this is a single function device,
-		 	 * don't scan past the first function.
-		 	 */
-			if (!dev->multifunction) {
-				if (func > 0) {
-					dev->multifunction = 1;
-				} else {
- 					break;
-				}
+	dev = pci_scan_single_device(bus, devfn);
+	if (dev && !dev->is_added)	/* new device? */
+		nr++;
+
+	if ((dev && dev->multifunction) ||
+	    (!dev && pcibios_scan_all_fns(bus, devfn))) {
+		for (fn = 1; fn < 8; fn++) {
+			dev = pci_scan_single_device(bus, devfn + fn);
+			if (dev) {
+				if (!dev->is_added)
+					nr++;
+				dev->multifunction = 1;
 			}
-		} else {
-			if (func == 0 && !scan_all_fns)
-				break;
 		}
 	}
 
-- 
cgit v1.2.3


From 74710ded8e16fc8dacbb702a5bac1a493d88549a Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:10 -0600
Subject: PCI: always scan child buses

While scanning bridges, we stop our scan if we encounter a bus
that we've seen before, to work around some buggy chipsets. This
is a good idea, but prevents us from fully scanning the PCI bus
at a future time (to find newly hot-added devices, for example).

Change the logic so that we skip _re-adding_ an existing bus
that we've seen before, but also allow the scan to descend to
all child buses.

Now that we're potentially scanning our child buses again, we
also need to be sure not to attempt re-initializing their BARs
so we avoid that.

This patch lays the groundwork to allow the user to issue a
rescan of the PCI bus at any time.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f3aabdf28f8..f69256c63b2 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -511,21 +511,21 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 
 		/*
 		 * If we already got to this bus through a different bridge,
-		 * ignore it.  This can happen with the i450NX chipset.
+		 * don't re-add it. This can happen with the i450NX chipset.
+		 *
+		 * However, we continue to descend down the hierarchy and
+		 * scan remaining child buses.
 		 */
-		if (pci_find_bus(pci_domain_nr(bus), busnr)) {
-			dev_info(&dev->dev, "bus %04x:%02x already known\n",
-				 pci_domain_nr(bus), busnr);
-			goto out;
+		child = pci_find_bus(pci_domain_nr(bus), busnr);
+		if (!child) {
+			child = pci_add_new_bus(bus, dev, busnr);
+			if (!child)
+				goto out;
+			child->primary = buses & 0xFF;
+			child->subordinate = (buses >> 16) & 0xFF;
+			child->bridge_ctl = bctl;
 		}
 
-		child = pci_add_new_bus(bus, dev, busnr);
-		if (!child)
-			goto out;
-		child->primary = buses & 0xFF;
-		child->subordinate = (buses >> 16) & 0xFF;
-		child->bridge_ctl = bctl;
-
 		cmax = pci_scan_child_bus(child);
 		if (cmax > max)
 			max = cmax;
@@ -1092,8 +1092,14 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
 	 * After performing arch-dependent fixup of the bus, look behind
 	 * all PCI-to-PCI bridges on this bus.
 	 */
-	pr_debug("PCI: Fixups for bus %04x:%02x\n", pci_domain_nr(bus), bus->number);
-	pcibios_fixup_bus(bus);
+	if (!bus->is_added) {
+		pr_debug("PCI: Fixups for bus %04x:%02x\n",
+			 pci_domain_nr(bus), bus->number);
+		pcibios_fixup_bus(bus);
+		if (pci_is_root_bus(bus))
+			bus->is_added = 1;
+	}
+
 	for (pass=0; pass < 2; pass++)
 		list_for_each_entry(dev, &bus->devices, bus_list) {
 			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-- 
cgit v1.2.3


From b73e97d95c168cbc19bd1208c894077f25931ba1 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:15 -0600
Subject: PCI: do not initialize bridges more than once

In preparation for PCI core hotplug, we need to ensure that we do
not attempt to re-initialize bridges that have already been initialized.

We only need to worry about non-root buses, since we will not allow
root bus removal.

Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/setup-bus.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 170a3eda9dd..334285a8e23 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -144,6 +144,9 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	struct pci_bus_region region;
 	u32 l, bu, lu, io_upper16;
 
+	if (!pci_is_root_bus(bus) && bus->is_added)
+		return;
+
 	dev_info(&bridge->dev, "PCI bridge, secondary bus %04x:%02x\n",
 		 pci_domain_nr(bus), bus->number);
 
-- 
cgit v1.2.3


From 9dd90cafa7a712d283e2e0c625b022e19f746762 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:20 -0600
Subject: PCI: do not enable bridges more than once

In preparation for PCI core hotplug, we need to ensure that we do
not attempt to re-enable bridges that have already been enabled.

Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/bus.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 118c77778d2..68f91a25259 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -184,8 +184,10 @@ void pci_enable_bridges(struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		if (dev->subordinate) {
-			retval = pci_enable_device(dev);
-			pci_set_master(dev);
+			if (atomic_read(&dev->enable_cnt) == 0) {
+				retval = pci_enable_device(dev);
+				pci_set_master(dev);
+			}
 			pci_enable_bridges(dev->subordinate);
 		}
 	}
-- 
cgit v1.2.3


From 3ed4fd96b3188406ac5357d9290bcffa08c65cf6 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:25 -0600
Subject: PCI: Introduce pci_rescan_bus()

This API is used by the PCI core to rescan a bus and rediscover
newly added devices.

Over time, it is expected that the various PCI hotplug drivers
will migrate to this interface and away from the old
pci_do_scan_bus() interface.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/fakephp.c |  6 +++---
 drivers/pci/probe.c           | 32 ++++++++++++++++++++++++++++++++
 include/linux/pci.h           |  3 +++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index d8649e12729..16063745766 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -245,12 +245,12 @@ static int pci_rescan_slot(struct pci_dev *temp)
 
 
 /**
- * pci_rescan_bus - Rescan PCI bus
+ * pci_rescan_bus_local - fakephp version of rescan PCI bus
  * @bus: the PCI bus to rescan
  *
  * Call pci_rescan_slot for each possible function of the bus.
  */
-static void pci_rescan_bus(const struct pci_bus *bus)
+static void pci_rescan_bus_local(const struct pci_bus *bus)
 {
 	unsigned int devfn;
 	struct pci_dev *dev;
@@ -291,7 +291,7 @@ static void pci_rescan_buses(const struct list_head *list)
 	const struct list_head *l;
 	list_for_each(l,list) {
 		const struct pci_bus *b = pci_bus_b(l);
-		pci_rescan_bus(b);
+		pci_rescan_bus_local(b);
 		pci_rescan_buses(&b->children);
 	}
 }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f69256c63b2..60a8e5fec6c 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1212,6 +1212,38 @@ struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent,
 EXPORT_SYMBOL(pci_scan_bus_parented);
 
 #ifdef CONFIG_HOTPLUG
+/**
+ * pci_rescan_bus - scan a PCI bus for devices.
+ * @bus: PCI bus to scan
+ *
+ * Scan a PCI bus and child buses for new devices, adds them,
+ * and enables them.
+ *
+ * Returns the max number of subordinate bus discovered.
+ */
+unsigned int __devinit pci_rescan_bus(struct pci_bus *bus)
+{
+	unsigned int max;
+	struct pci_dev *dev;
+
+	max = pci_scan_child_bus(bus);
+
+	up_read(&pci_bus_sem);
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+			if (dev->subordinate)
+				pci_bus_size_bridges(dev->subordinate);
+	down_read(&pci_bus_sem);
+
+	pci_bus_assign_resources(bus);
+	pci_enable_bridges(bus);
+	pci_bus_add_devices(bus);
+
+	return max;
+}
+EXPORT_SYMBOL_GPL(pci_rescan_bus);
+
 EXPORT_SYMBOL(pci_add_new_bus);
 EXPORT_SYMBOL(pci_scan_slot);
 EXPORT_SYMBOL(pci_scan_bridge);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 50d94388e87..6fb335b0d74 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -726,6 +726,9 @@ int pci_back_from_sleep(struct pci_dev *dev);
 
 /* Functions for PCI Hotplug drivers to use */
 int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap);
+#ifdef CONFIG_HOTPLUG
+unsigned int pci_rescan_bus(struct pci_bus *bus);
+#endif
 
 /* Vital product data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
-- 
cgit v1.2.3


From 705b1aaa823e800490f157cd9366ad8cff385f5f Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:31 -0600
Subject: PCI: Introduce /sys/bus/pci/rescan

This interface allows the user to force a rescan of all PCI buses
in system, and rediscover devices that have been removed earlier.

pci_bus_attrs implementation from Trent Piepho.

Thanks to Vegard Nossum for discovering locking issues with the
sysfs interface.

Cc: Trent Piepho <xyzzy@speakeasy.org>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci |  9 +++++++++
 drivers/pci/pci-driver.c                |  1 +
 drivers/pci/pci-sysfs.c                 | 26 ++++++++++++++++++++++++++
 drivers/pci/pci.h                       |  6 ++++++
 drivers/pci/probe.c                     |  4 ++--
 5 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index d175a2a5fac..e6ad047cb3c 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -57,6 +57,15 @@ Description:
 		match the driver to the device.  For example:
 		# echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id
 
+What:		/sys/bus/pci/rescan
+Date:		January 2009
+Contact:	Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+		Writing a non-zero value to this attribute will
+		force a rescan of all PCI buses in the system, and
+		re-discover previously removed devices.
+		Depends on CONFIG_HOTPLUG.
+
 What:		/sys/bus/pci/devices/.../vpd
 Date:		February 2008
 Contact:	Ben Hutchings <bhutchings@solarflare.com>
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 87a5ddbb324..95d19857029 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1049,6 +1049,7 @@ struct bus_type pci_bus_type = {
 	.remove		= pci_device_remove,
 	.shutdown	= pci_device_shutdown,
 	.dev_attrs	= pci_dev_attrs,
+	.bus_attrs	= pci_bus_attrs,
 	.pm		= PCI_PM_OPS_PTR,
 };
 
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index ec7a175dcba..be7468a5eb7 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -219,6 +219,32 @@ msi_bus_store(struct device *dev, struct device_attribute *attr,
 	return count;
 }
 
+#ifdef CONFIG_HOTPLUG
+static DEFINE_MUTEX(pci_remove_rescan_mutex);
+static ssize_t bus_rescan_store(struct bus_type *bus, const char *buf,
+				size_t count)
+{
+	unsigned long val;
+	struct pci_bus *b = NULL;
+
+	if (strict_strtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val) {
+		mutex_lock(&pci_remove_rescan_mutex);
+		while ((b = pci_find_next_bus(b)) != NULL)
+			pci_rescan_bus(b);
+		mutex_unlock(&pci_remove_rescan_mutex);
+	}
+	return count;
+}
+
+struct bus_attribute pci_bus_attrs[] = {
+	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
+	__ATTR_NULL
+};
+#endif
+
 struct device_attribute pci_dev_attrs[] = {
 	__ATTR_RO(resource),
 	__ATTR_RO(vendor),
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 22dcfdb75d9..45833a5bca6 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -138,6 +138,12 @@ extern int pcie_mch_quirk;
 extern struct device_attribute pci_dev_attrs[];
 extern struct device_attribute dev_attr_cpuaffinity;
 extern struct device_attribute dev_attr_cpulistaffinity;
+#ifdef CONFIG_HOTPLUG
+extern struct bus_attribute pci_bus_attrs[];
+#else
+#define pci_bus_attrs	NULL
+#endif
+
 
 /**
  * pci_match_one_device - Tell if a PCI device structure has a matching
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 60a8e5fec6c..56c71e585f3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1228,13 +1228,13 @@ unsigned int __devinit pci_rescan_bus(struct pci_bus *bus)
 
 	max = pci_scan_child_bus(bus);
 
-	up_read(&pci_bus_sem);
+	down_read(&pci_bus_sem);
 	list_for_each_entry(dev, &bus->devices, bus_list)
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
 		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
 			if (dev->subordinate)
 				pci_bus_size_bridges(dev->subordinate);
-	down_read(&pci_bus_sem);
+	up_read(&pci_bus_sem);
 
 	pci_bus_assign_resources(bus);
 	pci_enable_bridges(bus);
-- 
cgit v1.2.3


From 77c27c7b49d69d45ccb94e481653f024f1ac6650 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:36 -0600
Subject: PCI: Introduce /sys/bus/pci/devices/.../remove

This patch adds an attribute named "remove" to a PCI device's sysfs
directory.  Writing a non-zero value to this attribute will remove the PCI
device and any children of it.

Trent Piepho wrote the original implementation and documentation.

Thanks to Vegard Nossum for testing under kmemcheck and finding locking
issues with the sysfs interface.

Cc: Trent Piepho <xyzzy@speakeasy.org>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci |  8 ++++++++
 Documentation/filesystems/sysfs-pci.txt | 10 +++++++++
 drivers/pci/pci-sysfs.c                 | 36 +++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index e6ad047cb3c..daa791a9e85 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -66,6 +66,14 @@ Description:
 		re-discover previously removed devices.
 		Depends on CONFIG_HOTPLUG.
 
+What:		/sys/bus/pci/devices/.../remove
+Date:		January 2009
+Contact:	Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+		Writing a non-zero value to this attribute will
+		hot-remove the PCI device and any of its children.
+		Depends on CONFIG_HOTPLUG.
+
 What:		/sys/bus/pci/devices/.../vpd
 Date:		February 2008
 Contact:	Ben Hutchings <bhutchings@solarflare.com>
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 9f8740ca3f3..26e4b8bc53e 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -12,6 +12,7 @@ that support it.  For example, a given bus might look like this:
      |   |-- enable
      |   |-- irq
      |   |-- local_cpus
+     |   |-- remove
      |   |-- resource
      |   |-- resource0
      |   |-- resource1
@@ -36,6 +37,7 @@ files, each with their own function.
        enable	           Whether the device is enabled (ascii, rw)
        irq		   IRQ number (ascii, ro)
        local_cpus	   nearby CPU mask (cpumask, ro)
+       remove		   remove device from kernel's list (ascii, wo)
        resource		   PCI resource host addresses (ascii, ro)
        resource0..N	   PCI resource N, if present (binary, mmap)
        resource0_wc..N_wc  PCI WC map resource N, if prefetchable (binary, mmap)
@@ -46,6 +48,7 @@ files, each with their own function.
 
   ro - read only file
   rw - file is readable and writable
+  wo - write only file
   mmap - file is mmapable
   ascii - file contains ascii text
   binary - file contains binary data
@@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully.
 In the event a driver is not bound to the device, it can be enabled using the
 'enable' file, documented above.
 
+The 'remove' file is used to remove the PCI device, by writing a non-zero
+integer to the file.  This does not involve any kind of hot-plug functionality,
+e.g. powering off the device.  The device is removed from the kernel's list of
+PCI devices, the sysfs directory for it is removed, and the device will be
+removed from any drivers attached to it. Removal of PCI root buses is
+disallowed.
+
 Accessing legacy resources through sysfs
 ----------------------------------------
 
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index be7468a5eb7..e16990ecc02 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = {
 	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
 	__ATTR_NULL
 };
+
+static void remove_callback(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	mutex_lock(&pci_remove_rescan_mutex);
+	pci_remove_bus_device(pdev);
+	mutex_unlock(&pci_remove_rescan_mutex);
+}
+
+static ssize_t
+remove_store(struct device *dev, struct device_attribute *dummy,
+	     const char *buf, size_t count)
+{
+	int ret = 0;
+	unsigned long val;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (strict_strtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (pci_is_root_bus(pdev->bus))
+		return -EBUSY;
+
+	/* An attribute cannot be unregistered by one of its own methods,
+	 * so we have to use this roundabout approach.
+	 */
+	if (val)
+		ret = device_schedule_callback(dev, remove_callback);
+	if (ret)
+		count = ret;
+	return count;
+}
 #endif
 
 struct device_attribute pci_dev_attrs[] = {
@@ -263,6 +296,9 @@ struct device_attribute pci_dev_attrs[] = {
 	__ATTR(broken_parity_status,(S_IRUGO|S_IWUSR),
 		broken_parity_status_show,broken_parity_status_store),
 	__ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store),
+#ifdef CONFIG_HOTPLUG
+	__ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store),
+#endif
 	__ATTR_NULL,
 };
 
-- 
cgit v1.2.3


From 738a6396c223b486304dda778119dbbca563f019 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:41 -0600
Subject: PCI: Introduce /sys/bus/pci/devices/.../rescan

This interface allows the user to force a rescan of the device's
parent bus and all subordinate buses, and rediscover devices removed
earlier from this part of the device tree.

Cc: Trent Piepho <xyzzy@speakeasy.org>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci | 10 ++++++++++
 drivers/pci/pci-sysfs.c                 | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index daa791a9e85..97ad190e13a 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -74,6 +74,16 @@ Description:
 		hot-remove the PCI device and any of its children.
 		Depends on CONFIG_HOTPLUG.
 
+What:		/sys/bus/pci/devices/.../rescan
+Date:		January 2009
+Contact:	Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+		Writing a non-zero value to this attribute will
+		force a rescan of the device's parent bus and all
+		child buses, and re-discover devices removed earlier
+		from this part of the device tree.
+		Depends on CONFIG_HOTPLUG.
+
 What:		/sys/bus/pci/devices/.../vpd
 Date:		February 2008
 Contact:	Ben Hutchings <bhutchings@solarflare.com>
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index e16990ecc02..e9a8706a640 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -244,6 +244,24 @@ struct bus_attribute pci_bus_attrs[] = {
 	__ATTR_NULL
 };
 
+static ssize_t
+dev_rescan_store(struct device *dev, struct device_attribute *attr,
+		 const char *buf, size_t count)
+{
+	unsigned long val;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (strict_strtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val) {
+		mutex_lock(&pci_remove_rescan_mutex);
+		pci_rescan_bus(pdev->bus);
+		mutex_unlock(&pci_remove_rescan_mutex);
+	}
+	return count;
+}
+
 static void remove_callback(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -298,6 +316,7 @@ struct device_attribute pci_dev_attrs[] = {
 	__ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store),
 #ifdef CONFIG_HOTPLUG
 	__ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store),
+	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_rescan_store),
 #endif
 	__ATTR_NULL,
 };
-- 
cgit v1.2.3


From 83dbf66f04b96e65c6c18436c16d40f9cf8630aa Mon Sep 17 00:00:00 2001
From: Trent Piepho <xyzzy@speakeasy.org>
Date: Fri, 20 Mar 2009 14:56:46 -0600
Subject: PCI Hotplug: restore fakephp interface with complete reimplementation

A complete re-implementation of fakephp is necessary if it is to
present its former interface (pre-2.6.27, when it broke). The
reason is that PCI hotplug drivers call pci_hp_register(), which
enforces the rule that only one /sys/bus/pci/slots/ file may be
created per physical slot.

The change breaks the old fakephp's assumption that it could
create a file per function. So we re-implement fakephp to avoid
using the standard PCI hotplug API so that we can restore the old
fakephp user interface.

It puts entries in /sys/bus/pci/slots with the names of all PCI
devices/functions, exactly symmetrical to what is shown in
/sys/bus/pci/devices. Each slots/ entry has a "power" attribute,
which works the same way as the fakephp driver's power attribute
has worked.

There are a few improvements over old fakephp, which couldn't handle
PCI devices being added or removed via a means outside of
fakephp's knowledge.  If a device was added another way, old fakephp
didn't notice and didn't create the fake slot for it.  If a
device was removed another way, old fakephp didn't delete the fake
slot for it (and accessing the stale slot caused an oops).

The new implementation overcomes these limitations. As a
consequence, removing a bridge with other devices behind it now
works as well, which is something else old fakephp couldn't do
previously.

This duplicates a tiny bit of the code in the PCI core that does
this same function.  Re-using that code ends up being more
complex than duplicating it, and it makes code in the PCI core
more ugly just to support this legacy fakephp interface
compatibility layer.

Reviewed-by: James Cameron <qz@hp.com>
Signed-off-by: Trent Piepho <xyzzy@speakeasy.org>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/Makefile         |   2 +-
 drivers/pci/hotplug/fakephp.c        | 395 -----------------------------------
 drivers/pci/hotplug/legacy_fakephp.c | 162 ++++++++++++++
 3 files changed, 163 insertions(+), 396 deletions(-)
 delete mode 100644 drivers/pci/hotplug/fakephp.c
 create mode 100644 drivers/pci/hotplug/legacy_fakephp.c

diff --git a/drivers/pci/hotplug/Makefile b/drivers/pci/hotplug/Makefile
index 2aa117c8cd8..3314fe88150 100644
--- a/drivers/pci/hotplug/Makefile
+++ b/drivers/pci/hotplug/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_HOTPLUG_PCI_RPA_DLPAR)	+= rpadlpar_io.o
 obj-$(CONFIG_HOTPLUG_PCI_SGI)		+= sgi_hotplug.o
 
 # Link this last so it doesn't claim devices that have a real hotplug driver
-obj-$(CONFIG_HOTPLUG_PCI_FAKE)		+= fakephp.o
+obj-$(CONFIG_HOTPLUG_PCI_FAKE)		+= legacy_fakephp.o
 
 pci_hotplug-objs	:=	pci_hotplug_core.o
 
diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
deleted file mode 100644
index 16063745766..00000000000
--- a/drivers/pci/hotplug/fakephp.c
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Fake PCI Hot Plug Controller Driver
- *
- * Copyright (C) 2003 Greg Kroah-Hartman <greg@kroah.com>
- * Copyright (C) 2003 IBM Corp.
- * Copyright (C) 2003 Rolf Eike Beer <eike-kernel@sf-tec.de>
- *
- * Based on ideas and code from:
- * 	Vladimir Kondratiev <vladimir.kondratiev@intel.com>
- *	Rolf Eike Beer <eike-kernel@sf-tec.de>
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 2 of the License.
- *
- * Send feedback to <greg@kroah.com>
- */
-
-/*
- *
- * This driver will "emulate" removing PCI devices from the system.  If
- * the "power" file is written to with "0" then the specified PCI device
- * will be completely removed from the kernel.
- *
- * WARNING, this does NOT turn off the power to the PCI device.  This is
- * a "logical" removal, not a physical or electrical removal.
- *
- * Use this module at your own risk, you have been warned!
- *
- * Enabling PCI devices is left as an exercise for the reader...
- *
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/pci_hotplug.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include "../pci.h"
-
-#if !defined(MODULE)
-	#define MY_NAME	"fakephp"
-#else
-	#define MY_NAME	THIS_MODULE->name
-#endif
-
-#define dbg(format, arg...)					\
-	do {							\
-		if (debug)					\
-			printk(KERN_DEBUG "%s: " format,	\
-				MY_NAME , ## arg); 		\
-	} while (0)
-#define err(format, arg...) printk(KERN_ERR "%s: " format, MY_NAME , ## arg)
-#define info(format, arg...) printk(KERN_INFO "%s: " format, MY_NAME , ## arg)
-
-#define DRIVER_AUTHOR	"Greg Kroah-Hartman <greg@kroah.com>"
-#define DRIVER_DESC	"Fake PCI Hot Plug Controller Driver"
-
-struct dummy_slot {
-	struct list_head node;
-	struct hotplug_slot *slot;
-	struct pci_dev *dev;
-	struct work_struct remove_work;
-	unsigned long removed;
-};
-
-static int debug;
-static int dup_slots;
-static LIST_HEAD(slot_list);
-static struct workqueue_struct *dummyphp_wq;
-
-static void pci_rescan_worker(struct work_struct *work);
-static DECLARE_WORK(pci_rescan_work, pci_rescan_worker);
-
-static int enable_slot (struct hotplug_slot *slot);
-static int disable_slot (struct hotplug_slot *slot);
-
-static struct hotplug_slot_ops dummy_hotplug_slot_ops = {
-	.owner			= THIS_MODULE,
-	.enable_slot		= enable_slot,
-	.disable_slot		= disable_slot,
-};
-
-static void dummy_release(struct hotplug_slot *slot)
-{
-	struct dummy_slot *dslot = slot->private;
-
-	list_del(&dslot->node);
-	kfree(dslot->slot->info);
-	kfree(dslot->slot);
-	pci_dev_put(dslot->dev);
-	kfree(dslot);
-}
-
-#define SLOT_NAME_SIZE	8
-
-static int add_slot(struct pci_dev *dev)
-{
-	struct dummy_slot *dslot;
-	struct hotplug_slot *slot;
-	char name[SLOT_NAME_SIZE];
-	int retval = -ENOMEM;
-	static int count = 1;
-
-	slot = kzalloc(sizeof(struct hotplug_slot), GFP_KERNEL);
-	if (!slot)
-		goto error;
-
-	slot->info = kzalloc(sizeof(struct hotplug_slot_info), GFP_KERNEL);
-	if (!slot->info)
-		goto error_slot;
-
-	slot->info->power_status = 1;
-	slot->info->max_bus_speed = PCI_SPEED_UNKNOWN;
-	slot->info->cur_bus_speed = PCI_SPEED_UNKNOWN;
-
-	dslot = kzalloc(sizeof(struct dummy_slot), GFP_KERNEL);
-	if (!dslot)
-		goto error_info;
-
-	if (dup_slots)
-		snprintf(name, SLOT_NAME_SIZE, "fake");
-	else
-		snprintf(name, SLOT_NAME_SIZE, "fake%d", count++);
-	dbg("slot->name = %s\n", name);
-	slot->ops = &dummy_hotplug_slot_ops;
-	slot->release = &dummy_release;
-	slot->private = dslot;
-
-	retval = pci_hp_register(slot, dev->bus, PCI_SLOT(dev->devfn), name);
-	if (retval) {
-		err("pci_hp_register failed with error %d\n", retval);
-		goto error_dslot;
-	}
-
-	dbg("slot->name = %s\n", hotplug_slot_name(slot));
-	dslot->slot = slot;
-	dslot->dev = pci_dev_get(dev);
-	list_add (&dslot->node, &slot_list);
-	return retval;
-
-error_dslot:
-	kfree(dslot);
-error_info:
-	kfree(slot->info);
-error_slot:
-	kfree(slot);
-error:
-	return retval;
-}
-
-static int __init pci_scan_buses(void)
-{
-	struct pci_dev *dev = NULL;
-	int lastslot = 0;
-
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		if (PCI_FUNC(dev->devfn) > 0 &&
-				lastslot == PCI_SLOT(dev->devfn))
-			continue;
-		lastslot = PCI_SLOT(dev->devfn);
-		add_slot(dev);
-	}
-
-	return 0;
-}
-
-static void remove_slot(struct dummy_slot *dslot)
-{
-	int retval;
-
-	dbg("removing slot %s\n", hotplug_slot_name(dslot->slot));
-	retval = pci_hp_deregister(dslot->slot);
-	if (retval)
-		err("Problem unregistering a slot %s\n",
-			hotplug_slot_name(dslot->slot));
-}
-
-/* called from the single-threaded workqueue handler to remove a slot */
-static void remove_slot_worker(struct work_struct *work)
-{
-	struct dummy_slot *dslot =
-		container_of(work, struct dummy_slot, remove_work);
-	remove_slot(dslot);
-}
-
-/**
- * pci_rescan_slot - Rescan slot
- * @temp: Device template. Should be set: bus and devfn.
- *
- * Tries hard not to re-enable already existing devices;
- * also handles scanning of subfunctions.
- */
-static int pci_rescan_slot(struct pci_dev *temp)
-{
-	struct pci_bus *bus = temp->bus;
-	struct pci_dev *dev;
-	int func;
-	u8 hdr_type;
-	int count = 0;
-
-	if (!pci_read_config_byte(temp, PCI_HEADER_TYPE, &hdr_type)) {
-		temp->hdr_type = hdr_type & 0x7f;
-		if ((dev = pci_get_slot(bus, temp->devfn)) != NULL)
-			pci_dev_put(dev);
-		else {
-			dev = pci_scan_single_device(bus, temp->devfn);
-			if (dev) {
-				dbg("New device on %s function %x:%x\n",
-					bus->name, temp->devfn >> 3,
-					temp->devfn & 7);
-				count++;
-			}
-		}
-		/* multifunction device? */
-		if (!(hdr_type & 0x80))
-			return count;
-
-		/* continue scanning for other functions */
-		for (func = 1, temp->devfn++; func < 8; func++, temp->devfn++) {
-			if (pci_read_config_byte(temp, PCI_HEADER_TYPE, &hdr_type))
-				continue;
-			temp->hdr_type = hdr_type & 0x7f;
-
-			if ((dev = pci_get_slot(bus, temp->devfn)) != NULL)
-				pci_dev_put(dev);
-			else {
-				dev = pci_scan_single_device(bus, temp->devfn);
-				if (dev) {
-					dbg("New device on %s function %x:%x\n",
-						bus->name, temp->devfn >> 3,
-						temp->devfn & 7);
-					count++;
-				}
-			}
-		}
-	}
-
-	return count;
-}
-
-
-/**
- * pci_rescan_bus_local - fakephp version of rescan PCI bus
- * @bus: the PCI bus to rescan
- *
- * Call pci_rescan_slot for each possible function of the bus.
- */
-static void pci_rescan_bus_local(const struct pci_bus *bus)
-{
-	unsigned int devfn;
-	struct pci_dev *dev;
-	int retval;
-	int found = 0;
-	dev = alloc_pci_dev();
-	if (!dev)
-		return;
-
-	dev->bus = (struct pci_bus*)bus;
-	dev->sysdata = bus->sysdata;
-	for (devfn = 0; devfn < 0x100; devfn += 8) {
-		dev->devfn = devfn;
-		found += pci_rescan_slot(dev);
-	}
-
-	if (found) {
-		pci_bus_assign_resources(bus);
-		list_for_each_entry(dev, &bus->devices, bus_list) {
-			/* Skip already-added devices */
-			if (dev->is_added)
-					continue;
-			retval = pci_bus_add_device(dev);
-			if (retval)
-				dev_err(&dev->dev,
-					"Error adding device, continuing\n");
-			else
-				add_slot(dev);
-		}
-		pci_bus_add_devices(bus);
-	}
-	kfree(dev);
-}
-
-/* recursively scan all buses */
-static void pci_rescan_buses(const struct list_head *list)
-{
-	const struct list_head *l;
-	list_for_each(l,list) {
-		const struct pci_bus *b = pci_bus_b(l);
-		pci_rescan_bus_local(b);
-		pci_rescan_buses(&b->children);
-	}
-}
-
-/* initiate rescan of all pci buses */
-static inline void pci_rescan(void) {
-	pci_rescan_buses(&pci_root_buses);
-}
-
-/* called from the single-threaded workqueue handler to rescan all pci buses */
-static void pci_rescan_worker(struct work_struct *work)
-{
-	pci_rescan();
-}
-
-static int enable_slot(struct hotplug_slot *hotplug_slot)
-{
-	/* mis-use enable_slot for rescanning of the pci bus */
-	cancel_work_sync(&pci_rescan_work);
-	queue_work(dummyphp_wq, &pci_rescan_work);
-	return 0;
-}
-
-static int disable_slot(struct hotplug_slot *slot)
-{
-	struct dummy_slot *dslot;
-	struct pci_dev *dev;
-	int func;
-
-	if (!slot)
-		return -ENODEV;
-	dslot = slot->private;
-
-	dbg("%s - physical_slot = %s\n", __func__, hotplug_slot_name(slot));
-
-	for (func = 7; func >= 0; func--) {
-		dev = pci_get_slot(dslot->dev->bus, dslot->dev->devfn + func);
-		if (!dev)
-			continue;
-
-		if (test_and_set_bit(0, &dslot->removed)) {
-			dbg("Slot already scheduled for removal\n");
-			pci_dev_put(dev);
-			return -ENODEV;
-		}
-
-		/* remove the device from the pci core */
-		pci_remove_bus_device(dev);
-
-		/* queue work item to blow away this sysfs entry and other
-		 * parts.
-		 */
-		INIT_WORK(&dslot->remove_work, remove_slot_worker);
-		queue_work(dummyphp_wq, &dslot->remove_work);
-
-		pci_dev_put(dev);
-	}
-	return 0;
-}
-
-static void cleanup_slots (void)
-{
-	struct list_head *tmp;
-	struct list_head *next;
-	struct dummy_slot *dslot;
-
-	destroy_workqueue(dummyphp_wq);
-	list_for_each_safe (tmp, next, &slot_list) {
-		dslot = list_entry (tmp, struct dummy_slot, node);
-		remove_slot(dslot);
-	}
-	
-}
-
-static int __init dummyphp_init(void)
-{
-	info(DRIVER_DESC "\n");
-
-	dummyphp_wq = create_singlethread_workqueue(MY_NAME);
-	if (!dummyphp_wq)
-		return -ENOMEM;
-
-	return pci_scan_buses();
-}
-
-
-static void __exit dummyphp_exit(void)
-{
-	cleanup_slots();
-}
-
-module_init(dummyphp_init);
-module_exit(dummyphp_exit);
-
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_LICENSE("GPL");
-module_param(debug, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(debug, "Debugging mode enabled or not");
-module_param(dup_slots, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(dup_slots, "Force duplicate slot names for debugging");
diff --git a/drivers/pci/hotplug/legacy_fakephp.c b/drivers/pci/hotplug/legacy_fakephp.c
new file mode 100644
index 00000000000..2dc7828df48
--- /dev/null
+++ b/drivers/pci/hotplug/legacy_fakephp.c
@@ -0,0 +1,162 @@
+/* Works like the fakephp driver used to, except a little better.
+ *
+ * - It's possible to remove devices with subordinate busses.
+ * - New PCI devices that appear via any method, not just a fakephp triggered
+ *   rescan, will be noticed.
+ * - Devices that are removed via any method, not just a fakephp triggered
+ *   removal, will also be noticed.
+ *
+ * Uses nothing from the pci-hotplug subsystem.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include "../pci.h"
+
+struct legacy_slot {
+	struct kobject		kobj;
+	struct pci_dev		*dev;
+	struct list_head	list;
+};
+
+static LIST_HEAD(legacy_list);
+
+static ssize_t legacy_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
+	strcpy(buf, "1\n");
+	return 2;
+}
+
+static void remove_callback(void *data)
+{
+	pci_remove_bus_device((struct pci_dev *)data);
+}
+
+static ssize_t legacy_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buf, size_t len)
+{
+	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
+	unsigned long val;
+
+	if (strict_strtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val)
+		pci_rescan_bus(slot->dev->bus);
+	else
+		sysfs_schedule_callback(&slot->dev->dev.kobj, remove_callback,
+					slot->dev, THIS_MODULE);
+	return len;
+}
+
+static struct attribute *legacy_attrs[] = {
+	&(struct attribute){ .name = "power", .mode = 0644 },
+	NULL,
+};
+
+static void legacy_release(struct kobject *kobj)
+{
+	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
+
+	pci_dev_put(slot->dev);
+	kfree(slot);
+}
+
+static struct kobj_type legacy_ktype = {
+	.sysfs_ops = &(struct sysfs_ops){
+		.store = legacy_store, .show = legacy_show
+	},
+	.release = &legacy_release,
+	.default_attrs = legacy_attrs,
+};
+
+static int legacy_add_slot(struct pci_dev *pdev)
+{
+	struct legacy_slot *slot = kzalloc(sizeof(*slot), GFP_KERNEL);
+
+	if (!slot)
+		return -ENOMEM;
+
+	if (kobject_init_and_add(&slot->kobj, &legacy_ktype,
+				 &pci_slots_kset->kobj, "%s",
+				 pdev->dev.bus_id)) {
+		dev_warn(&pdev->dev, "Failed to created legacy fake slot\n");
+		return -EINVAL;
+	}
+	slot->dev = pci_dev_get(pdev);
+
+	list_add(&slot->list, &legacy_list);
+
+	return 0;
+}
+
+static int legacy_notify(struct notifier_block *nb,
+			 unsigned long action, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(data);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		legacy_add_slot(pdev);
+	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
+		struct legacy_slot *slot;
+
+		list_for_each_entry(slot, &legacy_list, list)
+			if (slot->dev == pdev)
+				goto found;
+
+		dev_warn(&pdev->dev, "Missing legacy fake slot?");
+		return -ENODEV;
+found:
+		kobject_del(&slot->kobj);
+		list_del(&slot->list);
+		kobject_put(&slot->kobj);
+	}
+
+	return 0;
+}
+
+static struct notifier_block legacy_notifier = {
+	.notifier_call = legacy_notify
+};
+
+static int __init init_legacy(void)
+{
+	struct pci_dev *pdev = NULL;
+
+	/* Add existing devices */
+	while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)))
+		legacy_add_slot(pdev);
+
+	/* Be alerted of any new ones */
+	bus_register_notifier(&pci_bus_type, &legacy_notifier);
+	return 0;
+}
+module_init(init_legacy);
+
+static void __exit remove_legacy(void)
+{
+	struct legacy_slot *slot, *tmp;
+
+	bus_unregister_notifier(&pci_bus_type, &legacy_notifier);
+
+	list_for_each_entry_safe(slot, tmp, &legacy_list, list) {
+		list_del(&slot->list);
+		kobject_del(&slot->kobj);
+		kobject_put(&slot->kobj);
+	}
+}
+module_exit(remove_legacy);
+
+
+MODULE_AUTHOR("Trent Piepho <xyzzy@speakeasy.org>");
+MODULE_DESCRIPTION("Legacy version of the fakephp interface");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 8ffd25454738fb9ed76ee18cc0f180fb0b360401 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:51 -0600
Subject: PCI Hotplug: rename legacy_fakephp to fakephp

We wanted to replace fakephp wholesale, so rename legacy_fakephp back
to fakephp. Yes, this is a silly commit, but it produces a much easier
patch to read and review.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/Makefile         |   2 +-
 drivers/pci/hotplug/fakephp.c        | 162 +++++++++++++++++++++++++++++++++++
 drivers/pci/hotplug/legacy_fakephp.c | 162 -----------------------------------
 3 files changed, 163 insertions(+), 163 deletions(-)
 create mode 100644 drivers/pci/hotplug/fakephp.c
 delete mode 100644 drivers/pci/hotplug/legacy_fakephp.c

diff --git a/drivers/pci/hotplug/Makefile b/drivers/pci/hotplug/Makefile
index 3314fe88150..2aa117c8cd8 100644
--- a/drivers/pci/hotplug/Makefile
+++ b/drivers/pci/hotplug/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_HOTPLUG_PCI_RPA_DLPAR)	+= rpadlpar_io.o
 obj-$(CONFIG_HOTPLUG_PCI_SGI)		+= sgi_hotplug.o
 
 # Link this last so it doesn't claim devices that have a real hotplug driver
-obj-$(CONFIG_HOTPLUG_PCI_FAKE)		+= legacy_fakephp.o
+obj-$(CONFIG_HOTPLUG_PCI_FAKE)		+= fakephp.o
 
 pci_hotplug-objs	:=	pci_hotplug_core.o
 
diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
new file mode 100644
index 00000000000..2dc7828df48
--- /dev/null
+++ b/drivers/pci/hotplug/fakephp.c
@@ -0,0 +1,162 @@
+/* Works like the fakephp driver used to, except a little better.
+ *
+ * - It's possible to remove devices with subordinate busses.
+ * - New PCI devices that appear via any method, not just a fakephp triggered
+ *   rescan, will be noticed.
+ * - Devices that are removed via any method, not just a fakephp triggered
+ *   removal, will also be noticed.
+ *
+ * Uses nothing from the pci-hotplug subsystem.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include "../pci.h"
+
+struct legacy_slot {
+	struct kobject		kobj;
+	struct pci_dev		*dev;
+	struct list_head	list;
+};
+
+static LIST_HEAD(legacy_list);
+
+static ssize_t legacy_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
+	strcpy(buf, "1\n");
+	return 2;
+}
+
+static void remove_callback(void *data)
+{
+	pci_remove_bus_device((struct pci_dev *)data);
+}
+
+static ssize_t legacy_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buf, size_t len)
+{
+	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
+	unsigned long val;
+
+	if (strict_strtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val)
+		pci_rescan_bus(slot->dev->bus);
+	else
+		sysfs_schedule_callback(&slot->dev->dev.kobj, remove_callback,
+					slot->dev, THIS_MODULE);
+	return len;
+}
+
+static struct attribute *legacy_attrs[] = {
+	&(struct attribute){ .name = "power", .mode = 0644 },
+	NULL,
+};
+
+static void legacy_release(struct kobject *kobj)
+{
+	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
+
+	pci_dev_put(slot->dev);
+	kfree(slot);
+}
+
+static struct kobj_type legacy_ktype = {
+	.sysfs_ops = &(struct sysfs_ops){
+		.store = legacy_store, .show = legacy_show
+	},
+	.release = &legacy_release,
+	.default_attrs = legacy_attrs,
+};
+
+static int legacy_add_slot(struct pci_dev *pdev)
+{
+	struct legacy_slot *slot = kzalloc(sizeof(*slot), GFP_KERNEL);
+
+	if (!slot)
+		return -ENOMEM;
+
+	if (kobject_init_and_add(&slot->kobj, &legacy_ktype,
+				 &pci_slots_kset->kobj, "%s",
+				 pdev->dev.bus_id)) {
+		dev_warn(&pdev->dev, "Failed to created legacy fake slot\n");
+		return -EINVAL;
+	}
+	slot->dev = pci_dev_get(pdev);
+
+	list_add(&slot->list, &legacy_list);
+
+	return 0;
+}
+
+static int legacy_notify(struct notifier_block *nb,
+			 unsigned long action, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(data);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		legacy_add_slot(pdev);
+	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
+		struct legacy_slot *slot;
+
+		list_for_each_entry(slot, &legacy_list, list)
+			if (slot->dev == pdev)
+				goto found;
+
+		dev_warn(&pdev->dev, "Missing legacy fake slot?");
+		return -ENODEV;
+found:
+		kobject_del(&slot->kobj);
+		list_del(&slot->list);
+		kobject_put(&slot->kobj);
+	}
+
+	return 0;
+}
+
+static struct notifier_block legacy_notifier = {
+	.notifier_call = legacy_notify
+};
+
+static int __init init_legacy(void)
+{
+	struct pci_dev *pdev = NULL;
+
+	/* Add existing devices */
+	while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)))
+		legacy_add_slot(pdev);
+
+	/* Be alerted of any new ones */
+	bus_register_notifier(&pci_bus_type, &legacy_notifier);
+	return 0;
+}
+module_init(init_legacy);
+
+static void __exit remove_legacy(void)
+{
+	struct legacy_slot *slot, *tmp;
+
+	bus_unregister_notifier(&pci_bus_type, &legacy_notifier);
+
+	list_for_each_entry_safe(slot, tmp, &legacy_list, list) {
+		list_del(&slot->list);
+		kobject_del(&slot->kobj);
+		kobject_put(&slot->kobj);
+	}
+}
+module_exit(remove_legacy);
+
+
+MODULE_AUTHOR("Trent Piepho <xyzzy@speakeasy.org>");
+MODULE_DESCRIPTION("Legacy version of the fakephp interface");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/hotplug/legacy_fakephp.c b/drivers/pci/hotplug/legacy_fakephp.c
deleted file mode 100644
index 2dc7828df48..00000000000
--- a/drivers/pci/hotplug/legacy_fakephp.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Works like the fakephp driver used to, except a little better.
- *
- * - It's possible to remove devices with subordinate busses.
- * - New PCI devices that appear via any method, not just a fakephp triggered
- *   rescan, will be noticed.
- * - Devices that are removed via any method, not just a fakephp triggered
- *   removal, will also be noticed.
- *
- * Uses nothing from the pci-hotplug subsystem.
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include "../pci.h"
-
-struct legacy_slot {
-	struct kobject		kobj;
-	struct pci_dev		*dev;
-	struct list_head	list;
-};
-
-static LIST_HEAD(legacy_list);
-
-static ssize_t legacy_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
-{
-	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
-	strcpy(buf, "1\n");
-	return 2;
-}
-
-static void remove_callback(void *data)
-{
-	pci_remove_bus_device((struct pci_dev *)data);
-}
-
-static ssize_t legacy_store(struct kobject *kobj, struct attribute *attr,
-			    const char *buf, size_t len)
-{
-	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
-	unsigned long val;
-
-	if (strict_strtoul(buf, 0, &val) < 0)
-		return -EINVAL;
-
-	if (val)
-		pci_rescan_bus(slot->dev->bus);
-	else
-		sysfs_schedule_callback(&slot->dev->dev.kobj, remove_callback,
-					slot->dev, THIS_MODULE);
-	return len;
-}
-
-static struct attribute *legacy_attrs[] = {
-	&(struct attribute){ .name = "power", .mode = 0644 },
-	NULL,
-};
-
-static void legacy_release(struct kobject *kobj)
-{
-	struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj);
-
-	pci_dev_put(slot->dev);
-	kfree(slot);
-}
-
-static struct kobj_type legacy_ktype = {
-	.sysfs_ops = &(struct sysfs_ops){
-		.store = legacy_store, .show = legacy_show
-	},
-	.release = &legacy_release,
-	.default_attrs = legacy_attrs,
-};
-
-static int legacy_add_slot(struct pci_dev *pdev)
-{
-	struct legacy_slot *slot = kzalloc(sizeof(*slot), GFP_KERNEL);
-
-	if (!slot)
-		return -ENOMEM;
-
-	if (kobject_init_and_add(&slot->kobj, &legacy_ktype,
-				 &pci_slots_kset->kobj, "%s",
-				 pdev->dev.bus_id)) {
-		dev_warn(&pdev->dev, "Failed to created legacy fake slot\n");
-		return -EINVAL;
-	}
-	slot->dev = pci_dev_get(pdev);
-
-	list_add(&slot->list, &legacy_list);
-
-	return 0;
-}
-
-static int legacy_notify(struct notifier_block *nb,
-			 unsigned long action, void *data)
-{
-	struct pci_dev *pdev = to_pci_dev(data);
-
-	if (action == BUS_NOTIFY_ADD_DEVICE) {
-		legacy_add_slot(pdev);
-	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
-		struct legacy_slot *slot;
-
-		list_for_each_entry(slot, &legacy_list, list)
-			if (slot->dev == pdev)
-				goto found;
-
-		dev_warn(&pdev->dev, "Missing legacy fake slot?");
-		return -ENODEV;
-found:
-		kobject_del(&slot->kobj);
-		list_del(&slot->list);
-		kobject_put(&slot->kobj);
-	}
-
-	return 0;
-}
-
-static struct notifier_block legacy_notifier = {
-	.notifier_call = legacy_notify
-};
-
-static int __init init_legacy(void)
-{
-	struct pci_dev *pdev = NULL;
-
-	/* Add existing devices */
-	while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)))
-		legacy_add_slot(pdev);
-
-	/* Be alerted of any new ones */
-	bus_register_notifier(&pci_bus_type, &legacy_notifier);
-	return 0;
-}
-module_init(init_legacy);
-
-static void __exit remove_legacy(void)
-{
-	struct legacy_slot *slot, *tmp;
-
-	bus_unregister_notifier(&pci_bus_type, &legacy_notifier);
-
-	list_for_each_entry_safe(slot, tmp, &legacy_list, list) {
-		list_del(&slot->list);
-		kobject_del(&slot->kobj);
-		kobject_put(&slot->kobj);
-	}
-}
-module_exit(remove_legacy);
-
-
-MODULE_AUTHOR("Trent Piepho <xyzzy@speakeasy.org>");
-MODULE_DESCRIPTION("Legacy version of the fakephp interface");
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From f110ca489c9b7cf3f6c9656e383e787f3aee217f Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:56 -0600
Subject: PCI Hotplug: schedule fakephp for feature removal

Now that the PCI core is capable of function-level remove and rescan
as well as bus-level rescan, there's no functional need to keep fakephp
anymore.

We keep it around for userspace compatibility reasons, schedule removal
in three years.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/feature-removal-schedule.txt | 33 ++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 20d3b94703a..8851eea06a0 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -335,6 +335,7 @@ Why:	In 2.6.18 the Secmark concept was introduced to replace the "compat_net"
 	Secmark, it is time to deprecate the older mechanism and start the
 	process of removing the old code.
 Who:	Paul Moore <paul.moore@hp.com>
+
 ---------------------------
 
 What:	sysfs ui for changing p4-clockmod parameters
@@ -344,3 +345,35 @@ Why:	See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
 	Removal is subject to fixing any remaining bugs in ACPI which may
 	cause the thermal throttling not to happen at the right time.
 Who:	Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
+
+---------------------------
+
+What:	fakephp and associated sysfs files in /sys/bus/pci/slots/
+When:	2011
+Why:	In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
+	represent a machine's physical PCI slots. The change in semantics
+	had userspace implications, as the hotplug core no longer allowed
+	drivers to create multiple sysfs files per physical slot (required
+	for multi-function devices, e.g.). fakephp was seen as a developer's
+	tool only, and its interface changed. Too late, we learned that
+	there were some users of the fakephp interface.
+
+	In 2.6.30, the original fakephp interface was restored. At the same
+	time, the PCI core gained the ability that fakephp provided, namely
+	function-level hot-remove and hot-add.
+
+	Since the PCI core now provides the same functionality, exposed in:
+
+		/sys/bus/pci/rescan
+		/sys/bus/pci/devices/.../remove
+		/sys/bus/pci/devices/.../rescan
+
+	there is no functional reason to maintain fakephp as well.
+
+	We will keep the existing module so that 'modprobe fakephp' will
+	present the old /sys/bus/pci/slots/... interface for compatibility,
+	but users are urged to migrate their applications to the API above.
+
+	After a reasonable transition period, we will remove the legacy
+	fakephp interface.
+Who:	Alex Chiang <achiang@hp.com>
-- 
cgit v1.2.3


From 9fa8cfe706f9c20067c042a064999d5825a35330 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:24:59 -0400
Subject: Btrfs: don't preallocate metadata blocks during btrfs_search_slot

In order to avoid doing expensive extent management with tree locks held,
btrfs_search_slot will preallocate tree blocks for use by COW without
any tree locks held.

A later commit moves all of the extent allocation work for COW into
a delayed update mechanism, and this preallocation will no longer be
required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 100 +++++++++----------------------------------------
 fs/btrfs/ctree.h       |   2 +-
 fs/btrfs/transaction.c |   4 +-
 3 files changed, 21 insertions(+), 85 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529a..87c90387283 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * empty_size -- a hint that you plan on doing more cow.  This is the size in
  * bytes the allocator should try to find free next to the block it returns.
  * This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
 			     struct extent_buffer **cow_ret,
-			     u64 search_start, u64 empty_size,
-			     u64 prealloc_dest)
+			     u64 search_start, u64 empty_size)
 {
 	u64 parent_start;
 	struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 
-	if (prealloc_dest) {
-		struct btrfs_key ins;
-
-		ins.objectid = prealloc_dest;
-		ins.offset = buf->len;
-		ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
-						  root->root_key.objectid,
-						  trans->transid, level, &ins);
-		BUG_ON(ret);
-		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-					    buf->len, level);
-	} else {
-		cow = btrfs_alloc_free_block(trans, root, buf->len,
-					     parent_start,
-					     root->root_key.objectid,
-					     trans->transid, level,
-					     search_start, empty_size);
-	}
+	cow = btrfs_alloc_free_block(trans, root, buf->len,
+				     parent_start, root->root_key.objectid,
+				     trans->transid, level,
+				     search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret, u64 prealloc_dest)
+		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
 	int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
-		WARN_ON(prealloc_dest);
 		return 0;
 	}
 
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	btrfs_set_lock_blocking(buf);
 
 	ret = __btrfs_cow_block(trans, root, buf, parent,
-				 parent_slot, cow_ret, search_start, 0,
-				 prealloc_dest);
+				 parent_slot, cow_ret, search_start, 0);
 	return ret;
 }
 
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
-					    (end_slot - i) * blocksize), 0);
+					    (end_slot - i) * blocksize));
 		if (err) {
 			btrfs_tree_unlock(cur);
 			free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		BUG_ON(!child);
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
-		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
 		BUG_ON(ret);
 
 		spin_lock(&root->node_lock);
@@ -979,7 +956,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(left);
 		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
-				       parent, pslot - 1, &left, 0);
+				       parent, pslot - 1, &left);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -990,7 +967,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(right);
 		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
-				       parent, pslot + 1, &right, 0);
+				       parent, pslot + 1, &right);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -1171,7 +1148,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			wret = 1;
 		} else {
 			ret = btrfs_cow_block(trans, root, left, parent,
-					      pslot - 1, &left, 0);
+					      pslot - 1, &left);
 			if (ret)
 				wret = 1;
 			else {
@@ -1222,7 +1199,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		} else {
 			ret = btrfs_cow_block(trans, root, right,
 					      parent, pslot + 1,
-					      &right, 0);
+					      &right);
 			if (ret)
 				wret = 1;
 			else {
@@ -1492,7 +1469,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	u8 lowest_level = 0;
 	u64 blocknr;
 	u64 gen;
-	struct btrfs_key prealloc_block;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1477,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ins_len < 0)
 		lowest_unlock = 2;
 
-	prealloc_block.objectid = 0;
-
 again:
 	if (p->skip_locking)
 		b = btrfs_root_node(root);
@@ -1529,44 +1503,11 @@ again:
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
 				goto cow_done;
 			}
-
-			/* ok, we have to cow, is our old prealloc the right
-			 * size?
-			 */
-			if (prealloc_block.objectid &&
-			    prealloc_block.offset != b->len) {
-				btrfs_release_path(root, p);
-				btrfs_free_reserved_extent(root,
-					   prealloc_block.objectid,
-					   prealloc_block.offset);
-				prealloc_block.objectid = 0;
-				goto again;
-			}
-
-			/*
-			 * for higher level blocks, try not to allocate blocks
-			 * with the block and the parent locks held.
-			 */
-			if (level > 0 && !prealloc_block.objectid) {
-				u32 size = b->len;
-				u64 hint = b->start;
-
-				btrfs_release_path(root, p);
-				ret = btrfs_reserve_extent(trans, root,
-							   size, size, 0,
-							   hint, (u64)-1,
-							   &prealloc_block, 0);
-				BUG_ON(ret);
-				goto again;
-			}
-
 			btrfs_set_path_blocking(p);
 
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
-					       p->slots[level + 1],
-					       &b, prealloc_block.objectid);
-			prealloc_block.objectid = 0;
+					       p->slots[level + 1], &b);
 			if (wret) {
 				free_extent_buffer(b);
 				ret = wret;
@@ -1743,11 +1684,6 @@ done:
 	 * from here on, so for now just mark it as blocking
 	 */
 	btrfs_set_path_blocking(p);
-	if (prealloc_block.objectid) {
-		btrfs_free_reserved_extent(root,
-			   prealloc_block.objectid,
-			   prealloc_block.offset);
-	}
 	return ret;
 }
 
@@ -1768,7 +1704,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 	int ret;
 
 	eb = btrfs_lock_root_node(root);
-	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
 	BUG_ON(ret);
 
 	btrfs_set_lock_blocking(eb);
@@ -1826,7 +1762,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
-					      &eb, 0);
+					      &eb);
 			BUG_ON(ret);
 
 			if (root->root_key.objectid ==
@@ -2377,7 +2313,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* cow and double check */
 	ret = btrfs_cow_block(trans, root, right, upper,
-			      slot + 1, &right, 0);
+			      slot + 1, &right);
 	if (ret)
 		goto out_unlock;
 
@@ -2576,7 +2512,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* cow and double check */
 	ret = btrfs_cow_block(trans, root, left,
-			      path->nodes[1], slot - 1, &left, 0);
+			      path->nodes[1], slot - 1, &left);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
 		ret = 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d..3a37ba7a8d6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1838,7 +1838,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret, u64 prealloc_dest);
+		    struct extent_buffer **cow_ret);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4..d638c54d39e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -463,7 +463,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	btrfs_extent_post_op(trans, fs_info->tree_root);
 
 	eb = btrfs_lock_root_node(fs_info->tree_root);
-	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
@@ -766,7 +766,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
-- 
cgit v1.2.3


From 56bec294dea971335d4466b30f2d959f28f6e36d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:10:06 -0400
Subject: Btrfs: do extent allocation and reference count updates in the
 background

The extent allocation tree maintains a reference count and full
back reference information for every extent allocated in the
filesystem.  For subvolume and snapshot trees, every time
a block goes through COW, the new copy of the block adds a reference
on every block it points to.

If a btree node points to 150 leaves, then the COW code needs to go
and add backrefs on 150 different extents, which might be spread all
over the extent allocation tree.

These updates currently happen during btrfs_cow_block, and most COWs
happen during btrfs_search_slot.  btrfs_search_slot has locks held
on both the parent and the node we are COWing, and so we really want
to avoid IO during the COW if we can.

This commit adds an rbtree of pending reference count updates and extent
allocations.  The tree is ordered by byte number of the extent and byte number
of the parent for the back reference.  The tree allows us to:

1) Modify back references in something close to disk order, reducing seeks
2) Significantly reduce the number of modifications made as block pointers
are balanced around
3) Do all of the extent insertion and back reference modifications outside
of the performance critical btrfs_search_slot code.

#3 has the added benefit of greatly reducing the btrfs stack footprint.
The extent allocation tree modifications are done without the deep
(and somewhat recursive) call chains used in the past.

These delayed back reference updates must be done before the transaction
commits, and so the rbtree is tied to the transaction.  Throttling is
implemented to help keep the queue of backrefs at a reasonable size.

Since there was a similar mechanism in place for the extent tree
extents, that is removed and replaced by the delayed reference tree.

Yan Zheng <yan.zheng@oracle.com> helped review and fixup this code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    2 +-
 fs/btrfs/ctree.c       |    3 +-
 fs/btrfs/ctree.h       |   12 +-
 fs/btrfs/delayed-ref.c |  585 ++++++++++++++++++
 fs/btrfs/delayed-ref.h |  182 ++++++
 fs/btrfs/disk-io.c     |   29 +-
 fs/btrfs/extent-tree.c | 1558 +++++++++++++-----------------------------------
 fs/btrfs/file.c        |    6 +-
 fs/btrfs/transaction.c |   54 +-
 fs/btrfs/transaction.h |    3 +
 fs/btrfs/tree-defrag.c |    2 -
 11 files changed, 1265 insertions(+), 1171 deletions(-)
 create mode 100644 fs/btrfs/delayed-ref.c
 create mode 100644 fs/btrfs/delayed-ref.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b..9adf5e4f7e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-	   compression.o
+	   compression.o delayed-ref.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 87c90387283..bebc9fd1666 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -922,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		spin_unlock(&root->node_lock);
 
 		ret = btrfs_update_extent_ref(trans, root, child->start,
+					      child->len,
 					      mid->start, child->start,
 					      root->root_key.objectid,
 					      trans->transid, level - 1);
@@ -2075,7 +2076,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	spin_unlock(&root->node_lock);
 
 	ret = btrfs_update_extent_ref(trans, root, lower->start,
-				      lower->start, c->start,
+				      lower->len, lower->start, c->start,
 				      root->root_key.objectid,
 				      trans->transid, level - 1);
 	BUG_ON(ret);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3a37ba7a8d6..ced5fd85dc3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -688,8 +688,6 @@ struct btrfs_fs_info {
 	struct rb_root block_group_cache_tree;
 
 	struct extent_io_tree pinned_extents;
-	struct extent_io_tree pending_del;
-	struct extent_io_tree extent_ins;
 
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
@@ -717,7 +715,6 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
-	struct mutex extent_ins_mutex;
 	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
@@ -1704,18 +1701,15 @@ static inline struct dentry *fdentry(struct file *file)
 }
 
 /* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
@@ -1777,7 +1771,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 root_objectid, u64 ref_generation,
 			 u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
+			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 			    u64 orig_parent, u64 parent,
 			    u64 root_objectid, u64 ref_generation,
 			    u64 owner_objectid);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 00000000000..874565a1f63
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include <linux/ftrace.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+		      u64 bytenr, u64 parent)
+{
+	if (bytenr < ref->bytenr)
+		return -1;
+	if (bytenr > ref->bytenr)
+		return 1;
+	if (parent < ref->parent)
+		return -1;
+	if (parent > ref->parent)
+		return 1;
+	return 0;
+}
+
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+						  u64 bytenr, u64 parent,
+						  struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_node *entry;
+	int cmp;
+
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+				 rb_node);
+
+		cmp = comp_entry(entry, bytenr, parent);
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * find an entry based on (bytenr,parent).  This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+						  u64 bytenr, u64 parent)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int cmp;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		cmp = comp_entry(entry, bytenr, parent);
+		if (cmp < 0)
+			n = n->rb_left;
+		else if (cmp > 0)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Locking on delayed refs is done by taking a lock on the head node,
+ * which has the (impossible) parent id of (u64)-1.  Once a lock is held
+ * on the head node, you're allowed (and required) to process all the
+ * delayed refs for a given byte number in the tree.
+ *
+ * This will walk forward in the rbtree until it finds a head node it
+ * is able to lock.  It might not lock the delayed ref you asked for,
+ * and so it will return the one it did lock in next_ret and return 0.
+ *
+ * If no locks are taken, next_ret is set to null and 1 is returned.  This
+ * means there are no more unlocked head nodes in the rbtree.
+ */
+int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_node *ref,
+			   struct btrfs_delayed_ref_head **next_ret)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_head *head;
+	int ret = 0;
+
+	while (1) {
+		if (btrfs_delayed_ref_is_head(ref)) {
+			head = btrfs_delayed_node_to_head(ref);
+			if (mutex_trylock(&head->mutex)) {
+				*next_ret = head;
+				ret = 0;
+				break;
+			}
+		}
+		node = rb_next(&ref->rb_node);
+		if (!node) {
+			ret = 1;
+			*next_ret = NULL;
+			break;
+		}
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+	}
+	return ret;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr.  It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree.  There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *prev_node;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+	if (ref) {
+		prev_node = rb_prev(&ref->rb_node);
+		if (!prev_node)
+			goto out;
+		ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+			       rb_node);
+		if (ref->bytenr == bytenr)
+			ret = 1;
+	}
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree.  This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	u32 num_refs;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+	delayed_refs = &trans->transaction->delayed_refs;
+again:
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_extent_item);
+		num_refs = btrfs_extent_refs(leaf, ei);
+	} else {
+		num_refs = 0;
+		ret = 0;
+	}
+
+	spin_lock(&delayed_refs->lock);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+	if (ref) {
+		head = btrfs_delayed_node_to_head(ref);
+		if (mutex_trylock(&head->mutex)) {
+			num_refs += ref->ref_mod;
+			mutex_unlock(&head->mutex);
+			*refs = num_refs;
+			goto out;
+		}
+
+		atomic_inc(&ref->refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(root->fs_info->extent_root, path);
+
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(ref);
+		goto again;
+	} else {
+		*refs = num_refs;
+	}
+out:
+	spin_unlock(&delayed_refs->lock);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+		    struct btrfs_delayed_ref_root *delayed_refs,
+		    struct btrfs_delayed_ref_node *existing,
+		    struct btrfs_delayed_ref_node *update)
+{
+	struct btrfs_delayed_ref *existing_ref;
+	struct btrfs_delayed_ref *ref;
+
+	existing_ref = btrfs_delayed_node_to_ref(existing);
+	ref = btrfs_delayed_node_to_ref(update);
+
+	if (ref->pin)
+		existing_ref->pin = 1;
+
+	if (ref->action != existing_ref->action) {
+		/*
+		 * this is effectively undoing either an add or a
+		 * drop.  We decrement the ref_mod, and if it goes
+		 * down to zero we just delete the entry without
+		 * every changing the extent allocation tree.
+		 */
+		existing->ref_mod--;
+		if (existing->ref_mod == 0) {
+			rb_erase(&existing->rb_node,
+				 &delayed_refs->root);
+			existing->in_tree = 0;
+			btrfs_put_delayed_ref(existing);
+			delayed_refs->num_entries--;
+			if (trans->delayed_ref_updates)
+				trans->delayed_ref_updates--;
+		}
+	} else {
+		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+			/* if we're adding refs, make sure all the
+			 * details match up.  The extent could
+			 * have been totally freed and reallocated
+			 * by a different owner before the delayed
+			 * ref entries were removed.
+			 */
+			existing_ref->owner_objectid = ref->owner_objectid;
+			existing_ref->generation = ref->generation;
+			existing_ref->root = ref->root;
+			existing->num_bytes = update->num_bytes;
+		}
+		/*
+		 * the action on the existing ref matches
+		 * the action on the ref we're trying to add.
+		 * Bump the ref_mod by one so the backref that
+		 * is eventually added/removed has the correct
+		 * reference count
+		 */
+		existing->ref_mod += update->ref_mod;
+	}
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+			 struct btrfs_delayed_ref_node *update)
+{
+	struct btrfs_delayed_ref_head *existing_ref;
+	struct btrfs_delayed_ref_head *ref;
+
+	existing_ref = btrfs_delayed_node_to_head(existing);
+	ref = btrfs_delayed_node_to_head(update);
+
+	if (ref->must_insert_reserved) {
+		/* if the extent was freed and then
+		 * reallocated before the delayed ref
+		 * entries were processed, we can end up
+		 * with an existing head ref without
+		 * the must_insert_reserved flag set.
+		 * Set it again here
+		 */
+		existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+		/*
+		 * update the num_bytes so we make sure the accounting
+		 * is done correctly
+		 */
+		existing->num_bytes = update->num_bytes;
+
+	}
+
+	/*
+	 * update the reference mod on the head to reflect this new operation
+	 */
+	existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  struct btrfs_delayed_ref_node *ref,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_ref *full_ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int count_mod = 1;
+	int must_insert_reserved = 0;
+
+	/*
+	 * the head node stores the sum of all the mods, so dropping a ref
+	 * should drop the sum in the head node by one.
+	 */
+	if (parent == (u64)-1 && action == BTRFS_DROP_DELAYED_REF)
+		count_mod = -1;
+
+	/*
+	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+	 * the reserved accounting when the extent is finally added, or
+	 * if a later modification deletes the delayed ref without ever
+	 * inserting the extent into the extent allocation tree.
+	 * ref->must_insert_reserved is the flag used to record
+	 * that accounting mods are required.
+	 *
+	 * Once we record must_insert_reserved, switch the action to
+	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+	 */
+	if (action == BTRFS_ADD_DELAYED_EXTENT) {
+		must_insert_reserved = 1;
+		action = BTRFS_ADD_DELAYED_REF;
+	} else {
+		must_insert_reserved = 0;
+	}
+
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->parent = parent;
+	ref->ref_mod = count_mod;
+	ref->in_tree = 1;
+	ref->num_bytes = num_bytes;
+
+	if (btrfs_delayed_ref_is_head(ref)) {
+		head_ref = btrfs_delayed_node_to_head(ref);
+		head_ref->must_insert_reserved = must_insert_reserved;
+		mutex_init(&head_ref->mutex);
+	} else {
+		full_ref = btrfs_delayed_node_to_ref(ref);
+		full_ref->root = ref_root;
+		full_ref->generation = ref_generation;
+		full_ref->owner_objectid = owner_objectid;
+		full_ref->pin = pin;
+		full_ref->action = action;
+	}
+
+	existing = tree_insert(&delayed_refs->root, bytenr,
+			       parent, &ref->rb_node);
+
+	if (existing) {
+		if (btrfs_delayed_ref_is_head(ref))
+			update_existing_head_ref(existing, ref);
+		else
+			update_existing_ref(trans, delayed_refs, existing, ref);
+
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin)
+{
+	struct btrfs_delayed_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	/*
+	 * the parent = 0 case comes from cases where we don't actually
+	 * know the parent yet.  It will get updated later via a add/drop
+	 * pair.
+	 */
+	if (parent == 0)
+		parent = bytenr;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+				      (u64)-1, 0, 0, 0, action, pin);
+	BUG_ON(ret);
+
+	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+				      parent, ref_root, ref_generation,
+				      owner_objectid, action, pin);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 orig_parent,
+			  u64 parent, u64 orig_ref_root, u64 ref_root,
+			  u64 orig_ref_generation, u64 ref_generation,
+			  u64 owner_objectid, int pin)
+{
+	struct btrfs_delayed_ref *ref;
+	struct btrfs_delayed_ref *old_ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+	if (!old_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+
+	/*
+	 * the parent = 0 case comes from cases where we don't actually
+	 * know the parent yet.  It will get updated later via a add/drop
+	 * pair.
+	 */
+	if (parent == 0)
+		parent = bytenr;
+	if (orig_parent == 0)
+		orig_parent = bytenr;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		kfree(old_ref);
+		return -ENOMEM;
+	}
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+				      (u64)-1, 0, 0, 0,
+				      BTRFS_ADD_DELAYED_REF, 0);
+	BUG_ON(ret);
+
+	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+				      parent, ref_root, ref_generation,
+				      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+	BUG_ON(ret);
+
+	ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+				      orig_parent, orig_ref_root,
+				      orig_ref_generation, owner_objectid,
+				      BTRFS_DROP_DELAYED_REF, pin);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 00000000000..37919e5c007
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+
+struct btrfs_delayed_ref_node {
+	struct rb_node rb_node;
+
+	/* the starting bytenr of the extent */
+	u64 bytenr;
+
+	/* the parent our backref will point to */
+	u64 parent;
+
+	/* the size of the extent */
+	u64 num_bytes;
+
+	/* ref count on this data structure */
+	atomic_t refs;
+
+	/*
+	 * how many refs is this entry adding or deleting.  For
+	 * head refs, this may be a negative number because it is keeping
+	 * track of the total mods done to the reference count.
+	 * For individual refs, this will always be a positive number
+	 *
+	 * It may be more than one, since it is possible for a single
+	 * parent to have more than one ref on an extent
+	 */
+	int ref_mod;
+
+	/* is this node still in the rbtree? */
+	unsigned int in_tree:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+	struct btrfs_delayed_ref_node node;
+
+	/*
+	 * the mutex is held while running the refs, and it is also
+	 * held when checking the sum of reference modifications.
+	 */
+	struct mutex mutex;
+
+	/*
+	 * when a new extent is allocated, it is just reserved in memory
+	 * The actual extent isn't inserted into the extent allocation tree
+	 * until the delayed ref is processed.  must_insert_reserved is
+	 * used to flag a delayed ref so the accounting can be updated
+	 * when a full insert is done.
+	 *
+	 * It is possible the extent will be freed before it is ever
+	 * inserted into the extent allocation tree.  In this case
+	 * we need to update the in ram accounting to properly reflect
+	 * the free has happened.
+	 */
+	unsigned int must_insert_reserved:1;
+};
+
+struct btrfs_delayed_ref {
+	struct btrfs_delayed_ref_node node;
+
+	/* the root objectid our ref will point to */
+	u64 root;
+
+	/* the generation for the backref */
+	u64 generation;
+
+	/* owner_objectid of the backref  */
+	u64 owner_objectid;
+
+	/* operation done by this entry in the rbtree */
+	u8 action;
+
+	/* if pin == 1, when the extent is freed it will be pinned until
+	 * transaction commit
+	 */
+	unsigned int pin:1;
+};
+
+struct btrfs_delayed_ref_root {
+	struct rb_root root;
+
+	/* this spin lock protects the rbtree and the entries inside */
+	spinlock_t lock;
+
+	/* how many delayed ref updates we've queued, used by the
+	 * throttling code
+	 */
+	unsigned long num_entries;
+
+	/*
+	 * set when the tree is flushing before a transaction commit,
+	 * used by the throttling code to decide if new updates need
+	 * to be run right away
+	 */
+	int flushing;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+	WARN_ON(atomic_read(&ref->refs) == 0);
+	if (atomic_dec_and_test(&ref->refs)) {
+		WARN_ON(ref->in_tree);
+		kfree(ref);
+	}
+}
+
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin);
+
+struct btrfs_delayed_ref *
+btrfs_find_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr,
+		       u64 parent);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_node *ref,
+			   struct btrfs_delayed_ref_head **next_ret);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 orig_parent,
+			  u64 parent, u64 orig_ref_root, u64 ref_root,
+			  u64 orig_ref_generation, u64 ref_generation,
+			  u64 owner_objectid, int pin);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+	return node->parent == (u64)-1;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_ref, node);
+
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(!btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_ref_head, node);
+
+}
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3e18175248e..4f43e227a29 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1458,6 +1458,7 @@ static int transaction_kthread(void *arg)
 	struct btrfs_root *root = arg;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_transaction *cur;
+	struct btrfs_fs_info *info = root->fs_info;
 	unsigned long now;
 	unsigned long delay;
 	int ret;
@@ -1471,12 +1472,6 @@ static int transaction_kthread(void *arg)
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-			printk(KERN_INFO "btrfs: total reference cache "
-			       "size %llu\n",
-			       root->fs_info->total_ref_cache_size);
-		}
-
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
@@ -1486,13 +1481,30 @@ static int transaction_kthread(void *arg)
 
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
+			unsigned long num_delayed;
+			num_delayed = cur->delayed_refs.num_entries;
 			mutex_unlock(&root->fs_info->trans_mutex);
 			delay = HZ * 5;
+
+			/*
+			 * we may have been woken up early to start
+			 * processing the delayed extent ref updates
+			 * If so, run some of them and then loop around again
+			 * to see if we need to force a commit
+			 */
+			if (num_delayed > 64) {
+				mutex_unlock(&info->transaction_kthread_mutex);
+				trans = btrfs_start_transaction(root, 1);
+				btrfs_run_delayed_refs(trans, root, 256);
+				btrfs_end_transaction(trans, root);
+				continue;
+			}
 			goto sleep;
 		}
 		mutex_unlock(&root->fs_info->trans_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		ret = btrfs_commit_transaction(trans, root);
+
 sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1611,10 +1623,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&fs_info->pending_del,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&fs_info->extent_ins,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
 	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1629,7 +1637,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
-	mutex_init(&fs_info->extent_ins_mutex);
 	mutex_init(&fs_info->pinned_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad205..9b5da2b013e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,10 +49,13 @@ struct pending_extent_op {
 	int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root, int all);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root, u64 parent,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, struct btrfs_key *ins,
+					 int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+				   u64 bytenr, u64 num, int reserve);
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data);
@@ -60,6 +63,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes, u64 parent,
+					u64 root_objectid, u64 ref_generation,
+					u64 owner_objectid, int pin,
+					int ref_to_drop);
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +563,13 @@ out:
 	return ret;
 }
 
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *extent_root,
-				    struct btrfs_path *path,
-				    struct list_head *update_list)
-{
-	struct btrfs_key key;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct pending_extent_op *op;
-	struct extent_buffer *leaf;
-	int ret = 0;
-	struct list_head *cur = update_list->next;
-	u64 ref_objectid;
-	u64 ref_root = extent_root->root_key.objectid;
-
-	op = list_entry(cur, struct pending_extent_op, list);
-
-search:
-	key.objectid = op->bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = op->orig_parent;
-
-	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
-	BUG_ON(ret);
-
-	leaf = path->nodes[0];
-
-loop:
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
-	ref_objectid = btrfs_ref_objectid(leaf, ref);
-
-	if (btrfs_ref_root(leaf, ref) != ref_root ||
-	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
-	    (ref_objectid != op->level &&
-	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
-		       "root %llu, owner %u\n",
-		       (unsigned long long)op->bytenr,
-		       (unsigned long long)op->orig_parent,
-		       (unsigned long long)ref_root, op->level);
-		btrfs_print_leaf(extent_root, leaf);
-		BUG();
-	}
-
-	key.objectid = op->bytenr;
-	key.offset = op->parent;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
-	BUG_ON(ret);
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-	btrfs_set_ref_generation(leaf, ref, op->generation);
-
-	cur = cur->next;
-
-	list_del_init(&op->list);
-	unlock_extent(&info->extent_ins, op->bytenr,
-		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-	kfree(op);
-
-	if (cur == update_list) {
-		btrfs_mark_buffer_dirty(path->nodes[0]);
-		btrfs_release_path(extent_root, path);
-		goto out;
-	}
-
-	op = list_entry(cur, struct pending_extent_op, list);
-
-	path->slots[0]++;
-	while (path->slots[0] < btrfs_header_nritems(leaf)) {
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid == op->bytenr &&
-		    key.type == BTRFS_EXTENT_REF_KEY)
-			goto loop;
-		path->slots[0]++;
-	}
-
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	btrfs_release_path(extent_root, path);
-	goto search;
-
-out:
-	return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *extent_root,
-				   struct btrfs_path *path,
-				   struct list_head *insert_list, int nr)
-{
-	struct btrfs_key *keys;
-	u32 *data_size;
-	struct pending_extent_op *op;
-	struct extent_buffer *leaf;
-	struct list_head *cur = insert_list->next;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	u64 ref_root = extent_root->root_key.objectid;
-	int i = 0, last = 0, ret;
-	int total = nr * 2;
-
-	if (!nr)
-		return 0;
-
-	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
-	if (!keys)
-		return -ENOMEM;
-
-	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
-	if (!data_size) {
-		kfree(keys);
-		return -ENOMEM;
-	}
-
-	list_for_each_entry(op, insert_list, list) {
-		keys[i].objectid = op->bytenr;
-		keys[i].offset = op->num_bytes;
-		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
-		data_size[i] = sizeof(struct btrfs_extent_item);
-		i++;
-
-		keys[i].objectid = op->bytenr;
-		keys[i].offset = op->parent;
-		keys[i].type = BTRFS_EXTENT_REF_KEY;
-		data_size[i] = sizeof(struct btrfs_extent_ref);
-		i++;
-	}
-
-	op = list_entry(cur, struct pending_extent_op, list);
-	i = 0;
-	while (i < total) {
-		int c;
-		ret = btrfs_insert_some_items(trans, extent_root, path,
-					      keys+i, data_size+i, total-i);
-		BUG_ON(ret < 0);
-
-		if (last && ret > 1)
-			BUG();
-
-		leaf = path->nodes[0];
-		for (c = 0; c < ret; c++) {
-			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
-			/*
-			 * if the first item we inserted was a backref, then
-			 * the EXTENT_ITEM will be the odd c's, else it will
-			 * be the even c's
-			 */
-			if ((ref_first && (c % 2)) ||
-			    (!ref_first && !(c % 2))) {
-				struct btrfs_extent_item *itm;
-
-				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
-						     struct btrfs_extent_item);
-				btrfs_set_extent_refs(path->nodes[0], itm, 1);
-				op->del++;
-			} else {
-				struct btrfs_extent_ref *ref;
-
-				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
-						     struct btrfs_extent_ref);
-				btrfs_set_ref_root(leaf, ref, ref_root);
-				btrfs_set_ref_generation(leaf, ref,
-							 op->generation);
-				btrfs_set_ref_objectid(leaf, ref, op->level);
-				btrfs_set_ref_num_refs(leaf, ref, 1);
-				op->del++;
-			}
-
-			/*
-			 * using del to see when its ok to free up the
-			 * pending_extent_op.  In the case where we insert the
-			 * last item on the list in order to help do batching
-			 * we need to not free the extent op until we actually
-			 * insert the extent_item
-			 */
-			if (op->del == 2) {
-				unlock_extent(&info->extent_ins, op->bytenr,
-					      op->bytenr + op->num_bytes - 1,
-					      GFP_NOFS);
-				cur = cur->next;
-				list_del_init(&op->list);
-				kfree(op);
-				if (cur != insert_list)
-					op = list_entry(cur,
-						struct pending_extent_op,
-						list);
-			}
-		}
-		btrfs_mark_buffer_dirty(leaf);
-		btrfs_release_path(extent_root, path);
-
-		/*
-		 * Ok backref's and items usually go right next to eachother,
-		 * but if we could only insert 1 item that means that we
-		 * inserted on the end of a leaf, and we have no idea what may
-		 * be on the next leaf so we just play it safe.  In order to
-		 * try and help this case we insert the last thing on our
-		 * insert list so hopefully it will end up being the last
-		 * thing on the leaf and everything else will be before it,
-		 * which will let us insert a whole bunch of items at the same
-		 * time.
-		 */
-		if (ret == 1 && !last && (i + ret < total)) {
-			/*
-			 * last: where we will pick up the next time around
-			 * i: our current key to insert, will be total - 1
-			 * cur: the current op we are screwing with
-			 * op: duh
-			 */
-			last = i + ret;
-			i = total - 1;
-			cur = insert_list->prev;
-			op = list_entry(cur, struct pending_extent_op, list);
-		} else if (last) {
-			/*
-			 * ok we successfully inserted the last item on the
-			 * list, lets reset everything
-			 *
-			 * i: our current key to insert, so where we left off
-			 *    last time
-			 * last: done with this
-			 * cur: the op we are messing with
-			 * op: duh
-			 * total: since we inserted the last key, we need to
-			 *        decrement total so we dont overflow
-			 */
-			i = last;
-			last = 0;
-			total--;
-			if (i < total) {
-				cur = insert_list->next;
-				op = list_entry(cur, struct pending_extent_op,
-						list);
-			}
-		} else {
-			i += ret;
-		}
-
-		cond_resched();
-	}
-	ret = 0;
-	kfree(keys);
-	kfree(data_size);
-	return ret;
-}
-
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
 					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid)
+					  u64 owner_objectid,
+					  int refs_to_add)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -829,9 +589,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_root(leaf, ref, ref_root);
 		btrfs_set_ref_generation(leaf, ref, ref_generation);
 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_num_refs(leaf, ref, 1);
+		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
 	} else if (ret == -EEXIST) {
 		u64 existing_owner;
+
 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
 		leaf = path->nodes[0];
 		ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +606,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 
 		num_refs = btrfs_ref_num_refs(leaf, ref);
 		BUG_ON(num_refs == 0);
-		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
 
 		existing_owner = btrfs_ref_objectid(leaf, ref);
 		if (existing_owner != owner_objectid &&
@@ -865,7 +626,8 @@ out:
 
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
-					  struct btrfs_path *path)
+					  struct btrfs_path *path,
+					  int refs_to_drop)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_ref *ref;
@@ -875,8 +637,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
 	num_refs = btrfs_ref_num_refs(leaf, ref);
-	BUG_ON(num_refs == 0);
-	num_refs -= 1;
+	BUG_ON(num_refs < refs_to_drop);
+	num_refs -= refs_to_drop;
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
@@ -927,332 +689,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static noinline int free_extents(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root,
-				 struct list_head *del_list)
-{
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_path *path;
-	struct btrfs_key key, found_key;
-	struct extent_buffer *leaf;
-	struct list_head *cur;
-	struct pending_extent_op *op;
-	struct btrfs_extent_item *ei;
-	int ret, num_to_del, extent_slot = 0, found_extent = 0;
-	u32 refs;
-	u64 bytes_freed = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = 1;
-
-search:
-	/* search for the backref for the current ref we want to delete */
-	cur = del_list->next;
-	op = list_entry(cur, struct pending_extent_op, list);
-	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
-				    op->orig_parent,
-				    extent_root->root_key.objectid,
-				    op->orig_generation, op->level, 1);
-	if (ret) {
-		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
-		       "root %llu gen %llu owner %u\n",
-		       (unsigned long long)op->bytenr,
-		       (unsigned long long)extent_root->root_key.objectid,
-		       (unsigned long long)op->orig_generation, op->level);
-		btrfs_print_leaf(extent_root, path->nodes[0]);
-		WARN_ON(1);
-		goto out;
-	}
-
-	extent_slot = path->slots[0];
-	num_to_del = 1;
-	found_extent = 0;
-
-	/*
-	 * if we aren't the first item on the leaf we can move back one and see
-	 * if our ref is right next to our extent item
-	 */
-	if (likely(extent_slot)) {
-		extent_slot--;
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      extent_slot);
-		if (found_key.objectid == op->bytenr &&
-		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-		    found_key.offset == op->num_bytes) {
-			num_to_del++;
-			found_extent = 1;
-		}
-	}
-
-	/*
-	 * if we didn't find the extent we need to delete the backref and then
-	 * search for the extent item key so we can update its ref count
-	 */
-	if (!found_extent) {
-		key.objectid = op->bytenr;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = op->num_bytes;
-
-		ret = remove_extent_backref(trans, extent_root, path);
-		BUG_ON(ret);
-		btrfs_release_path(extent_root, path);
-		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-		BUG_ON(ret);
-		extent_slot = path->slots[0];
-	}
-
-	/* this is where we update the ref count for the extent */
-	leaf = path->nodes[0];
-	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs == 0);
-	refs--;
-	btrfs_set_extent_refs(leaf, ei, refs);
-
-	btrfs_mark_buffer_dirty(leaf);
-
-	/*
-	 * This extent needs deleting.  The reason cur_slot is extent_slot +
-	 * num_to_del is because extent_slot points to the slot where the extent
-	 * is, and if the backref was not right next to the extent we will be
-	 * deleting at least 1 item, and will want to start searching at the
-	 * slot directly next to extent_slot.  However if we did find the
-	 * backref next to the extent item them we will be deleting at least 2
-	 * items and will want to start searching directly after the ref slot
-	 */
-	if (!refs) {
-		struct list_head *pos, *n, *end;
-		int cur_slot = extent_slot+num_to_del;
-		u64 super_used;
-		u64 root_used;
-
-		path->slots[0] = extent_slot;
-		bytes_freed = op->num_bytes;
-
-		mutex_lock(&info->pinned_mutex);
-		ret = pin_down_bytes(trans, extent_root, op->bytenr,
-				     op->num_bytes, op->level >=
-				     BTRFS_FIRST_FREE_OBJECTID);
-		mutex_unlock(&info->pinned_mutex);
-		BUG_ON(ret < 0);
-		op->del = ret;
-
-		/*
-		 * we need to see if we can delete multiple things at once, so
-		 * start looping through the list of extents we are wanting to
-		 * delete and see if their extent/backref's are right next to
-		 * eachother and the extents only have 1 ref
-		 */
-		for (pos = cur->next; pos != del_list; pos = pos->next) {
-			struct pending_extent_op *tmp;
-
-			tmp = list_entry(pos, struct pending_extent_op, list);
-
-			/* we only want to delete extent+ref at this stage */
-			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
-				break;
-
-			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
-			if (found_key.objectid != tmp->bytenr ||
-			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
-			    found_key.offset != tmp->num_bytes)
-				break;
-
-			/* check to make sure this extent only has one ref */
-			ei = btrfs_item_ptr(leaf, cur_slot,
-					    struct btrfs_extent_item);
-			if (btrfs_extent_refs(leaf, ei) != 1)
-				break;
-
-			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
-			if (found_key.objectid != tmp->bytenr ||
-			    found_key.type != BTRFS_EXTENT_REF_KEY ||
-			    found_key.offset != tmp->orig_parent)
-				break;
-
-			/*
-			 * the ref is right next to the extent, we can set the
-			 * ref count to 0 since we will delete them both now
-			 */
-			btrfs_set_extent_refs(leaf, ei, 0);
-
-			/* pin down the bytes for this extent */
-			mutex_lock(&info->pinned_mutex);
-			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
-					     tmp->num_bytes, tmp->level >=
-					     BTRFS_FIRST_FREE_OBJECTID);
-			mutex_unlock(&info->pinned_mutex);
-			BUG_ON(ret < 0);
-
-			/*
-			 * use the del field to tell if we need to go ahead and
-			 * free up the extent when we delete the item or not.
-			 */
-			tmp->del = ret;
-			bytes_freed += tmp->num_bytes;
-
-			num_to_del += 2;
-			cur_slot += 2;
-		}
-		end = pos;
-
-		/* update the free space counters */
-		spin_lock(&info->delalloc_lock);
-		super_used = btrfs_super_bytes_used(&info->super_copy);
-		btrfs_set_super_bytes_used(&info->super_copy,
-					   super_used - bytes_freed);
-
-		root_used = btrfs_root_used(&extent_root->root_item);
-		btrfs_set_root_used(&extent_root->root_item,
-				    root_used - bytes_freed);
-		spin_unlock(&info->delalloc_lock);
-
-		/* delete the items */
-		ret = btrfs_del_items(trans, extent_root, path,
-				      path->slots[0], num_to_del);
-		BUG_ON(ret);
-
-		/*
-		 * loop through the extents we deleted and do the cleanup work
-		 * on them
-		 */
-		for (pos = cur, n = pos->next; pos != end;
-		     pos = n, n = pos->next) {
-			struct pending_extent_op *tmp;
-			tmp = list_entry(pos, struct pending_extent_op, list);
-
-			/*
-			 * remember tmp->del tells us wether or not we pinned
-			 * down the extent
-			 */
-			ret = update_block_group(trans, extent_root,
-						 tmp->bytenr, tmp->num_bytes, 0,
-						 tmp->del);
-			BUG_ON(ret);
-
-			list_del_init(&tmp->list);
-			unlock_extent(&info->extent_ins, tmp->bytenr,
-				      tmp->bytenr + tmp->num_bytes - 1,
-				      GFP_NOFS);
-			kfree(tmp);
-		}
-	} else if (refs && found_extent) {
-		/*
-		 * the ref and extent were right next to eachother, but the
-		 * extent still has a ref, so just free the backref and keep
-		 * going
-		 */
-		ret = remove_extent_backref(trans, extent_root, path);
-		BUG_ON(ret);
-
-		list_del_init(&op->list);
-		unlock_extent(&info->extent_ins, op->bytenr,
-			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-		kfree(op);
-	} else {
-		/*
-		 * the extent has multiple refs and the backref we were looking
-		 * for was not right next to it, so just unlock and go next,
-		 * we're good to go
-		 */
-		list_del_init(&op->list);
-		unlock_extent(&info->extent_ins, op->bytenr,
-			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-		kfree(op);
-	}
-
-	btrfs_release_path(extent_root, path);
-	if (!list_empty(del_list))
-		goto search;
-
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 bytenr,
+				     u64 num_bytes,
 				     u64 orig_parent, u64 parent,
 				     u64 orig_root, u64 ref_root,
 				     u64 orig_generation, u64 ref_generation,
 				     u64 owner_objectid)
 {
 	int ret;
-	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct btrfs_path *path;
-
-	if (root == root->fs_info->extent_root) {
-		struct pending_extent_op *extent_op;
-		u64 num_bytes;
-
-		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
-		num_bytes = btrfs_level_size(root, (int)owner_objectid);
-		mutex_lock(&root->fs_info->extent_ins_mutex);
-		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-			u64 priv;
-			ret = get_state_private(&root->fs_info->extent_ins,
-						bytenr, &priv);
-			BUG_ON(ret);
-			extent_op = (struct pending_extent_op *)
-							(unsigned long)priv;
-			BUG_ON(extent_op->parent != orig_parent);
-			BUG_ON(extent_op->generation != orig_generation);
-
-			extent_op->parent = parent;
-			extent_op->generation = ref_generation;
-		} else {
-			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-			BUG_ON(!extent_op);
-
-			extent_op->type = PENDING_BACKREF_UPDATE;
-			extent_op->bytenr = bytenr;
-			extent_op->num_bytes = num_bytes;
-			extent_op->parent = parent;
-			extent_op->orig_parent = orig_parent;
-			extent_op->generation = ref_generation;
-			extent_op->orig_generation = orig_generation;
-			extent_op->level = (int)owner_objectid;
-			INIT_LIST_HEAD(&extent_op->list);
-			extent_op->del = 0;
-
-			set_extent_bits(&root->fs_info->extent_ins,
-					bytenr, bytenr + num_bytes - 1,
-					EXTENT_WRITEBACK, GFP_NOFS);
-			set_state_private(&root->fs_info->extent_ins,
-					  bytenr, (unsigned long)extent_op);
-		}
-		mutex_unlock(&root->fs_info->extent_ins_mutex);
-		return 0;
-	}
+	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	ret = lookup_extent_backref(trans, extent_root, path,
-				    bytenr, orig_parent, orig_root,
-				    orig_generation, owner_objectid, 1);
-	if (ret)
-		goto out;
-	ret = remove_extent_backref(trans, extent_root, path);
-	if (ret)
-		goto out;
-	ret = insert_extent_backref(trans, extent_root, path, bytenr,
-				    parent, ref_root, ref_generation,
-				    owner_objectid);
+	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+				       orig_parent, parent, orig_root,
+				       ref_root, orig_generation,
+				       ref_generation, owner_objectid, pin);
 	BUG_ON(ret);
-	finish_current_insert(trans, extent_root, 0);
-	del_pending_extents(trans, extent_root, 0);
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
-			    u64 orig_parent, u64 parent,
+			    u64 num_bytes, u64 orig_parent, u64 parent,
 			    u64 ref_root, u64 ref_generation,
 			    u64 owner_objectid)
 {
@@ -1260,19 +718,35 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
-					parent, ref_root, ref_root,
-					ref_generation, ref_generation,
-					owner_objectid);
+
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+					orig_parent, parent, ref_root,
+					ref_root, ref_generation,
+					ref_generation, owner_objectid);
 	return ret;
 }
-
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root, u64 bytenr,
+				  u64 num_bytes,
 				  u64 orig_parent, u64 parent,
 				  u64 orig_root, u64 ref_root,
 				  u64 orig_generation, u64 ref_generation,
 				  u64 owner_objectid)
+{
+	int ret;
+
+	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+				    ref_generation, owner_objectid,
+				    BTRFS_ADD_DELAYED_REF, 0);
+	BUG_ON(ret);
+	return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 bytenr,
+			  u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid,
+			  int refs_to_add)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -1288,15 +762,19 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	path->reada = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = (u64)-1;
+	key.offset = num_bytes;
 
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-				0, 1);
+	/* first find the extent item and update its reference count */
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
 	if (ret < 0)
 		return ret;
-	BUG_ON(ret == 0 || path->slots[0] == 0);
 
-	path->slots[0]--;
+	if (ret > 0) {
+		WARN_ON(1);
+		btrfs_free_path(path);
+		return -EIO;
+	}
 	l = path->nodes[0];
 
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1309,98 +787,274 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
-	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	refs = btrfs_extent_refs(l, item);
-	btrfs_set_extent_refs(l, item, refs + 1);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
+	refs = btrfs_extent_refs(l, item);
+	btrfs_set_extent_refs(l, item, refs + refs_to_add);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_release_path(root->fs_info->extent_root, path);
+
+	path->reada = 1;
+	/* now insert the actual backref */
+	ret = insert_extent_backref(trans, root->fs_info->extent_root,
+				    path, bytenr, parent,
+				    ref_root, ref_generation,
+				    owner_objectid, refs_to_add);
+	BUG_ON(ret);
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 ref_root, u64 ref_generation,
+			 u64 owner_objectid)
+{
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
+				     0, ref_root, 0, ref_generation,
+				     owner_objectid);
+	return ret;
+}
+
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_ref_node *node)
+{
+	int ret = 0;
+	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+	BUG_ON(node->ref_mod == 0);
+	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+				  node->parent, ref->root, ref->generation,
+				  ref->owner_objectid, ref->pin, node->ref_mod);
+
+	return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_ref_node *node,
+					int insert_reserved)
+{
+	int ret;
+	struct btrfs_delayed_ref *ref;
+
+	if (node->parent == (u64)-1) {
+		struct btrfs_delayed_ref_head *head;
+		/*
+		 * we've hit the end of the chain and we were supposed
+		 * to insert this extent into the tree.  But, it got
+		 * deleted before we ever needed to insert it, so all
+		 * we have to do is clean up the accounting
+		 */
+		if (insert_reserved) {
+			update_reserved_extents(root, node->bytenr,
+						node->num_bytes, 0);
+		}
+		head = btrfs_delayed_node_to_head(node);
+		mutex_unlock(&head->mutex);
+		return 0;
+	}
+
+	ref = btrfs_delayed_node_to_ref(node);
+	if (ref->action == BTRFS_ADD_DELAYED_REF) {
+		if (insert_reserved) {
+			struct btrfs_key ins;
+
+			ins.objectid = node->bytenr;
+			ins.offset = node->num_bytes;
+			ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+			/* record the full extent allocation */
+			ret = __btrfs_alloc_reserved_extent(trans, root,
+					node->parent, ref->root,
+					ref->generation, ref->owner_objectid,
+					&ins, node->ref_mod);
+			update_reserved_extents(root, node->bytenr,
+						node->num_bytes, 0);
+		} else {
+			/* just add one backref */
+			ret = add_extent_ref(trans, root, node->bytenr,
+				     node->num_bytes,
+				     node->parent, ref->root, ref->generation,
+				     ref->owner_objectid, node->ref_mod);
+		}
+		BUG_ON(ret);
+	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+		WARN_ON(insert_reserved);
+		ret = drop_delayed_ref(trans, root, node);
+	}
+	return 0;
+}
+
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref;
+	int action = BTRFS_ADD_DELAYED_REF;
+again:
+	/*
+	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+	 * this prevents ref count from going down to zero when
+	 * there still are pending delayed ref.
+	 */
+	node = rb_prev(&head->node.rb_node);
+	while (1) {
+		if (!node)
+			break;
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (ref->bytenr != head->node.bytenr)
+			break;
+		if (btrfs_delayed_node_to_ref(ref)->action == action)
+			return ref;
+		node = rb_prev(node);
+	}
+	if (action == BTRFS_ADD_DELAYED_REF) {
+		action = BTRFS_DROP_DELAYED_REF;
+		goto again;
+	}
+	return NULL;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *locked_ref = NULL;
+	int ret;
+	int must_insert_reserved = 0;
+	int run_all = count == (unsigned long)-1;
+
+	if (root == root->fs_info->extent_root)
+		root = root->fs_info->tree_root;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+again:
+	spin_lock(&delayed_refs->lock);
+	if (count == 0)
+		count = delayed_refs->num_entries;
+	while (1) {
+		if (!locked_ref) {
+			/*
+			 * no locked ref, go find something we can
+			 * process in the rbtree.  We start at
+			 * the beginning of the tree, there may be less
+			 * lock contention if we do something smarter here.
+			 */
+			node = rb_first(&delayed_refs->root);
+			if (!node) {
+				spin_unlock(&delayed_refs->lock);
+				break;
+			}
+
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
+			if (ret) {
+				spin_unlock(&delayed_refs->lock);
+				break;
+			}
+		}
 
-	btrfs_release_path(root->fs_info->extent_root, path);
+		/*
+		 * record the must insert reserved flag before we
+		 * drop the spin lock.
+		 */
+		must_insert_reserved = locked_ref->must_insert_reserved;
+		locked_ref->must_insert_reserved = 0;
 
-	path->reada = 1;
-	ret = insert_extent_backref(trans, root->fs_info->extent_root,
-				    path, bytenr, parent,
-				    ref_root, ref_generation,
-				    owner_objectid);
-	BUG_ON(ret);
-	finish_current_insert(trans, root->fs_info->extent_root, 0);
-	del_pending_extents(trans, root->fs_info->extent_root, 0);
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
 
-	btrfs_free_path(path);
-	return 0;
-}
+		ref = select_delayed_ref(locked_ref);
+		if (!ref) {
+			/* All delayed refs have been processed, Go ahead
+			 * and send the head node to run_one_delayed_ref,
+			 * so that any accounting fixes can happen
+			 */
+			ref = &locked_ref->node;
+			locked_ref = NULL;
+		}
 
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 ref_root, u64 ref_generation,
-			 u64 owner_objectid)
-{
-	int ret;
-	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-		return 0;
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
-				     0, ref_root, 0, ref_generation,
-				     owner_objectid);
-	return ret;
-}
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
+		spin_unlock(&delayed_refs->lock);
 
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root)
-{
-	u64 start;
-	u64 end;
-	int ret;
+		ret = run_one_delayed_ref(trans, root, ref,
+					  must_insert_reserved);
+		BUG_ON(ret);
+		btrfs_put_delayed_ref(ref);
 
-	while(1) {
-		finish_current_insert(trans, root->fs_info->extent_root, 1);
-		del_pending_extents(trans, root->fs_info->extent_root, 1);
+		/* once we lock the head ref, we have to process all the
+		 * entries for it.  So, we might end up doing more entries
+		 * that count was asking us to do.
+		 */
+		if (count > 0)
+			count--;
 
-		/* is there more work to do? */
-		ret = find_first_extent_bit(&root->fs_info->pending_del,
-					    0, &start, &end, EXTENT_WRITEBACK);
-		if (!ret)
-			continue;
-		ret = find_first_extent_bit(&root->fs_info->extent_ins,
-					    0, &start, &end, EXTENT_WRITEBACK);
-		if (!ret)
-			continue;
-		break;
+		/*
+		 * we set locked_ref to null above if we're all done
+		 * with this bytenr
+		 */
+		if (!locked_ref && count == 0)
+			break;
+
+		spin_lock(&delayed_refs->lock);
 	}
-	return 0;
-}
+	if (run_all) {
+		spin_lock(&delayed_refs->lock);
+		node = rb_first(&delayed_refs->root);
+		if (!node) {
+			spin_unlock(&delayed_refs->lock);
+			goto out;
+		}
 
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs)
-{
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct extent_buffer *l;
-	struct btrfs_extent_item *item;
+		while (node) {
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			if (btrfs_delayed_ref_is_head(ref)) {
+				struct btrfs_delayed_ref_head *head;
 
-	WARN_ON(num_bytes < root->sectorsize);
-	path = btrfs_alloc_path();
-	path->reada = 1;
-	key.objectid = bytenr;
-	key.offset = num_bytes;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-				0, 0);
-	if (ret < 0)
-		goto out;
-	if (ret != 0) {
-		btrfs_print_leaf(root, path->nodes[0]);
-		printk(KERN_INFO "btrfs failed to find block number %llu\n",
-		       (unsigned long long)bytenr);
-		BUG();
+				head = btrfs_delayed_node_to_head(ref);
+				atomic_inc(&ref->refs);
+
+				spin_unlock(&delayed_refs->lock);
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+
+				btrfs_put_delayed_ref(ref);
+				goto again;
+			}
+			node = rb_next(node);
+		}
+		spin_unlock(&delayed_refs->lock);
+		count = (unsigned long)-1;
+		schedule_timeout(1);
+		goto again;
 	}
-	l = path->nodes[0];
-	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	*refs = btrfs_extent_refs(l, item);
 out:
-	btrfs_free_path(path);
 	return 0;
 }
 
@@ -1624,7 +1278,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 	int refi = 0;
 	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
 	ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1350,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 
 		if (level == 0) {
 			btrfs_item_key_to_cpu(buf, &key, slot);
+			fi = btrfs_item_ptr(buf, slot,
+					    struct btrfs_file_extent_item);
+
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
 
 			ret = process_func(trans, root, bytenr,
-					   orig_buf->start, buf->start,
-					   orig_root, ref_root,
-					   orig_generation, ref_generation,
-					   key.objectid);
+				   btrfs_file_extent_disk_num_bytes(buf, fi),
+				   orig_buf->start, buf->start,
+				   orig_root, ref_root,
+				   orig_generation, ref_generation,
+				   key.objectid);
 
 			if (ret) {
 				faili = slot;
@@ -1709,7 +1370,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 				goto fail;
 			}
 		} else {
-			ret = process_func(trans, root, bytenr,
+			ret = process_func(trans, root, bytenr, buf->len,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
@@ -1786,17 +1447,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			if (bytenr == 0)
 				continue;
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    orig_buf->start, buf->start,
-					    orig_root, ref_root,
-					    orig_generation, ref_generation,
-					    key.objectid);
+				    btrfs_file_extent_disk_num_bytes(buf, fi),
+				    orig_buf->start, buf->start,
+				    orig_root, ref_root, orig_generation,
+				    ref_generation, key.objectid);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, slot);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    orig_buf->start, buf->start,
-					    orig_root, ref_root,
+					    buf->len, orig_buf->start,
+					    buf->start, orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    level - 1);
 			if (ret)
@@ -1815,7 +1476,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 				 struct btrfs_block_group_cache *cache)
 {
 	int ret;
-	int pending_ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	unsigned long bi;
 	struct extent_buffer *leaf;
@@ -1831,12 +1491,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(extent_root, path);
 fail:
-	finish_current_insert(trans, extent_root, 0);
-	pending_ret = del_pending_extents(trans, extent_root, 0);
 	if (ret)
 		return ret;
-	if (pending_ret)
-		return pending_ret;
 	return 0;
 
 }
@@ -2474,193 +2130,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root, int all)
-{
-	u64 start;
-	u64 end;
-	u64 priv;
-	u64 search = 0;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_path *path;
-	struct pending_extent_op *extent_op, *tmp;
-	struct list_head insert_list, update_list;
-	int ret;
-	int num_inserts = 0, max_inserts, restart = 0;
-
-	path = btrfs_alloc_path();
-	INIT_LIST_HEAD(&insert_list);
-	INIT_LIST_HEAD(&update_list);
-
-	max_inserts = extent_root->leafsize /
-		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
-		 sizeof(struct btrfs_extent_ref) +
-		 sizeof(struct btrfs_extent_item));
-again:
-	mutex_lock(&info->extent_ins_mutex);
-	while (1) {
-		ret = find_first_extent_bit(&info->extent_ins, search, &start,
-					    &end, EXTENT_WRITEBACK);
-		if (ret) {
-			if (restart && !num_inserts &&
-			    list_empty(&update_list)) {
-				restart = 0;
-				search = 0;
-				continue;
-			}
-			break;
-		}
-
-		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
-		if (!ret) {
-			if (all)
-				restart = 1;
-			search = end + 1;
-			if (need_resched()) {
-				mutex_unlock(&info->extent_ins_mutex);
-				cond_resched();
-				mutex_lock(&info->extent_ins_mutex);
-			}
-			continue;
-		}
-
-		ret = get_state_private(&info->extent_ins, start, &priv);
-		BUG_ON(ret);
-		extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
-		if (extent_op->type == PENDING_EXTENT_INSERT) {
-			num_inserts++;
-			list_add_tail(&extent_op->list, &insert_list);
-			search = end + 1;
-			if (num_inserts == max_inserts) {
-				restart = 1;
-				break;
-			}
-		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-			list_add_tail(&extent_op->list, &update_list);
-			search = end + 1;
-		} else {
-			BUG();
-		}
-	}
-
-	/*
-	 * process the update list, clear the writeback bit for it, and if
-	 * somebody marked this thing for deletion then just unlock it and be
-	 * done, the free_extents will handle it
-	 */
-	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
-		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-				  extent_op->bytenr + extent_op->num_bytes - 1,
-				  EXTENT_WRITEBACK, GFP_NOFS);
-		if (extent_op->del) {
-			list_del_init(&extent_op->list);
-			unlock_extent(&info->extent_ins, extent_op->bytenr,
-				      extent_op->bytenr + extent_op->num_bytes
-				      - 1, GFP_NOFS);
-			kfree(extent_op);
-		}
-	}
-	mutex_unlock(&info->extent_ins_mutex);
-
-	/*
-	 * still have things left on the update list, go ahead an update
-	 * everything
-	 */
-	if (!list_empty(&update_list)) {
-		ret = update_backrefs(trans, extent_root, path, &update_list);
-		BUG_ON(ret);
-
-		/* we may have COW'ed new blocks, so lets start over */
-		if (all)
-			restart = 1;
-	}
-
-	/*
-	 * if no inserts need to be done, but we skipped some extents and we
-	 * need to make sure everything is cleaned then reset everything and
-	 * go back to the beginning
-	 */
-	if (!num_inserts && restart) {
-		search = 0;
-		restart = 0;
-		INIT_LIST_HEAD(&update_list);
-		INIT_LIST_HEAD(&insert_list);
-		goto again;
-	} else if (!num_inserts) {
-		goto out;
-	}
-
-	/*
-	 * process the insert extents list.  Again if we are deleting this
-	 * extent, then just unlock it, pin down the bytes if need be, and be
-	 * done with it.  Saves us from having to actually insert the extent
-	 * into the tree and then subsequently come along and delete it
-	 */
-	mutex_lock(&info->extent_ins_mutex);
-	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
-		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-				  extent_op->bytenr + extent_op->num_bytes - 1,
-				  EXTENT_WRITEBACK, GFP_NOFS);
-		if (extent_op->del) {
-			u64 used;
-			list_del_init(&extent_op->list);
-			unlock_extent(&info->extent_ins, extent_op->bytenr,
-				      extent_op->bytenr + extent_op->num_bytes
-				      - 1, GFP_NOFS);
-
-			mutex_lock(&extent_root->fs_info->pinned_mutex);
-			ret = pin_down_bytes(trans, extent_root,
-					     extent_op->bytenr,
-					     extent_op->num_bytes, 0);
-			mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-			spin_lock(&info->delalloc_lock);
-			used = btrfs_super_bytes_used(&info->super_copy);
-			btrfs_set_super_bytes_used(&info->super_copy,
-					used - extent_op->num_bytes);
-			used = btrfs_root_used(&extent_root->root_item);
-			btrfs_set_root_used(&extent_root->root_item,
-					used - extent_op->num_bytes);
-			spin_unlock(&info->delalloc_lock);
-
-			ret = update_block_group(trans, extent_root,
-						 extent_op->bytenr,
-						 extent_op->num_bytes,
-						 0, ret > 0);
-			BUG_ON(ret);
-			kfree(extent_op);
-			num_inserts--;
-		}
-	}
-	mutex_unlock(&info->extent_ins_mutex);
-
-	ret = insert_extents(trans, extent_root, path, &insert_list,
-			     num_inserts);
-	BUG_ON(ret);
-
-	/*
-	 * if restart is set for whatever reason we need to go back and start
-	 * searching through the pending list again.
-	 *
-	 * We just inserted some extents, which could have resulted in new
-	 * blocks being allocated, which would result in new blocks needing
-	 * updates, so if all is set we _must_ restart to get the updated
-	 * blocks.
-	 */
-	if (restart || all) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		search = 0;
-		restart = 0;
-		num_inserts = 0;
-		goto again;
-	}
-out:
-	btrfs_free_path(path);
-	return 0;
-}
-
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data)
@@ -2686,6 +2155,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			clean_tree_block(NULL, root, buf);
@@ -2710,7 +2180,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, int pin, int mark_free)
+			 u64 owner_objectid, int pin, int mark_free,
+			 int refs_to_drop)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -2753,7 +2224,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				break;
 		}
 		if (!found_extent) {
-			ret = remove_extent_backref(trans, extent_root, path);
+			ret = remove_extent_backref(trans, extent_root, path,
+						    refs_to_drop);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
 			ret = btrfs_search_slot(trans, extent_root,
@@ -2771,8 +2243,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "root %llu gen %llu owner %llu\n",
+		       "parent %llu root %llu gen %llu owner %llu\n",
 		       (unsigned long long)bytenr,
+		       (unsigned long long)parent,
 		       (unsigned long long)root_objectid,
 		       (unsigned long long)ref_generation,
 		       (unsigned long long)owner_objectid);
@@ -2782,17 +2255,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs == 0);
-	refs -= 1;
-	btrfs_set_extent_refs(leaf, ei, refs);
 
+	/*
+	 * we're not allowed to delete the extent item if there
+	 * are other delayed ref updates pending
+	 */
+
+	BUG_ON(refs < refs_to_drop);
+	refs -= refs_to_drop;
+	btrfs_set_extent_refs(leaf, ei, refs);
 	btrfs_mark_buffer_dirty(leaf);
 
-	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+	if (refs == 0 && found_extent &&
+	    path->slots[0] == extent_slot + 1) {
 		struct btrfs_extent_ref *ref;
 		ref = btrfs_item_ptr(leaf, path->slots[0],
 				     struct btrfs_extent_ref);
-		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
 		/* if the back ref and the extent are next to each other
 		 * they get deleted below in one shot
 		 */
@@ -2800,7 +2279,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		num_to_del = 2;
 	} else if (found_extent) {
 		/* otherwise delete the extent back ref */
-		ret = remove_extent_backref(trans, extent_root, path);
+		ret = remove_extent_backref(trans, extent_root, path,
+					    refs_to_drop);
 		BUG_ON(ret);
 		/* if refs are 0, we need to setup the path for deletion */
 		if (refs == 0) {
@@ -2850,218 +2330,35 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
-	finish_current_insert(trans, extent_root, 0);
 	return ret;
 }
 
-/*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
- */
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root, int all)
-{
-	int ret;
-	int err = 0;
-	u64 start;
-	u64 end;
-	u64 priv;
-	u64 search = 0;
-	int nr = 0, skipped = 0;
-	struct extent_io_tree *pending_del;
-	struct extent_io_tree *extent_ins;
-	struct pending_extent_op *extent_op;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct list_head delete_list;
-
-	INIT_LIST_HEAD(&delete_list);
-	extent_ins = &extent_root->fs_info->extent_ins;
-	pending_del = &extent_root->fs_info->pending_del;
-
-again:
-	mutex_lock(&info->extent_ins_mutex);
-	while (1) {
-		ret = find_first_extent_bit(pending_del, search, &start, &end,
-					    EXTENT_WRITEBACK);
-		if (ret) {
-			if (all && skipped && !nr) {
-				search = 0;
-				skipped = 0;
-				continue;
-			}
-			mutex_unlock(&info->extent_ins_mutex);
-			break;
-		}
-
-		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
-		if (!ret) {
-			search = end+1;
-			skipped = 1;
-
-			if (need_resched()) {
-				mutex_unlock(&info->extent_ins_mutex);
-				cond_resched();
-				mutex_lock(&info->extent_ins_mutex);
-			}
-
-			continue;
-		}
-		BUG_ON(ret < 0);
-
-		ret = get_state_private(pending_del, start, &priv);
-		BUG_ON(ret);
-		extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
-				  GFP_NOFS);
-		if (!test_range_bit(extent_ins, start, end,
-				    EXTENT_WRITEBACK, 0)) {
-			list_add_tail(&extent_op->list, &delete_list);
-			nr++;
-		} else {
-			kfree(extent_op);
-
-			ret = get_state_private(&info->extent_ins, start,
-						&priv);
-			BUG_ON(ret);
-			extent_op = (struct pending_extent_op *)
-						(unsigned long)priv;
-
-			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_WRITEBACK, GFP_NOFS);
-
-			if (extent_op->type == PENDING_BACKREF_UPDATE) {
-				list_add_tail(&extent_op->list, &delete_list);
-				search = end + 1;
-				nr++;
-				continue;
-			}
-
-			mutex_lock(&extent_root->fs_info->pinned_mutex);
-			ret = pin_down_bytes(trans, extent_root, start,
-					     end + 1 - start, 0);
-			mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-			ret = update_block_group(trans, extent_root, start,
-						end + 1 - start, 0, ret > 0);
-
-			unlock_extent(extent_ins, start, end, GFP_NOFS);
-			BUG_ON(ret);
-			kfree(extent_op);
-		}
-		if (ret)
-			err = ret;
-
-		search = end + 1;
-
-		if (need_resched()) {
-			mutex_unlock(&info->extent_ins_mutex);
-			cond_resched();
-			mutex_lock(&info->extent_ins_mutex);
-		}
-	}
-
-	if (nr) {
-		ret = free_extents(trans, extent_root, &delete_list);
-		BUG_ON(ret);
-	}
-
-	if (all && skipped) {
-		INIT_LIST_HEAD(&delete_list);
-		search = 0;
-		nr = 0;
-		goto again;
-	}
-
-	if (!err)
-		finish_current_insert(trans, extent_root, 0);
-	return err;
-}
-
 /*
  * remove an extent from the root, returns 0 on success
  */
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 bytenr, u64 num_bytes, u64 parent,
-			       u64 root_objectid, u64 ref_generation,
-			       u64 owner_objectid, int pin)
+					struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes, u64 parent,
+					u64 root_objectid, u64 ref_generation,
+					u64 owner_objectid, int pin,
+					int refs_to_drop)
 {
-	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	int pending_ret;
-	int ret;
-
 	WARN_ON(num_bytes < root->sectorsize);
-	if (root == extent_root) {
-		struct pending_extent_op *extent_op = NULL;
-
-		mutex_lock(&root->fs_info->extent_ins_mutex);
-		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-			u64 priv;
-			ret = get_state_private(&root->fs_info->extent_ins,
-						bytenr, &priv);
-			BUG_ON(ret);
-			extent_op = (struct pending_extent_op *)
-						(unsigned long)priv;
-
-			extent_op->del = 1;
-			if (extent_op->type == PENDING_EXTENT_INSERT) {
-				mutex_unlock(&root->fs_info->extent_ins_mutex);
-				return 0;
-			}
-		}
-
-		if (extent_op) {
-			ref_generation = extent_op->orig_generation;
-			parent = extent_op->orig_parent;
-		}
 
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
-
-		extent_op->type = PENDING_EXTENT_DELETE;
-		extent_op->bytenr = bytenr;
-		extent_op->num_bytes = num_bytes;
-		extent_op->parent = parent;
-		extent_op->orig_parent = parent;
-		extent_op->generation = ref_generation;
-		extent_op->orig_generation = ref_generation;
-		extent_op->level = (int)owner_objectid;
-		INIT_LIST_HEAD(&extent_op->list);
-		extent_op->del = 0;
-
-		set_extent_bits(&root->fs_info->pending_del,
-				bytenr, bytenr + num_bytes - 1,
-				EXTENT_WRITEBACK, GFP_NOFS);
-		set_state_private(&root->fs_info->pending_del,
-				  bytenr, (unsigned long)extent_op);
-		mutex_unlock(&root->fs_info->extent_ins_mutex);
-		return 0;
-	}
-	/* if metadata always pin */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-			mutex_lock(&root->fs_info->pinned_mutex);
-			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-			mutex_unlock(&root->fs_info->pinned_mutex);
-			update_reserved_extents(root, bytenr, num_bytes, 0);
-			return 0;
-		}
+	/*
+	 * if metadata always pin
+	 * if data pin when any transaction has committed this
+	 */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	    ref_generation != trans->transid)
 		pin = 1;
-	}
 
-	/* if data pin when any transaction has committed this */
 	if (ref_generation != trans->transid)
 		pin = 1;
 
-	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+	return __free_extent(trans, root, bytenr, num_bytes, parent,
 			    root_objectid, ref_generation,
-			    owner_objectid, pin, pin == 0);
-
-	finish_current_insert(trans, root->fs_info->extent_root, 0);
-	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
-	return ret ? ret : pending_ret;
+			    owner_objectid, pin, pin == 0, refs_to_drop);
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2369,26 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
-				  root_objectid, ref_generation,
-				  owner_objectid, pin);
+	/*
+	 * tree log blocks never actually go into the extent allocation
+	 * tree, just update pinning info and exit early.
+	 *
+	 * data extents referenced by the tree log do need to have
+	 * their reference counts bumped.
+	 */
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		mutex_lock(&root->fs_info->pinned_mutex);
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+		mutex_unlock(&root->fs_info->pinned_mutex);
+		update_reserved_extents(root, bytenr, num_bytes, 0);
+		ret = 0;
+	} else {
+		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+				       root_objectid, ref_generation,
+				       owner_objectid,
+				       BTRFS_DROP_DELAYED_REF, 1);
+	}
 	return ret;
 }
 
@@ -3475,10 +2789,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins)
+					 u64 owner, struct btrfs_key *ins,
+					 int ref_mod)
 {
 	int ret;
-	int pending_ret;
 	u64 super_used;
 	u64 root_used;
 	u64 num_bytes = ins->offset;
@@ -3503,33 +2817,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 	spin_unlock(&info->delalloc_lock);
 
-	if (root == extent_root) {
-		struct pending_extent_op *extent_op;
-
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
-
-		extent_op->type = PENDING_EXTENT_INSERT;
-		extent_op->bytenr = ins->objectid;
-		extent_op->num_bytes = ins->offset;
-		extent_op->parent = parent;
-		extent_op->orig_parent = 0;
-		extent_op->generation = ref_generation;
-		extent_op->orig_generation = 0;
-		extent_op->level = (int)owner;
-		INIT_LIST_HEAD(&extent_op->list);
-		extent_op->del = 0;
-
-		mutex_lock(&root->fs_info->extent_ins_mutex);
-		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-				ins->objectid + ins->offset - 1,
-				EXTENT_WRITEBACK, GFP_NOFS);
-		set_state_private(&root->fs_info->extent_ins,
-				  ins->objectid, (unsigned long)extent_op);
-		mutex_unlock(&root->fs_info->extent_ins_mutex);
-		goto update_block;
-	}
-
 	memcpy(&keys[0], ins, sizeof(*ins));
 	keys[1].objectid = ins->objectid;
 	keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3546,31 +2833,24 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_extent_ref);
 
 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	trans->alloc_exclude_start = 0;
 	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
-	finish_current_insert(trans, extent_root, 0);
-	pending_ret = del_pending_extents(trans, extent_root, 0);
 
 	if (ret)
 		goto out;
-	if (pending_ret) {
-		ret = pending_ret;
-		goto out;
-	}
 
-update_block:
 	ret = update_block_group(trans, root, ins->objectid,
 				 ins->offset, 1, 0);
 	if (ret) {
@@ -3592,9 +2872,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins);
-	update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+	ret = btrfs_add_delayed_ref(trans, ins->objectid,
+				    ins->offset, parent, root_objectid,
+				    ref_generation, owner,
+				    BTRFS_ADD_DELAYED_EXTENT, 0);
+	BUG_ON(ret);
 	return ret;
 }
 
@@ -3621,7 +2904,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	put_block_group(block_group);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins);
+					    ref_generation, owner, ins, 1);
 	return ret;
 }
 
@@ -3640,20 +2923,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
-
 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
 	BUG_ON(ret);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-					root_objectid, ref_generation,
-					owner_objectid, ins);
+		ret = btrfs_add_delayed_ref(trans, ins->objectid,
+					    ins->offset, parent, root_objectid,
+					    ref_generation, owner_objectid,
+					    BTRFS_ADD_DELAYED_EXTENT, 0);
 		BUG_ON(ret);
-
-	} else {
-		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	return ret;
 }
 
@@ -3789,7 +3070,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
-		ret = __btrfs_free_extent(trans, root, disk_bytenr,
+		ret = btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, 0);
@@ -3829,7 +3110,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	 */
 	for (i = 0; i < ref->nritems; i++) {
 		info = ref->extents + sorted[i].slot;
-		ret = __btrfs_free_extent(trans, root, info->bytenr,
+		ret = btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
 					  info->objectid, 0);
@@ -3846,12 +3127,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 start,
 				     u64 len, u32 *refs)
 {
 	int ret;
 
-	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3241,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
 		 * we just decrement it below and don't update any
 		 * of the refs the leaf points to.
 		 */
-		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+						blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1)
 			continue;
@@ -4010,7 +3293,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
 	 */
 	for (i = 0; i < refi; i++) {
 		bytenr = sorted[i].bytenr;
-		ret = __btrfs_free_extent(trans, root, bytenr,
+		ret = btrfs_free_extent(trans, root, bytenr,
 					blocksize, eb->start,
 					root_owner, root_gen, 0, 1);
 		BUG_ON(ret);
@@ -4053,7 +3336,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
 				path->nodes[*level]->len, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -4104,7 +3387,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
-		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+						blocksize, &refs);
 		BUG_ON(ret);
 
 		/*
@@ -4119,7 +3403,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 
-			ret = __btrfs_free_extent(trans, root, bytenr,
+			ret = btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
 						root_owner, root_gen,
 						*level - 1, 1);
@@ -4165,7 +3449,7 @@ out:
 	 * cleanup and free the reference on the last node
 	 * we processed
 	 */
-	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
 	free_extent_buffer(path->nodes[*level]);
@@ -5457,6 +4741,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					root->root_key.objectid,
 					trans->transid, key.objectid);
 		BUG_ON(ret);
+
 		ret = btrfs_free_extent(trans, root,
 					bytenr, num_bytes, leaf->start,
 					btrfs_header_owner(leaf),
@@ -5768,9 +5053,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
 				ref_path, NULL, NULL);
 	BUG_ON(ret);
 
-	if (root == root->fs_info->extent_root)
-		btrfs_extent_post_op(trans, root);
-
 	return 0;
 }
 
@@ -6208,6 +5490,9 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
+
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -6500,9 +5785,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 				sizeof(cache->item));
 	BUG_ON(ret);
 
-	finish_current_insert(trans, extent_root, 0);
-	ret = del_pending_extents(trans, extent_root, 0);
-	BUG_ON(ret);
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
 	return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b..c8007549764 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -643,7 +643,9 @@ next_slot:
 
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
-						disk_bytenr, orig_parent,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						orig_parent,
 						leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
@@ -912,7 +914,7 @@ again:
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 
 	if (orig_parent != leaf->start) {
-		ret = btrfs_update_extent_ref(trans, root, bytenr,
+		ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
 					      orig_parent, leaf->start,
 					      root->root_key.objectid,
 					      trans->transid, inode->i_ino);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d638c54d39e..f94c2ad8996 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -65,6 +65,12 @@ static noinline int join_transaction(struct btrfs_root *root)
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
+
+		cur_trans->delayed_refs.root.rb_node = NULL;
+		cur_trans->delayed_refs.num_entries = 0;
+		cur_trans->delayed_refs.flushing = 0;
+		spin_lock_init(&cur_trans->delayed_refs.lock);
+
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +188,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->block_group = 0;
 	h->alloc_exclude_nr = 0;
 	h->alloc_exclude_start = 0;
+	h->delayed_ref_updates = 0;
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
@@ -281,6 +288,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *info = root->fs_info;
 
+	if (trans->delayed_ref_updates &&
+	    (trans->transaction->delayed_refs.flushing ||
+	    trans->transaction->delayed_refs.num_entries > 16384)) {
+		btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates);
+	} else if (trans->transaction->delayed_refs.num_entries > 64) {
+		wake_up_process(root->fs_info->transaction_kthread);
+	}
+
 	mutex_lock(&info->trans_mutex);
 	cur_trans = info->running_transaction;
 	WARN_ON(cur_trans != trans->transaction);
@@ -424,9 +439,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	u64 old_root_bytenr;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 
-	btrfs_extent_post_op(trans, root);
 	btrfs_write_dirty_block_groups(trans, root);
-	btrfs_extent_post_op(trans, root);
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
 
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +454,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 				     btrfs_header_level(root->node));
 		btrfs_set_root_generation(&root->root_item, trans->transid);
 
-		btrfs_extent_post_op(trans, root);
-
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
 		BUG_ON(ret);
 		btrfs_write_dirty_block_groups(trans, root);
-		btrfs_extent_post_op(trans, root);
+
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -459,15 +475,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
 	struct extent_buffer *eb;
+	int ret;
 
-	btrfs_extent_post_op(trans, fs_info->tree_root);
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
 
 	eb = btrfs_lock_root_node(fs_info->tree_root);
 	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
-	btrfs_extent_post_op(trans, fs_info->tree_root);
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
 
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +494,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		root = list_entry(next, struct btrfs_root, dirty_list);
 
 		update_cowonly_root(trans, root);
+
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -895,6 +917,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	DEFINE_WAIT(wait);
 	int ret;
 
+	/* make a pass through all the delayed refs we have so far
+	 * any runnings procs may add more while we are here
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	BUG_ON(ret);
+
+	/*
+	 * set the flushing flag so procs in this transaction have to
+	 * start sending their work down.
+	 */
+	trans->transaction->delayed_refs.flushing = 1;
+
+	ret = btrfs_run_delayed_refs(trans, root, (u64)-1);
+	BUG_ON(ret);
+
 	INIT_LIST_HEAD(&dirty_fs_roots);
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
@@ -969,6 +1006,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = create_pending_snapshots(trans, root->fs_info);
 	BUG_ON(ret);
 
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
+
 	WARN_ON(cur_trans != trans->transaction);
 
 	/* btrfs_commit_tree_roots is responsible for getting the
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f88..94876709217 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,6 +19,7 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "delayed-ref.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -34,6 +35,7 @@ struct btrfs_transaction {
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
+	struct btrfs_delayed_ref_root delayed_refs;
 };
 
 struct btrfs_trans_handle {
@@ -44,6 +46,7 @@ struct btrfs_trans_handle {
 	u64 block_group;
 	u64 alloc_exclude_start;
 	u64 alloc_exclude_nr;
+	unsigned long delayed_ref_updates;
 };
 
 struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570..b10eacdb162 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_release_path(root, path);
-	if (is_extent)
-		btrfs_extent_post_op(trans, root);
 out:
 	if (path)
 		btrfs_free_path(path);
-- 
cgit v1.2.3


From 44871b1b24b593996db43495cf4484cc580bdc10 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:04:31 -0400
Subject: Btrfs: reduce stack usage in some crucial tree balancing functions

Many of the tree balancing functions follow the same pattern.

1) cow a block
2) do something to the result

This commit breaks them up into two functions so the variables and
code required for part two don't suck down stack during part one.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 458 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 278 insertions(+), 180 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bebc9fd1666..3764248bdc0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2266,66 +2266,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	return ret;
 }
 
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int data_size,
-			   int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int data_size, int empty,
+				      struct extent_buffer *right,
+				      int free_space, u32 left_nritems)
 {
 	struct extent_buffer *left = path->nodes[0];
-	struct extent_buffer *right;
-	struct extent_buffer *upper;
+	struct extent_buffer *upper = path->nodes[1];
 	struct btrfs_disk_key disk_key;
 	int slot;
 	u32 i;
-	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
-	u32 left_nritems;
 	u32 nr;
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
 	int ret;
 
-	slot = path->slots[1];
-	if (!path->nodes[1])
-		return 1;
-
-	upper = path->nodes[1];
-	if (slot >= btrfs_header_nritems(upper) - 1)
-		return 1;
-
-	btrfs_assert_tree_locked(path->nodes[1]);
-
-	right = read_node_slot(root, upper, slot + 1);
-	btrfs_tree_lock(right);
-	btrfs_set_lock_blocking(right);
-
-	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size)
-		goto out_unlock;
-
-	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, right, upper,
-			      slot + 1, &right);
-	if (ret)
-		goto out_unlock;
-
-	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size)
-		goto out_unlock;
-
-	left_nritems = btrfs_header_nritems(left);
-	if (left_nritems == 0)
-		goto out_unlock;
-
 	if (empty)
 		nr = 0;
 	else
@@ -2334,6 +2295,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (path->slots[0] >= left_nritems)
 		push_space += data_size;
 
+	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
 		item = btrfs_item_nr(left, i);
@@ -2464,25 +2426,83 @@ out_unlock:
 	return 1;
 }
 
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path, int data_size,
+			   int empty)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
+	int slot;
+	int free_space;
+	u32 left_nritems;
+	int ret;
+
+	if (!path->nodes[1])
+		return 1;
+
+	slot = path->slots[1];
+	upper = path->nodes[1];
+	if (slot >= btrfs_header_nritems(upper) - 1)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	right = read_node_slot(root, upper, slot + 1);
+	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right);
+	if (ret)
+		goto out_unlock;
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	left_nritems = btrfs_header_nritems(left);
+	if (left_nritems == 0)
+		goto out_unlock;
+
+	return __push_leaf_right(trans, root, path, data_size, empty,
+				right, free_space, left_nritems);
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int data_size,
-			  int empty)
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path, int data_size,
+				     int empty, struct extent_buffer *left,
+				     int free_space, int right_nritems)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
-	struct extent_buffer *left;
 	int slot;
 	int i;
-	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
 	u32 old_left_nritems;
-	u32 right_nritems;
 	u32 nr;
 	int ret = 0;
 	int wret;
@@ -2490,41 +2510,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 old_left_item_size;
 
 	slot = path->slots[1];
-	if (slot == 0)
-		return 1;
-	if (!path->nodes[1])
-		return 1;
-
-	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0)
-		return 1;
-
-	btrfs_assert_tree_locked(path->nodes[1]);
-
-	left = read_node_slot(root, path->nodes[1], slot - 1);
-	btrfs_tree_lock(left);
-	btrfs_set_lock_blocking(left);
-
-	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size) {
-		ret = 1;
-		goto out;
-	}
-
-	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, left,
-			      path->nodes[1], slot - 1, &left);
-	if (ret) {
-		/* we hit -ENOSPC, but it isn't fatal here */
-		ret = 1;
-		goto out;
-	}
-
-	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size) {
-		ret = 1;
-		goto out;
-	}
 
 	if (empty)
 		nr = right_nritems;
@@ -2691,6 +2676,154 @@ out:
 	return ret;
 }
 
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int data_size,
+			  int empty)
+{
+	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
+	int slot;
+	int free_space;
+	u32 right_nritems;
+	int ret = 0;
+
+	slot = path->slots[1];
+	if (slot == 0)
+		return 1;
+	if (!path->nodes[1])
+		return 1;
+
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	left = read_node_slot(root, path->nodes[1], slot - 1);
+	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		ret = 1;
+		goto out;
+	}
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	return __push_leaf_left(trans, root, path, data_size,
+			       empty, left, free_space, right_nritems);
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct extent_buffer *l,
+			       struct extent_buffer *right,
+			       int slot, int mid, int nritems)
+{
+	int data_copy_size;
+	int rt_data_off;
+	int i;
+	int ret = 0;
+	int wret;
+	struct btrfs_disk_key disk_key;
+
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
+
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end_nr(l, mid);
+
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(right, i);
+		u32 ioff;
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(right, item);
+		btrfs_set_item_offset(right, item, ioff + rt_data_off);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_set_header_nritems(l, mid);
+	ret = 0;
+	btrfs_item_key(right, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key, right->start,
+			  path->slots[1] + 1, 1);
+	if (wret)
+		ret = wret;
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
+	BUG_ON(path->slots[0] != slot);
+
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	BUG_ON(ret);
+
+	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+
+	BUG_ON(path->slots[0] < 0);
+
+	return ret;
+}
+
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
@@ -2708,14 +2841,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int mid;
 	int slot;
 	struct extent_buffer *right;
-	int data_copy_size;
-	int rt_data_off;
-	int i;
 	int ret = 0;
 	int wret;
 	int double_split;
 	int num_doubles = 0;
-	struct btrfs_disk_key disk_key;
 
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
@@ -2767,11 +2896,14 @@ again:
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
+
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot >= nritems) {
+				struct btrfs_disk_key disk_key;
+
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2799,6 +2931,8 @@ again:
 		if (leaf_space_used(l, 0, mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (!extend && data_size && slot == 0) {
+				struct btrfs_disk_key disk_key;
+
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2831,76 +2965,16 @@ again:
 			}
 		}
 	}
-	nritems = nritems - mid;
-	btrfs_set_header_nritems(right, nritems);
-	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
-	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
-			   btrfs_item_nr_offset(mid),
-			   nritems * sizeof(struct btrfs_item));
-
-	copy_extent_buffer(right, l,
-		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-		     data_copy_size, btrfs_leaf_data(l) +
-		     leaf_data_end(root, l), data_copy_size);
-
-	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-		      btrfs_item_end_nr(l, mid);
 
-	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(right, i);
-		u32 ioff;
-
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
-
-		ioff = btrfs_item_offset(right, item);
-		btrfs_set_item_offset(right, item, ioff + rt_data_off);
-	}
-
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
-	}
-
-	btrfs_set_header_nritems(l, mid);
-	ret = 0;
-	btrfs_item_key(right, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key, right->start,
-			  path->slots[1] + 1, 1);
-	if (wret)
-		ret = wret;
-
-	btrfs_mark_buffer_dirty(right);
-	btrfs_mark_buffer_dirty(l);
-	BUG_ON(path->slots[0] != slot);
-
-	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 	BUG_ON(ret);
 
-	if (mid <= slot) {
-		btrfs_tree_unlock(path->nodes[0]);
-		free_extent_buffer(path->nodes[0]);
-		path->nodes[0] = right;
-		path->slots[0] -= mid;
-		path->slots[1] += 1;
-	} else {
-		btrfs_tree_unlock(right);
-		free_extent_buffer(right);
-	}
-
-	BUG_ON(path->slots[0] < 0);
-
 	if (double_split) {
 		BUG_ON(num_doubles != 0);
 		num_doubles++;
 		goto again;
 	}
+
 	return ret;
 }
 
@@ -3382,39 +3456,27 @@ out:
 }
 
 /*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
  */
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct btrfs_path *path,
+		      struct btrfs_key *cpu_key, u32 *data_size,
+		      u32 total_data, u32 total_size, int nr)
 {
-	struct extent_buffer *leaf;
 	struct btrfs_item *item;
-	int ret = 0;
-	int slot;
-	int slot_orig;
 	int i;
 	u32 nritems;
-	u32 total_size = 0;
-	u32 total_data = 0;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
+	int ret;
+	struct extent_buffer *leaf;
+	int slot;
 
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
-
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
+	slot = path->slots[0];
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
@@ -3426,9 +3488,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
 	if (slot != nritems) {
 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
@@ -3484,11 +3543,13 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		data_end -= data_size[i];
 		btrfs_set_item_size(leaf, item, data_size[i]);
 	}
+
 	btrfs_set_header_nritems(leaf, nritems + nr);
 	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
 	if (slot == 0) {
+		struct btrfs_disk_key disk_key;
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	}
@@ -3497,6 +3558,43 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	struct extent_buffer *leaf;
+	int ret = 0;
+	int slot;
+	int i;
+	u32 total_size = 0;
+	u32 total_data = 0;
+
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+			       total_data, total_size, nr);
+
 out:
 	btrfs_unlock_up_safe(path, 1);
 	return ret;
-- 
cgit v1.2.3


From 1887be66dcc3140a81d1299958a41fc0eedfa64f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:11:24 -0400
Subject: Btrfs: try to cleanup delayed refs while freeing extents

When extents are freed, it is likely that we've removed the last
delayed reference update for the extent.  This checks the delayed
ref tree when things are freed, and if no ref updates area left it
immediately processes the delayed ref.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/delayed-ref.c | 18 ++++++++++++++
 fs/btrfs/delayed-ref.h |  5 ++--
 fs/btrfs/extent-tree.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 874565a1f63..3e7eeaf8640 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -510,6 +510,24 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+	if (ref)
+		return btrfs_delayed_node_to_head(ref);
+	return NULL;
+}
+
 /*
  * add a delayed ref to the tree.  This does all of the accounting required
  * to make sure the delayed ref is eventually processed before this
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 37919e5c007..c345fee9f96 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -137,9 +137,8 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 ref_generation, u64 owner_objectid, int action,
 			  int pin);
 
-struct btrfs_delayed_ref *
-btrfs_find_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr,
-		       u64 parent);
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_node *ref,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9b5da2b013e..8471c79b087 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1021,6 +1021,7 @@ again:
 		if (!locked_ref && count == 0)
 			break;
 
+		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
 	if (run_all) {
@@ -1045,6 +1046,7 @@ again:
 				mutex_unlock(&head->mutex);
 
 				btrfs_put_delayed_ref(ref);
+				cond_resched();
 				goto again;
 			}
 			node = rb_next(node);
@@ -2361,6 +2363,68 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			    owner_objectid, pin, pin == 0, refs_to_drop);
 }
 
+/*
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct rb_node *node;
+	int ret;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out;
+
+	node = rb_prev(&head->node.rb_node);
+	if (!node)
+		goto out;
+
+	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+	/* there are still entries for this ref, we can't drop it */
+	if (ref->bytenr == bytenr)
+		goto out;
+
+	/*
+	 * waiting for the lock here would deadlock.  If someone else has it
+	 * locked they are already in the process of dropping it anyway
+	 */
+	if (!mutex_trylock(&head->mutex))
+		goto out;
+
+	/*
+	 * at this point we have a head with no other entries.  Go
+	 * ahead and process it.
+	 */
+	head->node.in_tree = 0;
+	rb_erase(&head->node.rb_node, &delayed_refs->root);
+	delayed_refs->num_entries--;
+
+	/*
+	 * we don't take a ref on the node because we're removing it from the
+	 * tree, so we just steal the ref the tree was holding.
+	 */
+	spin_unlock(&delayed_refs->lock);
+
+	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+				  &head->node, head->must_insert_reserved);
+	BUG_ON(ret);
+	btrfs_put_delayed_ref(&head->node);
+	return 0;
+out:
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
@@ -2388,6 +2452,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 				       root_objectid, ref_generation,
 				       owner_objectid,
 				       BTRFS_DROP_DELAYED_REF, 1);
+		BUG_ON(ret);
+		ret = check_ref_cleanup(trans, root, bytenr);
+		BUG_ON(ret);
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From c3e69d58e86c3917ae4e9e31b4acf490a7cafe60 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:17:05 -0400
Subject: Btrfs: process the delayed reference queue in clusters

The delayed reference queue maintains pending operations that need to
be done to the extent allocation tree.  These are processed by
finding records in the tree that are not currently being processed one at
a time.

This is slow because it uses lots of time searching through the rbtree
and because it creates lock contention on the extent allocation tree
when lots of different procs are running delayed refs at the same time.

This commit changes things to grab a cluster of refs for processing,
using a cursor into the rbtree as the starting point of the next search.
This way we walk smoothly through the rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/delayed-ref.c | 130 +++++++++++++++++++++++++++++++-----------
 fs/btrfs/delayed-ref.h |  17 +++++-
 fs/btrfs/disk-io.c     |  17 ------
 fs/btrfs/extent-tree.c | 149 ++++++++++++++++++++++++++++++++-----------------
 fs/btrfs/transaction.c |  25 ++++++---
 6 files changed, 226 insertions(+), 113 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ced5fd85dc3..08d9f8d1553 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -720,6 +720,7 @@ struct btrfs_fs_info {
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
+
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 3e7eeaf8640..759fa247ced 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -93,7 +93,8 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
  * ref if it was able to find one, or NULL if nothing was in that spot
  */
 static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
-						  u64 bytenr, u64 parent)
+				  u64 bytenr, u64 parent,
+				  struct btrfs_delayed_ref_node **last)
 {
 	struct rb_node *n = root->rb_node;
 	struct btrfs_delayed_ref_node *entry;
@@ -102,6 +103,8 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 		WARN_ON(!entry->in_tree);
+		if (last)
+			*last = entry;
 
 		cmp = comp_entry(entry, bytenr, parent);
 		if (cmp < 0)
@@ -114,45 +117,99 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
 	return NULL;
 }
 
-/*
- * Locking on delayed refs is done by taking a lock on the head node,
- * which has the (impossible) parent id of (u64)-1.  Once a lock is held
- * on the head node, you're allowed (and required) to process all the
- * delayed refs for a given byte number in the tree.
- *
- * This will walk forward in the rbtree until it finds a head node it
- * is able to lock.  It might not lock the delayed ref you asked for,
- * and so it will return the one it did lock in next_ret and return 0.
- *
- * If no locks are taken, next_ret is set to null and 1 is returned.  This
- * means there are no more unlocked head nodes in the rbtree.
- */
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_delayed_ref_node *ref,
-			   struct btrfs_delayed_ref_head **next_ret)
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head)
 {
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	assert_spin_locked(&delayed_refs->lock);
+	if (mutex_trylock(&head->mutex))
+		return 0;
+
+	atomic_inc(&head->node.refs);
+	spin_unlock(&delayed_refs->lock);
+
+	mutex_lock(&head->mutex);
+	spin_lock(&delayed_refs->lock);
+	if (!head->node.in_tree) {
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+	btrfs_put_delayed_ref(&head->node);
+	return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+			   struct list_head *cluster, u64 start)
+{
+	int count = 0;
+	struct btrfs_delayed_ref_root *delayed_refs;
 	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *head;
-	int ret = 0;
 
-	while (1) {
+	delayed_refs = &trans->transaction->delayed_refs;
+	if (start == 0) {
+		node = rb_first(&delayed_refs->root);
+	} else {
+		ref = NULL;
+		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+		if (ref) {
+			struct btrfs_delayed_ref_node *tmp;
+
+			node = rb_prev(&ref->rb_node);
+			while (node) {
+				tmp = rb_entry(node,
+					       struct btrfs_delayed_ref_node,
+					       rb_node);
+				if (tmp->bytenr < start)
+					break;
+				ref = tmp;
+				node = rb_prev(&ref->rb_node);
+			}
+			node = &ref->rb_node;
+		} else
+			node = rb_first(&delayed_refs->root);
+	}
+again:
+	while (node && count < 32) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 		if (btrfs_delayed_ref_is_head(ref)) {
 			head = btrfs_delayed_node_to_head(ref);
-			if (mutex_trylock(&head->mutex)) {
-				*next_ret = head;
-				ret = 0;
+			if (list_empty(&head->cluster)) {
+				list_add_tail(&head->cluster, cluster);
+				delayed_refs->run_delayed_start =
+					head->node.bytenr;
+				count++;
+
+				WARN_ON(delayed_refs->num_heads_ready == 0);
+				delayed_refs->num_heads_ready--;
+			} else if (count) {
+				/* the goal of the clustering is to find extents
+				 * that are likely to end up in the same extent
+				 * leaf on disk.  So, we don't want them spread
+				 * all over the tree.  Stop now if we've hit
+				 * a head that was already in use
+				 */
 				break;
 			}
 		}
-		node = rb_next(&ref->rb_node);
-		if (!node) {
-			ret = 1;
-			*next_ret = NULL;
-			break;
-		}
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
 	}
-	return ret;
+	if (count) {
+		return 0;
+	} else if (start) {
+		/*
+		 * we've gone to the end of the rbtree without finding any
+		 * clusters.  start from the beginning and try again
+		 */
+		start = 0;
+		node = rb_first(&delayed_refs->root);
+		goto again;
+	}
+	return 1;
 }
 
 /*
@@ -178,7 +235,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
 	if (ref) {
 		prev_node = rb_prev(&ref->rb_node);
 		if (!prev_node)
@@ -240,7 +297,7 @@ again:
 	}
 
 	spin_lock(&delayed_refs->lock);
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
 	if (ref) {
 		head = btrfs_delayed_node_to_head(ref);
 		if (mutex_trylock(&head->mutex)) {
@@ -384,7 +441,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_ref *full_ref;
-	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
 	int must_insert_reserved = 0;
@@ -428,6 +485,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	if (btrfs_delayed_ref_is_head(ref)) {
 		head_ref = btrfs_delayed_node_to_head(ref);
 		head_ref->must_insert_reserved = must_insert_reserved;
+		INIT_LIST_HEAD(&head_ref->cluster);
 		mutex_init(&head_ref->mutex);
 	} else {
 		full_ref = btrfs_delayed_node_to_ref(ref);
@@ -453,6 +511,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 		 */
 		kfree(ref);
 	} else {
+		if (btrfs_delayed_ref_is_head(ref)) {
+			delayed_refs->num_heads++;
+			delayed_refs->num_heads_ready++;
+		}
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
@@ -522,7 +584,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c345fee9f96..57153fcc347 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,6 +68,8 @@ struct btrfs_delayed_ref_head {
 	 */
 	struct mutex mutex;
 
+	struct list_head cluster;
+
 	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
@@ -115,12 +117,20 @@ struct btrfs_delayed_ref_root {
 	 */
 	unsigned long num_entries;
 
+	/* total number of head nodes in tree */
+	unsigned long num_heads;
+
+	/* total number of head nodes ready for processing */
+	unsigned long num_heads_ready;
+
 	/*
 	 * set when the tree is flushing before a transaction commit,
 	 * used by the throttling code to decide if new updates need
 	 * to be run right away
 	 */
 	int flushing;
+
+	u64 run_delayed_start;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -140,9 +150,6 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_delayed_ref_node *ref,
-			   struct btrfs_delayed_ref_head **next_ret);
 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
 			    u64 num_bytes, u32 *refs);
@@ -151,6 +158,10 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
 			  u64 orig_ref_generation, u64 ref_generation,
 			  u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+			   struct list_head *cluster, u64 search_start);
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4f43e227a29..1f1d89b1881 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1458,7 +1458,6 @@ static int transaction_kthread(void *arg)
 	struct btrfs_root *root = arg;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_transaction *cur;
-	struct btrfs_fs_info *info = root->fs_info;
 	unsigned long now;
 	unsigned long delay;
 	int ret;
@@ -1481,24 +1480,8 @@ static int transaction_kthread(void *arg)
 
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
-			unsigned long num_delayed;
-			num_delayed = cur->delayed_refs.num_entries;
 			mutex_unlock(&root->fs_info->trans_mutex);
 			delay = HZ * 5;
-
-			/*
-			 * we may have been woken up early to start
-			 * processing the delayed extent ref updates
-			 * If so, run some of them and then loop around again
-			 * to see if we need to force a commit
-			 */
-			if (num_delayed > 64) {
-				mutex_unlock(&info->transaction_kthread_mutex);
-				trans = btrfs_start_transaction(root, 1);
-				btrfs_run_delayed_refs(trans, root, 256);
-				btrfs_end_transaction(trans, root);
-				continue;
-			}
 			goto sleep;
 		}
 		mutex_unlock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8471c79b087..3b8b6c21270 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -926,52 +926,41 @@ again:
 	return NULL;
 }
 
-/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
- */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long count)
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct list_head *cluster)
 {
-	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	int ret;
+	int count = 0;
 	int must_insert_reserved = 0;
-	int run_all = count == (unsigned long)-1;
-
-	if (root == root->fs_info->extent_root)
-		root = root->fs_info->tree_root;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-again:
-	spin_lock(&delayed_refs->lock);
-	if (count == 0)
-		count = delayed_refs->num_entries;
 	while (1) {
 		if (!locked_ref) {
-			/*
-			 * no locked ref, go find something we can
-			 * process in the rbtree.  We start at
-			 * the beginning of the tree, there may be less
-			 * lock contention if we do something smarter here.
-			 */
-			node = rb_first(&delayed_refs->root);
-			if (!node) {
-				spin_unlock(&delayed_refs->lock);
+			/* pick a new head ref from the cluster list */
+			if (list_empty(cluster))
 				break;
-			}
 
-			ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				       rb_node);
-			ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
-			if (ret) {
-				spin_unlock(&delayed_refs->lock);
-				break;
+			locked_ref = list_entry(cluster->next,
+				     struct btrfs_delayed_ref_head, cluster);
+
+			/* grab the lock that says we are going to process
+			 * all the refs for this head */
+			ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+			/*
+			 * we may have dropped the spin lock to get the head
+			 * mutex lock, and that might have given someone else
+			 * time to free the head.  If that's true, it has been
+			 * removed from our list and we can move on.
+			 */
+			if (ret == -EAGAIN) {
+				locked_ref = NULL;
+				count++;
+				continue;
 			}
 		}
 
@@ -986,7 +975,6 @@ again:
 		 * locked_ref is the head node, so we have to go one
 		 * node back for any delayed ref updates
 		 */
-
 		ref = select_delayed_ref(locked_ref);
 		if (!ref) {
 			/* All delayed refs have been processed, Go ahead
@@ -994,6 +982,7 @@ again:
 			 * so that any accounting fixes can happen
 			 */
 			ref = &locked_ref->node;
+			list_del_init(&locked_ref->cluster);
 			locked_ref = NULL;
 		}
 
@@ -1007,30 +996,72 @@ again:
 		BUG_ON(ret);
 		btrfs_put_delayed_ref(ref);
 
-		/* once we lock the head ref, we have to process all the
-		 * entries for it.  So, we might end up doing more entries
-		 * that count was asking us to do.
-		 */
-		if (count > 0)
-			count--;
+		count++;
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+	return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct list_head cluster;
+	int ret;
+	int run_all = count == (unsigned long)-1;
+	int run_most = 0;
+
+	if (root == root->fs_info->extent_root)
+		root = root->fs_info->tree_root;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	INIT_LIST_HEAD(&cluster);
+again:
+	spin_lock(&delayed_refs->lock);
+	if (count == 0) {
+		count = delayed_refs->num_entries * 2;
+		run_most = 1;
+	}
+	while (1) {
+		if (!(run_all || run_most) &&
+		    delayed_refs->num_heads_ready < 64)
+			break;
 
 		/*
-		 * we set locked_ref to null above if we're all done
-		 * with this bytenr
+		 * go find something we can process in the rbtree.  We start at
+		 * the beginning of the tree, and then build a cluster
+		 * of refs to process starting at the first one we are able to
+		 * lock
 		 */
-		if (!locked_ref && count == 0)
+		ret = btrfs_find_ref_cluster(trans, &cluster,
+					     delayed_refs->run_delayed_start);
+		if (ret)
 			break;
 
-		cond_resched();
-		spin_lock(&delayed_refs->lock);
+		ret = run_clustered_refs(trans, root, &cluster);
+		BUG_ON(ret < 0);
+
+		count -= min_t(unsigned long, ret, count);
+
+		if (count == 0)
+			break;
 	}
+
 	if (run_all) {
-		spin_lock(&delayed_refs->lock);
 		node = rb_first(&delayed_refs->root);
-		if (!node) {
-			spin_unlock(&delayed_refs->lock);
+		if (!node)
 			goto out;
-		}
+		count = (unsigned long)-1;
 
 		while (node) {
 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
@@ -1052,11 +1083,11 @@ again:
 			node = rb_next(node);
 		}
 		spin_unlock(&delayed_refs->lock);
-		count = (unsigned long)-1;
 		schedule_timeout(1);
 		goto again;
 	}
 out:
+	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
 
@@ -2407,12 +2438,18 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	 */
 	head->node.in_tree = 0;
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
+
 	delayed_refs->num_entries--;
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
 	 * tree, so we just steal the ref the tree was holding.
 	 */
+	delayed_refs->num_heads--;
+	if (list_empty(&head->cluster))
+		delayed_refs->num_heads_ready--;
+
+	list_del_init(&head->cluster);
 	spin_unlock(&delayed_refs->lock);
 
 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
@@ -3705,6 +3742,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_path *path;
 	int i;
 	int orig_level;
+	int update_count;
 	struct btrfs_root_item *root_item = &root->root_item;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -3746,6 +3784,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 	}
 	while (1) {
+		unsigned long update;
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -3764,6 +3803,14 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
+		for (update_count = 0; update_count < 16; update_count++) {
+			update = trans->delayed_ref_updates;
+			trans->delayed_ref_updates = 0;
+			if (update)
+				btrfs_run_delayed_refs(trans, root, update);
+			else
+				break;
+		}
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f94c2ad8996..903edab3659 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -68,7 +68,10 @@ static noinline int join_transaction(struct btrfs_root *root)
 
 		cur_trans->delayed_refs.root.rb_node = NULL;
 		cur_trans->delayed_refs.num_entries = 0;
+		cur_trans->delayed_refs.num_heads_ready = 0;
+		cur_trans->delayed_refs.num_heads = 0;
 		cur_trans->delayed_refs.flushing = 0;
+		cur_trans->delayed_refs.run_delayed_start = 0;
 		spin_lock_init(&cur_trans->delayed_refs.lock);
 
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
@@ -287,13 +290,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *info = root->fs_info;
-
-	if (trans->delayed_ref_updates &&
-	    (trans->transaction->delayed_refs.flushing ||
-	    trans->transaction->delayed_refs.num_entries > 16384)) {
-		btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates);
-	} else if (trans->transaction->delayed_refs.num_entries > 64) {
-		wake_up_process(root->fs_info->transaction_kthread);
+	int count = 0;
+
+	while (count < 4) {
+		unsigned long cur = trans->delayed_ref_updates;
+		trans->delayed_ref_updates = 0;
+		if (cur &&
+		    trans->transaction->delayed_refs.num_heads_ready > 64) {
+			trans->delayed_ref_updates = 0;
+			btrfs_run_delayed_refs(trans, root, cur);
+		} else {
+			break;
+		}
+		count++;
 	}
 
 	mutex_lock(&info->trans_mutex);
@@ -929,7 +938,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	trans->transaction->delayed_refs.flushing = 1;
 
-	ret = btrfs_run_delayed_refs(trans, root, (u64)-1);
+	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
 	INIT_LIST_HEAD(&dirty_fs_roots);
-- 
cgit v1.2.3


From b7ec40d7845bffca8bb3af2ea3f192d6257bbe21 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Mar 2009 20:12:45 -0400
Subject: Btrfs: reduce stalls during transaction commit

To avoid deadlocks and reduce latencies during some critical operations, some
transaction writers are allowed to jump into the running transaction and make
it run a little longer, while others sit around and wait for the commit to
finish.

This is a bit unfair, especially when the callers that jump in do a bunch
of IO that makes all the others procs on the box wait.  This commit
reduces the stalls this produces by pre-reading file extent pointers
during btrfs_finish_ordered_io before the transaction is joined.

It also tunes the drop_snapshot code to politely wait for transactions
that have started writing out their delayed refs to finish.  This avoids
new delayed refs being flooded into the queue while we're trying to
close off the transaction.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  3 ++-
 fs/btrfs/inode.c       | 18 +++++++++++++++
 fs/btrfs/transaction.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/transaction.h |  5 ++++
 4 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b8b6c21270..a421c32c6cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3797,7 +3797,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		if (trans->transaction->in_commit) {
+		if (trans->transaction->in_commit ||
+		    trans->transaction->delayed_refs.flushing) {
 			ret = -EAGAIN;
 			break;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22..13a17477c4f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1502,6 +1502,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
 	int compressed = 0;
 	int ret;
 
@@ -1509,6 +1510,23 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (!ret)
 		return 0;
 
+	/*
+	 * before we join the transaction, try to do some of our IO.
+	 * This will limit the amount of IO that we have to do with
+	 * the transaction running.  We're unlikely to need to do any
+	 * IO if the file extents are new, the disk_i_size checks
+	 * covers the most common case.
+	 */
+	if (start < BTRFS_I(inode)->disk_i_size) {
+		path = btrfs_alloc_path();
+		if (path) {
+			ret = btrfs_lookup_file_extent(NULL, root, path,
+						       inode->i_ino,
+						       start, 0);
+			btrfs_free_path(path);
+		}
+	}
+
 	trans = btrfs_join_transaction(root, 1);
 
 	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 903edab3659..01c9620bb00 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -192,6 +192,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->alloc_exclude_nr = 0;
 	h->alloc_exclude_start = 0;
 	h->delayed_ref_updates = 0;
+
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
@@ -281,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
 	if (!root->fs_info->open_ioctl_trans)
 		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
-
 	throttle_on_drops(root);
 }
 
@@ -298,6 +298,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		if (cur &&
 		    trans->transaction->delayed_refs.num_heads_ready > 64) {
 			trans->delayed_ref_updates = 0;
+
+			/*
+			 * do a full flush if the transaction is trying
+			 * to close
+			 */
+			if (trans->transaction->delayed_refs.flushing)
+				cur = 0;
 			btrfs_run_delayed_refs(trans, root, cur);
 		} else {
 			break;
@@ -665,6 +672,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	return 0;
 }
 
+/*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+	DEFINE_WAIT(wait);
+
+	mutex_lock(&info->trans_mutex);
+	while (info->running_transaction &&
+	       info->running_transaction->delayed_refs.flushing) {
+		prepare_to_wait(&info->transaction_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&info->trans_mutex);
+		schedule();
+		mutex_lock(&info->trans_mutex);
+		finish_wait(&info->transaction_wait, &wait);
+	}
+	mutex_unlock(&info->trans_mutex);
+	return 0;
+}
+
 /*
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
@@ -692,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		atomic_inc(&root->fs_info->throttles);
 
 		while (1) {
+			/*
+			 * we don't want to jump in and create a bunch of
+			 * delayed refs if the transaction is starting to close
+			 */
+			wait_transaction_pre_flush(tree_root->fs_info);
 			trans = btrfs_start_transaction(tree_root, 1);
+
+			/*
+			 * we've joined a transaction, make sure it isn't
+			 * closing right now
+			 */
+			if (trans->transaction->delayed_refs.flushing) {
+				btrfs_end_transaction(trans, tree_root);
+				continue;
+			}
+
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
 			if (ret != -EAGAIN)
@@ -932,20 +979,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
+	cur_trans = trans->transaction;
 	/*
 	 * set the flushing flag so procs in this transaction have to
 	 * start sending their work down.
 	 */
-	trans->transaction->delayed_refs.flushing = 1;
+	cur_trans->delayed_refs.flushing = 1;
 
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
-	INIT_LIST_HEAD(&dirty_fs_roots);
 	mutex_lock(&root->fs_info->trans_mutex);
-	if (trans->transaction->in_commit) {
-		cur_trans = trans->transaction;
-		trans->transaction->use_count++;
+	INIT_LIST_HEAD(&dirty_fs_roots);
+	if (cur_trans->in_commit) {
+		cur_trans->use_count++;
 		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
 
@@ -968,7 +1015,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
-	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
@@ -1081,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_copy_pinned(root, pinned_copy);
 
 	trans->transaction->blocked = 0;
+
 	wake_up(&root->fs_info->transaction_throttle);
 	wake_up(&root->fs_info->transaction_wait);
 
@@ -1107,6 +1154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->fs_info->trans_mutex);
 
 	cur_trans->commit_done = 1;
+
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94876709217..94f5bde2b58 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -23,7 +23,12 @@
 
 struct btrfs_transaction {
 	u64 transid;
+	/*
+	 * total writers in this transaction, it must be zero before the
+	 * transaction can end
+	 */
 	unsigned long num_writers;
+
 	unsigned long num_joined;
 	int in_commit;
 	int use_count;
-- 
cgit v1.2.3


From 7f366cfecfc126731f8ac505d72026d691dac79a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Mar 2009 20:12:45 -0400
Subject: Btrfs: reduce stack in cow_file_range

The fs/btrfs/inode.c code to run delayed allocation during writout
needed some stack usage optimization.  This is the first pass, it does
the check for compression earlier on, which allows us to do the common
(no compression) case higher up in the call chain.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 13a17477c4f..c427011dc45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -204,7 +204,7 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode, u64 start, u64 end,
 				 size_t compressed_size,
@@ -854,11 +854,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	u64 cur_end;
 	int limit = 10 * 1024 * 1042;
 
-	if (!btrfs_test_opt(root, COMPRESS)) {
-		return cow_file_range(inode, locked_page, start, end,
-				      page_started, nr_written, 1);
-	}
-
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
 			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
 	while (start < end) {
@@ -935,7 +930,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+				       struct page *locked_page,
 			      u64 start, u64 end, int *page_started, int force,
 			      unsigned long *nr_written)
 {
@@ -1133,6 +1129,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 			      unsigned long *nr_written)
 {
 	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1137,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
+	else if (!btrfs_test_opt(root, COMPRESS))
+		ret = cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
 					   page_started, nr_written);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 66d7e85ea7c3628189d19b265495358f756cb463 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Mar 2009 20:12:45 -0400
Subject: Btrfs: Check for a blocking lock before taking the spin

This reduces contention on the extent buffer spin locks by testing for a
blocking lock before trying to take the spinlock.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/locking.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a..6d8db2f5c38 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 static int btrfs_spin_on_block(struct extent_buffer *eb)
 {
 	int i;
+
 	for (i = 0; i < 512; i++) {
-		cpu_relax();
 		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
 			return 1;
 		if (need_resched())
 			break;
+		cpu_relax();
 	}
 	return 0;
 }
@@ -102,6 +103,7 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 
 	/* spin for a bit on the BLOCKING flag */
 	for (i = 0; i < 2; i++) {
+		cpu_relax();
 		if (!btrfs_spin_on_block(eb))
 			break;
 
@@ -148,6 +150,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 	DEFINE_WAIT(wait);
 	wait.func = btrfs_wake_function;
 
+	if (!btrfs_spin_on_block(eb))
+		goto sleep;
+
 	while(1) {
 		spin_nested(eb);
 
@@ -165,9 +170,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 		 * spin for a bit, and if the blocking flag goes away,
 		 * loop around
 		 */
+		cpu_relax();
 		if (btrfs_spin_on_block(eb))
 			continue;
-
+sleep:
 		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
-- 
cgit v1.2.3


From 89573b9c516b24af8a3b9958dd5afca8fa874e3d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Mar 2009 20:12:45 -0400
Subject: Btrfs: Only let very young transactions grow during commit

Commits are fairly expensive, and so btrfs has code to sit around for a while
during the commit and let new writers come in.

But, while we're sitting there, new delayed refs might be added, and those
can be expensive to process as well.  Unless the transaction is very very
young, it makes sense to go ahead and let the commit finish without hanging
around.

The commit grow loop isn't as important as it used to be, the fsync logging
code handles most performance critical syncs now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01c9620bb00..9c8f158dd2d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -972,6 +972,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
+	int should_grow = 0;
+	unsigned long now = get_seconds();
 
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
@@ -1029,6 +1031,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+		should_grow = 1;
+
 	do {
 		int snap_pending = 0;
 		joined = cur_trans->num_joined;
@@ -1041,7 +1046,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		if (cur_trans->num_writers > 1)
 			timeout = MAX_SCHEDULE_TIMEOUT;
-		else
+		else if (should_grow)
 			timeout = 1;
 
 		mutex_unlock(&root->fs_info->trans_mutex);
@@ -1051,12 +1056,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 
-		schedule_timeout(timeout);
+		smp_mb();
+		if (cur_trans->num_writers > 1 || should_grow)
+			schedule_timeout(timeout);
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
 	} while (cur_trans->num_writers > 1 ||
-		 (cur_trans->num_joined != joined));
+		 (should_grow && cur_trans->num_joined != joined));
 
 	ret = create_pending_snapshots(trans, root->fs_info);
 	BUG_ON(ret);
-- 
cgit v1.2.3


From b9473439d3e84d9fc1a0a83faca69cc1b7566341 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 11:00:37 -0400
Subject: Btrfs: leave btree locks spinning more often

btrfs_mark_buffer dirty would set dirty bits in the extent_io tree
for the buffers it was dirtying.  This may require a kmalloc and it
was not atomic.  So, anyone who called btrfs_mark_buffer_dirty had to
set any btree locks they were holding to blocking first.

This commit changes dirty tracking for extent buffers to just use a flag
in the extent buffer.  Now that we have one and only one extent buffer
per page, this can be safely done without losing dirty bits along the way.

This also introduces a path->leave_spinning flag that callers of
btrfs_search_slot can use to indicate they will properly deal with a
path returned where all the locks are spinning instead of blocking.

Many of the btree search callers now expect spinning paths,
resulting in better btree concurrency overall.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 19 ++++++++------
 fs/btrfs/ctree.h       | 12 ++++++---
 fs/btrfs/dir-item.c    |  3 +++
 fs/btrfs/disk-io.c     | 67 ++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/extent-tree.c | 69 ++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent_io.c   | 51 +++++++------------------------------
 fs/btrfs/extent_io.h   |  3 +++
 fs/btrfs/file-item.c   |  7 +++--
 fs/btrfs/file.c        |  4 +++
 fs/btrfs/inode-item.c  |  3 +++
 fs/btrfs/inode.c       | 17 ++++++++++---
 fs/btrfs/locking.c     | 11 ++++----
 fs/btrfs/tree-log.c    |  1 -
 14 files changed, 172 insertions(+), 96 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3764248bdc0..8686a3d2ab3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1684,7 +1684,8 @@ done:
 	 * we don't really know what they plan on doing with the path
 	 * from here on, so for now just mark it as blocking
 	 */
-	btrfs_set_path_blocking(p);
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
 	return ret;
 }
 
@@ -3032,26 +3033,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 		return -EAGAIN;
 	}
 
+	btrfs_set_path_blocking(path);
 	ret = split_leaf(trans, root, &orig_key, path,
 			 sizeof(struct btrfs_item), 1);
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
+	btrfs_unlock_up_safe(path, 1);
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
 	/*
 	 * make sure any changes to the path from split_leaf leave it
 	 * in a blocking state
 	 */
 	btrfs_set_path_blocking(path);
 
-	leaf = path->nodes[0];
-	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
 	item = btrfs_item_nr(leaf, path->slots[0]);
 	orig_offset = btrfs_item_offset(leaf, item);
 	item_size = btrfs_item_size(leaf, item);
 
-
 	buf = kmalloc(item_size, GFP_NOFS);
 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
 			    path->slots[0]), item_size);
@@ -3545,7 +3547,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + nr);
-	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
 	if (slot == 0) {
@@ -3553,6 +3554,8 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	}
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(root, leaf) < 0) {
 		btrfs_print_leaf(root, leaf);
@@ -3596,7 +3599,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			       total_data, total_size, nr);
 
 out:
-	btrfs_unlock_up_safe(path, 1);
 	return ret;
 }
 
@@ -3792,6 +3794,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			extent_buffer_get(leaf);
 
+			btrfs_set_path_blocking(path);
 			wret = push_leaf_left(trans, root, path, 1, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 08d9f8d1553..4ddce91cf3f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -401,15 +401,16 @@ struct btrfs_path {
 	int locks[BTRFS_MAX_LEVEL];
 	int reada;
 	/* keep some upper locks as we walk down */
-	int keep_locks;
-	int skip_locking;
 	int lowest_level;
 
 	/*
 	 * set by btrfs_split_item, tells search_slot to keep all locks
 	 * and to force calls to keep space in the nodes
 	 */
-	int search_for_split;
+	unsigned int search_for_split:1;
+	unsigned int keep_locks:1;
+	unsigned int skip_locking:1;
+	unsigned int leave_spinning:1;
 };
 
 /*
@@ -779,6 +780,11 @@ struct btrfs_fs_info {
 	atomic_t throttle_gen;
 
 	u64 total_pinned;
+
+	/* protected by the delalloc lock, used to keep from writing
+	 * metadata until there is a nice batch
+	 */
+	u64 dirty_metadata_bytes;
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7..1d70236ba00 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	key.offset = btrfs_name_hash(name, name_len);
+
 	path = btrfs_alloc_path();
+	path->leave_spinning = 1;
+
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1f1d89b1881..9244cd7313d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct extent_buffer *eb;
+	int was_dirty;
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (!(current->flags & PF_MEMALLOC)) {
+		return extent_write_full_page(tree, page,
+					      btree_get_extent, wbc);
+	}
 
-	if (current->flags & PF_MEMALLOC) {
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
+	redirty_page_for_writepage(wbc, page);
+	eb = btrfs_find_tree_block(root, page_offset(page),
+				      PAGE_CACHE_SIZE);
+	WARN_ON(!eb);
+
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+	if (!was_dirty) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+		spin_unlock(&root->fs_info->delalloc_lock);
 	}
-	return extent_write_full_page(tree, page, btree_get_extent, wbc);
+	free_extent_buffer(eb);
+
+	unlock_page(page);
+	return 0;
 }
 
 static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
+		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
 		u64 num_dirty;
-		u64 start = 0;
 		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
-		num_dirty = count_range_bits(tree, &start, (u64)-1,
-					     thresh, EXTENT_DIRTY);
+		/* this is a bit racy, but that's ok */
+		num_dirty = root->fs_info->dirty_metadata_bytes;
 		if (num_dirty < thresh)
 			return 0;
 	}
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	    root->fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
-		/* ugh, clear_extent_buffer_dirty can be expensive */
-		btrfs_set_lock_blocking(buf);
+		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+			spin_lock(&root->fs_info->delalloc_lock);
+			if (root->fs_info->dirty_metadata_bytes >= buf->len)
+				root->fs_info->dirty_metadata_bytes -= buf->len;
+			else
+				WARN_ON(1);
+			spin_unlock(&root->fs_info->delalloc_lock);
+		}
 
+		/* ugh, clear_extent_buffer_dirty needs to lock the page */
+		btrfs_set_lock_blocking(buf);
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	}
@@ -2348,8 +2373,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
-
-	btrfs_set_lock_blocking(buf);
+	int was_dirty;
 
 	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
@@ -2360,7 +2384,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			(unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+	was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+					    buf);
+	if (!was_dirty) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->dirty_metadata_bytes += buf->len;
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2400,6 +2430,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_buffer *eb;
 	unsigned long len;
@@ -2415,6 +2446,16 @@ int btree_lock_page_hook(struct page *page)
 
 	btrfs_tree_lock(eb);
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		if (root->fs_info->dirty_metadata_bytes >= eb->len)
+			root->fs_info->dirty_metadata_bytes -= eb->len;
+		else
+			WARN_ON(1);
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227b..c958ecbc191 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a421c32c6cf..8933d15a240 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -56,9 +56,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 					 int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
 				   u64 bytenr, u64 num, int reserve);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  u64 bytenr, u64 num_bytes, int is_data);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
@@ -618,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 	} else {
 		goto out;
 	}
+	btrfs_unlock_up_safe(path, 1);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
@@ -760,6 +758,7 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
+	path->leave_spinning = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.offset = num_bytes;
@@ -767,8 +766,10 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 	/* first find the extent item and update its reference count */
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
 				path, 0, 1);
-	if (ret < 0)
+	if (ret < 0) {
+		btrfs_set_path_blocking(path);
 		return ret;
+	}
 
 	if (ret > 0) {
 		WARN_ON(1);
@@ -791,11 +792,15 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 
 	refs = btrfs_extent_refs(l, item);
 	btrfs_set_extent_refs(l, item, refs + refs_to_add);
+	btrfs_unlock_up_safe(path, 1);
+
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
+	path->leave_spinning = 1;
+
 	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent,
@@ -2050,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 		clear_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
 	}
+	mutex_unlock(&root->fs_info->pinned_mutex);
+
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
 		BUG_ON(!cache);
@@ -2141,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 end;
 	int ret;
 
-	mutex_lock(&root->fs_info->pinned_mutex);
 	while (1) {
+		mutex_lock(&root->fs_info->pinned_mutex);
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
@@ -2150,14 +2157,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_discard_extent(root, start, end + 1 - start);
 
+		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
-		if (need_resched()) {
-			mutex_unlock(&root->fs_info->pinned_mutex);
-			cond_resched();
-			mutex_lock(&root->fs_info->pinned_mutex);
-		}
+		cond_resched();
 	}
 	mutex_unlock(&root->fs_info->pinned_mutex);
 	return ret;
@@ -2165,7 +2169,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
-			  u64 bytenr, u64 num_bytes, int is_data)
+			  struct btrfs_path *path,
+			  u64 bytenr, u64 num_bytes, int is_data,
+			  struct extent_buffer **must_clean)
 {
 	int err = 0;
 	struct extent_buffer *buf;
@@ -2191,15 +2197,16 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-			clean_tree_block(NULL, root, buf);
-			btrfs_tree_unlock(buf);
-			free_extent_buffer(buf);
+			*must_clean = buf;
 			return 1;
 		}
 		btrfs_tree_unlock(buf);
 	}
 	free_extent_buffer(buf);
 pinit:
+	btrfs_set_path_blocking(path);
+	mutex_lock(&root->fs_info->pinned_mutex);
+	/* unlocks the pinned mutex */
 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
 	BUG_ON(err < 0);
@@ -2236,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
+	path->leave_spinning = 1;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, parent, root_objectid,
 				    ref_generation, owner_objectid, 1);
@@ -2261,6 +2269,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 						    refs_to_drop);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
+			path->leave_spinning = 1;
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
 			if (ret) {
@@ -2318,6 +2327,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		/* if refs are 0, we need to setup the path for deletion */
 		if (refs == 0) {
 			btrfs_release_path(extent_root, path);
+			path->leave_spinning = 1;
 			ret = btrfs_search_slot(trans, extent_root, &key, path,
 						-1, 1);
 			BUG_ON(ret);
@@ -2327,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (refs == 0) {
 		u64 super_used;
 		u64 root_used;
+		struct extent_buffer *must_clean = NULL;
 
 		if (pin) {
-			mutex_lock(&root->fs_info->pinned_mutex);
-			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
-				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
-			mutex_unlock(&root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, root, path,
+				bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+				&must_clean);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
 		}
+
 		/* block accounting for super block */
 		spin_lock(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2348,11 +2360,27 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_root_used(&root->root_item,
 					   root_used - num_bytes);
 		spin_unlock(&info->delalloc_lock);
+
+		/*
+		 * it is going to be very rare for someone to be waiting
+		 * on the block we're freeing.  del_items might need to
+		 * schedule, so rather than get fancy, just force it
+		 * to blocking here
+		 */
+		if (must_clean)
+			btrfs_set_lock_blocking(must_clean);
+
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
 		btrfs_release_path(extent_root, path);
 
+		if (must_clean) {
+			clean_tree_block(NULL, root, must_clean);
+			btrfs_tree_unlock(must_clean);
+			free_extent_buffer(must_clean);
+		}
+
 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
@@ -2480,8 +2508,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		mutex_lock(&root->fs_info->pinned_mutex);
+
+		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-		mutex_unlock(&root->fs_info->pinned_mutex);
 		update_reserved_extents(root, bytenr, num_bytes, 0);
 		ret = 0;
 	} else {
@@ -2931,6 +2960,7 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
 				       sizes, 2);
 	BUG_ON(ret);
@@ -5435,6 +5465,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e606..08085af089e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb)
 {
-	int set;
 	unsigned long i;
 	unsigned long num_pages;
 	struct page *page;
 
-	u64 start = eb->start;
-	u64 end = start + eb->len - 1;
-
-	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
 	num_pages = num_extent_pages(eb->start, eb->len);
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!set && !PageDirty(page))
+		if (!PageDirty(page))
 			continue;
 
 		lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 		else
 			set_page_private(page, EXTENT_PAGE_PRIVATE);
 
-		/*
-		 * if we're on the last page or the first page and the
-		 * block isn't aligned on a page boundary, do extra checks
-		 * to make sure we don't clean page that is partially dirty
-		 */
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			start = (u64)page->index << PAGE_CACHE_SHIFT;
-			end  = start + PAGE_CACHE_SIZE - 1;
-			if (test_range_bit(tree, start, end,
-					   EXTENT_DIRTY, 0)) {
-				unlock_page(page);
-				continue;
-			}
-		}
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 {
 	unsigned long i;
 	unsigned long num_pages;
+	int was_dirty = 0;
 
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = extent_buffer_page(eb, i);
-		/* writepage may need to do something special for the
-		 * first page, we have to make sure page->private is
-		 * properly set.  releasepage may drop page->private
-		 * on us if the page isn't already dirty.
-		 */
-		lock_page(page);
-		if (i == 0) {
-			set_page_extent_head(page, eb->len);
-		} else if (PagePrivate(page) &&
-			   page->private != EXTENT_PAGE_PRIVATE) {
-			set_page_extent_mapped(page);
-		}
+	for (i = 0; i < num_pages; i++)
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		set_extent_dirty(tree, page_offset(page),
-				 page_offset(page) + PAGE_CACHE_SIZE - 1,
-				 GFP_NOFS);
-		unlock_page(page);
-	}
-	return 0;
+	return was_dirty;
 }
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 		ret = 0;
 		goto out;
 	}
+	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		ret = 0;
+		goto out;
+	}
 	/* at this point we can safely release the extent buffer */
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf..5bc20abf3f3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd..9b99886562d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	file_key.offset = pos;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
 	if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		key.offset = end_byte - 1;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
 
+		path->leave_spinning = 1;
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret > 0) {
 			if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
 	} else {
 		ins_size = csum_size;
 	}
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      ins_size);
+	path->leave_spinning = 0;
 	if (ret < 0)
 		goto fail_unlock;
 	if (ret != 0) {
@@ -776,7 +780,6 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-	cond_resched();
 next_sector:
 
 	if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	cond_resched();
 	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
+		cond_resched();
 		goto again;
 	}
 out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c8007549764..f06c275644b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 
 			btrfs_release_path(root, path);
+			path->leave_spinning = 1;
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
 			BUG_ON(ret);
@@ -639,7 +640,9 @@ next_slot:
 							ram_bytes);
 			btrfs_set_file_extent_type(leaf, extent, found_type);
 
+			btrfs_unlock_up_safe(path, 1);
 			btrfs_mark_buffer_dirty(path->nodes[0]);
+			btrfs_set_lock_blocking(path->nodes[0]);
 
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
@@ -652,6 +655,7 @@ next_slot:
 
 				BUG_ON(ret);
 			}
+			path->leave_spinning = 0;
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0)
 				inode_add_bytes(inode, extent_end - end);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a..6b627c61180 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
+
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      ins_len);
 	if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c427011dc45..b83a45dc717 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
 	btrfs_set_trans_block_group(trans, inode);
 
 	key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
-			kaddr = kmap(cpage);
+			kaddr = kmap_atomic(cpage, KM_USER0);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap(cpage);
+			kunmap_atomic(kaddr, KM_USER0);
 
 			i++;
 			ptr += cur_size;
@@ -1452,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
+	path->leave_spinning = 1;
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
 				 file_pos + num_bytes, file_pos, &hint);
 	BUG_ON(ret);
@@ -1474,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_compression(leaf, fi, compression);
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_set_lock_blocking(leaf);
+
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_add_bytes(inode, num_bytes);
@@ -1486,8 +1492,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 					  root->root_key.objectid,
 					  trans->transid, inode->i_ino, &ins);
 	BUG_ON(ret);
-
 	btrfs_free_path(path);
+
 	return 0;
 }
 
@@ -2118,6 +2124,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	path->leave_spinning = 1;
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -2164,6 +2171,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto err;
 	}
 
+	path->leave_spinning = 1;
 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -2515,6 +2523,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.type = (u8)-1;
 
 search_again:
+	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto error;
@@ -2661,6 +2670,7 @@ delete:
 			break;
 		}
 		if (found_extent) {
+			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
 						leaf->start, root_owner,
@@ -3466,6 +3476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	sizes[0] = sizeof(struct btrfs_inode_item);
 	sizes[1] = name_len + sizeof(*ref);
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
 	if (ret != 0)
 		goto fail;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6d8db2f5c38..a5310c0f41e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -96,11 +96,12 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
 	int i;
 
-	spin_nested(eb);
-	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-		return 1;
-	spin_unlock(&eb->lock);
-
+	if (btrfs_spin_on_block(eb)) {
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
 	/* spin for a bit on the BLOCKING flag */
 	for (i = 0; i < 2; i++) {
 		cpu_relax();
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60f..a93934fc93b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -203,7 +203,6 @@ static int process_one_buffer(struct btrfs_root *log,
 		mutex_lock(&log->fs_info->pinned_mutex);
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
 					    eb->start, eb->len, 1);
-		mutex_unlock(&log->fs_info->pinned_mutex);
 	}
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
-- 
cgit v1.2.3


From 5d13a98f3bf5afc1113f7db184c627a44659bc29 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 11:41:46 -0400
Subject: Btrfs: readahead checksums during btrfs_finish_ordered_io

This reads in blocks in the checksum btree before starting the
transaction in btrfs_finish_ordered_io.  It makes it much more likely
we'll be able to do operations inside the transaction without
needing any btree reads, which limits transaction latencies overall.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b83a45dc717..9b4faac50c1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1497,6 +1497,30 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction.  It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+				struct btrfs_path *path,
+				struct btrfs_ordered_extent *ordered_extent)
+{
+	struct btrfs_ordered_sum *sum;
+	u64 bytenr;
+
+	sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+			 list);
+	bytenr = sum->sums[0].bytenr;
+
+	/*
+	 * we don't care about the results, the point of this search is
+	 * just to get the btree leaves into ram
+	 */
+	btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1505,7 +1529,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_ordered_extent *ordered_extent;
+	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_path *path;
 	int compressed = 0;
@@ -1528,13 +1552,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 			ret = btrfs_lookup_file_extent(NULL, root, path,
 						       inode->i_ino,
 						       start, 0);
+			ordered_extent = btrfs_lookup_ordered_extent(inode,
+								     start);
+			if (!list_empty(&ordered_extent->list)) {
+				btrfs_release_path(root, path);
+				reada_csum(root, path, ordered_extent);
+			}
 			btrfs_free_path(path);
 		}
 	}
 
 	trans = btrfs_join_transaction(root, 1);
 
-	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	if (!ordered_extent)
+		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
-- 
cgit v1.2.3


From a4b6e07d1a8a9b907e82b9acbf51a026fbb9301c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 16 Mar 2009 10:59:57 -0400
Subject: Btrfs: limit balancing work while flushing delayed refs

The delayed reference mechanism is responsible for all updates to the
extent allocation trees, including those updates created while processing
the delayed references.

This commit tries to limit the amount of work that gets created during
the final run of delayed refs before a commit.  It avoids cowing new blocks
unless it is required to finish the commit, and so it avoids new allocations
that were not really required.

The goal is to avoid infinite loops where we are always making more work
on the final run of delayed refs.  Over the long term we'll make a
special log for the last delayed ref updates as well.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8686a3d2ab3..dbb72412463 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -949,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
+	if (trans->transaction->delayed_refs.flushing &&
+	    btrfs_header_nritems(mid) > 2)
+		return 0;
+
 	if (btrfs_header_nritems(mid) < 2)
 		err_on_enospc = 1;
 
@@ -2159,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
-	} else {
+	} else if (!trans->transaction->delayed_refs.flushing) {
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
@@ -2848,7 +2852,8 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int num_doubles = 0;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+	    !trans->transaction->delayed_refs.flushing) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0)
 			return wret;
@@ -3786,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+		    !trans->transaction->delayed_refs.flushing) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
-- 
cgit v1.2.3


From a74ac3220774d33db967088906dc3351829e2d3a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Mar 2009 10:24:13 -0400
Subject: Btrfs: Make sure i_nlink doesn't hit zero too soon during log replay

During log replay, inodes are copied from the log to the main filesystem
btrees.  Sometimes they have a zero link count in the log but they actually
gain links during the replay or have some in the main btree.

This patch updates the link count to be at least one after copying the
inode out of the log.  This makes sure the inode is deleted during an
iput while the rest of the replay code is still working on it.

The log replay has fixup code to make sure that link counts are correct
at the end of the replay, so we could use any non-zero number here and
it would work fine.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a93934fc93b..405439ca4c4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1532,6 +1532,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 					root, inode, inode->i_size,
 					BTRFS_EXTENT_DATA_KEY);
 				BUG_ON(ret);
+
+				/* if the nlink count is zero here, the iput
+				 * will free the inode.  We bump it to make
+				 * sure it doesn't get freed until the link
+				 * count fixup is done
+				 */
+				if (inode->i_nlink == 0) {
+					btrfs_inc_nlink(inode);
+					btrfs_update_inode(wc->trans,
+							   root, inode);
+				}
 				iput(inode);
 			}
 			ret = link_to_fixup_dir(wc->trans, root,
-- 
cgit v1.2.3


From 12fcfd22fe5bf4fe74710232098bc101af497995 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Mar 2009 10:24:20 -0400
Subject: Btrfs: tree logging unlink/rename fixes

The tree logging code allows individual files or directories to be logged
without including operations on other files and directories in the FS.
It tries to commit the minimal set of changes to disk in order to
fsync the single file or directory that was sent to fsync or O_SYNC.

The tree logging code was allowing files and directories to be unlinked
if they were part of a rename operation where only one directory
in the rename was in the fsync log.  This patch adds a few new rules
to the tree logging.

1) on rename or unlink, if the inode being unlinked isn't in the fsync
log, we must force a full commit before doing an fsync of the directory
where the unlink was done.  The commit isn't done during the unlink,
but it is forced the next time we try to log the parent directory.

Solution: record transid of last unlink/rename per directory when the
directory wasn't already logged.  For renames this is only done when
renaming to a different directory.

mkdir foo/some_dir
normal commit
rename foo/some_dir foo2/some_dir
mkdir foo/some_dir
fsync foo/some_dir/some_file

The fsync above will unlink the original some_dir without recording
it in its new location (foo2).  After a crash, some_dir will be gone
unless the fsync of some_file forces a full commit

2) we must log any new names for any file or dir that is in the fsync
log.  This way we make sure not to lose files that are unlinked during
the same transaction.

2a) we must log any new names for any file or dir during rename
when the directory they are being removed from was logged.

2a is actually the more important variant.  Without the extra logging
a crash might unlink the old name without recreating the new one

3) after a crash, we must go through any directories with a link count
of zero and redo the rm -rf

mkdir f1/foo
normal commit
rm -rf f1/foo
fsync(f1)

The directory f1 was fully removed from the FS, but fsync was never
called on f1, only its parent dir.  After a crash the rm -rf must
be replayed.  This must be able to recurse down the entire
directory tree.  The inode link count fixup code takes care of the
ugly details.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  13 +-
 fs/btrfs/ctree.h       |   7 +-
 fs/btrfs/extent-tree.c |   2 +-
 fs/btrfs/file.c        |  14 +-
 fs/btrfs/inode.c       |  28 +++-
 fs/btrfs/tree-log.c    | 389 +++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/tree-log.h    |  17 ++-
 7 files changed, 372 insertions(+), 98 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74..3af4cfb5654 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,12 +86,6 @@ struct btrfs_inode {
 	 */
 	u64 logged_trans;
 
-	/*
-	 * trans that last made a change that should be fully fsync'd.  This
-	 * gets reset to zero each time the inode is logged
-	 */
-	u64 log_dirty_trans;
-
 	/* total number of bytes pending delalloc, used by stat to calc the
 	 * real block usage of the file
 	 */
@@ -121,6 +115,13 @@ struct btrfs_inode {
 	/* the start of block group preferred for allocations. */
 	u64 block_group;
 
+	/* the fsync log has some corner cases that mean we have to check
+	 * directories to see if any unlinks have been done before
+	 * the directory was logged.  See tree-log.c for all the
+	 * details
+	 */
+	u64 last_unlink_trans;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ddce91cf3f..2737facbd34 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,7 +695,12 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
-	u64 last_trans_new_blockgroup;
+
+	/*
+	 * this is updated to the current trans every time a full commit
+	 * is required instead of the faster short fsync log commits
+	 */
+	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8933d15a240..0c482e0d7c4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	extent_root = root->fs_info->extent_root;
 
-	root->fs_info->last_trans_new_blockgroup = trans->transid;
+	root->fs_info->last_trans_log_full_commit = trans->transid;
 
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	if (!cache)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f06c275644b..32d10a61761 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1173,8 +1173,11 @@ out_nolock:
 			ret = btrfs_log_dentry_safe(trans, root,
 						    file->f_dentry);
 			if (ret == 0) {
-				btrfs_sync_log(trans, root);
-				btrfs_end_transaction(trans, root);
+				ret = btrfs_sync_log(trans, root);
+				if (ret == 0)
+					btrfs_end_transaction(trans, root);
+				else
+					btrfs_commit_transaction(trans, root);
 			} else {
 				btrfs_commit_transaction(trans, root);
 			}
@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
-		btrfs_sync_log(trans, root);
-		ret = btrfs_end_transaction(trans, root);
+		ret = btrfs_sync_log(trans, root);
+		if (ret == 0)
+			ret = btrfs_end_transaction(trans, root);
+		else
+			ret = btrfs_commit_transaction(trans, root);
 	}
 	mutex_lock(&dentry->d_inode->i_mutex);
 out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b4faac50c1..bffd79faffb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir->i_ino);
 	BUG_ON(ret != 0 && ret != -ENOENT);
-	if (ret != -ENOENT)
-		BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
+
+	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
 
@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
-	bi->log_dirty_trans = 0;
+	bi->last_unlink_trans = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 
 	nr = trans->blocks_used;
+
+	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
@@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	trans = btrfs_start_transaction(root, 1);
 
+	/*
+	 * this is an ugly little race, but the rename is required to make
+	 * sure that if we crash, the inode is either at the old name
+	 * or the new one.  pinning the log transaction lets us make sure
+	 * we don't allow a log commit to come in after we unlink the
+	 * name but before we add the new name back in.
+	 */
+	btrfs_pin_log_trans(root);
+
 	btrfs_set_trans_block_group(trans, new_dir);
 
 	btrfs_inc_nlink(old_dentry->d_inode);
@@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
 
+	if (old_dentry->d_parent != new_dentry->d_parent)
+		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
 	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
 				 old_dentry->d_name.name,
 				 old_dentry->d_name.len);
@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_fail;
 
+	btrfs_log_new_name(trans, old_inode, old_dir,
+				       new_dentry->d_parent);
 out_fail:
+
+	/* this btrfs_end_log_trans just allows the current
+	 * log-sub transaction to complete
+	 */
+	btrfs_end_log_trans(root);
 	btrfs_end_transaction_throttle(trans, root);
 out_unlock:
 	return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 405439ca4c4..1b7f04a8f16 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -34,6 +34,49 @@
 #define LOG_INODE_ALL 0
 #define LOG_INODE_EXISTS 1
 
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
 /*
  * stages for the tree walking.  The first
  * stage (0) is to only pin down the blocks we find
@@ -47,12 +90,17 @@
 #define LOG_WALK_REPLAY_INODES 1
 #define LOG_WALK_REPLAY_ALL 2
 
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid, int del_all);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -132,11 +180,26 @@ static int join_running_log_trans(struct btrfs_root *root)
 	return ret;
 }
 
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&root->log_mutex);
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
 /*
  * indicate we're done making changes to the log tree
  * and wake up anyone waiting to do a sync
  */
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
 		smp_mb();
@@ -602,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 
 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
 	BUG_ON(ret);
+
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 	BUG_ON(ret);
 	kfree(name);
@@ -803,6 +867,7 @@ conflict_again:
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(root, path);
+
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
@@ -921,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		key.offset--;
 		btrfs_release_path(root, path);
 	}
-	btrfs_free_path(path);
+	btrfs_release_path(root, path);
 	if (nlink != inode->i_nlink) {
 		inode->i_nlink = nlink;
 		btrfs_update_inode(trans, root, inode);
 	}
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 
+	if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+		ret = replay_dir_deletes(trans, root, NULL, path,
+					 inode->i_ino, 1);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+
 	return 0;
 }
 
@@ -970,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 
 		iput(inode);
 
-		if (key.offset == 0)
-			break;
-		key.offset--;
+		/*
+		 * fixup on a directory may create new entries,
+		 * make sure we always look for the highset possible
+		 * offset
+		 */
+		key.offset = (u64)-1;
 	}
 	btrfs_release_path(root, path);
 	return 0;
@@ -1312,11 +1387,11 @@ again:
 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
 				  name_len);
 		log_di = NULL;
-		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
 						       dir_key->objectid,
 						       name, name_len, 0);
-		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
 			log_di = btrfs_lookup_dir_index_item(trans, log,
 						     log_path,
 						     dir_key->objectid,
@@ -1377,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
-				       u64 dirid)
+				       u64 dirid, int del_all)
 {
 	u64 range_start;
 	u64 range_end;
@@ -1407,10 +1482,14 @@ again:
 	range_start = 0;
 	range_end = 0;
 	while (1) {
-		ret = find_dir_range(log, path, dirid, key_type,
-				     &range_start, &range_end);
-		if (ret != 0)
-			break;
+		if (del_all)
+			range_end = (u64)-1;
+		else {
+			ret = find_dir_range(log, path, dirid, key_type,
+					     &range_start, &range_end);
+			if (ret != 0)
+				break;
+		}
 
 		dir_key.offset = range_start;
 		while (1) {
@@ -1436,7 +1515,8 @@ again:
 				break;
 
 			ret = check_item_in_log(trans, root, log, path,
-						log_path, dir, &found_key);
+						log_path, dir,
+						&found_key);
 			BUG_ON(ret);
 			if (found_key.offset == (u64)-1)
 				break;
@@ -1513,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			mode = btrfs_inode_mode(eb, inode_item);
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
-					 root, log, path, key.objectid);
+					 root, log, path, key.objectid, 0);
 				BUG_ON(ret);
 			}
 			ret = overwrite_item(wc->trans, root, path,
@@ -1850,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -1864,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 		prepare_to_wait(&root->log_commit_wait[index],
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (root->log_transid < transid + 2 &&
+
+		if (root->fs_info->last_trans_log_full_commit !=
+		    trans->transid && root->log_transid < transid + 2 &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
+
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
 	} while (root->log_transid < transid + 2 &&
@@ -1874,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 	return 0;
 }
 
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
 	while (atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (atomic_read(&root->log_writers))
+		if (root->fs_info->last_trans_log_full_commit !=
+		    trans->transid && atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
@@ -1892,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
 /*
  * btrfs_sync_log does sends a given tree log down to the disk and
  * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root)
@@ -1906,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->log_mutex);
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(root, root->log_transid);
+		wait_log_commit(trans, root, root->log_transid);
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
@@ -1914,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(root, root->log_transid - 1);
+		wait_log_commit(trans, root, root->log_transid - 1);
 
 	while (1) {
 		unsigned long batch = root->log_batch;
 		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&root->log_mutex);
-		wait_for_writer(root);
+
+		wait_for_writer(trans, root);
 		if (batch == root->log_batch)
 			break;
 	}
 
+	/* bail out if we need to do a full commit */
+	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		ret = -EAGAIN;
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
+
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
@@ -1961,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
-		wait_log_commit(log_root_tree, log_root_tree->log_transid);
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
-	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
-		wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid - 1);
+	}
+
+	wait_for_writer(trans, log_root_tree);
 
-	wait_for_writer(log_root_tree);
+	/*
+	 * now that we've moved on to the tree of log tree roots,
+	 * check the full commit flag again
+	 */
+	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out_wake_log_root;
+	}
 
 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
 				&log_root_tree->dirty_log_pages);
@@ -1995,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * in and cause problems either.
 	 */
 	write_ctree_super(trans, root->fs_info->tree_root, 2);
+	ret = 0;
 
+out_wake_log_root:
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -2008,7 +2124,8 @@ out:
 	return 0;
 }
 
-/* * free all the extents used by the tree log.  This should be called
+/*
+ * free all the extents used by the tree log.  This should be called
  * at commit time of the full transaction
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2142,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
 	btrfs_free_path(path);
 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
-	end_log_trans(root);
+	btrfs_end_log_trans(root);
 
 	return 0;
 }
@@ -2169,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
 				  dirid, &index);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
-	end_log_trans(root);
+	btrfs_end_log_trans(root);
 
 	return ret;
 }
@@ -2569,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
  *
  * This handles both files and directories.
  */
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only)
 {
@@ -2595,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	min_key.offset = 0;
 
 	max_key.objectid = inode->i_ino;
+
+	/* today the code can only do partial logging of directories */
+	if (!S_ISDIR(inode->i_mode))
+	    inode_only = LOG_INODE_ALL;
+
 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
-	/*
-	 * if this inode has already been logged and we're in inode_only
-	 * mode, we don't want to delete the things that have already
-	 * been written to the log.
-	 *
-	 * But, if the inode has been through an inode_only log,
-	 * the logged_trans field is not set.  This allows us to catch
-	 * any new names for this inode in the backrefs by logging it
-	 * again
-	 */
-	if (inode_only == LOG_INODE_EXISTS &&
-	    BTRFS_I(inode)->logged_trans == trans->transid) {
-		btrfs_free_path(path);
-		btrfs_free_path(dst_path);
-		goto out;
-	}
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
 	/*
@@ -2703,7 +2809,6 @@ next_slot:
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
-		BTRFS_I(inode)->log_dirty_trans = 0;
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
@@ -2712,19 +2817,58 @@ next_slot:
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
-out:
 	return 0;
 }
 
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    int inode_only)
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+					       struct inode *inode,
+					       struct dentry *parent,
+					       struct super_block *sb,
+					       u64 last_committed)
 {
-	int ret;
+	int ret = 0;
+	struct btrfs_root *root;
 
-	start_log_trans(trans, root);
-	ret = __btrfs_log_inode(trans, root, inode, inode_only);
-	end_log_trans(root);
+	if (!S_ISDIR(inode->i_mode)) {
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			goto out;
+		inode = parent->d_inode;
+	}
+
+	while (1) {
+		BTRFS_I(inode)->logged_trans = trans->transid;
+		smp_mb();
+
+		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+			root = BTRFS_I(inode)->root;
+
+			/*
+			 * make sure any commits to the log are forced
+			 * to be full commits
+			 */
+			root->fs_info->last_trans_log_full_commit =
+				trans->transid;
+			ret = 1;
+			break;
+		}
+
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			break;
+
+		if (parent == sb->s_root)
+			break;
+
+		parent = parent->d_parent;
+		inode = parent->d_inode;
+
+	}
+out:
 	return ret;
 }
 
@@ -2734,31 +2878,53 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct dentry *dentry)
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    struct dentry *parent, int exists_only)
 {
-	int inode_only = LOG_INODE_ALL;
+	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
-	int ret;
+	int ret = 0;
+	u64 last_committed = root->fs_info->last_trans_committed;
+
+	sb = inode->i_sb;
+
+	if (root->fs_info->last_trans_log_full_commit >
+	    root->fs_info->last_trans_committed) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	ret = check_parent_dirs_for_sync(trans, inode, parent,
+					 sb, last_committed);
+	if (ret)
+		goto end_no_trans;
 
 	start_log_trans(trans, root);
-	sb = dentry->d_inode->i_sb;
-	while (1) {
-		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
-					inode_only);
-		BUG_ON(ret);
-		inode_only = LOG_INODE_EXISTS;
 
-		dentry = dentry->d_parent;
-		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+	ret = btrfs_log_inode(trans, root, inode, inode_only);
+	BUG_ON(ret);
+	inode_only = LOG_INODE_EXISTS;
+
+	while (1) {
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
 			break;
 
-		if (BTRFS_I(dentry->d_inode)->generation <=
-		    root->fs_info->last_trans_committed)
+		inode = parent->d_inode;
+		if (BTRFS_I(inode)->generation >
+		    root->fs_info->last_trans_committed) {
+			ret = btrfs_log_inode(trans, root, inode, inode_only);
+			BUG_ON(ret);
+		}
+		if (parent == sb->s_root)
 			break;
+
+		parent = parent->d_parent;
 	}
-	end_log_trans(root);
-	return 0;
+	ret = 0;
+	btrfs_end_log_trans(root);
+end_no_trans:
+	return ret;
 }
 
 /*
@@ -2770,12 +2936,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry)
 {
-	u64 gen;
-	gen = root->fs_info->last_trans_new_blockgroup;
-	if (gen > root->fs_info->last_trans_committed)
-		return 1;
-	else
-		return btrfs_log_dentry(trans, root, dentry);
+	return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+				      dentry->d_parent, 0);
 }
 
 /*
@@ -2894,3 +3056,74 @@ again:
 	kfree(log_root_tree);
 	return 0;
 }
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename)
+{
+	/*
+	 * if this directory was already logged any new
+	 * names for this file/dir will get recorded
+	 */
+	smp_mb();
+	if (BTRFS_I(dir)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * if the inode we're about to unlink was logged,
+	 * the log will be properly updated for any new names
+	 */
+	if (BTRFS_I(inode)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * when renaming files across directories, if the directory
+	 * there we're unlinking from gets fsync'd later on, there's
+	 * no way to find the destination directory later and fsync it
+	 * properly.  So, we have to be conservative and force commits
+	 * so the new name gets discovered.
+	 */
+	if (for_rename)
+		goto record;
+
+	/* we can safely do the unlink without any special recording */
+	return;
+
+record:
+	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent)
+{
+	struct btrfs_root * root = BTRFS_I(inode)->root;
+
+	/*
+	 * if this inode hasn't been logged and directory we're renaming it
+	 * from hasn't been logged, we don't need to log it
+	 */
+	if (BTRFS_I(inode)->logged_trans <=
+	    root->fs_info->last_trans_committed &&
+	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+		    root->fs_info->last_trans_committed))
+		return 0;
+
+	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed0..d09c7609e16 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct dentry *dentry);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    int inode_only);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       const char *name, int name_len,
 			       struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent);
 #endif
-- 
cgit v1.2.3


From af4176b49c5ee15a9c9b10720c92456b28e7aac7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Mar 2009 10:24:31 -0400
Subject: Btrfs: optimize fsyncs on old files

The fsync log has code to make sure all of the parents of a file are in the
log along with the file.  It uses a minimal log of the parent directory
inodes, just enough to get the parent directory on disk.

If the transaction that originally created a file is fully on disk,
and the file hasn't been renamed or linked into other directories, we
can safely skip the parent directory walk.  We know the file is on disk
somewhere and we can go ahead and just log that single file.

This is more important now because unrelated unlinks in the parent directory
might make us force a commit if we try to log the parent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1b7f04a8f16..fc9b87a7975 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2835,6 +2835,17 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	struct btrfs_root *root;
 
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
+			goto out;
+
 	if (!S_ISDIR(inode->i_mode)) {
 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
 			goto out;
@@ -2904,8 +2915,19 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_log_inode(trans, root, inode, inode_only);
 	BUG_ON(ret);
-	inode_only = LOG_INODE_EXISTS;
 
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
+			goto no_parent;
+
+	inode_only = LOG_INODE_EXISTS;
 	while (1) {
 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
 			break;
@@ -2921,6 +2943,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
 		parent = parent->d_parent;
 	}
+no_parent:
 	ret = 0;
 	btrfs_end_log_trans(root);
 end_no_trans:
@@ -3069,6 +3092,19 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct inode *dir, struct inode *inode,
 			     int for_rename)
 {
+	/*
+	 * when we're logging a file, if it hasn't been renamed
+	 * or unlinked, and its inode is fully committed on disk,
+	 * we don't have to worry about walking up the directory chain
+	 * to log its parents.
+	 *
+	 * So, we use the last_unlink_trans field to put this transid
+	 * into the file.  When the file is logged we check it and
+	 * don't log the parents if the file is fully on disk.
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
 	/*
 	 * if this directory was already logged any new
 	 * names for this file/dir will get recorded
@@ -3114,6 +3150,13 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root * root = BTRFS_I(inode)->root;
 
+	/*
+	 * this will force the logging code to walk the dentry chain
+	 * up for the file
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
 	/*
 	 * if this inode hasn't been logged and directory we're renaming it
 	 * from hasn't been logged, we don't need to log it
-- 
cgit v1.2.3


From 1a81af4d1d9c60d4313309f937a1fc5567205a87 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Mar 2009 09:55:11 -0400
Subject: Btrfs: make sure btrfs_update_delayed_ref doesn't increase ref_mod

btrfs_update_delayed_ref is optimized to add and remove different
references in one pass through the delayed ref tree.  It is a zero
sum on the total number of refs on a given extent.

But, the code was recording an extra ref in the head node.  This
never made it down to the disk but was used when deciding if it was
safe to free the extent while dropping snapshots.

The fix used here is to make sure the ref_mod count is unchanged
on the head ref when btrfs_update_delayed_ref is called.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/delayed-ref.c | 10 +++++++---
 fs/btrfs/delayed-ref.h |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 759fa247ced..cbf7dc8ae3e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -450,8 +450,12 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * the head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
 	 */
-	if (parent == (u64)-1 && action == BTRFS_DROP_DELAYED_REF)
-		count_mod = -1;
+	if (parent == (u64)-1) {
+		if (action == BTRFS_DROP_DELAYED_REF)
+			count_mod = -1;
+		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+			count_mod = 0;
+	}
 
 	/*
 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -647,7 +651,7 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 	 */
 	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
 				      (u64)-1, 0, 0, 0,
-				      BTRFS_ADD_DELAYED_REF, 0);
+				      BTRFS_UPDATE_DELAYED_HEAD, 0);
 	BUG_ON(ret);
 
 	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 57153fcc347..3bec2ff0b15 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -22,6 +22,7 @@
 #define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
 #define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
 
 struct btrfs_delayed_ref_node {
 	struct rb_node rb_node;
-- 
cgit v1.2.3


From ceb93a9ff16612314d757874b6415ffbb2091576 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@hobbes.lan>
Date: Wed, 25 Mar 2009 08:54:29 -0700
Subject: powerpc/PCI: include pci.h in powerpc MSI implementation

This file uses PCI MSI defines and so needs pci.h.

Tested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/powerpc/kernel/msi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 0c16e2a854e..8bbc12d20f5 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/msi.h>
+#include <linux/pci.h>
 
 #include <asm/machdep.h>
 
-- 
cgit v1.2.3


From 7058548cd50e5bda8db086bb2e5c1d82f746d047 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 25 Mar 2009 23:35:46 -0400
Subject: ext4: Use WRITE_SYNC for commits which are caused by fsync()

If a commit is triggered by fsync(), set a flag indicating the journal
blocks associated with the transaction should be flushed out using
WRITE_SYNC.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 5 ++++-
 fs/jbd2/transaction.c | 2 ++
 include/linux/jbd2.h  | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44..4ea72377c7a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 			stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598..996ffda06bf 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4d248b3f132..8815a3456b3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -648,6 +648,12 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
+	/*
+	 * This transaction is being forced and some process is
+	 * waiting for it to finish.
+	 */
+	int t_synchronous_commit:1;
+
 	/*
 	 * For use by the filesystem to store fs-specific data
 	 * structures associated with the transaction
-- 
cgit v1.2.3


From 563bdd61fe4dbd6b58cf7eb06f8d8f14479ae1dc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 26 Mar 2009 00:06:19 -0400
Subject: ext4: Check for an valid i_mode when reading the inode from disk

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bed4a0abd0d..c7d9250fbc3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4394,7 +4394,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
-	} else {
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
@@ -4402,6 +4403,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	} else {
+		brelse(bh);
+		ret = -EIO;
+		ext4_error(inode->i_sb, __func__, 
+			   "bogus i_mode (%o) for inode=%lu",
+			   inode->i_mode, inode->i_ino);
+		goto bad_inode;
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-- 
cgit v1.2.3


From e8c3b42057be44cff9dd225bd9930956c5f34776 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:32 +0900
Subject: ia64/pv_ops/xen: use __initconst instead of __initdata for const data

use __initconst instead of __initdata for const data like
ec8148de85a73a3be397a59b6d8f4f32cf2dd254

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/xen/xen_pv_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index 936cff3c96e..fa3b967e69c 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -260,7 +260,7 @@ xen_intrin_local_irq_restore(unsigned long mask)
 		xen_rsm_i();
 }
 
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.fc		= xen_fc,
 	.thash		= xen_thash,
 	.get_cpuid	= xen_get_cpuid,
-- 
cgit v1.2.3


From ac93925acbf841d70a95ab576b76b15a34d194eb Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:33 +0900
Subject: ia64/xen: short-circuit tests for dom0

This patch is ia64 counter part of clean up of the xen predicates.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/xen/hypervisor.h | 39 ++++++++++++++++------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h
index 7a804e80fc6..e425227a418 100644
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -33,9 +33,6 @@
 #ifndef _ASM_IA64_XEN_HYPERVISOR_H
 #define _ASM_IA64_XEN_HYPERVISOR_H
 
-#ifdef CONFIG_XEN
-
-#include <linux/init.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>	/* to compile feature.c */
 #include <xen/features.h>		/* to comiple xen-netfront.c */
@@ -43,22 +40,32 @@
 
 /* xen_domain_type is set before executing any C code by early_xen_setup */
 enum xen_domain_type {
-	XEN_NATIVE,
-	XEN_PV_DOMAIN,
-	XEN_HVM_DOMAIN,
+	XEN_NATIVE,	/* running on bare hardware */
+	XEN_PV_DOMAIN,	/* running in a PV domain */
+	XEN_HVM_DOMAIN,	/* running in a Xen hvm domain*/
 };
 
+#ifdef CONFIG_XEN
 extern enum xen_domain_type xen_domain_type;
+#else
+#define xen_domain_type		XEN_NATIVE
+#endif
 
 #define xen_domain()		(xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain()		(xen_domain_type == XEN_PV_DOMAIN)
-#define xen_initial_domain()	(xen_pv_domain() && \
+#define xen_pv_domain()		(xen_domain() &&			\
+				 xen_domain_type == XEN_PV_DOMAIN)
+#define xen_hvm_domain()	(xen_domain() &&			\
+				 xen_domain_type == XEN_HVM_DOMAIN)
+
+#ifdef CONFIG_XEN_DOM0
+#define xen_initial_domain()	(xen_pv_domain() &&			\
 				 (xen_start_info->flags & SIF_INITDOMAIN))
-#define xen_hvm_domain()	(xen_domain_type == XEN_HVM_DOMAIN)
+#else
+#define xen_initial_domain()	(0)
+#endif
 
-/* deprecated. remove this */
-#define is_running_on_xen()	(xen_domain_type == XEN_PV_DOMAIN)
 
+#ifdef CONFIG_XEN
 extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 
@@ -74,16 +81,6 @@ void force_evtchn_callback(void);
 
 /* For setup_arch() in arch/ia64/kernel/setup.c */
 void xen_ia64_enable_opt_feature(void);
-
-#else /* CONFIG_XEN */
-
-#define xen_domain()		(0)
-#define xen_pv_domain()		(0)
-#define xen_initial_domain()	(0)
-#define xen_hvm_domain()	(0)
-#define is_running_on_xen()	(0)	/* deprecated. remove this */
 #endif
 
-#define is_initial_xendomain()	(0)	/* deprecated. remove this */
-
 #endif /* _ASM_IA64_XEN_HYPERVISOR_H */
-- 
cgit v1.2.3


From dd97d5cb540939602cba9af6f88e883a6fe451f0 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:34 +0900
Subject: ia64/pv_ops: add hooks to paravirtualize fsyscall implementation.

Add two hooks, paravirt_get_fsyscall_table() and
paravirt_get_fsys_bubble_doen() to paravirtualize fsyscall implementation.
This patch just add the hooks fsyscall and don't paravirtualize it.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/native/inst.h |  3 +++
 arch/ia64/include/asm/paravirt.h    | 15 +++++++++++++++
 arch/ia64/kernel/Makefile           |  4 ++--
 arch/ia64/kernel/fsys.S             | 17 +++++++++--------
 arch/ia64/kernel/patch.c            | 26 +++++++++++++++++++++++---
 arch/ia64/mm/init.c                 |  3 ++-
 6 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
index 0a1026cca4f..5e4e15133f4 100644
--- a/arch/ia64/include/asm/native/inst.h
+++ b/arch/ia64/include/asm/native/inst.h
@@ -30,6 +30,9 @@
 #define __paravirt_work_processed_syscall_target \
 						ia64_work_processed_syscall
 
+#define paravirt_fsyscall_table			ia64_native_fsyscall_table
+#define paravirt_fsys_bubble_down		ia64_native_fsys_bubble_down
+
 #ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK
 # define PARAVIRT_POISON	0xdeadbeefbaadf00d
 # define CLOBBER(clob)				\
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 2bf3636473f..56f69f938cc 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -22,6 +22,21 @@
 #ifndef __ASM_PARAVIRT_H
 #define __ASM_PARAVIRT_H
 
+#ifndef __ASSEMBLY__
+/******************************************************************************
+ * fsys related addresses
+ */
+struct pv_fsys_data {
+	unsigned long *fsyscall_table;
+	void *fsys_bubble_down;
+};
+
+extern struct pv_fsys_data pv_fsys_data;
+
+unsigned long *paravirt_get_fsyscall_table(void);
+char *paravirt_get_fsys_bubble_down(void);
+#endif
+
 #ifdef CONFIG_PARAVIRT_GUEST
 
 #define PARAVIRT_HYPERVISOR_TYPE_DEFAULT	0
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index c381ea95489..1ab150ec8ce 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -111,9 +111,9 @@ include/asm-ia64/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
 clean-files += $(objtree)/include/asm-ia64/nr-irqs.h
 
 #
-# native ivt.S and entry.S
+# native ivt.S, entry.S and fsys.S
 #
-ASM_PARAVIRT_OBJS = ivt.o entry.o
+ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o
 define paravirtualized_native
 AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE
 AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index c1625c7e177..788319f121a 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -25,6 +25,7 @@
 #include <asm/unistd.h>
 
 #include "entry.h"
+#include "paravirt_inst.h"
 
 /*
  * See Documentation/ia64/fsys.txt for details on fsyscalls.
@@ -602,7 +603,7 @@ ENTRY(fsys_fallback_syscall)
 	mov r26=ar.pfs
 END(fsys_fallback_syscall)
 	/* FALL THROUGH */
-GLOBAL_ENTRY(fsys_bubble_down)
+GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	.prologue
 	.altrp b6
 	.body
@@ -640,7 +641,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	 *
 	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
 	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
-	 * PSR.I  : already turned off by the time fsys_bubble_down gets
+	 * PSR.I  : already turned off by the time paravirt_fsys_bubble_down gets
 	 *	    invoked
 	 * PSR.DFL: always 0 (kernel never turns it on)
 	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
@@ -650,7 +651,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	 * PSR.DB : don't care --- kernel never enables kernel-level
 	 *	    breakpoints
 	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
-	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
+	 *          __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down
 	 *          will trigger a taken branch; the taken-trap-handler then
 	 *          converts the syscall into a break-based system-call.
 	 */
@@ -741,14 +742,14 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	nop.m 0
 (p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
 	br.cond.spnt ia64_trace_syscall		// B
-END(fsys_bubble_down)
+END(paravirt_fsys_bubble_down)
 
 	.rodata
 	.align 8
-	.globl fsyscall_table
+	.globl paravirt_fsyscall_table
 
-	data8 fsys_bubble_down
-fsyscall_table:
+	data8 paravirt_fsys_bubble_down
+paravirt_fsyscall_table:
 	data8 fsys_ni_syscall
 	data8 0				// exit			// 1025
 	data8 0				// read
@@ -1033,4 +1034,4 @@ fsyscall_table:
 
 	// fill in zeros for the remaining entries
 	.zero:
-	.space fsyscall_table + 8*NR_syscalls - .zero, 0
+	.space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index b83b2c51600..02dd977436f 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 
+#include <asm/paravirt.h>
 #include <asm/patch.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -169,16 +170,35 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
+extern unsigned long ia64_native_fsyscall_table[NR_syscalls];
+extern char ia64_native_fsys_bubble_down[];
+struct pv_fsys_data pv_fsys_data __initdata = {
+	.fsyscall_table = (unsigned long *)ia64_native_fsyscall_table,
+	.fsys_bubble_down = (void *)ia64_native_fsys_bubble_down,
+};
+
+unsigned long * __init
+paravirt_get_fsyscall_table(void)
+{
+	return pv_fsys_data.fsyscall_table;
+}
+
+char * __init
+paravirt_get_fsys_bubble_down(void)
+{
+	return pv_fsys_data.fsys_bubble_down;
+}
+
 static void __init
 patch_fsyscall_table (unsigned long start, unsigned long end)
 {
-	extern unsigned long fsyscall_table[NR_syscalls];
+	u64 fsyscall_table = (u64)paravirt_get_fsyscall_table();
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
 	while (offp < (s32 *) end) {
 		ip = (u64) ia64_imva((char *) offp + *offp);
-		ia64_patch_imm64(ip, (u64) fsyscall_table);
+		ia64_patch_imm64(ip, fsyscall_table);
 		ia64_fc((void *) ip);
 		++offp;
 	}
@@ -189,7 +209,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end)
 static void __init
 patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 {
-	extern char fsys_bubble_down[];
+	u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down();
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 56e12903973..c9bc5b305ff 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -35,6 +35,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/mca.h>
+#include <asm/paravirt.h>
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
@@ -667,8 +668,8 @@ mem_init (void)
 	 * code can tell them apart.
 	 */
 	for (i = 0; i < NR_syscalls; ++i) {
-		extern unsigned long fsyscall_table[NR_syscalls];
 		extern unsigned long sys_call_table[NR_syscalls];
+		unsigned long *fsyscall_table = paravirt_get_fsyscall_table();
 
 		if (!fsyscall_table[i] || nolwsys)
 			fsyscall_table[i] = sys_call_table[i] | 1;
-- 
cgit v1.2.3


From 533bd156231eec4b399c36579e7c30b6f52cfd29 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:35 +0900
Subject: ia64/pv_ops/xen: preliminary to paravirtualizing fsys.S for xen.

This is a preliminary patch to paravirtualizing fsys.S.
compile fsys.S twice one for native and one for xen, and switch
them at run tine.
Later fsys.S will be paravirtualized.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/xen/inst.h |  3 +++
 arch/ia64/xen/Makefile           |  2 +-
 arch/ia64/xen/xen_pv_ops.c       | 14 ++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h
index 19c2ae1d878..e8e01b28d2a 100644
--- a/arch/ia64/include/asm/xen/inst.h
+++ b/arch/ia64/include/asm/xen/inst.h
@@ -33,6 +33,9 @@
 #define __paravirt_work_processed_syscall_target \
 						xen_work_processed_syscall
 
+#define paravirt_fsyscall_table			xen_fsyscall_table
+#define paravirt_fsys_bubble_down		xen_fsys_bubble_down
+
 #define MOV_FROM_IFA(reg)	\
 	movl reg = XSI_IFA;	\
 	;;			\
diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile
index 0ad0224693d..b4ca2e6c0ea 100644
--- a/arch/ia64/xen/Makefile
+++ b/arch/ia64/xen/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_IA64_GENERIC) += machvec.o
 AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN
 
 # xen multi compile
-ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S
+ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S fsys.S
 ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o))
 obj-y += $(ASM_PARAVIRT_OBJS)
 define paravirtualized_xen
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index fa3b967e69c..fe72308321b 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -24,6 +24,7 @@
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/pm.h>
+#include <linux/unistd.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/xencomm.h>
@@ -165,6 +166,18 @@ static const struct pv_init_ops xen_init_ops __initconst = {
 	.post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu,
 };
 
+/***************************************************************************
+ * pv_fsys_data
+ * addresses for fsys
+ */
+
+extern unsigned long xen_fsyscall_table[NR_syscalls];
+extern char xen_fsys_bubble_down[];
+struct pv_fsys_data xen_fsys_data __initdata = {
+	.fsyscall_table = (unsigned long *)xen_fsyscall_table,
+	.fsys_bubble_down = (void *)xen_fsys_bubble_down,
+};
+
 /***************************************************************************
  * pv_cpu_ops
  * intrinsics hooks.
@@ -355,6 +368,7 @@ xen_setup_pv_ops(void)
 	xen_info_init();
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
+	pv_fsys_data = xen_fsys_data;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_iosapic_ops = xen_iosapic_ops;
 	pv_irq_ops = xen_irq_ops;
-- 
cgit v1.2.3


From 84b8857a038c060535dafdc8732a1ed60d0e98fc Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:36 +0900
Subject: ia64/pv_ops: paravirtualize fsys.S.

paravirtualize fsys.S.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/fsys.S | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 788319f121a..3544d75e7cb 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -419,7 +419,7 @@ EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
 	mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
 	;;
 
-	rsm psr.i				// mask interrupt delivery
+	RSM_PSR_I(p0, r18, r19)			// mask interrupt delivery
 	mov ar.ccv=0
 	andcm r14=r14,r17			// filter out SIGKILL & SIGSTOP
 
@@ -492,7 +492,7 @@ EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
 #ifdef CONFIG_SMP
 	st4.rel [r31]=r0			// release the lock
 #endif
-	ssm psr.i
+	SSM_PSR_I(p0, p9, r31)
 	;;
 
 	srlz.d					// ensure psr.i is set again
@@ -514,7 +514,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
 #ifdef CONFIG_SMP
 	st4.rel [r31]=r0			// release the lock
 #endif
-	ssm psr.i
+	SSM_PSR_I(p0, p9, r17)
 	;;
 	srlz.d
 	br.sptk.many fsys_fallback_syscall	// with signal pending, do the heavy-weight syscall
@@ -522,7 +522,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
 #ifdef CONFIG_SMP
 .lock_contention:
 	/* Rather than spinning here, fall back on doing a heavy-weight syscall.  */
-	ssm psr.i
+	SSM_PSR_I(p0, p9, r17)
 	;;
 	srlz.d
 	br.sptk.many fsys_fallback_syscall
@@ -593,11 +593,11 @@ ENTRY(fsys_fallback_syscall)
 	adds r17=-1024,r15
 	movl r14=sys_call_table
 	;;
-	rsm psr.i
+	RSM_PSR_I(p0, r26, r27)
 	shladd r18=r17,3,r14
 	;;
 	ld8 r18=[r18]				// load normal (heavy-weight) syscall entry-point
-	mov r29=psr				// read psr (12 cyc load latency)
+	MOV_FROM_PSR(p0, r29, r26)		// read psr (12 cyc load latency)
 	mov r27=ar.rsc
 	mov r21=ar.fpsr
 	mov r26=ar.pfs
@@ -735,7 +735,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	mov rp=r14				// I0   set the real return addr
 	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
 	;;
-	ssm psr.i				// M2   we're on kernel stacks now, reenable irqs
+	SSM_PSR_I(p0, p6, r22)			// M2   we're on kernel stacks now, reenable irqs
 	cmp.eq p8,p0=r3,r0			// A
 (p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
 
-- 
cgit v1.2.3


From 9d1964f25c3c43dab5a5f4761477bcfc8402250e Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:37 +0900
Subject: ia64/pv_ops/pvchecker: support mov = ar.itc paravirtualization

add suport for mov = ar.itc to pvchecker.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/native/pvchk_inst.h | 5 +++++
 arch/ia64/scripts/pvcheck.sed             | 1 +
 2 files changed, 6 insertions(+)

diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h
index b8e6eb1090d..13b289ea617 100644
--- a/arch/ia64/include/asm/native/pvchk_inst.h
+++ b/arch/ia64/include/asm/native/pvchk_inst.h
@@ -180,6 +180,11 @@
 	IS_PRED_IN(pred)			\
 	IS_RREG_OUT(reg)			\
 	IS_RREG_CLOB(clob)
+#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
+	IS_PRED_IN(pred)				\
+	IS_PRED_CLOB(pred_clob)				\
+	IS_RREG_OUT(reg)				\
+	IS_RREG_CLOB(clob)
 #define MOV_TO_IFA(reg, clob)			\
 	IS_RREG_IN(reg)				\
 	IS_RREG_CLOB(clob)
diff --git a/arch/ia64/scripts/pvcheck.sed b/arch/ia64/scripts/pvcheck.sed
index ba66ac2e4c6..e59809a3fc0 100644
--- a/arch/ia64/scripts/pvcheck.sed
+++ b/arch/ia64/scripts/pvcheck.sed
@@ -17,6 +17,7 @@ s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g
 s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g
 s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g	# avoid ar.fpsr
 s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g
+s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g
 s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g
 s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g
 s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g
-- 
cgit v1.2.3


From 94752a794ddfdef65289a16627faefa7e2e62d58 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:38 +0900
Subject: ia64/pv_ops: paravirtualize mov = ar.itc.

paravirtualize mov reg = ar.itc.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/native/inst.h | 5 +++++
 arch/ia64/kernel/entry.S            | 4 ++--
 arch/ia64/kernel/fsys.S             | 4 ++--
 arch/ia64/kernel/ivt.S              | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
index 5e4e15133f4..ad59fc6a613 100644
--- a/arch/ia64/include/asm/native/inst.h
+++ b/arch/ia64/include/asm/native/inst.h
@@ -77,6 +77,11 @@
 (pred)	mov reg = psr			\
 	CLOBBER(clob)
 
+#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
+(pred)	mov reg = ar.itc				\
+	CLOBBER(clob)					\
+	CLOBBER_PRED(pred_clob)
+
 #define MOV_TO_IFA(reg, clob)	\
 	mov cr.ifa = reg	\
 	CLOBBER(clob)
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e5341e2c117..ccfdeee9d89 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -735,7 +735,7 @@ GLOBAL_ENTRY(__paravirt_leave_syscall)
 __paravirt_work_processed_syscall:
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 	adds r2=PT(LOADRS)+16,r12
-(pUStk)	mov.m r22=ar.itc			// fetch time at leave
+	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
 	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
 	;;
 (p6)	ld4 r31=[r18]				// load current_thread_info()->flags
@@ -984,7 +984,7 @@ GLOBAL_ENTRY(__paravirt_leave_kernel)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 	.pred.rel.mutex pUStk,pKStk
 	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
-(pUStk)	mov.m r22=ar.itc	// M  fetch time at leave
+	MOV_FROM_ITC(pUStk, p9, r22, r29)	// M  fetch time at leave
 	nop.i 0
 	;;
 #else
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 3544d75e7cb..3567d54f8ce 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -280,7 +280,7 @@ ENTRY(fsys_gettimeofday)
 (p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
 	;;
 	.pred.rel.mutex p8,p9
-(p8)	mov r2 = ar.itc		// CPU_TIMER. 36 clocks latency!!!
+	MOV_FROM_ITC(p8, p6, r2, r10)	// CPU_TIMER. 36 clocks latency!!!
 (p9)	ld8 r2 = [r30]		// MMIO_TIMER. Could also have latency issues..
 (p13)	ld8 r25 = [r19]		// get itc_lastcycle value
 	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET	// tv_sec
@@ -684,7 +684,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	;;
 	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	mov.m r30=ar.itc			// M    get cycle for accounting
+	MOV_FROM_ITC(p0, p6, r30, r23)		// M    get cycle for accounting
 #else
 	nop.m 0
 #endif
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index f675d8e3385..ec9a5fdfa1b 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -804,7 +804,7 @@ ENTRY(break_fault)
 ///////////////////////////////////////////////////////////////////////
 	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	mov.m r30=ar.itc			// M    get cycle for accounting
+	MOV_FROM_ITC(p0, p14, r30, r18)		// M    get cycle for accounting
 #else
 	mov b6=r30				// I0   setup syscall handler branch reg early
 #endif
-- 
cgit v1.2.3


From 496203b15b7249599712525c2b6aafe231b4628d Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:39 +0900
Subject: ia64/pv_ops/xen: paravirtualize read/write ar.itc and ar.itm

paravirtualize ar.itc and ar.itm in order to support save/restore.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/xen/inst.h      | 21 +++++++++
 arch/ia64/include/asm/xen/interface.h |  9 ++++
 arch/ia64/include/asm/xen/minstate.h  | 11 ++++-
 arch/ia64/include/asm/xen/privop.h    |  2 +
 arch/ia64/kernel/asm-offsets.c        |  2 +
 arch/ia64/xen/xen_pv_ops.c            | 80 ++++++++++++++++++++++++++++++++++-
 6 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h
index e8e01b28d2a..90537dc15ef 100644
--- a/arch/ia64/include/asm/xen/inst.h
+++ b/arch/ia64/include/asm/xen/inst.h
@@ -113,6 +113,27 @@
 .endm
 #define MOV_FROM_PSR(pred, reg, clob)	__MOV_FROM_PSR pred, reg, clob
 
+/* assuming ar.itc is read with interrupt disabled. */
+#define MOV_FROM_ITC(pred, pred_clob, reg, clob)		\
+(pred)	movl clob = XSI_ITC_OFFSET;				\
+	;;							\
+(pred)	ld8 clob = [clob];					\
+(pred)	mov reg = ar.itc;					\
+	;;							\
+(pred)	add reg = reg, clob;					\
+	;;							\
+(pred)	movl clob = XSI_ITC_LAST;				\
+	;;							\
+(pred)	ld8 clob = [clob];					\
+	;;							\
+(pred)	cmp.geu.unc pred_clob, p0 = clob, reg;			\
+	;;							\
+(pred_clob)	add reg = 1, clob;				\
+	;;							\
+(pred)	movl clob = XSI_ITC_LAST;				\
+	;;							\
+(pred)	st8 [clob] = reg
+
 
 #define MOV_TO_IFA(reg, clob)	\
 	movl clob = XSI_IFA;	\
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h
index f00fab40854..e951e740bdf 100644
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -209,6 +209,15 @@ struct mapped_regs {
 			unsigned long krs[8];	/* kernel registers */
 			unsigned long tmp[16];	/* temp registers
 						   (e.g. for hyperprivops) */
+
+			/* itc paravirtualization
+			 * vAR.ITC = mAR.ITC + itc_offset
+			 * itc_last is one which was lastly passed to
+			 * the guest OS in order to prevent it from
+			 * going backwords.
+			 */
+			unsigned long itc_offset;
+			unsigned long itc_last;
 		};
 	};
 };
diff --git a/arch/ia64/include/asm/xen/minstate.h b/arch/ia64/include/asm/xen/minstate.h
index 4d92d9bbda7..c57fa910f2c 100644
--- a/arch/ia64/include/asm/xen/minstate.h
+++ b/arch/ia64/include/asm/xen/minstate.h
@@ -1,3 +1,12 @@
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+/* read ar.itc in advance, and use it before leaving bank 0 */
+#define XEN_ACCOUNT_GET_STAMP		\
+	MOV_FROM_ITC(pUStk, p6, r20, r2);
+#else
+#define XEN_ACCOUNT_GET_STAMP
+#endif
+
 /*
  * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
  * the minimum state necessary that allows us to turn psr.ic back
@@ -123,7 +132,7 @@
 	;;											\
 .mem.offset 0,0; st8.spill [r16]=r2,16;								\
 .mem.offset 8,0; st8.spill [r17]=r3,16;								\
-	ACCOUNT_GET_STAMP									\
+	XEN_ACCOUNT_GET_STAMP									\
 	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
 	;;											\
 	EXTRA;											\
diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h
index 71ec7546e10..2261dda578f 100644
--- a/arch/ia64/include/asm/xen/privop.h
+++ b/arch/ia64/include/asm/xen/privop.h
@@ -55,6 +55,8 @@
 #define XSI_BANK1_R16			(XSI_BASE + XSI_BANK1_R16_OFS)
 #define XSI_BANKNUM			(XSI_BASE + XSI_BANKNUM_OFS)
 #define XSI_IHA				(XSI_BASE + XSI_IHA_OFS)
+#define XSI_ITC_OFFSET			(XSI_BASE + XSI_ITC_OFFSET_OFS)
+#define XSI_ITC_LAST			(XSI_BASE + XSI_ITC_LAST_OFS)
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 742dbb1d5a4..af565016904 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -316,5 +316,7 @@ void foo(void)
 	DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
 	DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat);
 	DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat);
+	DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset);
+	DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last);
 #endif /* CONFIG_XEN */
 }
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index fe72308321b..d9133611434 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -183,6 +183,75 @@ struct pv_fsys_data xen_fsys_data __initdata = {
  * intrinsics hooks.
  */
 
+static void
+xen_set_itm_with_offset(unsigned long val)
+{
+	/* ia64_cpu_local_tick() calls this with interrupt enabled. */
+	/* WARN_ON(!irqs_disabled()); */
+	xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
+}
+
+static unsigned long
+xen_get_itm_with_offset(void)
+{
+	/* unused at this moment */
+	printk(KERN_DEBUG "%s is called.\n", __func__);
+
+	WARN_ON(!irqs_disabled());
+	return ia64_native_getreg(_IA64_REG_CR_ITM) +
+		XEN_MAPPEDREGS->itc_offset;
+}
+
+/* ia64_set_itc() is only called by
+ * cpu_init() with ia64_set_itc(0) and ia64_sync_itc().
+ * So XEN_MAPPEDRESG->itc_offset cal be considered as almost constant.
+ */
+static void
+xen_set_itc(unsigned long val)
+{
+	unsigned long mitc;
+
+	WARN_ON(!irqs_disabled());
+	mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
+	XEN_MAPPEDREGS->itc_offset = val - mitc;
+	XEN_MAPPEDREGS->itc_last = val;
+}
+
+static unsigned long
+xen_get_itc(void)
+{
+	unsigned long res;
+	unsigned long itc_offset;
+	unsigned long itc_last;
+	unsigned long ret_itc_last;
+
+	itc_offset = XEN_MAPPEDREGS->itc_offset;
+	do {
+		itc_last = XEN_MAPPEDREGS->itc_last;
+		res = ia64_native_getreg(_IA64_REG_AR_ITC);
+		res += itc_offset;
+		if (itc_last >= res)
+			res = itc_last + 1;
+		ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
+				       itc_last, res);
+	} while (unlikely(ret_itc_last != itc_last));
+	return res;
+
+#if 0
+	/* ia64_itc_udelay() calls ia64_get_itc() with interrupt enabled.
+	   Should it be paravirtualized instead? */
+	WARN_ON(!irqs_disabled());
+	itc_offset = XEN_MAPPEDREGS->itc_offset;
+	itc_last = XEN_MAPPEDREGS->itc_last;
+	res = ia64_native_getreg(_IA64_REG_AR_ITC);
+	res += itc_offset;
+	if (itc_last >= res)
+		res = itc_last + 1;
+	XEN_MAPPEDREGS->itc_last = res;
+	return res;
+#endif
+}
+
 static void xen_setreg(int regnum, unsigned long val)
 {
 	switch (regnum) {
@@ -194,11 +263,14 @@ static void xen_setreg(int regnum, unsigned long val)
 		xen_set_eflag(val);
 		break;
 #endif
+	case _IA64_REG_AR_ITC:
+		xen_set_itc(val);
+		break;
 	case _IA64_REG_CR_TPR:
 		xen_set_tpr(val);
 		break;
 	case _IA64_REG_CR_ITM:
-		xen_set_itm(val);
+		xen_set_itm_with_offset(val);
 		break;
 	case _IA64_REG_CR_EOI:
 		xen_eoi(val);
@@ -222,6 +294,12 @@ static unsigned long xen_getreg(int regnum)
 		res = xen_get_eflag();
 		break;
 #endif
+	case _IA64_REG_AR_ITC:
+		res = xen_get_itc();
+		break;
+	case _IA64_REG_CR_ITM:
+		res = xen_get_itm_with_offset();
+		break;
 	case _IA64_REG_CR_IVR:
 		res = xen_get_ivr();
 		break;
-- 
cgit v1.2.3


From f927da178671a824cf6c530f0623544206387e57 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:40 +0900
Subject: ia64/pv_ops/pv_time_ops: add sched_clock hook.

add sched_clock() hook to paravirtualize sched_clock().
ia64 sched_clock() is based on ar.itc which isn't stable
on virtualized environment because vcpu may move around on
pcpus. So it needs paravirtualization.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/paravirt.h |  7 +++++++
 arch/ia64/include/asm/timex.h    |  1 +
 arch/ia64/kernel/head.S          | 10 ++++++++--
 arch/ia64/kernel/paravirt.c      |  1 +
 arch/ia64/kernel/time.c          |  9 +++++++++
 5 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 56f69f938cc..a73e77add7e 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -225,6 +225,8 @@ struct pv_time_ops {
 	int (*do_steal_accounting)(unsigned long *new_itm);
 
 	void (*clocksource_resume)(void);
+
+	unsigned long long (*sched_clock)(void);
 };
 
 extern struct pv_time_ops pv_time_ops;
@@ -242,6 +244,11 @@ paravirt_do_steal_accounting(unsigned long *new_itm)
 	return pv_time_ops.do_steal_accounting(new_itm);
 }
 
+static inline unsigned long long paravirt_sched_clock(void)
+{
+	return pv_time_ops.sched_clock();
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #else
diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h
index 4e03cfe74a0..86c7db86118 100644
--- a/arch/ia64/include/asm/timex.h
+++ b/arch/ia64/include/asm/timex.h
@@ -40,5 +40,6 @@ get_cycles (void)
 }
 
 extern void ia64_cpu_local_tick (void);
+extern unsigned long long ia64_native_sched_clock (void);
 
 #endif /* _ASM_IA64_TIMEX_H */
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 59301c47280..23f846de62d 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1050,7 +1050,7 @@ END(ia64_delay_loop)
  * except that the multiplication and the shift are done with 128-bit
  * intermediate precision so that we can produce a full 64-bit result.
  */
-GLOBAL_ENTRY(sched_clock)
+GLOBAL_ENTRY(ia64_native_sched_clock)
 	addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
 	mov.m r9=ar.itc		// fetch cycle-counter				(35 cyc)
 	;;
@@ -1066,7 +1066,13 @@ GLOBAL_ENTRY(sched_clock)
 	;;
 	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
 	br.ret.sptk.many rp
-END(sched_clock)
+END(ia64_native_sched_clock)
+#ifndef CONFIG_PARAVIRT
+	//unsigned long long
+	//sched_clock(void) __attribute__((alias("ia64_native_sched_clock")));
+	.global sched_clock
+sched_clock = ia64_native_sched_clock
+#endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 GLOBAL_ENTRY(cycle_to_cputime)
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index 9f14c16f636..6bc33a6db75 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -366,4 +366,5 @@ ia64_native_do_steal_accounting(unsigned long *new_itm)
 
 struct pv_time_ops pv_time_ops = {
 	.do_steal_accounting = ia64_native_do_steal_accounting,
+	.sched_clock = ia64_native_sched_clock,
 };
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index f0ebb342409..c323c7b9c77 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -49,6 +49,15 @@ EXPORT_SYMBOL(last_cli_ip);
 
 #endif
 
+#ifdef CONFIG_PARAVIRT
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+unsigned long long sched_clock(void)
+{
+        return paravirt_sched_clock();
+}
+#endif
+
 #ifdef CONFIG_PARAVIRT
 static void
 paravirt_clocksource_resume(void)
-- 
cgit v1.2.3


From 1aec1c558a797512e922581b21a178a05438bfc9 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:41 +0900
Subject: ia64/pv_ops/xen/pv_time_ops: implement sched_clock.

paravirtualize sched_clock.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/xen/time.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index 68d6204c3f1..fb833269017 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -175,10 +175,58 @@ static void xen_itc_jitter_data_reset(void)
 	} while (unlikely(ret != lcycle));
 }
 
+/* based on xen_sched_clock() in arch/x86/xen/time.c. */
+/*
+ * This relies on HAVE_UNSTABLE_SCHED_CLOCK. If it can't be defined,
+ * something similar logic should be implemented here.
+ */
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+static unsigned long long xen_sched_clock(void)
+{
+	struct vcpu_runstate_info runstate;
+
+	unsigned long long now;
+	unsigned long long offset;
+	unsigned long long ret;
+
+	/*
+	 * Ideally sched_clock should be called on a per-cpu basis
+	 * anyway, so preempt should already be disabled, but that's
+	 * not current practice at the moment.
+	 */
+	preempt_disable();
+
+	/*
+	 * both ia64_native_sched_clock() and xen's runstate are
+	 * based on mAR.ITC. So difference of them makes sense.
+	 */
+	now = ia64_native_sched_clock();
+
+	get_runstate_snapshot(&runstate);
+
+	WARN_ON(runstate.state != RUNSTATE_running);
+
+	offset = 0;
+	if (now > runstate.state_entry_time)
+		offset = now - runstate.state_entry_time;
+	ret = runstate.time[RUNSTATE_blocked] +
+		runstate.time[RUNSTATE_running] +
+		offset;
+
+	preempt_enable();
+
+	return ret;
+}
+
 struct pv_time_ops xen_time_ops __initdata = {
 	.init_missing_ticks_accounting	= xen_init_missing_ticks_accounting,
 	.do_steal_accounting		= xen_do_steal_accounting,
 	.clocksource_resume		= xen_itc_jitter_data_reset,
+	.sched_clock			= xen_sched_clock,
 };
 
 /* Called after suspend, to resume time.  */
-- 
cgit v1.2.3


From e4ff5b8f545811008123dd9556a51d814f562fcf Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:42 +0900
Subject: ia64/pv_ops: gate page paravirtualization.

paravirtualize gate page by allowing each pv_ops instances
to define its own gate page.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/native/patchlist.h | 38 +++++++++++++++
 arch/ia64/include/asm/paravirt.h         | 35 ++++++++++++++
 arch/ia64/kernel/Makefile                | 32 ++-----------
 arch/ia64/kernel/Makefile.gate           | 27 +++++++++++
 arch/ia64/kernel/gate.lds.S              | 17 +++----
 arch/ia64/kernel/paravirt_patchlist.c    | 79 ++++++++++++++++++++++++++++++++
 arch/ia64/kernel/paravirt_patchlist.h    | 28 +++++++++++
 arch/ia64/kernel/patch.c                 | 12 ++---
 arch/ia64/mm/init.c                      |  6 ++-
 9 files changed, 231 insertions(+), 43 deletions(-)
 create mode 100644 arch/ia64/include/asm/native/patchlist.h
 create mode 100644 arch/ia64/kernel/Makefile.gate
 create mode 100644 arch/ia64/kernel/paravirt_patchlist.c
 create mode 100644 arch/ia64/kernel/paravirt_patchlist.h

diff --git a/arch/ia64/include/asm/native/patchlist.h b/arch/ia64/include/asm/native/patchlist.h
new file mode 100644
index 00000000000..be16ca9311b
--- /dev/null
+++ b/arch/ia64/include/asm/native/patchlist.h
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * arch/ia64/include/asm/native/inst.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#define __paravirt_start_gate_fsyscall_patchlist		\
+	__ia64_native_start_gate_fsyscall_patchlist
+#define __paravirt_end_gate_fsyscall_patchlist			\
+	__ia64_native_end_gate_fsyscall_patchlist
+#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist	\
+	__ia64_native_start_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist	\
+	__ia64_native_end_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_start_gate_vtop_patchlist			\
+	__ia64_native_start_gate_vtop_patchlist
+#define __paravirt_end_gate_vtop_patchlist			\
+	__ia64_native_end_gate_vtop_patchlist
+#define __paravirt_start_gate_mckinley_e9_patchlist		\
+	__ia64_native_start_gate_mckinley_e9_patchlist
+#define __paravirt_end_gate_mckinley_e9_patchlist		\
+	__ia64_native_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index a73e77add7e..fc433f6c327 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -35,6 +35,41 @@ extern struct pv_fsys_data pv_fsys_data;
 
 unsigned long *paravirt_get_fsyscall_table(void);
 char *paravirt_get_fsys_bubble_down(void);
+
+/******************************************************************************
+ * patchlist addresses for gate page
+ */
+enum pv_gate_patchlist {
+	PV_GATE_START_FSYSCALL,
+	PV_GATE_END_FSYSCALL,
+
+	PV_GATE_START_BRL_FSYS_BUBBLE_DOWN,
+	PV_GATE_END_BRL_FSYS_BUBBLE_DOWN,
+
+	PV_GATE_START_VTOP,
+	PV_GATE_END_VTOP,
+
+	PV_GATE_START_MCKINLEY_E9,
+	PV_GATE_END_MCKINLEY_E9,
+};
+
+struct pv_patchdata {
+	unsigned long start_fsyscall_patchlist;
+	unsigned long end_fsyscall_patchlist;
+	unsigned long start_brl_fsys_bubble_down_patchlist;
+	unsigned long end_brl_fsys_bubble_down_patchlist;
+	unsigned long start_vtop_patchlist;
+	unsigned long end_vtop_patchlist;
+	unsigned long start_mckinley_e9_patchlist;
+	unsigned long end_mckinley_e9_patchlist;
+
+	void *gate_section;
+};
+
+extern struct pv_patchdata pv_patchdata;
+
+unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type);
+void *paravirt_get_gate_section(void);
 #endif
 
 #ifdef CONFIG_PARAVIRT_GUEST
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 1ab150ec8ce..8dc9df8a87a 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y	:= head.o init_task.o vmlinux.lds
 
 obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o		\
+	 irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o		\
 	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
 	 unwind.o mca.o mca_asm.o topology.o
 
@@ -47,35 +47,13 @@ ifeq ($(CONFIG_DMAR), y)
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 endif
 
-# The gate DSO image is built using a special linker script.
-targets += gate.so gate-syms.o
-
-extra-y += gate.so gate-syms.o gate.lds gate.o
-
 # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
 CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
 
-CPPFLAGS_gate.lds := -P -C -U$(ARCH)
-
-quiet_cmd_gate = GATE $@
-      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
-		     $(call ld-option, -Wl$(comma)--hash-style=sysv)
-$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-$(obj)/built-in.o: $(obj)/gate-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
-
-GATECFLAGS_gate-syms.o = -r
-$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-# gate-data.o contains the gate DSO image as data in section .data.gate.
-# We must build gate.so before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/gate-data.o: $(obj)/gate.so
+# The gate DSO image is built using a special linker script.
+include $(srctree)/arch/ia64/kernel/Makefile.gate
+# tell compiled for native
+CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE
 
 # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
 define sed-y
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
new file mode 100644
index 00000000000..1d87f84069b
--- /dev/null
+++ b/arch/ia64/kernel/Makefile.gate
@@ -0,0 +1,27 @@
+# The gate DSO image is built using a special linker script.
+
+targets += gate.so gate-syms.o
+
+extra-y += gate.so gate-syms.o gate.lds gate.o
+
+CPPFLAGS_gate.lds := -P -C -U$(ARCH)
+
+quiet_cmd_gate = GATE $@
+      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
+		     $(call ld-option, -Wl$(comma)--hash-style=sysv)
+$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
+	$(call if_changed,gate)
+
+$(obj)/built-in.o: $(obj)/gate-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
+
+GATECFLAGS_gate-syms.o = -r
+$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
+	$(call if_changed,gate)
+
+# gate-data.o contains the gate DSO image as data in section .data.gate.
+# We must build gate.so before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
index 3cb1abc00e2..88c64ed47c3 100644
--- a/arch/ia64/kernel/gate.lds.S
+++ b/arch/ia64/kernel/gate.lds.S
@@ -7,6 +7,7 @@
 
 
 #include <asm/system.h>
+#include "paravirt_patchlist.h"
 
 SECTIONS
 {
@@ -33,21 +34,21 @@ SECTIONS
 	. = GATE_ADDR + 0x600;
 
 	.data.patch		: {
-		__start_gate_mckinley_e9_patchlist = .;
+		__paravirt_start_gate_mckinley_e9_patchlist = .;
 		*(.data.patch.mckinley_e9)
-		__end_gate_mckinley_e9_patchlist = .;
+		__paravirt_end_gate_mckinley_e9_patchlist = .;
 
-		__start_gate_vtop_patchlist = .;
+		__paravirt_start_gate_vtop_patchlist = .;
 		*(.data.patch.vtop)
-		__end_gate_vtop_patchlist = .;
+		__paravirt_end_gate_vtop_patchlist = .;
 
-		__start_gate_fsyscall_patchlist = .;
+		__paravirt_start_gate_fsyscall_patchlist = .;
 		*(.data.patch.fsyscall_table)
-		__end_gate_fsyscall_patchlist = .;
+		__paravirt_end_gate_fsyscall_patchlist = .;
 
-		__start_gate_brl_fsys_bubble_down_patchlist = .;
+		__paravirt_start_gate_brl_fsys_bubble_down_patchlist = .;
 		*(.data.patch.brl_fsys_bubble_down)
-		__end_gate_brl_fsys_bubble_down_patchlist = .;
+		__paravirt_end_gate_brl_fsys_bubble_down_patchlist = .;
 	}						:readable
 
 	.IA_64.unwind_info	: { *(.IA_64.unwind_info*) }
diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c
new file mode 100644
index 00000000000..b28082a95d4
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.c
@@ -0,0 +1,79 @@
+/******************************************************************************
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/bug.h>
+#include <asm/paravirt.h>
+
+#define DECLARE(name)						\
+	extern unsigned long					\
+		__ia64_native_start_gate_##name##_patchlist[];	\
+	extern unsigned long					\
+		__ia64_native_end_gate_##name##_patchlist[]
+
+DECLARE(fsyscall);
+DECLARE(brl_fsys_bubble_down);
+DECLARE(vtop);
+DECLARE(mckinley_e9);
+
+extern unsigned long __start_gate_section[];
+
+#define ASSIGN(name)							    \
+	.start_##name##_patchlist =					    \
+		(unsigned long)__ia64_native_start_gate_##name##_patchlist, \
+	.end_##name##_patchlist =					    \
+		(unsigned long)__ia64_native_end_gate_##name##_patchlist
+
+struct pv_patchdata pv_patchdata __initdata = {
+	ASSIGN(fsyscall),
+	ASSIGN(brl_fsys_bubble_down),
+	ASSIGN(vtop),
+	ASSIGN(mckinley_e9),
+
+	.gate_section = (void*)__start_gate_section,
+};
+
+
+unsigned long __init
+paravirt_get_gate_patchlist(enum pv_gate_patchlist type)
+{
+
+#define CASE(NAME, name)					\
+	case PV_GATE_START_##NAME:				\
+		return pv_patchdata.start_##name##_patchlist;	\
+	case PV_GATE_END_##NAME:				\
+		return pv_patchdata.end_##name##_patchlist;	\
+
+	switch (type) {
+		CASE(FSYSCALL, fsyscall);
+		CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down);
+		CASE(VTOP, vtop);
+		CASE(MCKINLEY_E9, mckinley_e9);
+	default:
+		BUG();
+		break;
+	}
+	return 0;
+}
+
+void * __init
+paravirt_get_gate_section(void)
+{
+	return pv_patchdata.gate_section;
+}
diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h
new file mode 100644
index 00000000000..0684aa6c650
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_patchlist.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#if defined(__IA64_GATE_PARAVIRTUALIZED_XEN)
+#include <asm/xen/patchlist.h>
+#else
+#include <asm/native/patchlist.h>
+#endif
+
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index 02dd977436f..64c6f95daa3 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -227,13 +227,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 void __init
 ia64_patch_gate (void)
 {
-#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
-#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
+#	define START(name)	paravirt_get_gate_patchlist(PV_GATE_START_##name)
+#	define END(name)	paravirt_get_gate_patchlist(PV_GATE_END_##name)
 
-	patch_fsyscall_table(START(fsyscall), END(fsyscall));
-	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
-	ia64_patch_vtop(START(vtop), END(vtop));
-	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
+	patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL));
+	patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN));
+	ia64_patch_vtop(START(VTOP), END(VTOP));
+	ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9));
 }
 
 void ia64_patch_phys_stack_reg(unsigned long val)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index c9bc5b305ff..8503d534794 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -260,6 +260,7 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 static void __init
 setup_gate (void)
 {
+	void *gate_section;
 	struct page *page;
 
 	/*
@@ -267,10 +268,11 @@ setup_gate (void)
 	 * headers etc. and once execute-only page to enable
 	 * privilege-promotion via "epc":
 	 */
-	page = virt_to_page(ia64_imva(__start_gate_section));
+	gate_section = paravirt_get_gate_section();
+	page = virt_to_page(ia64_imva(gate_section));
 	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
 #ifdef HAVE_BUGGY_SEGREL
-	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
+	page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE));
 	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
 #else
 	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
-- 
cgit v1.2.3


From b937dd76d07f2347684d6cc1e1ec4e2746417357 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:43 +0900
Subject: ia64/pv_ops/xen: define xen specific gate page.

define xen specific gate page.
At this phase bits in the gate page is same to native.
At the next phase, it will be paravirtualized.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/xen/patchlist.h | 38 +++++++++++++++++++++++++++++++++++
 arch/ia64/kernel/vmlinux.lds.S        |  6 ++++++
 arch/ia64/xen/Makefile                | 16 ++++++++++++++-
 arch/ia64/xen/gate-data.S             |  3 +++
 arch/ia64/xen/xen_pv_ops.c            | 32 +++++++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 arch/ia64/include/asm/xen/patchlist.h
 create mode 100644 arch/ia64/xen/gate-data.S

diff --git a/arch/ia64/include/asm/xen/patchlist.h b/arch/ia64/include/asm/xen/patchlist.h
new file mode 100644
index 00000000000..eae944e8884
--- /dev/null
+++ b/arch/ia64/include/asm/xen/patchlist.h
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * arch/ia64/include/asm/xen/patchlist.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#define __paravirt_start_gate_fsyscall_patchlist		\
+	__xen_start_gate_fsyscall_patchlist
+#define __paravirt_end_gate_fsyscall_patchlist			\
+	__xen_end_gate_fsyscall_patchlist
+#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist	\
+	__xen_start_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist	\
+	__xen_end_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_start_gate_vtop_patchlist			\
+	__xen_start_gate_vtop_patchlist
+#define __paravirt_end_gate_vtop_patchlist			\
+	__xen_end_gate_vtop_patchlist
+#define __paravirt_start_gate_mckinley_e9_patchlist		\
+	__xen_start_gate_mckinley_e9_patchlist
+#define __paravirt_end_gate_mckinley_e9_patchlist		\
+	__xen_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 10a7d47e851..92ae7e8f014 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -201,6 +201,12 @@ SECTIONS
 	  __start_gate_section = .;
 	  *(.data.gate)
 	  __stop_gate_section = .;
+#ifdef CONFIG_XEN
+	  . = ALIGN(PAGE_SIZE);
+	  __xen_start_gate_section = .;
+	  *(.data.gate.xen)
+	  __xen_stop_gate_section = .;
+#endif
 	}
   . = ALIGN(PAGE_SIZE);		/* make sure the gate page doesn't expose
   				 * kernel data
diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile
index b4ca2e6c0ea..94f0d8e7d9d 100644
--- a/arch/ia64/xen/Makefile
+++ b/arch/ia64/xen/Makefile
@@ -3,10 +3,24 @@
 #
 
 obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \
-	 hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o
+	 hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o \
+	 gate-data.o
 
 obj-$(CONFIG_IA64_GENERIC) += machvec.o
 
+# The gate DSO image is built using a special linker script.
+include $(srctree)/arch/ia64/kernel/Makefile.gate
+
+# tell compiled for xen
+CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN
+
+# use same file of native.
+$(obj)/gate.o: $(src)/../kernel/gate.S FORCE
+	$(call if_changed_dep,as_o_S)
+$(obj)/gate.lds: $(src)/../kernel/gate.lds.S FORCE
+	$(call if_changed_dep,cpp_lds_S)
+
+
 AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN
 
 # xen multi compile
diff --git a/arch/ia64/xen/gate-data.S b/arch/ia64/xen/gate-data.S
new file mode 100644
index 00000000000..7d4830afc91
--- /dev/null
+++ b/arch/ia64/xen/gate-data.S
@@ -0,0 +1,3 @@
+	.section .data.gate.xen, "aw"
+
+	.incbin "arch/ia64/xen/gate.so"
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index d9133611434..bdf1acbce81 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -178,6 +178,37 @@ struct pv_fsys_data xen_fsys_data __initdata = {
 	.fsys_bubble_down = (void *)xen_fsys_bubble_down,
 };
 
+/***************************************************************************
+ * pv_patchdata
+ * patchdata addresses
+ */
+
+#define DECLARE(name)							\
+	extern unsigned long __xen_start_gate_##name##_patchlist[];	\
+	extern unsigned long __xen_end_gate_##name##_patchlist[]
+
+DECLARE(fsyscall);
+DECLARE(brl_fsys_bubble_down);
+DECLARE(vtop);
+DECLARE(mckinley_e9);
+
+extern unsigned long __xen_start_gate_section[];
+
+#define ASSIGN(name)							\
+	.start_##name##_patchlist =					\
+		(unsigned long)__xen_start_gate_##name##_patchlist,	\
+	.end_##name##_patchlist =					\
+		(unsigned long)__xen_end_gate_##name##_patchlist
+
+static struct pv_patchdata xen_patchdata __initdata = {
+	ASSIGN(fsyscall),
+	ASSIGN(brl_fsys_bubble_down),
+	ASSIGN(vtop),
+	ASSIGN(mckinley_e9),
+
+	.gate_section = (void*)__xen_start_gate_section,
+};
+
 /***************************************************************************
  * pv_cpu_ops
  * intrinsics hooks.
@@ -447,6 +478,7 @@ xen_setup_pv_ops(void)
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
 	pv_fsys_data = xen_fsys_data;
+	pv_patchdata = xen_patchdata;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_iosapic_ops = xen_iosapic_ops;
 	pv_irq_ops = xen_irq_ops;
-- 
cgit v1.2.3


From 53129c5c553f8d0c45f12f15742ac112e8605ab5 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:44 +0900
Subject: ia64/pv_ops: move down __kernel_syscall_via_epc.

Move down __kernel_syscall_via_epc to the end of the page.
We want to paravirtualize only __kernel_syscall_via_epc because
it includes privileged instructions. Its paravirtualization increases
its symbols size.

On the other hand, each paravirtualized gate must have e symbols of
same value and size to native's because the page is mapped to GATE_ADDR
and GATE_ADDR + PERCPU_PAGE_SIZE and vmlinux is linked to those symbols.
Later to have the same symbol size, we pads NOPs at the end of
__kernel_syscall_via_epc. Move it after other functions to keep
symbols of other functions have same values and sizes.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/gate.S | 162 ++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 81 deletions(-)

diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index 74b1ccce4e8..c957228e3f1 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -48,87 +48,6 @@ GLOBAL_ENTRY(__kernel_syscall_via_break)
 }
 END(__kernel_syscall_via_break)
 
-/*
- * On entry:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	b6  = return address
- * On exit:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	all other "scratch" registers:	undefined
- *	all "preserved" registers:	same as on entry
- */
-
-GLOBAL_ENTRY(__kernel_syscall_via_epc)
-	.prologue
-	.altrp b6
-	.body
-{
-	/*
-	 * Note: the kernel cannot assume that the first two instructions in this
-	 * bundle get executed.  The remaining code must be safe even if
-	 * they do not get executed.
-	 */
-	adds r17=-1024,r15			// A
-	mov r10=0				// A    default to successful syscall execution
-	epc					// B	causes split-issue
-}
-	;;
-	rsm psr.be | psr.i			// M2 (5 cyc to srlz.d)
-	LOAD_FSYSCALL_TABLE(r14)		// X
-	;;
-	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
-	shladd r18=r17,3,r14			// A
-	mov r19=NR_syscalls-1			// A
-	;;
-	lfetch [r18]				// M0|1
-	mov r29=psr				// M2 (12 cyc)
-	// If r17 is a NaT, p6 will be zero
-	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
-	;;
-	mov r21=ar.fpsr				// M2 (12 cyc)
-	tnat.nz p10,p9=r15			// I0
-	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
-	;;
-	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
-(p6)	ld8 r18=[r18]				// M0|1
-	nop.i 0
-	;;
-	nop.m 0
-(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
-	nop.i 0
-	;;
-(p8)	ssm psr.i
-(p6)	mov b7=r18				// I0
-(p8)	br.dptk.many b7				// B
-
-	mov r27=ar.rsc				// M2 (12 cyc)
-/*
- * brl.cond doesn't work as intended because the linker would convert this branch
- * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
- * future version of the linker.  In the meantime, we just use an indirect branch
- * instead.
- */
-#ifdef CONFIG_ITANIUM
-(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
-	;;
-(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
-	;;
-(p6)	mov b7=r14
-(p6)	br.sptk.many b7
-#else
-	BRL_COND_FSYS_BUBBLE_DOWN(p6)
-#endif
-	ssm psr.i
-	mov r10=-1
-(p10)	mov r8=EINVAL
-(p9)	mov r8=ENOSYS
-	FSYS_RETURN
-END(__kernel_syscall_via_epc)
-
 #	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
 #	define ARG1_OFF		(16 + IA64_SIGFRAME_ARG1_OFFSET)
 #	define ARG2_OFF		(16 + IA64_SIGFRAME_ARG2_OFFSET)
@@ -374,3 +293,84 @@ restore_rbs:
 	// invala not necessary as that will happen when returning to user-mode
 	br.cond.sptk back_from_restore_rbs
 END(__kernel_sigtramp)
+
+/*
+ * On entry:
+ *	r11 = saved ar.pfs
+ *	r15 = system call #
+ *	b0  = saved return address
+ *	b6  = return address
+ * On exit:
+ *	r11 = saved ar.pfs
+ *	r15 = system call #
+ *	b0  = saved return address
+ *	all other "scratch" registers:	undefined
+ *	all "preserved" registers:	same as on entry
+ */
+
+GLOBAL_ENTRY(__kernel_syscall_via_epc)
+	.prologue
+	.altrp b6
+	.body
+{
+	/*
+	 * Note: the kernel cannot assume that the first two instructions in this
+	 * bundle get executed.  The remaining code must be safe even if
+	 * they do not get executed.
+	 */
+	adds r17=-1024,r15			// A
+	mov r10=0				// A    default to successful syscall execution
+	epc					// B	causes split-issue
+}
+	;;
+	rsm psr.be | psr.i			// M2 (5 cyc to srlz.d)
+	LOAD_FSYSCALL_TABLE(r14)		// X
+	;;
+	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
+	shladd r18=r17,3,r14			// A
+	mov r19=NR_syscalls-1			// A
+	;;
+	lfetch [r18]				// M0|1
+	mov r29=psr				// M2 (12 cyc)
+	// If r17 is a NaT, p6 will be zero
+	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
+	;;
+	mov r21=ar.fpsr				// M2 (12 cyc)
+	tnat.nz p10,p9=r15			// I0
+	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
+	;;
+	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
+(p6)	ld8 r18=[r18]				// M0|1
+	nop.i 0
+	;;
+	nop.m 0
+(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
+	nop.i 0
+	;;
+(p8)	ssm psr.i
+(p6)	mov b7=r18				// I0
+(p8)	br.dptk.many b7				// B
+
+	mov r27=ar.rsc				// M2 (12 cyc)
+/*
+ * brl.cond doesn't work as intended because the linker would convert this branch
+ * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
+ * future version of the linker.  In the meantime, we just use an indirect branch
+ * instead.
+ */
+#ifdef CONFIG_ITANIUM
+(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
+	;;
+(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
+	;;
+(p6)	mov b7=r14
+(p6)	br.sptk.many b7
+#else
+	BRL_COND_FSYS_BUBBLE_DOWN(p6)
+#endif
+	ssm psr.i
+	mov r10=-1
+(p10)	mov r8=EINVAL
+(p9)	mov r8=ENOSYS
+	FSYS_RETURN
+END(__kernel_syscall_via_epc)
-- 
cgit v1.2.3


From c4312511ba1f3a08f2f64ca8335882ef56ff9bdd Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:45 +0900
Subject: ia64/pv_ops: paravirtualize gate.S.

paravirtualize gate.S.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/native/inst.h       |  5 +++++
 arch/ia64/include/asm/native/pvchk_inst.h |  3 +++
 arch/ia64/kernel/gate.S                   | 17 +++++++++++++----
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
index ad59fc6a613..d2d46efb3e6 100644
--- a/arch/ia64/include/asm/native/inst.h
+++ b/arch/ia64/include/asm/native/inst.h
@@ -166,6 +166,11 @@
 #define RSM_PSR_DT		\
 	rsm psr.dt
 
+#define RSM_PSR_BE_I(clob0, clob1)	\
+	rsm psr.be | psr.i		\
+	CLOBBER(clob0)			\
+	CLOBBER(clob1)
+
 #define SSM_PSR_DT_AND_SRLZ_I	\
 	ssm psr.dt		\
 	;;			\
diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h
index 13b289ea617..8d72962ec83 100644
--- a/arch/ia64/include/asm/native/pvchk_inst.h
+++ b/arch/ia64/include/asm/native/pvchk_inst.h
@@ -251,6 +251,9 @@
 	IS_RREG_CLOB(clob2)
 #define RSM_PSR_DT				\
 	nop 0
+#define RSM_PSR_BE_I(clob0, clob1)		\
+	IS_RREG_CLOB(clob0)			\
+	IS_RREG_CLOB(clob1)
 #define SSM_PSR_DT_AND_SRLZ_I			\
 	nop 0
 #define BSW_0(clob0, clob1, clob2)		\
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index c957228e3f1..cf5e0a105e1 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -13,6 +13,7 @@
 #include <asm/sigcontext.h>
 #include <asm/system.h>
 #include <asm/unistd.h>
+#include "paravirt_inst.h"
 
 /*
  * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
@@ -323,7 +324,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 	epc					// B	causes split-issue
 }
 	;;
-	rsm psr.be | psr.i			// M2 (5 cyc to srlz.d)
+	RSM_PSR_BE_I(r20, r22)			// M2 (5 cyc to srlz.d)
 	LOAD_FSYSCALL_TABLE(r14)		// X
 	;;
 	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
@@ -331,7 +332,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 	mov r19=NR_syscalls-1			// A
 	;;
 	lfetch [r18]				// M0|1
-	mov r29=psr				// M2 (12 cyc)
+	MOV_FROM_PSR(p0, r29, r8)		// M2 (12 cyc)
 	// If r17 is a NaT, p6 will be zero
 	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
 	;;
@@ -347,7 +348,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 (p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
 	nop.i 0
 	;;
-(p8)	ssm psr.i
+	SSM_PSR_I(p8, p14, r25)
 (p6)	mov b7=r18				// I0
 (p8)	br.dptk.many b7				// B
 
@@ -368,9 +369,17 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 #else
 	BRL_COND_FSYS_BUBBLE_DOWN(p6)
 #endif
-	ssm psr.i
+	SSM_PSR_I(p0, p14, r10)
 	mov r10=-1
 (p10)	mov r8=EINVAL
 (p9)	mov r8=ENOSYS
 	FSYS_RETURN
+
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * padd to make the size of this symbol constant
+	 * independent of paravirtualization.
+	 */
+	.align PAGE_SIZE / 8
+#endif
 END(__kernel_syscall_via_epc)
-- 
cgit v1.2.3


From f8de2ec678fa09276cf7ad02838eb80e86c73097 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:05:46 +0900
Subject: ia64/pv_ops/xen/gate.S: xen gate page paravirtualization

xen gate page paravirtualization

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/xen/inst.h | 4 ++++
 arch/ia64/xen/Makefile           | 1 +
 2 files changed, 5 insertions(+)

diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h
index 90537dc15ef..c53a4761120 100644
--- a/arch/ia64/include/asm/xen/inst.h
+++ b/arch/ia64/include/asm/xen/inst.h
@@ -386,6 +386,10 @@
 #define RSM_PSR_DT		\
 	XEN_HYPER_RSM_PSR_DT
 
+#define RSM_PSR_BE_I(clob0, clob1)	\
+	RSM_PSR_I(p0, clob0, clob1);	\
+	rum psr.be
+
 #define SSM_PSR_DT_AND_SRLZ_I	\
 	XEN_HYPER_SSM_PSR_DT
 
diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile
index 94f0d8e7d9d..e6f4a0a7422 100644
--- a/arch/ia64/xen/Makefile
+++ b/arch/ia64/xen/Makefile
@@ -13,6 +13,7 @@ include $(srctree)/arch/ia64/kernel/Makefile.gate
 
 # tell compiled for xen
 CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN
+AFLAGS_gate.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN -D__IA64_GATE_PARAVIRTUALIZED_XEN
 
 # use same file of native.
 $(obj)/gate.o: $(src)/../kernel/gate.S FORCE
-- 
cgit v1.2.3


From bf7ab02f620c1020c869fc71a2c855918b6a5375 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:06:51 +0900
Subject: ia64/pv_op/binarypatch: add helper functions to support binary
 patching for paravirt_ops.

add helper functions to support binary patching for paravirt_ops.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/paravirt_patch.h | 143 +++++++++
 arch/ia64/kernel/paravirt_patch.c      | 514 +++++++++++++++++++++++++++++++++
 arch/ia64/kernel/paravirtentry.S       |  56 ++++
 arch/ia64/kernel/vmlinux.lds.S         |  24 ++
 4 files changed, 737 insertions(+)
 create mode 100644 arch/ia64/include/asm/paravirt_patch.h
 create mode 100644 arch/ia64/kernel/paravirt_patch.c

diff --git a/arch/ia64/include/asm/paravirt_patch.h b/arch/ia64/include/asm/paravirt_patch.h
new file mode 100644
index 00000000000..128ff5db6e6
--- /dev/null
+++ b/arch/ia64/include/asm/paravirt_patch.h
@@ -0,0 +1,143 @@
+/******************************************************************************
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef __ASM_PARAVIRT_PATCH_H
+#define __ASM_PARAVIRT_PATCH_H
+
+#ifdef __ASSEMBLY__
+
+	.section .paravirt_branches, "a"
+	.previous
+#define PARAVIRT_PATCH_SITE_BR(type)		\
+	{					\
+	[1:] ;					\
+	br.cond.sptk.many 2f ;			\
+	nop.b 0 ;				\
+	nop.b 0;; ;				\
+	} ;					\
+	2:					\
+	.xdata8 ".paravirt_branches", 1b, type
+
+#else
+
+#include <linux/stringify.h>
+#include <asm/intrinsics.h>
+
+/* for binary patch */
+struct paravirt_patch_site_bundle {
+	void		*sbundle;
+	void		*ebundle;
+	unsigned long	type;
+};
+
+/* label means the beginning of new bundle */
+#define paravirt_alt_bundle(instr, privop)				\
+	"\t998:\n"							\
+	"\t" instr "\n"							\
+	"\t999:\n"							\
+	"\t.pushsection .paravirt_bundles, \"a\"\n"			\
+	"\t.popsection\n"						\
+	"\t.xdata8 \".paravirt_bundles\", 998b, 999b, "			\
+	__stringify(privop) "\n"
+
+
+struct paravirt_patch_bundle_elem {
+	const void	*sbundle;
+	const void	*ebundle;
+	unsigned long	type;
+};
+
+
+struct paravirt_patch_site_inst {
+	unsigned long	stag;
+	unsigned long	etag;
+	unsigned long	type;
+};
+
+#define paravirt_alt_inst(instr, privop)				\
+	"\t[998:]\n"							\
+	"\t" instr "\n"							\
+	"\t[999:]\n"							\
+	"\t.pushsection .paravirt_insts, \"a\"\n"			\
+	"\t.popsection\n"						\
+	"\t.xdata8 \".paravirt_insts\", 998b, 999b, "			\
+	__stringify(privop) "\n"
+
+struct paravirt_patch_site_branch {
+	unsigned long	tag;
+	unsigned long	type;
+};
+
+struct paravirt_patch_branch_target {
+	const void	*entry;
+	unsigned long	type;
+};
+
+void
+__paravirt_patch_apply_branch(
+	unsigned long tag, unsigned long type,
+	const struct paravirt_patch_branch_target *entries,
+	unsigned int nr_entries);
+
+void
+paravirt_patch_reloc_br(unsigned long tag, const void *target);
+
+void
+paravirt_patch_reloc_brl(unsigned long tag, const void *target);
+
+
+#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT)
+unsigned long
+ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
+
+unsigned long
+__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
+			      const struct paravirt_patch_bundle_elem *elems,
+			      unsigned long nelems,
+			      const struct paravirt_patch_bundle_elem **found);
+
+void
+paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
+			    const struct paravirt_patch_site_bundle *end);
+
+void
+paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
+			  const struct paravirt_patch_site_inst *end);
+
+void paravirt_patch_apply(void);
+#else
+#define paravirt_patch_apply_bundle(start, end)	do { } while (0)
+#define paravirt_patch_apply_inst(start, end)	do { } while (0)
+#define paravirt_patch_apply()			do { } while (0)
+#endif
+
+#endif /* !__ASSEMBLEY__ */
+
+#endif /* __ASM_PARAVIRT_PATCH_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "linux"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * indent-tabs-mode: t
+ * End:
+ */
diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c
new file mode 100644
index 00000000000..bfdfef1b1ff
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patch.c
@@ -0,0 +1,514 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_patch.c
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/intrinsics.h>
+#include <asm/kprobes.h>
+#include <asm/paravirt.h>
+#include <asm/paravirt_patch.h>
+
+typedef union ia64_inst {
+        struct {
+		unsigned long long qp : 6;
+		unsigned long long : 31;
+		unsigned long long opcode : 4;
+		unsigned long long reserved : 23;
+        } generic;
+        unsigned long long l;
+} ia64_inst_t;
+
+/*
+ * flush_icache_range() can't be used here.
+ * we are here before cpu_init() which initializes
+ * ia64_i_cache_stride_shift. flush_icache_range() uses it.
+ */
+void __init_or_module
+paravirt_flush_i_cache_range(const void *instr, unsigned long size)
+{
+	extern void paravirt_fc_i(const void *addr);
+	unsigned long i;
+
+	for (i = 0; i < size; i += sizeof(bundle_t))
+		paravirt_fc_i(instr + i);
+}
+
+bundle_t* __init_or_module
+paravirt_get_bundle(unsigned long tag)
+{
+	return (bundle_t *)(tag & ~3UL);
+}
+
+unsigned long __init_or_module
+paravirt_get_slot(unsigned long tag)
+{
+	return tag & 3UL;
+}
+
+unsigned long __init_or_module
+paravirt_get_num_inst(unsigned long stag, unsigned long etag)
+{
+	bundle_t *sbundle = paravirt_get_bundle(stag);
+	unsigned long sslot = paravirt_get_slot(stag);
+	bundle_t *ebundle = paravirt_get_bundle(etag);
+	unsigned long eslot = paravirt_get_slot(etag);
+
+	return (ebundle - sbundle) * 3 + eslot - sslot + 1;
+}
+
+unsigned long __init_or_module
+paravirt_get_next_tag(unsigned long tag)
+{
+	unsigned long slot = paravirt_get_slot(tag);
+
+	switch (slot) {
+	case 0:
+	case 1:
+		return tag + 1;
+	case 2: {
+		bundle_t *bundle = paravirt_get_bundle(tag);
+		return (unsigned long)(bundle + 1);
+	}
+	default:
+		BUG();
+	}
+	/* NOTREACHED */
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot0(const bundle_t *bundle)
+{
+	ia64_inst_t inst;
+	inst.l = bundle->quad0.slot0;
+	return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot1(const bundle_t *bundle)
+{
+	ia64_inst_t inst;
+	inst.l = bundle->quad0.slot1_p0 |
+		((unsigned long long)bundle->quad1.slot1_p1 << 18UL);
+	return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot2(const bundle_t *bundle)
+{
+	ia64_inst_t inst;
+	inst.l = bundle->quad1.slot2;
+	return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_inst(unsigned long tag)
+{
+	bundle_t *bundle = paravirt_get_bundle(tag);
+	unsigned long slot = paravirt_get_slot(tag);
+
+	switch (slot) {
+	case 0:
+		return paravirt_read_slot0(bundle);
+	case 1:
+		return paravirt_read_slot1(bundle);
+	case 2:
+		return paravirt_read_slot2(bundle);
+	default:
+		BUG();
+	}
+	/* NOTREACHED */
+}
+
+void __init_or_module
+paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst)
+{
+	bundle->quad0.slot0 = inst.l;
+}
+
+void __init_or_module
+paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst)
+{
+	bundle->quad0.slot1_p0 = inst.l;
+	bundle->quad1.slot1_p1 = inst.l >> 18UL;
+}
+
+void __init_or_module
+paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst)
+{
+	bundle->quad1.slot2 = inst.l;
+}
+
+void __init_or_module
+paravirt_write_inst(unsigned long tag, ia64_inst_t inst)
+{
+	bundle_t *bundle = paravirt_get_bundle(tag);
+	unsigned long slot = paravirt_get_slot(tag);
+
+	switch (slot) {
+	case 0:
+		paravirt_write_slot0(bundle, inst);
+		break;
+	case 1:
+		paravirt_write_slot1(bundle, inst);
+		break;
+	case 2:
+		paravirt_write_slot2(bundle, inst);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	paravirt_flush_i_cache_range(bundle, sizeof(*bundle));
+}
+
+/* for debug */
+void
+paravirt_print_bundle(const bundle_t *bundle)
+{
+	const unsigned long *quad = (const unsigned long *)bundle;
+	ia64_inst_t slot0 = paravirt_read_slot0(bundle);
+	ia64_inst_t slot1 = paravirt_read_slot1(bundle);
+	ia64_inst_t slot2 = paravirt_read_slot2(bundle);
+
+	printk(KERN_DEBUG
+	       "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]);
+	printk(KERN_DEBUG
+	       "bundle template 0x%x\n",
+	       bundle->quad0.template);
+	printk(KERN_DEBUG
+	       "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n",
+	       (unsigned long)bundle->quad0.slot0,
+	       (unsigned long)bundle->quad0.slot1_p0,
+	       (unsigned long)bundle->quad1.slot1_p1,
+	       (unsigned long)bundle->quad1.slot2);
+	printk(KERN_DEBUG
+	       "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n",
+	       slot0.l, slot1.l, slot2.l);
+}
+
+static int noreplace_paravirt __init_or_module = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+	noreplace_paravirt = 1;
+	return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+
+#ifdef ASM_SUPPORTED
+static void __init_or_module
+fill_nop_bundle(void *sbundle, void *ebundle)
+{
+	extern const char paravirt_nop_bundle[];
+	extern const unsigned long paravirt_nop_bundle_size;
+
+	void *bundle = sbundle;
+
+	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
+	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
+
+	while (bundle < ebundle) {
+		memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size);
+
+		bundle += paravirt_nop_bundle_size;
+	}
+}
+
+/* helper function */
+unsigned long __init_or_module
+__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
+			      const struct paravirt_patch_bundle_elem *elems,
+			      unsigned long nelems,
+			      const struct paravirt_patch_bundle_elem **found)
+{
+	unsigned long used = 0;
+	unsigned long i;
+
+	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
+	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
+
+	found = NULL;
+	for (i = 0; i < nelems; i++) {
+		const struct paravirt_patch_bundle_elem *p = &elems[i];
+		if (p->type == type) {
+			unsigned long need = p->ebundle - p->sbundle;
+			unsigned long room = ebundle - sbundle;
+
+			if (found != NULL)
+				*found = p;
+
+			if (room < need) {
+				/* no room to replace. skip it */
+				printk(KERN_DEBUG
+				       "the space is too small to put "
+				       "bundles. type %ld need %ld room %ld\n",
+				       type, need, room);
+				break;
+			}
+
+			used = need;
+			memcpy(sbundle, p->sbundle, used);
+			break;
+		}
+	}
+
+	return used;
+}
+
+void __init_or_module
+paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
+			    const struct paravirt_patch_site_bundle *end)
+{
+	const struct paravirt_patch_site_bundle *p;
+
+	if (noreplace_paravirt)
+		return;
+	if (pv_init_ops.patch_bundle == NULL)
+		return;
+
+	for (p = start; p < end; p++) {
+		unsigned long used;
+
+		used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle,
+						   p->type);
+		if (used == 0)
+			continue;
+
+		fill_nop_bundle(p->sbundle + used, p->ebundle);
+		paravirt_flush_i_cache_range(p->sbundle,
+					     p->ebundle - p->sbundle);
+	}
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+
+/*
+ * nop.i, nop.m, nop.f instruction are same format.
+ * but nop.b has differennt format.
+ * This doesn't support nop.b for now.
+ */
+static void __init_or_module
+fill_nop_inst(unsigned long stag, unsigned long etag)
+{
+	extern const bundle_t paravirt_nop_mfi_inst_bundle[];
+	unsigned long tag;
+	const ia64_inst_t nop_inst =
+		paravirt_read_slot0(paravirt_nop_mfi_inst_bundle);
+
+	for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag))
+		paravirt_write_inst(tag, nop_inst);
+}
+
+void __init_or_module
+paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
+			  const struct paravirt_patch_site_inst *end)
+{
+	const struct paravirt_patch_site_inst *p;
+
+	if (noreplace_paravirt)
+		return;
+	if (pv_init_ops.patch_inst == NULL)
+		return;
+
+	for (p = start; p < end; p++) {
+		unsigned long tag;
+		bundle_t *sbundle;
+		bundle_t *ebundle;
+
+		tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type);
+		if (tag == p->stag)
+			continue;
+
+		fill_nop_inst(tag, p->etag);
+		sbundle = paravirt_get_bundle(p->stag);
+		ebundle = paravirt_get_bundle(p->etag) + 1;
+		paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) *
+					     sizeof(bundle_t));
+	}
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+#endif /* ASM_SUPPOTED */
+
+/* brl.cond.sptk.many <target64> X3 */
+typedef union inst_x3_op {
+	ia64_inst_t inst;
+	struct {
+		unsigned long qp: 6;
+		unsigned long btyp: 3;
+		unsigned long unused: 3;
+		unsigned long p: 1;
+		unsigned long imm20b: 20;
+		unsigned long wh: 2;
+		unsigned long d: 1;
+		unsigned long i: 1;
+		unsigned long opcode: 4;
+	};
+	unsigned long l;
+} inst_x3_op_t;
+
+typedef union inst_x3_imm {
+	ia64_inst_t inst;
+	struct {
+		unsigned long unused: 2;
+		unsigned long imm39: 39;
+	};
+	unsigned long l;
+} inst_x3_imm_t;
+
+void __init_or_module
+paravirt_patch_reloc_brl(unsigned long tag, const void *target)
+{
+	unsigned long tag_op = paravirt_get_next_tag(tag);
+	unsigned long tag_imm = tag;
+	bundle_t *bundle = paravirt_get_bundle(tag);
+
+	ia64_inst_t inst_op = paravirt_read_inst(tag_op);
+	ia64_inst_t inst_imm = paravirt_read_inst(tag_imm);
+
+	inst_x3_op_t inst_x3_op = { .l = inst_op.l };
+	inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l };
+
+	unsigned long imm60 =
+		((unsigned long)target - (unsigned long)bundle) >> 4;
+
+	BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */
+	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
+
+	/* imm60[59] 1bit */
+	inst_x3_op.i = (imm60 >> 59) & 1;
+	/* imm60[19:0] 20bit */
+	inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1);
+	/* imm60[58:20] 39bit */
+	inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1);
+
+	inst_op.l = inst_x3_op.l;
+	inst_imm.l = inst_x3_imm.l;
+
+	paravirt_write_inst(tag_op, inst_op);
+	paravirt_write_inst(tag_imm, inst_imm);
+}
+
+/* br.cond.sptk.many <target25>	B1 */
+typedef union inst_b1 {
+	ia64_inst_t inst;
+	struct {
+		unsigned long qp: 6;
+		unsigned long btype: 3;
+		unsigned long unused: 3;
+		unsigned long p: 1;
+		unsigned long imm20b: 20;
+		unsigned long wh: 2;
+		unsigned long d: 1;
+		unsigned long s: 1;
+		unsigned long opcode: 4;
+	};
+	unsigned long l;
+} inst_b1_t;
+
+void __init
+paravirt_patch_reloc_br(unsigned long tag, const void *target)
+{
+	bundle_t *bundle = paravirt_get_bundle(tag);
+	ia64_inst_t inst = paravirt_read_inst(tag);
+	unsigned long target25 = (unsigned long)target - (unsigned long)bundle;
+	inst_b1_t inst_b1;
+
+	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
+
+	inst_b1.l = inst.l;
+	if (target25 & (1UL << 63))
+		inst_b1.s = 1;
+	else
+		inst_b1.s = 0;
+
+	inst_b1.imm20b = target25 >> 4;
+	inst.l = inst_b1.l;
+
+	paravirt_write_inst(tag, inst);
+}
+
+void __init
+__paravirt_patch_apply_branch(
+	unsigned long tag, unsigned long type,
+	const struct paravirt_patch_branch_target *entries,
+	unsigned int nr_entries)
+{
+	unsigned int i;
+	for (i = 0; i < nr_entries; i++) {
+		if (entries[i].type == type) {
+			paravirt_patch_reloc_br(tag, entries[i].entry);
+			break;
+		}
+	}
+}
+
+static void __init
+paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start,
+			    const struct paravirt_patch_site_branch *end)
+{
+	const struct paravirt_patch_site_branch *p;
+
+	if (noreplace_paravirt)
+		return;
+	if (pv_init_ops.patch_branch == NULL)
+		return;
+
+	for (p = start; p < end; p++)
+		(*pv_init_ops.patch_branch)(p->tag, p->type);
+
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+
+void __init
+paravirt_patch_apply(void)
+{
+	extern const char __start_paravirt_bundles[];
+	extern const char __stop_paravirt_bundles[];
+	extern const char __start_paravirt_insts[];
+	extern const char __stop_paravirt_insts[];
+	extern const char __start_paravirt_branches[];
+	extern const char __stop_paravirt_branches[];
+
+	paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *)
+				    __start_paravirt_bundles,
+				    (const struct paravirt_patch_site_bundle *)
+				    __stop_paravirt_bundles);
+	paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *)
+				  __start_paravirt_insts,
+				  (const struct paravirt_patch_site_inst *)
+				  __stop_paravirt_insts);
+	paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *)
+				    __start_paravirt_branches,
+				    (const struct paravirt_patch_site_branch *)
+				    __stop_paravirt_branches);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "linux"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * indent-tabs-mode: t
+ * End:
+ */
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
index 2f42fcb9776..80c0d365cbc 100644
--- a/arch/ia64/kernel/paravirtentry.S
+++ b/arch/ia64/kernel/paravirtentry.S
@@ -58,3 +58,59 @@ BRANCH_PROC(switch_to, r22, b7)
 BRANCH_PROC_UNWINFO(leave_syscall, r22, b7)
 BRANCH_PROC(work_processed_syscall, r2, b7)
 BRANCH_PROC_UNWINFO(leave_kernel, r22, b7)
+
+
+#ifdef CONFIG_MODULES
+#define __INIT_OR_MODULE	.text
+#define __INITDATA_OR_MODULE	.data
+#else
+#define __INIT_OR_MODULE	__INIT
+#define __INITDATA_OR_MODULE	__INITDATA
+#endif /* CONFIG_MODULES */
+
+	__INIT_OR_MODULE
+	GLOBAL_ENTRY(paravirt_fc_i)
+	fc.i r32
+	br.ret.sptk.many rp
+	END(paravirt_fc_i)
+	__FINIT
+
+	__INIT_OR_MODULE
+	.align 32
+	GLOBAL_ENTRY(paravirt_nop_b_inst_bundle)
+	{
+		nop.b 0
+		nop.b 0
+		nop.b 0
+	}
+	END(paravirt_nop_b_inst_bundle)
+	__FINIT
+
+	/* NOTE: nop.[mfi] has same format */
+	__INIT_OR_MODULE
+	GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle)
+	{
+		nop.m 0
+		nop.f 0
+		nop.i 0
+	}
+	END(paravirt_nop_mfi_inst_bundle)
+	__FINIT
+
+	__INIT_OR_MODULE
+	GLOBAL_ENTRY(paravirt_nop_bundle)
+paravirt_nop_bundle_start:
+	{
+		nop 0
+		nop 0
+		nop 0
+	}
+paravirt_nop_bundle_end:
+	END(paravirt_nop_bundle)
+	__FINIT
+
+	__INITDATA_OR_MODULE
+	.align 8
+	.global paravirt_nop_bundle_size
+paravirt_nop_bundle_size:
+	data8	paravirt_nop_bundle_end - paravirt_nop_bundle_start
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 92ae7e8f014..794d168bc8a 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -169,6 +169,30 @@ SECTIONS
 	  __end___mckinley_e9_bundles = .;
 	}
 
+#if defined(CONFIG_PARAVIRT)
+  . = ALIGN(16);
+  .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET)
+	{
+	  __start_paravirt_bundles = .;
+          *(.paravirt_bundles)
+	  __stop_paravirt_bundles = .;
+	}
+  . = ALIGN(16);
+  .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET)
+	{
+	  __start_paravirt_insts = .;
+          *(.paravirt_insts)
+	  __stop_paravirt_insts = .;
+	}
+  . = ALIGN(16);
+  .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET)
+	{
+	  __start_paravirt_branches = .;
+	  *(.paravirt_branches)
+	  __stop_paravirt_branches = .;
+	}
+#endif
+
 #if defined(CONFIG_IA64_GENERIC)
   /* Machine Vector */
   . = ALIGN(16);
-- 
cgit v1.2.3


From 03f511dd02f1431ef652fb97a7f2fe7aef47e025 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:06:52 +0900
Subject: ia64/pv_ops: implement binary patching optimization for native.

implement binary patching optimization for pv_cpu_ops.
With this optimization, indirect call for pv_cpu_ops methods can be
converted into inline execution or direct call.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/intrinsics.h      |   6 +-
 arch/ia64/include/asm/paravirt.h        |   8 +
 arch/ia64/include/asm/paravirt_privop.h | 341 ++++++++++++++++++++-
 arch/ia64/kernel/Makefile               |   3 +-
 arch/ia64/kernel/paravirt.c             | 520 +++++++++++++++++++++++++++++++-
 arch/ia64/kernel/paravirtentry.S        |  43 +--
 arch/ia64/kernel/setup.c                |   2 +
 7 files changed, 898 insertions(+), 25 deletions(-)

diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
index a3e44a5ed49..fbe2ad9234d 100644
--- a/arch/ia64/include/asm/intrinsics.h
+++ b/arch/ia64/include/asm/intrinsics.h
@@ -201,7 +201,11 @@ extern long ia64_cmpxchg_called_with_bad_pointer (void);
 
 #ifndef __ASSEMBLY__
 #if defined(CONFIG_PARAVIRT) && defined(__KERNEL__)
-#define IA64_INTRINSIC_API(name)	pv_cpu_ops.name
+#ifdef ASM_SUPPORTED
+# define IA64_INTRINSIC_API(name)	paravirt_ ## name
+#else
+# define IA64_INTRINSIC_API(name)	pv_cpu_ops.name
+#endif
 #define IA64_INTRINSIC_MACRO(name)	paravirt_ ## name
 #else
 #define IA64_INTRINSIC_API(name)	ia64_native_ ## name
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index fc433f6c327..2eb0a981a09 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -118,6 +118,14 @@ struct pv_init_ops {
 	int (*arch_setup_nomca)(void);
 
 	void (*post_smp_prepare_boot_cpu)(void);
+
+#ifdef ASM_SUPPORTED
+	unsigned long (*patch_bundle)(void *sbundle, void *ebundle,
+				      unsigned long type);
+	unsigned long (*patch_inst)(unsigned long stag, unsigned long etag,
+				    unsigned long type);
+#endif
+	void (*patch_branch)(unsigned long tag, unsigned long type);
 };
 
 extern struct pv_init_ops pv_init_ops;
diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h
index 33c8e55f577..76d6a6943e8 100644
--- a/arch/ia64/include/asm/paravirt_privop.h
+++ b/arch/ia64/include/asm/paravirt_privop.h
@@ -60,12 +60,18 @@ extern unsigned long ia64_native_getreg_func(int regnum);
 /* Instructions paravirtualized for performance */
 /************************************************/
 
+#ifndef ASM_SUPPORTED
+#define paravirt_ssm_i()	pv_cpu_ops.ssm_i()
+#define paravirt_rsm_i()	pv_cpu_ops.rsm_i()
+#define __paravirt_getreg()	pv_cpu_ops.getreg()
+#endif
+
 /* mask for ia64_native_ssm/rsm() must be constant.("i" constraing).
  * static inline function doesn't satisfy it. */
 #define paravirt_ssm(mask)			\
 	do {					\
 		if ((mask) == IA64_PSR_I)	\
-			pv_cpu_ops.ssm_i();	\
+			paravirt_ssm_i();	\
 		else				\
 			ia64_native_ssm(mask);	\
 	} while (0)
@@ -73,7 +79,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
 #define paravirt_rsm(mask)			\
 	do {					\
 		if ((mask) == IA64_PSR_I)	\
-			pv_cpu_ops.rsm_i();	\
+			paravirt_rsm_i();	\
 		else				\
 			ia64_native_rsm(mask);	\
 	} while (0)
@@ -86,7 +92,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
 		if ((reg) == _IA64_REG_IP)			\
 			res = ia64_native_getreg(_IA64_REG_IP); \
 		else						\
-			res = pv_cpu_ops.getreg(reg);		\
+			res = __paravirt_getreg(reg);		\
 		res;						\
 	})
 
@@ -121,4 +127,333 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 	IA64_PARAVIRT_ASM_FUNC(work_processed_syscall)
 #define ia64_leave_kernel		IA64_PARAVIRT_ASM_FUNC(leave_kernel)
 
+
+#if defined(CONFIG_PARAVIRT)
+/******************************************************************************
+ * binary patching infrastructure
+ */
+#define PARAVIRT_PATCH_TYPE_FC				1
+#define PARAVIRT_PATCH_TYPE_THASH			2
+#define PARAVIRT_PATCH_TYPE_GET_CPUID			3
+#define PARAVIRT_PATCH_TYPE_GET_PMD			4
+#define PARAVIRT_PATCH_TYPE_PTCGA			5
+#define PARAVIRT_PATCH_TYPE_GET_RR			6
+#define PARAVIRT_PATCH_TYPE_SET_RR			7
+#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4		8
+#define PARAVIRT_PATCH_TYPE_SSM_I			9
+#define PARAVIRT_PATCH_TYPE_RSM_I			10
+#define PARAVIRT_PATCH_TYPE_GET_PSR_I			11
+#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE	12
+
+/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */
+#define PARAVIRT_PATCH_TYPE_GETREG			0x10000000
+#define PARAVIRT_PATCH_TYPE_SETREG			0x20000000
+
+/*
+ * struct task_struct* (*ia64_switch_to)(void* next_task);
+ * void *ia64_leave_syscall;
+ * void *ia64_work_processed_syscall
+ * void *ia64_leave_kernel;
+ */
+
+#define PARAVIRT_PATCH_TYPE_BR_START			0x30000000
+#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO		\
+	(PARAVIRT_PATCH_TYPE_BR_START + 0)
+#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL		\
+	(PARAVIRT_PATCH_TYPE_BR_START + 1)
+#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL	\
+	(PARAVIRT_PATCH_TYPE_BR_START + 2)
+#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL		\
+	(PARAVIRT_PATCH_TYPE_BR_START + 3)
+
+#ifdef ASM_SUPPORTED
+#include <asm/paravirt_patch.h>
+
+/*
+ * pv_cpu_ops calling stub.
+ * normal function call convension can't be written by gcc
+ * inline assembly.
+ *
+ * from the caller's point of view,
+ * the following registers will be clobbered.
+ * r2, r3
+ * r8-r15
+ * r16, r17
+ * b6, b7
+ * p6-p15
+ * ar.ccv
+ *
+ * from the callee's point of view ,
+ * the following registers can be used.
+ * r2, r3: scratch
+ * r8: scratch, input argument0 and return value
+ * r0-r15: scratch, input argument1-5
+ * b6: return pointer
+ * b7: scratch
+ * p6-p15: scratch
+ * ar.ccv: scratch
+ *
+ * other registers must not be changed. especially
+ * b0: rp: preserved. gcc ignores b0 in clobbered register.
+ * r16: saved gp
+ */
+/* 5 bundles */
+#define __PARAVIRT_BR							\
+	";;\n"								\
+	"{ .mlx\n"							\
+	"nop 0\n"							\
+	"movl r2 = %[op_addr]\n"/* get function pointer address */	\
+	";;\n"								\
+	"}\n"								\
+	"1:\n"								\
+	"{ .mii\n"							\
+	"ld8 r2 = [r2]\n"	/* load function descriptor address */	\
+	"mov r17 = ip\n"	/* get ip to calc return address */	\
+	"mov r16 = gp\n"	/* save gp */				\
+	";;\n"								\
+	"}\n"								\
+	"{ .mii\n"							\
+	"ld8 r3 = [r2], 8\n"	/* load entry address */		\
+	"adds r17 =  1f - 1b, r17\n"	/* calculate return address */	\
+	";;\n"								\
+	"mov b7 = r3\n"		/* set entry address */			\
+	"}\n"								\
+	"{ .mib\n"							\
+	"ld8 gp = [r2]\n"	/* load gp value */			\
+	"mov b6 = r17\n"	/* set return address */		\
+	"br.cond.sptk.few b7\n"	/* intrinsics are very short isns */	\
+	"}\n"								\
+	"1:\n"								\
+	"{ .mii\n"							\
+	"mov gp = r16\n"	/* restore gp value */			\
+	"nop 0\n"							\
+	"nop 0\n"							\
+	";;\n"								\
+	"}\n"
+
+#define PARAVIRT_OP(op)				\
+	[op_addr] "i"(&pv_cpu_ops.op)
+
+#define PARAVIRT_TYPE(type)			\
+	PARAVIRT_PATCH_TYPE_ ## type
+
+#define PARAVIRT_REG_CLOBBERS0					\
+	"r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14",	\
+		"r15", "r16", "r17"
+
+#define PARAVIRT_REG_CLOBBERS1					\
+	"r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14",	\
+		"r15", "r16", "r17"
+
+#define PARAVIRT_REG_CLOBBERS2					\
+	"r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14",	\
+		"r15", "r16", "r17"
+
+#define PARAVIRT_REG_CLOBBERS5					\
+	"r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/	\
+		"r15", "r16", "r17"
+
+#define PARAVIRT_BR_CLOBBERS			\
+	"b6", "b7"
+
+#define PARAVIRT_PR_CLOBBERS						\
+	"p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"
+
+#define PARAVIRT_AR_CLOBBERS			\
+	"ar.ccv"
+
+#define PARAVIRT_CLOBBERS0			\
+		PARAVIRT_REG_CLOBBERS0,		\
+		PARAVIRT_BR_CLOBBERS,		\
+		PARAVIRT_PR_CLOBBERS,		\
+		PARAVIRT_AR_CLOBBERS,		\
+		"memory"
+
+#define PARAVIRT_CLOBBERS1			\
+		PARAVIRT_REG_CLOBBERS1,		\
+		PARAVIRT_BR_CLOBBERS,		\
+		PARAVIRT_PR_CLOBBERS,		\
+		PARAVIRT_AR_CLOBBERS,		\
+		"memory"
+
+#define PARAVIRT_CLOBBERS2			\
+		PARAVIRT_REG_CLOBBERS2,		\
+		PARAVIRT_BR_CLOBBERS,		\
+		PARAVIRT_PR_CLOBBERS,		\
+		PARAVIRT_AR_CLOBBERS,		\
+		"memory"
+
+#define PARAVIRT_CLOBBERS5			\
+		PARAVIRT_REG_CLOBBERS5,		\
+		PARAVIRT_BR_CLOBBERS,		\
+		PARAVIRT_PR_CLOBBERS,		\
+		PARAVIRT_AR_CLOBBERS,		\
+		"memory"
+
+#define PARAVIRT_BR0(op, type)					\
+	register unsigned long ia64_clobber asm ("r8");		\
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+					  PARAVIRT_TYPE(type))	\
+		      :	"=r"(ia64_clobber)			\
+		      : PARAVIRT_OP(op)				\
+		      : PARAVIRT_CLOBBERS0)
+
+#define PARAVIRT_BR0_RET(op, type)				\
+	register unsigned long ia64_intri_res asm ("r8");	\
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+					  PARAVIRT_TYPE(type))	\
+		      : "=r"(ia64_intri_res)			\
+		      : PARAVIRT_OP(op)				\
+		      : PARAVIRT_CLOBBERS0)
+
+#define PARAVIRT_BR1(op, type, arg1)				\
+	register unsigned long __##arg1 asm ("r8") = arg1;	\
+	register unsigned long ia64_clobber asm ("r8");		\
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+					  PARAVIRT_TYPE(type))	\
+		      :	"=r"(ia64_clobber)			\
+		      : PARAVIRT_OP(op), "0"(__##arg1)		\
+		      : PARAVIRT_CLOBBERS1)
+
+#define PARAVIRT_BR1_RET(op, type, arg1)			\
+	register unsigned long ia64_intri_res asm ("r8");	\
+	register unsigned long __##arg1 asm ("r8") = arg1;	\
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+					  PARAVIRT_TYPE(type))	\
+		      : "=r"(ia64_intri_res)			\
+		      : PARAVIRT_OP(op), "0"(__##arg1)		\
+		      : PARAVIRT_CLOBBERS1)
+
+#define PARAVIRT_BR2(op, type, arg1, arg2)				\
+	register unsigned long __##arg1 asm ("r8") = arg1;		\
+	register unsigned long __##arg2 asm ("r9") = arg2;		\
+	register unsigned long ia64_clobber1 asm ("r8");		\
+	register unsigned long ia64_clobber2 asm ("r9");		\
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,		\
+					  PARAVIRT_TYPE(type))		\
+		      : "=r"(ia64_clobber1), "=r"(ia64_clobber2)	\
+		      : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2)	\
+		      : PARAVIRT_CLOBBERS2)
+
+
+#define PARAVIRT_DEFINE_CPU_OP0(op, type)		\
+	static inline void				\
+	paravirt_ ## op (void)				\
+	{						\
+		PARAVIRT_BR0(op, type);			\
+	}
+
+#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type)		\
+	static inline unsigned long			\
+	paravirt_ ## op (void)				\
+	{						\
+		PARAVIRT_BR0_RET(op, type);		\
+		return ia64_intri_res;			\
+	}
+
+#define PARAVIRT_DEFINE_CPU_OP1(op, type)		\
+	static inline void				\
+	paravirt_ ## op (unsigned long arg1)		\
+	{						\
+		PARAVIRT_BR1(op, type, arg1);		\
+	}
+
+#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type)		\
+	static inline unsigned long			\
+	paravirt_ ## op (unsigned long arg1)		\
+	{						\
+		PARAVIRT_BR1_RET(op, type, arg1);	\
+		return ia64_intri_res;			\
+	}
+
+#define PARAVIRT_DEFINE_CPU_OP2(op, type)		\
+	static inline void				\
+	paravirt_ ## op (unsigned long arg1,		\
+			 unsigned long arg2)		\
+	{						\
+		PARAVIRT_BR2(op, type, arg1, arg2);	\
+	}
+
+
+PARAVIRT_DEFINE_CPU_OP1(fc, FC);
+PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH)
+PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID)
+PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD)
+PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA)
+PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR)
+PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR)
+PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I)
+PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I)
+PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I)
+PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE)
+
+static inline void
+paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
+			unsigned long val2, unsigned long val3,
+			unsigned long val4)
+{
+	register unsigned long __val0 asm ("r8") = val0;
+	register unsigned long __val1 asm ("r9") = val1;
+	register unsigned long __val2 asm ("r10") = val2;
+	register unsigned long __val3 asm ("r11") = val3;
+	register unsigned long __val4 asm ("r14") = val4;
+
+	register unsigned long ia64_clobber0 asm ("r8");
+	register unsigned long ia64_clobber1 asm ("r9");
+	register unsigned long ia64_clobber2 asm ("r10");
+	register unsigned long ia64_clobber3 asm ("r11");
+	register unsigned long ia64_clobber4 asm ("r14");
+
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,
+					  PARAVIRT_TYPE(SET_RR0_TO_RR4))
+		      : "=r"(ia64_clobber0),
+			"=r"(ia64_clobber1),
+			"=r"(ia64_clobber2),
+			"=r"(ia64_clobber3),
+			"=r"(ia64_clobber4)
+		      : PARAVIRT_OP(set_rr0_to_rr4),
+			"0"(__val0), "1"(__val1), "2"(__val2),
+			"3"(__val3), "4"(__val4)
+		      : PARAVIRT_CLOBBERS5);
+}
+
+/* unsigned long paravirt_getreg(int reg) */
+#define __paravirt_getreg(reg)						\
+	({								\
+		register unsigned long ia64_intri_res asm ("r8");	\
+		register unsigned long __reg asm ("r8") = (reg);	\
+									\
+		BUILD_BUG_ON(!__builtin_constant_p(reg));		\
+		asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+						  PARAVIRT_TYPE(GETREG) \
+						  + (reg))		\
+			      : "=r"(ia64_intri_res)			\
+			      : PARAVIRT_OP(getreg), "0"(__reg)		\
+			      : PARAVIRT_CLOBBERS1);			\
+									\
+		ia64_intri_res;						\
+	})
+
+/* void paravirt_setreg(int reg, unsigned long val) */
+#define paravirt_setreg(reg, val)					\
+	do {								\
+		register unsigned long __val asm ("r8") = val;		\
+		register unsigned long __reg asm ("r9") = reg;		\
+		register unsigned long ia64_clobber1 asm ("r8");	\
+		register unsigned long ia64_clobber2 asm ("r9");	\
+									\
+		BUILD_BUG_ON(!__builtin_constant_p(reg));		\
+		asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+						  PARAVIRT_TYPE(SETREG) \
+						  + (reg))		\
+			      : "=r"(ia64_clobber1),			\
+				"=r"(ia64_clobber2)			\
+			      : PARAVIRT_OP(setreg),			\
+				"1"(__reg), "0"(__val)			\
+			      : PARAVIRT_CLOBBERS2);			\
+	} while (0)
+
+#endif /* ASM_SUPPORTED */
+#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */
+
 #endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 8dc9df8a87a..dbc19e4d5ef 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -36,7 +36,8 @@ obj-$(CONFIG_PCI_MSI)		+= msi_ia64.o
 mca_recovery-y			+= mca_drv.o mca_drv_asm.o
 obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
 
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirtentry.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirtentry.o \
+				   paravirt_patch.o
 
 obj-$(CONFIG_IA64_ESI)		+= esi.o
 ifneq ($(CONFIG_IA64_ESI),)
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index 6bc33a6db75..158d52414e9 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -46,13 +46,23 @@ struct pv_info pv_info = {
  * initialization hooks.
  */
 
-struct pv_init_ops pv_init_ops;
+static void __init
+ia64_native_patch_branch(unsigned long tag, unsigned long type);
+
+struct pv_init_ops pv_init_ops =
+{
+#ifdef ASM_SUPPORTED
+	.patch_bundle = ia64_native_patch_bundle,
+#endif
+	.patch_branch = ia64_native_patch_branch,
+};
 
 /***************************************************************************
  * pv_cpu_ops
  * intrinsics hooks.
  */
 
+#ifndef ASM_SUPPORTED
 /* ia64_native_xxx are macros so that we have to make them real functions */
 
 #define DEFINE_VOID_FUNC1(name)					\
@@ -274,6 +284,261 @@ ia64_native_setreg_func(int regnum, unsigned long val)
 		break;
 	}
 }
+#else
+
+#define __DEFINE_FUNC(name, code)					\
+	extern const char ia64_native_ ## name ## _direct_start[];	\
+	extern const char ia64_native_ ## name ## _direct_end[];	\
+	asm (".align 32\n"						\
+	     ".proc ia64_native_" #name "_func\n"			\
+	     "ia64_native_" #name "_func:\n"				\
+	     "ia64_native_" #name "_direct_start:\n"			\
+	     code							\
+	     "ia64_native_" #name "_direct_end:\n"			\
+	     "br.cond.sptk.many b6\n"					\
+	     ".endp ia64_native_" #name "_func\n")
+
+#define DEFINE_VOID_FUNC0(name, code)				\
+	extern void						\
+	ia64_native_ ## name ## _func(void);			\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1(name, code)				\
+	extern void						\
+	ia64_native_ ## name ## _func(unsigned long arg);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC2(name, code)				\
+	extern void						\
+	ia64_native_ ## name ## _func(unsigned long arg0,	\
+				      unsigned long arg1);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC0(name, code)		\
+	extern unsigned long			\
+	ia64_native_ ## name ## _func(void);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC1(name, type, code)			\
+	extern unsigned long				\
+	ia64_native_ ## name ## _func(type arg);	\
+	__DEFINE_FUNC(name, code)
+
+DEFINE_VOID_FUNC1(fc,
+		  "fc r8\n");
+DEFINE_VOID_FUNC1(intrin_local_irq_restore,
+		  ";;\n"
+		  "     cmp.ne p6, p7 = r8, r0\n"
+		  ";;\n"
+		  "(p6) ssm psr.i\n"
+		  "(p7) rsm psr.i\n"
+		  ";;\n"
+		  "(p6) srlz.d\n");
+
+DEFINE_VOID_FUNC2(ptcga,
+		  "ptc.ga r8, r9\n");
+DEFINE_VOID_FUNC2(set_rr,
+		  "mov rr[r8] = r9\n");
+
+/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */
+DEFINE_FUNC0(get_psr_i,
+	     "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n"
+	     "mov r8 = psr\n"
+	     ";;\n"
+	     "and r8 = r2, r8\n");
+
+DEFINE_FUNC1(thash, unsigned long,
+	     "thash r8 = r8\n");
+DEFINE_FUNC1(get_cpuid, int,
+	     "mov r8 = cpuid[r8]\n");
+DEFINE_FUNC1(get_pmd, int,
+	     "mov r8 = pmd[r8]\n");
+DEFINE_FUNC1(get_rr, unsigned long,
+	     "mov r8 = rr[r8]\n");
+
+DEFINE_VOID_FUNC0(ssm_i,
+		  "ssm psr.i\n");
+DEFINE_VOID_FUNC0(rsm_i,
+		  "rsm psr.i\n");
+
+extern void
+ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
+				unsigned long val2, unsigned long val3,
+				unsigned long val4);
+__DEFINE_FUNC(set_rr0_to_rr4,
+	      "mov rr[r0] = r8\n"
+	      "movl r2 = 0x2000000000000000\n"
+	      ";;\n"
+	      "mov rr[r2] = r9\n"
+	      "shl r3 = r2, 1\n"	/* movl r3 = 0x4000000000000000 */
+	      ";;\n"
+	      "add r2 = r2, r3\n"	/* movl r2 = 0x6000000000000000 */
+	      "mov rr[r3] = r10\n"
+	      ";;\n"
+	      "mov rr[r2] = r11\n"
+	      "shl r3 = r3, 1\n"	/* movl r3 = 0x8000000000000000 */
+	      ";;\n"
+	      "mov rr[r3] = r14\n");
+
+extern unsigned long ia64_native_getreg_func(int regnum);
+asm(".global ia64_native_getreg_func\n");
+#define __DEFINE_GET_REG(id, reg)			\
+	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
+	";;\n"						\
+	"cmp.eq p6, p0 = r2, r8\n"			\
+	";;\n"						\
+	"(p6) mov r8 = " #reg "\n"			\
+	"(p6) br.cond.sptk.many b6\n"			\
+	";;\n"
+#define __DEFINE_GET_AR(id, reg)	__DEFINE_GET_REG(AR_ ## id, ar.reg)
+#define __DEFINE_GET_CR(id, reg)	__DEFINE_GET_REG(CR_ ## id, cr.reg)
+
+__DEFINE_FUNC(getreg,
+	      __DEFINE_GET_REG(GP, gp)
+	      /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */
+	      __DEFINE_GET_REG(PSR, psr)
+	      __DEFINE_GET_REG(TP, tp)
+	      __DEFINE_GET_REG(SP, sp)
+
+	      __DEFINE_GET_REG(AR_KR0, ar0)
+	      __DEFINE_GET_REG(AR_KR1, ar1)
+	      __DEFINE_GET_REG(AR_KR2, ar2)
+	      __DEFINE_GET_REG(AR_KR3, ar3)
+	      __DEFINE_GET_REG(AR_KR4, ar4)
+	      __DEFINE_GET_REG(AR_KR5, ar5)
+	      __DEFINE_GET_REG(AR_KR6, ar6)
+	      __DEFINE_GET_REG(AR_KR7, ar7)
+	      __DEFINE_GET_AR(RSC, rsc)
+	      __DEFINE_GET_AR(BSP, bsp)
+	      __DEFINE_GET_AR(BSPSTORE, bspstore)
+	      __DEFINE_GET_AR(RNAT, rnat)
+	      __DEFINE_GET_AR(FCR, fcr)
+	      __DEFINE_GET_AR(EFLAG, eflag)
+	      __DEFINE_GET_AR(CSD, csd)
+	      __DEFINE_GET_AR(SSD, ssd)
+	      __DEFINE_GET_REG(AR_CFLAG, ar27)
+	      __DEFINE_GET_AR(FSR, fsr)
+	      __DEFINE_GET_AR(FIR, fir)
+	      __DEFINE_GET_AR(FDR, fdr)
+	      __DEFINE_GET_AR(CCV, ccv)
+	      __DEFINE_GET_AR(UNAT, unat)
+	      __DEFINE_GET_AR(FPSR, fpsr)
+	      __DEFINE_GET_AR(ITC, itc)
+	      __DEFINE_GET_AR(PFS, pfs)
+	      __DEFINE_GET_AR(LC, lc)
+	      __DEFINE_GET_AR(EC, ec)
+
+	      __DEFINE_GET_CR(DCR, dcr)
+	      __DEFINE_GET_CR(ITM, itm)
+	      __DEFINE_GET_CR(IVA, iva)
+	      __DEFINE_GET_CR(PTA, pta)
+	      __DEFINE_GET_CR(IPSR, ipsr)
+	      __DEFINE_GET_CR(ISR, isr)
+	      __DEFINE_GET_CR(IIP, iip)
+	      __DEFINE_GET_CR(IFA, ifa)
+	      __DEFINE_GET_CR(ITIR, itir)
+	      __DEFINE_GET_CR(IIPA, iipa)
+	      __DEFINE_GET_CR(IFS, ifs)
+	      __DEFINE_GET_CR(IIM, iim)
+	      __DEFINE_GET_CR(IHA, iha)
+	      __DEFINE_GET_CR(LID, lid)
+	      __DEFINE_GET_CR(IVR, ivr)
+	      __DEFINE_GET_CR(TPR, tpr)
+	      __DEFINE_GET_CR(EOI, eoi)
+	      __DEFINE_GET_CR(IRR0, irr0)
+	      __DEFINE_GET_CR(IRR1, irr1)
+	      __DEFINE_GET_CR(IRR2, irr2)
+	      __DEFINE_GET_CR(IRR3, irr3)
+	      __DEFINE_GET_CR(ITV, itv)
+	      __DEFINE_GET_CR(PMV, pmv)
+	      __DEFINE_GET_CR(CMCV, cmcv)
+	      __DEFINE_GET_CR(LRR0, lrr0)
+	      __DEFINE_GET_CR(LRR1, lrr1)
+
+	      "mov r8 = -1\n"	/* unsupported case */
+	);
+
+extern void ia64_native_setreg_func(int regnum, unsigned long val);
+asm(".global ia64_native_setreg_func\n");
+#define __DEFINE_SET_REG(id, reg)			\
+	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
+	";;\n"						\
+	"cmp.eq p6, p0 = r2, r9\n"			\
+	";;\n"						\
+	"(p6) mov " #reg " = r8\n"			\
+	"(p6) br.cond.sptk.many b6\n"			\
+	";;\n"
+#define __DEFINE_SET_AR(id, reg)	__DEFINE_SET_REG(AR_ ## id, ar.reg)
+#define __DEFINE_SET_CR(id, reg)	__DEFINE_SET_REG(CR_ ## id, cr.reg)
+__DEFINE_FUNC(setreg,
+	      "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n"
+	      ";;\n"
+	      "cmp.eq p6, p0 = r2, r9\n"
+	      ";;\n"
+	      "(p6) mov psr.l = r8\n"
+#ifdef HAVE_SERIALIZE_DIRECTIVE
+	      ".serialize.data\n"
+#endif
+	      "(p6) br.cond.sptk.many b6\n"
+	      __DEFINE_SET_REG(GP, gp)
+	      __DEFINE_SET_REG(SP, sp)
+
+	      __DEFINE_SET_REG(AR_KR0, ar0)
+	      __DEFINE_SET_REG(AR_KR1, ar1)
+	      __DEFINE_SET_REG(AR_KR2, ar2)
+	      __DEFINE_SET_REG(AR_KR3, ar3)
+	      __DEFINE_SET_REG(AR_KR4, ar4)
+	      __DEFINE_SET_REG(AR_KR5, ar5)
+	      __DEFINE_SET_REG(AR_KR6, ar6)
+	      __DEFINE_SET_REG(AR_KR7, ar7)
+	      __DEFINE_SET_AR(RSC, rsc)
+	      __DEFINE_SET_AR(BSP, bsp)
+	      __DEFINE_SET_AR(BSPSTORE, bspstore)
+	      __DEFINE_SET_AR(RNAT, rnat)
+	      __DEFINE_SET_AR(FCR, fcr)
+	      __DEFINE_SET_AR(EFLAG, eflag)
+	      __DEFINE_SET_AR(CSD, csd)
+	      __DEFINE_SET_AR(SSD, ssd)
+	      __DEFINE_SET_REG(AR_CFLAG, ar27)
+	      __DEFINE_SET_AR(FSR, fsr)
+	      __DEFINE_SET_AR(FIR, fir)
+	      __DEFINE_SET_AR(FDR, fdr)
+	      __DEFINE_SET_AR(CCV, ccv)
+	      __DEFINE_SET_AR(UNAT, unat)
+	      __DEFINE_SET_AR(FPSR, fpsr)
+	      __DEFINE_SET_AR(ITC, itc)
+	      __DEFINE_SET_AR(PFS, pfs)
+	      __DEFINE_SET_AR(LC, lc)
+	      __DEFINE_SET_AR(EC, ec)
+
+	      __DEFINE_SET_CR(DCR, dcr)
+	      __DEFINE_SET_CR(ITM, itm)
+	      __DEFINE_SET_CR(IVA, iva)
+	      __DEFINE_SET_CR(PTA, pta)
+	      __DEFINE_SET_CR(IPSR, ipsr)
+	      __DEFINE_SET_CR(ISR, isr)
+	      __DEFINE_SET_CR(IIP, iip)
+	      __DEFINE_SET_CR(IFA, ifa)
+	      __DEFINE_SET_CR(ITIR, itir)
+	      __DEFINE_SET_CR(IIPA, iipa)
+	      __DEFINE_SET_CR(IFS, ifs)
+	      __DEFINE_SET_CR(IIM, iim)
+	      __DEFINE_SET_CR(IHA, iha)
+	      __DEFINE_SET_CR(LID, lid)
+	      __DEFINE_SET_CR(IVR, ivr)
+	      __DEFINE_SET_CR(TPR, tpr)
+	      __DEFINE_SET_CR(EOI, eoi)
+	      __DEFINE_SET_CR(IRR0, irr0)
+	      __DEFINE_SET_CR(IRR1, irr1)
+	      __DEFINE_SET_CR(IRR2, irr2)
+	      __DEFINE_SET_CR(IRR3, irr3)
+	      __DEFINE_SET_CR(ITV, itv)
+	      __DEFINE_SET_CR(PMV, pmv)
+	      __DEFINE_SET_CR(CMCV, cmcv)
+	      __DEFINE_SET_CR(LRR0, lrr0)
+	      __DEFINE_SET_CR(LRR1, lrr1)
+	);
+#endif
 
 struct pv_cpu_ops pv_cpu_ops = {
 	.fc		= ia64_native_fc_func,
@@ -368,3 +633,256 @@ struct pv_time_ops pv_time_ops = {
 	.do_steal_accounting = ia64_native_do_steal_accounting,
 	.sched_clock = ia64_native_sched_clock,
 };
+
+/***************************************************************************
+ * binary pacthing
+ * pv_init_ops.patch_bundle
+ */
+
+#ifdef ASM_SUPPORTED
+#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg)	\
+	__DEFINE_FUNC(get_ ## name,			\
+		      ";;\n"				\
+		      "mov r8 = " #reg "\n"		\
+		      ";;\n")
+
+#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
+	__DEFINE_FUNC(set_ ## name,			\
+		      ";;\n"				\
+		      "mov " #reg " = r8\n"		\
+		      ";;\n")
+
+#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg)		\
+	IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg);	\
+	IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
+
+#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg)			\
+	IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg)
+
+#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg)			\
+	IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg)
+
+
+IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr);
+IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp);
+
+/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */
+__DEFINE_FUNC(set_psr_l,
+	      ";;\n"
+	      "mov psr.l = r8\n"
+#ifdef HAVE_SERIALIZE_DIRECTIVE
+	      ".serialize.data\n"
+#endif
+	      ";;\n");
+
+IA64_NATIVE_PATCH_DEFINE_REG(gp, gp);
+IA64_NATIVE_PATCH_DEFINE_REG(sp, sp);
+
+IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0);
+IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1);
+IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2);
+IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3);
+IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4);
+IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5);
+IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6);
+IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7);
+
+IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc);
+IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp);
+IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore);
+IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat);
+IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr);
+IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag);
+IA64_NATIVE_PATCH_DEFINE_AR(csd, csd);
+IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd);
+IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27);
+IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr);
+IA64_NATIVE_PATCH_DEFINE_AR(fir, fir);
+IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr);
+IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv);
+IA64_NATIVE_PATCH_DEFINE_AR(unat, unat);
+IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr);
+IA64_NATIVE_PATCH_DEFINE_AR(itc, itc);
+IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs);
+IA64_NATIVE_PATCH_DEFINE_AR(lc, lc);
+IA64_NATIVE_PATCH_DEFINE_AR(ec, ec);
+
+IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr);
+IA64_NATIVE_PATCH_DEFINE_CR(itm, itm);
+IA64_NATIVE_PATCH_DEFINE_CR(iva, iva);
+IA64_NATIVE_PATCH_DEFINE_CR(pta, pta);
+IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr);
+IA64_NATIVE_PATCH_DEFINE_CR(isr, isr);
+IA64_NATIVE_PATCH_DEFINE_CR(iip, iip);
+IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa);
+IA64_NATIVE_PATCH_DEFINE_CR(itir, itir);
+IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa);
+IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs);
+IA64_NATIVE_PATCH_DEFINE_CR(iim, iim);
+IA64_NATIVE_PATCH_DEFINE_CR(iha, iha);
+IA64_NATIVE_PATCH_DEFINE_CR(lid, lid);
+IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr);
+IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr);
+IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi);
+IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0);
+IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1);
+IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2);
+IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3);
+IA64_NATIVE_PATCH_DEFINE_CR(itv, itv);
+IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv);
+IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv);
+IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0);
+IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1);
+
+static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[]
+__initdata_or_module =
+{
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type)		\
+	{							\
+		(void*)ia64_native_ ## name ## _direct_start,	\
+		(void*)ia64_native_ ## name ## _direct_end,	\
+		PARAVIRT_PATCH_TYPE_ ## type,			\
+	}
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore,
+				      INTRIN_LOCAL_IRQ_RESTORE),
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg)			\
+	{								\
+		(void*)ia64_native_get_ ## name ## _direct_start,	\
+		(void*)ia64_native_get_ ## name ## _direct_end,		\
+		PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg,		\
+	}
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)			\
+	{								\
+		(void*)ia64_native_set_ ## name ## _direct_start,	\
+		(void*)ia64_native_set_ ## name ## _direct_end,		\
+		PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg,		\
+	}
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg)		\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg),	\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)		\
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg)		\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg)
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg)		\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg)
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1),
+};
+
+unsigned long __init_or_module
+ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
+{
+	const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) /
+		sizeof(ia64_native_patch_bundle_elems[0]);
+
+	return __paravirt_patch_apply_bundle(sbundle, ebundle, type,
+					      ia64_native_patch_bundle_elems,
+					      nelems, NULL);
+}
+#endif /* ASM_SUPPOTED */
+
+extern const char ia64_native_switch_to[];
+extern const char ia64_native_leave_syscall[];
+extern const char ia64_native_work_processed_syscall[];
+extern const char ia64_native_leave_kernel[];
+
+const struct paravirt_patch_branch_target ia64_native_branch_target[]
+__initconst = {
+#define PARAVIRT_BR_TARGET(name, type)			\
+	{						\
+		ia64_native_ ## name,			\
+		PARAVIRT_PATCH_TYPE_BR_ ## type,	\
+	}
+	PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
+	PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
+	PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
+	PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
+};
+
+static void __init
+ia64_native_patch_branch(unsigned long tag, unsigned long type)
+{
+	const unsigned long nelem =
+		sizeof(ia64_native_branch_target) /
+		sizeof(ia64_native_branch_target[0]);
+	__paravirt_patch_apply_branch(tag, type,
+				      ia64_native_branch_target, nelem);
+}
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
index 80c0d365cbc..6158560d7f1 100644
--- a/arch/ia64/kernel/paravirtentry.S
+++ b/arch/ia64/kernel/paravirtentry.S
@@ -20,8 +20,11 @@
  *
  */
 
+#include <linux/init.h>
 #include <asm/asmmacro.h>
 #include <asm/asm-offsets.h>
+#include <asm/paravirt_privop.h>
+#include <asm/paravirt_patch.h>
 #include "entry.h"
 
 #define DATA8(sym, init_value)			\
@@ -32,32 +35,34 @@
 	data8 init_value ;			\
 	.popsection
 
-#define BRANCH(targ, reg, breg)		\
-	movl reg=targ ;			\
-	;;				\
-	ld8 reg=[reg] ;			\
-	;;				\
-	mov breg=reg ;			\
+#define BRANCH(targ, reg, breg, type)					\
+	PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ;	\
+	;;								\
+	movl reg=targ ;							\
+	;;								\
+	ld8 reg=[reg] ;							\
+	;;								\
+	mov breg=reg ;							\
 	br.cond.sptk.many breg
 
-#define BRANCH_PROC(sym, reg, breg)				\
-	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
-	GLOBAL_ENTRY(paravirt_ ## sym) ;			\
-		BRANCH(paravirt_ ## sym ## _targ, reg, breg) ;	\
+#define BRANCH_PROC(sym, reg, breg, type)				\
+	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
+	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
+		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
 	END(paravirt_ ## sym)
 
-#define BRANCH_PROC_UNWINFO(sym, reg, breg)			\
-	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
-	GLOBAL_ENTRY(paravirt_ ## sym) ;			\
-		PT_REGS_UNWIND_INFO(0) ;			\
-		BRANCH(paravirt_ ## sym ## _targ, reg, breg) ;	\
+#define BRANCH_PROC_UNWINFO(sym, reg, breg, type)			\
+	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
+	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
+		PT_REGS_UNWIND_INFO(0) ;				\
+		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
 	END(paravirt_ ## sym)
 
 
-BRANCH_PROC(switch_to, r22, b7)
-BRANCH_PROC_UNWINFO(leave_syscall, r22, b7)
-BRANCH_PROC(work_processed_syscall, r2, b7)
-BRANCH_PROC_UNWINFO(leave_kernel, r22, b7)
+BRANCH_PROC(switch_to, r22, b7, SWITCH_TO)
+BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL)
+BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL)
+BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL)
 
 
 #ifdef CONFIG_MODULES
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 865af27c773..4ed3e1c117e 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -52,6 +52,7 @@
 #include <asm/meminit.h>
 #include <asm/page.h>
 #include <asm/paravirt.h>
+#include <asm/paravirt_patch.h>
 #include <asm/patch.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -537,6 +538,7 @@ setup_arch (char **cmdline_p)
 	paravirt_arch_setup_early();
 
 	ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
+	paravirt_patch_apply();
 
 	*cmdline_p = __va(ia64_boot_param->command_line);
 	strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
-- 
cgit v1.2.3


From ee158fcd095c8233c9b578fbbe8a5897979a52a9 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:06:53 +0900
Subject: ia64/pv_ops/bp/module: support binary patching for kernel module.

support binary patching for kernel module.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/module.h |  6 ++++++
 arch/ia64/kernel/module.c      | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h
index d2da61e4c49..908eaef42a0 100644
--- a/arch/ia64/include/asm/module.h
+++ b/arch/ia64/include/asm/module.h
@@ -16,6 +16,12 @@ struct mod_arch_specific {
 	struct elf64_shdr *got;		/* global offset table */
 	struct elf64_shdr *opd;		/* official procedure descriptors */
 	struct elf64_shdr *unwind;	/* unwind-table section */
+#ifdef CONFIG_PARAVIRT
+	struct elf64_shdr *paravirt_bundles;
+					/* paravirt_alt_bundle_patch table */
+	struct elf64_shdr *paravirt_insts;
+					/* paravirt_alt_inst_patch table */
+#endif
 	unsigned long gp;		/* global-pointer for module */
 
 	void *core_unw_table;		/* core unwind-table cookie returned by unwinder */
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index aaa7d901521..34fe4259a14 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -446,6 +446,14 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
 			mod->arch.opd = s;
 		else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
 			mod->arch.unwind = s;
+#ifdef CONFIG_PARAVIRT
+		else if (strcmp(".paravirt_bundles",
+				secstrings + s->sh_name) == 0)
+			mod->arch.paravirt_bundles = s;
+		else if (strcmp(".paravirt_insts",
+				secstrings + s->sh_name) == 0)
+			mod->arch.paravirt_insts = s;
+#endif
 
 	if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
 		printk(KERN_ERR "%s: sections missing\n", mod->name);
@@ -921,6 +929,30 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo
 	DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
 	if (mod->arch.unwind)
 		register_unwind_table(mod);
+#ifdef CONFIG_PARAVIRT
+        if (mod->arch.paravirt_bundles) {
+                struct paravirt_patch_site_bundle *start =
+                        (struct paravirt_patch_site_bundle *)
+                        mod->arch.paravirt_bundles->sh_addr;
+                struct paravirt_patch_site_bundle *end =
+                        (struct paravirt_patch_site_bundle *)
+                        (mod->arch.paravirt_bundles->sh_addr +
+                         mod->arch.paravirt_bundles->sh_size);
+
+                paravirt_patch_apply_bundle(start, end);
+        }
+        if (mod->arch.paravirt_insts) {
+                struct paravirt_patch_site_inst *start =
+                        (struct paravirt_patch_site_inst *)
+                        mod->arch.paravirt_insts->sh_addr;
+                struct paravirt_patch_site_inst *end =
+                        (struct paravirt_patch_site_inst *)
+                        (mod->arch.paravirt_insts->sh_addr +
+                         mod->arch.paravirt_insts->sh_size);
+
+                paravirt_patch_apply_inst(start, end);
+        }
+#endif
 	return 0;
 }
 
-- 
cgit v1.2.3


From dae17da60d1797c9049d21d06de0db1873eee153 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:06:54 +0900
Subject: ia64/pv_ops/binary patch: define paravirt_dv_serialize_data() and
 suppress false positive warning.

define paravirt_dv_serialize_data() and insert it to suppress
false positive warnings.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/paravirt_privop.h | 6 ++++++
 arch/ia64/kernel/efi.c                  | 1 +
 arch/ia64/kvm/vtlb.c                    | 2 ++
 3 files changed, 9 insertions(+)

diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h
index 76d6a6943e8..4e40e62c4ab 100644
--- a/arch/ia64/include/asm/paravirt_privop.h
+++ b/arch/ia64/include/asm/paravirt_privop.h
@@ -118,6 +118,12 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 
 #endif /* CONFIG_PARAVIRT */
 
+#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED)
+#define paravirt_dv_serialize_data()	ia64_dv_serialize_data()
+#else
+#define paravirt_dv_serialize_data()	/* nothing */
+#endif
+
 /* these routines utilize privilege-sensitive or performance-sensitive
  * privileged instructions so the code must be replaced with
  * paravirtualized versions */
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index efaff15d8cf..7ef80e8161c 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -456,6 +456,7 @@ efi_map_pal_code (void)
 		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
 		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
 		 IA64_GRANULE_SHIFT);
+	paravirt_dv_serialize_data();
 	ia64_set_psr(psr);		/* restore psr */
 }
 
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index 6b6307a3bd5..1de4dbda37e 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -210,6 +210,7 @@ void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type)
 		phy_pte  &= ~PAGE_FLAGS_RV_MASK;
 		psr = ia64_clear_ic();
 		ia64_itc(type, va, phy_pte, itir_ps(itir));
+		paravirt_dv_serialize_data();
 		ia64_set_psr(psr);
 	}
 
@@ -464,6 +465,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 		phy_pte  &= ~PAGE_FLAGS_RV_MASK;
 		psr = ia64_clear_ic();
 		ia64_itc(type, ifa, phy_pte, ps);
+		paravirt_dv_serialize_data();
 		ia64_set_psr(psr);
 	}
 	if (!(pte&VTLB_PTE_IO))
-- 
cgit v1.2.3


From 0a7d32440294faea84c9aae4cb99239fe6ddb8ed Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 4 Mar 2009 21:06:55 +0900
Subject: ia64/pv_ops/bp/xen: implemented binary patchable pv_cpu_ops.

implemented xen binary patch for pv_cpu_ops.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/xen/privop.h |   4 +
 arch/ia64/xen/hypercall.S          |   2 +
 arch/ia64/xen/xen_pv_ops.c         | 665 +++++++++++++++++++++++++++++++++++++
 3 files changed, 671 insertions(+)

diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h
index 2261dda578f..e5fbaeeb161 100644
--- a/arch/ia64/include/asm/xen/privop.h
+++ b/arch/ia64/include/asm/xen/privop.h
@@ -82,8 +82,10 @@ extern unsigned long xen_thash(unsigned long addr);
 extern unsigned long xen_get_cpuid(int index);
 extern unsigned long xen_get_pmd(int index);
 
+#ifndef ASM_SUPPORTED
 extern unsigned long xen_get_eflag(void);	/* see xen_ia64_getreg */
 extern void xen_set_eflag(unsigned long);	/* see xen_ia64_setreg */
+#endif
 
 /************************************************/
 /* Instructions paravirtualized for performance */
@@ -108,6 +110,7 @@ extern void xen_set_eflag(unsigned long);	/* see xen_ia64_setreg */
 #define xen_get_virtual_pend()		\
 	(*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
 
+#ifndef ASM_SUPPORTED
 /* Although all privileged operations can be left to trap and will
  * be properly handled by Xen, some are frequent enough that we use
  * hyperprivops for performance. */
@@ -125,6 +128,7 @@ extern void xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
 			       unsigned long val4);
 extern void xen_set_kr(unsigned long index, unsigned long val);
 extern void xen_ptcga(unsigned long addr, unsigned long size);
+#endif /* !ASM_SUPPORTED */
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/ia64/xen/hypercall.S b/arch/ia64/xen/hypercall.S
index 45e02bb64a9..e32dae444dd 100644
--- a/arch/ia64/xen/hypercall.S
+++ b/arch/ia64/xen/hypercall.S
@@ -9,6 +9,7 @@
 #include <asm/intrinsics.h>
 #include <asm/xen/privop.h>
 
+#ifdef __INTEL_COMPILER
 /*
  * Hypercalls without parameter.
  */
@@ -72,6 +73,7 @@ GLOBAL_ENTRY(xen_set_rr0_to_rr4)
 	br.ret.sptk.many rp
 	;;
 END(xen_set_rr0_to_rr4)
+#endif
 
 GLOBAL_ENTRY(xen_send_ipi)
 	mov r14=r32
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index bdf1acbce81..6c44225e7b8 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -154,6 +154,13 @@ xen_post_smp_prepare_boot_cpu(void)
 	xen_setup_vcpu_info_placement();
 }
 
+#ifdef ASM_SUPPORTED
+static unsigned long __init_or_module
+xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
+#endif
+static void __init
+xen_patch_branch(unsigned long tag, unsigned long type);
+
 static const struct pv_init_ops xen_init_ops __initconst = {
 	.banner = xen_banner,
 
@@ -164,6 +171,10 @@ static const struct pv_init_ops xen_init_ops __initconst = {
 	.arch_setup_nomca = xen_arch_setup_nomca,
 
 	.post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu,
+#ifdef ASM_SUPPORTED
+	.patch_bundle = xen_patch_bundle,
+#endif
+	.patch_branch = xen_patch_branch,
 };
 
 /***************************************************************************
@@ -214,6 +225,7 @@ static struct pv_patchdata xen_patchdata __initdata = {
  * intrinsics hooks.
  */
 
+#ifndef ASM_SUPPORTED
 static void
 xen_set_itm_with_offset(unsigned long val)
 {
@@ -381,6 +393,410 @@ xen_intrin_local_irq_restore(unsigned long mask)
 	else
 		xen_rsm_i();
 }
+#else
+#define __DEFINE_FUNC(name, code)					\
+	extern const char xen_ ## name ## _direct_start[];		\
+	extern const char xen_ ## name ## _direct_end[];		\
+	asm (".align 32\n"						\
+	     ".proc xen_" #name "\n"					\
+	     "xen_" #name ":\n"						\
+	     "xen_" #name "_direct_start:\n"				\
+	     code							\
+	     "xen_" #name "_direct_end:\n"				\
+	     "br.cond.sptk.many b6\n"					\
+	     ".endp xen_" #name "\n")
+
+#define DEFINE_VOID_FUNC0(name, code)		\
+	extern void				\
+	xen_ ## name (void);			\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1(name, code)		\
+	extern void				\
+	xen_ ## name (unsigned long arg);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC2(name, code)		\
+	extern void				\
+	xen_ ## name (unsigned long arg0,	\
+		      unsigned long arg1);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC0(name, code)		\
+	extern unsigned long			\
+	xen_ ## name (void);			\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC1(name, type, code)		\
+	extern unsigned long			\
+	xen_ ## name (type arg);		\
+	__DEFINE_FUNC(name, code)
+
+#define XEN_PSR_I_ADDR_ADDR     (XSI_BASE + XSI_PSR_I_ADDR_OFS)
+
+/*
+ * static void xen_set_itm_with_offset(unsigned long val)
+ *        xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
+ */
+/* 2 bundles */
+DEFINE_VOID_FUNC1(set_itm_with_offset,
+		  "mov r2 = " __stringify(XSI_BASE) " + "
+		  __stringify(XSI_ITC_OFFSET_OFS) "\n"
+		  ";;\n"
+		  "ld8 r3 = [r2]\n"
+		  ";;\n"
+		  "sub r8 = r8, r3\n"
+		  "break " __stringify(HYPERPRIVOP_SET_ITM) "\n");
+
+/*
+ * static unsigned long xen_get_itm_with_offset(void)
+ *    return ia64_native_getreg(_IA64_REG_CR_ITM) + XEN_MAPPEDREGS->itc_offset;
+ */
+/* 2 bundles */
+DEFINE_FUNC0(get_itm_with_offset,
+	     "mov r2 = " __stringify(XSI_BASE) " + "
+	     __stringify(XSI_ITC_OFFSET_OFS) "\n"
+	     ";;\n"
+	     "ld8 r3 = [r2]\n"
+	     "mov r8 = cr.itm\n"
+	     ";;\n"
+	     "add r8 = r8, r2\n");
+
+/*
+ * static void xen_set_itc(unsigned long val)
+ *	unsigned long mitc;
+ *
+ *	WARN_ON(!irqs_disabled());
+ *	mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
+ *	XEN_MAPPEDREGS->itc_offset = val - mitc;
+ *	XEN_MAPPEDREGS->itc_last = val;
+ */
+/* 2 bundles */
+DEFINE_VOID_FUNC1(set_itc,
+		  "mov r2 = " __stringify(XSI_BASE) " + "
+		  __stringify(XSI_ITC_LAST_OFS) "\n"
+		  "mov r3 = ar.itc\n"
+		  ";;\n"
+		  "sub r3 = r8, r3\n"
+		  "st8 [r2] = r8, "
+		  __stringify(XSI_ITC_LAST_OFS) " - "
+		  __stringify(XSI_ITC_OFFSET_OFS) "\n"
+		  ";;\n"
+		  "st8 [r2] = r3\n");
+
+/*
+ * static unsigned long xen_get_itc(void)
+ *	unsigned long res;
+ *	unsigned long itc_offset;
+ *	unsigned long itc_last;
+ *	unsigned long ret_itc_last;
+ *
+ *	itc_offset = XEN_MAPPEDREGS->itc_offset;
+ *	do {
+ *		itc_last = XEN_MAPPEDREGS->itc_last;
+ *		res = ia64_native_getreg(_IA64_REG_AR_ITC);
+ *		res += itc_offset;
+ *		if (itc_last >= res)
+ *			res = itc_last + 1;
+ *		ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
+ *				       itc_last, res);
+ *	} while (unlikely(ret_itc_last != itc_last));
+ *	return res;
+ */
+/* 5 bundles */
+DEFINE_FUNC0(get_itc,
+	     "mov r2 = " __stringify(XSI_BASE) " + "
+	     __stringify(XSI_ITC_OFFSET_OFS) "\n"
+	     ";;\n"
+	     "ld8 r9 = [r2], " __stringify(XSI_ITC_LAST_OFS) " - "
+	     __stringify(XSI_ITC_OFFSET_OFS) "\n"
+					/* r9 = itc_offset */
+					/* r2 = XSI_ITC_OFFSET */
+	     "888:\n"
+	     "mov r8 = ar.itc\n"	/* res = ar.itc */
+	     ";;\n"
+	     "ld8 r3 = [r2]\n"		/* r3 = itc_last */
+	     "add r8 = r8, r9\n"	/* res = ar.itc + itc_offset */
+	     ";;\n"
+	     "cmp.gtu p6, p0 = r3, r8\n"
+	     ";;\n"
+	     "(p6) add r8 = 1, r3\n"	/* if (itc_last > res) itc_last + 1 */
+	     ";;\n"
+	     "mov ar.ccv = r8\n"
+	     ";;\n"
+	     "cmpxchg8.acq r10 = [r2], r8, ar.ccv\n"
+	     ";;\n"
+	     "cmp.ne p6, p0 = r10, r3\n"
+	     "(p6) hint @pause\n"
+	     "(p6) br.cond.spnt 888b\n");
+
+DEFINE_VOID_FUNC1(fc,
+		  "break " __stringify(HYPERPRIVOP_FC) "\n");
+
+/*
+ * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR
+ * masked_addr = *psr_i_addr_addr
+ * pending_intr_addr = masked_addr - 1
+ * if (val & IA64_PSR_I) {
+ *   masked = *masked_addr
+ *   *masked_addr = 0:xen_set_virtual_psr_i(1)
+ *   compiler barrier
+ *   if (masked) {
+ *      uint8_t pending = *pending_intr_addr;
+ *      if (pending)
+ *              XEN_HYPER_SSM_I
+ *   }
+ * } else {
+ *   *masked_addr = 1:xen_set_virtual_psr_i(0)
+ * }
+ */
+/* 6 bundles */
+DEFINE_VOID_FUNC1(intrin_local_irq_restore,
+		  /* r8 = input value: 0 or IA64_PSR_I
+		   * p6 =  (flags & IA64_PSR_I)
+		   *    = if clause
+		   * p7 = !(flags & IA64_PSR_I)
+		   *    = else clause
+		   */
+		  "cmp.ne p6, p7 = r8, r0\n"
+		  "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+		  ";;\n"
+		  /* r9 = XEN_PSR_I_ADDR */
+		  "ld8 r9 = [r9]\n"
+		  ";;\n"
+
+		  /* r10 = masked previous value */
+		  "(p6)	ld1.acq r10 = [r9]\n"
+		  ";;\n"
+
+		  /* p8 = !masked interrupt masked previously? */
+		  "(p6)	cmp.ne.unc p8, p0 = r10, r0\n"
+
+		  /* p7 = else clause */
+		  "(p7)	mov r11 = 1\n"
+		  ";;\n"
+		  /* masked = 1 */
+		  "(p7)	st1.rel [r9] = r11\n"
+
+		  /* p6 = if clause */
+		  /* masked = 0
+		   * r9 = masked_addr - 1
+		   *    = pending_intr_addr
+		   */
+		  "(p8)	st1.rel [r9] = r0, -1\n"
+		  ";;\n"
+		  /* r8 = pending_intr */
+		  "(p8)	ld1.acq r11 = [r9]\n"
+		  ";;\n"
+		  /* p9 = interrupt pending? */
+		  "(p8)	cmp.ne.unc p9, p10 = r11, r0\n"
+		  ";;\n"
+		  "(p10) mf\n"
+		  /* issue hypercall to trigger interrupt */
+		  "(p9)	break " __stringify(HYPERPRIVOP_SSM_I) "\n");
+
+DEFINE_VOID_FUNC2(ptcga,
+		  "break " __stringify(HYPERPRIVOP_PTC_GA) "\n");
+DEFINE_VOID_FUNC2(set_rr,
+		  "break " __stringify(HYPERPRIVOP_SET_RR) "\n");
+
+/*
+ * tmp = XEN_MAPPEDREGS->interrupt_mask_addr = XEN_PSR_I_ADDR_ADDR;
+ * tmp = *tmp
+ * tmp = *tmp;
+ * psr_i = tmp? 0: IA64_PSR_I;
+ */
+/* 4 bundles */
+DEFINE_FUNC0(get_psr_i,
+	     "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+	     ";;\n"
+	     "ld8 r9 = [r9]\n"			/* r9 = XEN_PSR_I_ADDR */
+	     "mov r8 = 0\n"			/* psr_i = 0 */
+	     ";;\n"
+	     "ld1.acq r9 = [r9]\n"		/* r9 = XEN_PSR_I */
+	     ";;\n"
+	     "cmp.eq.unc p6, p0 = r9, r0\n"	/* p6 = (XEN_PSR_I != 0) */
+	     ";;\n"
+	     "(p6) mov r8 = " __stringify(1 << IA64_PSR_I_BIT) "\n");
+
+DEFINE_FUNC1(thash, unsigned long,
+	     "break " __stringify(HYPERPRIVOP_THASH) "\n");
+DEFINE_FUNC1(get_cpuid, int,
+	     "break " __stringify(HYPERPRIVOP_GET_CPUID) "\n");
+DEFINE_FUNC1(get_pmd, int,
+	     "break " __stringify(HYPERPRIVOP_GET_PMD) "\n");
+DEFINE_FUNC1(get_rr, unsigned long,
+	     "break " __stringify(HYPERPRIVOP_GET_RR) "\n");
+
+/*
+ * void xen_privop_ssm_i(void)
+ *
+ * int masked = !xen_get_virtual_psr_i();
+ *	// masked = *(*XEN_MAPPEDREGS->interrupt_mask_addr)
+ * xen_set_virtual_psr_i(1)
+ *	// *(*XEN_MAPPEDREGS->interrupt_mask_addr) = 0
+ * // compiler barrier
+ * if (masked) {
+ *	uint8_t* pend_int_addr =
+ *		(uint8_t*)(*XEN_MAPPEDREGS->interrupt_mask_addr) - 1;
+ *	uint8_t pending = *pend_int_addr;
+ *	if (pending)
+ *		XEN_HYPER_SSM_I
+ * }
+ */
+/* 4 bundles */
+DEFINE_VOID_FUNC0(ssm_i,
+		  "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+		  ";;\n"
+		  "ld8 r8 = [r8]\n"		/* r8 = XEN_PSR_I_ADDR */
+		  ";;\n"
+		  "ld1.acq r9 = [r8]\n"		/* r9 = XEN_PSR_I */
+		  ";;\n"
+		  "st1.rel [r8] = r0, -1\n"	/* psr_i = 0. enable interrupt
+						 * r8 = XEN_PSR_I_ADDR - 1
+						 *    = pend_int_addr
+						 */
+		  "cmp.eq.unc p0, p6 = r9, r0\n"/* p6 = !XEN_PSR_I
+						 * previously interrupt
+						 * masked?
+						 */
+		  ";;\n"
+		  "(p6) ld1.acq r8 = [r8]\n"	/* r8 = xen_pend_int */
+		  ";;\n"
+		  "(p6) cmp.eq.unc p6, p7 = r8, r0\n"	/*interrupt pending?*/
+		  ";;\n"
+		  /* issue hypercall to get interrupt */
+		  "(p7) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
+		  ";;\n");
+
+/*
+ * psr_i_addr_addr = XEN_MAPPEDREGS->interrupt_mask_addr
+ *		   = XEN_PSR_I_ADDR_ADDR;
+ * psr_i_addr = *psr_i_addr_addr;
+ * *psr_i_addr = 1;
+ */
+/* 2 bundles */
+DEFINE_VOID_FUNC0(rsm_i,
+		  "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+						/* r8 = XEN_PSR_I_ADDR */
+		  "mov r9 = 1\n"
+		  ";;\n"
+		  "ld8 r8 = [r8]\n"		/* r8 = XEN_PSR_I */
+		  ";;\n"
+		  "st1.rel [r8] = r9\n");	/* XEN_PSR_I = 1 */
+
+extern void
+xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
+		   unsigned long val2, unsigned long val3,
+		   unsigned long val4);
+__DEFINE_FUNC(set_rr0_to_rr4,
+	      "break " __stringify(HYPERPRIVOP_SET_RR0_TO_RR4) "\n");
+
+
+extern unsigned long xen_getreg(int regnum);
+#define __DEFINE_GET_REG(id, privop)					\
+	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"			\
+	";;\n"								\
+	"cmp.eq p6, p0 = r2, r8\n"					\
+	";;\n"								\
+	"(p6) break " __stringify(HYPERPRIVOP_GET_ ## privop) "\n"	\
+	"(p6) br.cond.sptk.many b6\n"					\
+	";;\n"
+
+__DEFINE_FUNC(getreg,
+	      __DEFINE_GET_REG(PSR, PSR)
+#ifdef CONFIG_IA32_SUPPORT
+	      __DEFINE_GET_REG(AR_EFLAG, EFLAG)
+#endif
+
+	      /* get_itc */
+	      "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
+	      ";;\n"
+	      "cmp.eq p6, p0 = r2, r8\n"
+	      ";;\n"
+	      "(p6) br.cond.spnt xen_get_itc\n"
+	      ";;\n"
+
+	      /* get itm */
+	      "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
+	      ";;\n"
+	      "cmp.eq p6, p0 = r2, r8\n"
+	      ";;\n"
+	      "(p6) br.cond.spnt xen_get_itm_with_offset\n"
+	      ";;\n"
+
+	      __DEFINE_GET_REG(CR_IVR, IVR)
+	      __DEFINE_GET_REG(CR_TPR, TPR)
+
+	      /* fall back */
+	      "movl r2 = ia64_native_getreg_func\n"
+	      ";;\n"
+	      "mov b7 = r2\n"
+	      ";;\n"
+	      "br.cond.sptk.many b7\n");
+
+extern void xen_setreg(int regnum, unsigned long val);
+#define __DEFINE_SET_REG(id, privop)					\
+	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"			\
+	";;\n"								\
+	"cmp.eq p6, p0 = r2, r9\n"					\
+	";;\n"								\
+	"(p6) break " __stringify(HYPERPRIVOP_ ## privop) "\n"		\
+	"(p6) br.cond.sptk.many b6\n"					\
+	";;\n"
+
+__DEFINE_FUNC(setreg,
+	      /* kr0 .. kr 7*/
+	      /*
+	       * if (_IA64_REG_AR_KR0 <= regnum &&
+	       *     regnum <= _IA64_REG_AR_KR7) {
+	       *     register __index asm ("r8") = regnum - _IA64_REG_AR_KR0
+	       *     register __val asm ("r9") = val
+	       *    "break HYPERPRIVOP_SET_KR"
+	       * }
+	       */
+	      "mov r17 = r9\n"
+	      "mov r2 = " __stringify(_IA64_REG_AR_KR0) "\n"
+	      ";;\n"
+	      "cmp.ge p6, p0 = r9, r2\n"
+	      "sub r17 = r17, r2\n"
+	      ";;\n"
+	      "(p6) cmp.ge.unc p7, p0 = "
+	      __stringify(_IA64_REG_AR_KR7) " - " __stringify(_IA64_REG_AR_KR0)
+	      ", r17\n"
+	      ";;\n"
+	      "(p7) mov r9 = r8\n"
+	      ";;\n"
+	      "(p7) mov r8 = r17\n"
+	      "(p7) break " __stringify(HYPERPRIVOP_SET_KR) "\n"
+
+	      /* set itm */
+	      "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
+	      ";;\n"
+	      "cmp.eq p6, p0 = r2, r8\n"
+	      ";;\n"
+	      "(p6) br.cond.spnt xen_set_itm_with_offset\n"
+
+	      /* set itc */
+	      "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
+	      ";;\n"
+	      "cmp.eq p6, p0 = r2, r8\n"
+	      ";;\n"
+	      "(p6) br.cond.spnt xen_set_itc\n"
+
+#ifdef CONFIG_IA32_SUPPORT
+	      __DEFINE_SET_REG(AR_EFLAG, SET_EFLAG)
+#endif
+	      __DEFINE_SET_REG(CR_TPR, SET_TPR)
+	      __DEFINE_SET_REG(CR_EOI, EOI)
+
+	      /* fall back */
+	      "movl r2 = ia64_native_setreg_func\n"
+	      ";;\n"
+	      "mov b7 = r2\n"
+	      ";;\n"
+	      "br.cond.sptk.many b7\n");
+#endif
 
 static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.fc		= xen_fc,
@@ -486,3 +902,252 @@ xen_setup_pv_ops(void)
 
 	paravirt_cpu_asm_init(&xen_cpu_asm_switch);
 }
+
+#ifdef ASM_SUPPORTED
+/***************************************************************************
+ * binary pacthing
+ * pv_init_ops.patch_bundle
+ */
+
+#define DEFINE_FUNC_GETREG(name, privop)				\
+	DEFINE_FUNC0(get_ ## name,					\
+		     "break "__stringify(HYPERPRIVOP_GET_ ## privop) "\n")
+
+DEFINE_FUNC_GETREG(psr, PSR);
+DEFINE_FUNC_GETREG(eflag, EFLAG);
+DEFINE_FUNC_GETREG(ivr, IVR);
+DEFINE_FUNC_GETREG(tpr, TPR);
+
+#define DEFINE_FUNC_SET_KR(n)						\
+	DEFINE_VOID_FUNC0(set_kr ## n,					\
+			  ";;\n"					\
+			  "mov r9 = r8\n"				\
+			  "mov r8 = " #n "\n"				\
+			  "break " __stringify(HYPERPRIVOP_SET_KR) "\n")
+
+DEFINE_FUNC_SET_KR(0);
+DEFINE_FUNC_SET_KR(1);
+DEFINE_FUNC_SET_KR(2);
+DEFINE_FUNC_SET_KR(3);
+DEFINE_FUNC_SET_KR(4);
+DEFINE_FUNC_SET_KR(5);
+DEFINE_FUNC_SET_KR(6);
+DEFINE_FUNC_SET_KR(7);
+
+#define __DEFINE_FUNC_SETREG(name, privop)				\
+	DEFINE_VOID_FUNC0(name,						\
+			  "break "__stringify(HYPERPRIVOP_ ## privop) "\n")
+
+#define DEFINE_FUNC_SETREG(name, privop)			\
+	__DEFINE_FUNC_SETREG(set_ ## name, SET_ ## privop)
+
+DEFINE_FUNC_SETREG(eflag, EFLAG);
+DEFINE_FUNC_SETREG(tpr, TPR);
+__DEFINE_FUNC_SETREG(eoi, EOI);
+
+extern const char xen_check_events[];
+extern const char __xen_intrin_local_irq_restore_direct_start[];
+extern const char __xen_intrin_local_irq_restore_direct_end[];
+extern const unsigned long __xen_intrin_local_irq_restore_direct_reloc;
+
+asm (
+	".align 32\n"
+	".proc xen_check_events\n"
+	"xen_check_events:\n"
+	/* masked = 0
+	 * r9 = masked_addr - 1
+	 *    = pending_intr_addr
+	 */
+	"st1.rel [r9] = r0, -1\n"
+	";;\n"
+	/* r8 = pending_intr */
+	"ld1.acq r11 = [r9]\n"
+	";;\n"
+	/* p9 = interrupt pending? */
+	"cmp.ne p9, p10 = r11, r0\n"
+	";;\n"
+	"(p10) mf\n"
+	/* issue hypercall to trigger interrupt */
+	"(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
+	"br.cond.sptk.many b6\n"
+	".endp xen_check_events\n"
+	"\n"
+	".align 32\n"
+	".proc __xen_intrin_local_irq_restore_direct\n"
+	"__xen_intrin_local_irq_restore_direct:\n"
+	"__xen_intrin_local_irq_restore_direct_start:\n"
+	"1:\n"
+	"{\n"
+	"cmp.ne p6, p7 = r8, r0\n"
+	"mov r17 = ip\n" /* get ip to calc return address */
+	"mov r9 = "__stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+	";;\n"
+	"}\n"
+	"{\n"
+	/* r9 = XEN_PSR_I_ADDR */
+	"ld8 r9 = [r9]\n"
+	";;\n"
+	/* r10 = masked previous value */
+	"(p6) ld1.acq r10 = [r9]\n"
+	"adds r17 =  1f - 1b, r17\n" /* calculate return address */
+	";;\n"
+	"}\n"
+	"{\n"
+	/* p8 = !masked interrupt masked previously? */
+	"(p6) cmp.ne.unc p8, p0 = r10, r0\n"
+	"\n"
+	/* p7 = else clause */
+	"(p7) mov r11 = 1\n"
+	";;\n"
+	"(p8) mov b6 = r17\n" /* set return address */
+	"}\n"
+	"{\n"
+	/* masked = 1 */
+	"(p7) st1.rel [r9] = r11\n"
+	"\n"
+	"[99:]\n"
+	"(p8) brl.cond.dptk.few xen_check_events\n"
+	"}\n"
+	/* pv calling stub is 5 bundles. fill nop to adjust return address */
+	"{\n"
+	"nop 0\n"
+	"nop 0\n"
+	"nop 0\n"
+	"}\n"
+	"1:\n"
+	"__xen_intrin_local_irq_restore_direct_end:\n"
+	".endp __xen_intrin_local_irq_restore_direct\n"
+	"\n"
+	".align 8\n"
+	"__xen_intrin_local_irq_restore_direct_reloc:\n"
+	"data8 99b\n"
+);
+
+static struct paravirt_patch_bundle_elem xen_patch_bundle_elems[]
+__initdata_or_module =
+{
+#define XEN_PATCH_BUNDLE_ELEM(name, type)		\
+	{						\
+		(void*)xen_ ## name ## _direct_start,	\
+		(void*)xen_ ## name ## _direct_end,	\
+		PARAVIRT_PATCH_TYPE_ ## type,		\
+	}
+
+	XEN_PATCH_BUNDLE_ELEM(fc, FC),
+	XEN_PATCH_BUNDLE_ELEM(thash, THASH),
+	XEN_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
+	XEN_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
+	XEN_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
+	XEN_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
+	XEN_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
+	XEN_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
+	XEN_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
+	XEN_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
+	XEN_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
+	{
+		(void*)__xen_intrin_local_irq_restore_direct_start,
+		(void*)__xen_intrin_local_irq_restore_direct_end,
+		PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE,
+	},
+
+#define XEN_PATCH_BUNDLE_ELEM_GETREG(name, reg)			\
+	{							\
+		xen_get_ ## name ## _direct_start,		\
+		xen_get_ ## name ## _direct_end,		\
+		PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \
+	}
+
+	XEN_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
+	XEN_PATCH_BUNDLE_ELEM_GETREG(eflag, AR_EFLAG),
+
+	XEN_PATCH_BUNDLE_ELEM_GETREG(ivr, CR_IVR),
+	XEN_PATCH_BUNDLE_ELEM_GETREG(tpr, CR_TPR),
+
+	XEN_PATCH_BUNDLE_ELEM_GETREG(itc, AR_ITC),
+	XEN_PATCH_BUNDLE_ELEM_GETREG(itm_with_offset, CR_ITM),
+
+
+#define __XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg)		\
+	{							\
+		xen_ ## name ## _direct_start,			\
+		xen_ ## name ## _direct_end,			\
+		PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \
+	}
+
+#define XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg)			\
+	__XEN_PATCH_BUNDLE_ELEM_SETREG(set_ ## name, reg)
+
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr0, AR_KR0),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr1, AR_KR1),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr2, AR_KR2),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr3, AR_KR3),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr4, AR_KR4),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr5, AR_KR5),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr6, AR_KR6),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(kr7, AR_KR7),
+
+	XEN_PATCH_BUNDLE_ELEM_SETREG(eflag, AR_EFLAG),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(tpr, CR_TPR),
+	__XEN_PATCH_BUNDLE_ELEM_SETREG(eoi, CR_EOI),
+
+	XEN_PATCH_BUNDLE_ELEM_SETREG(itc, AR_ITC),
+	XEN_PATCH_BUNDLE_ELEM_SETREG(itm_with_offset, CR_ITM),
+};
+
+static unsigned long __init_or_module
+xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
+{
+	const unsigned long nelems = sizeof(xen_patch_bundle_elems) /
+		sizeof(xen_patch_bundle_elems[0]);
+	unsigned long used;
+	const struct paravirt_patch_bundle_elem *found;
+
+	used = __paravirt_patch_apply_bundle(sbundle, ebundle, type,
+					     xen_patch_bundle_elems, nelems,
+					     &found);
+
+	if (found == NULL)
+		/* fallback */
+		return ia64_native_patch_bundle(sbundle, ebundle, type);
+	if (used == 0)
+		return used;
+
+	/* relocation */
+	switch (type) {
+	case PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE: {
+		unsigned long reloc =
+			__xen_intrin_local_irq_restore_direct_reloc;
+		unsigned long reloc_offset = reloc - (unsigned long)
+			__xen_intrin_local_irq_restore_direct_start;
+		unsigned long tag = (unsigned long)sbundle + reloc_offset;
+		paravirt_patch_reloc_brl(tag, xen_check_events);
+		break;
+	}
+	default:
+		/* nothing */
+		break;
+	}
+	return used;
+}
+#endif /* ASM_SUPPOTED */
+
+const struct paravirt_patch_branch_target xen_branch_target[]
+__initconst = {
+#define PARAVIRT_BR_TARGET(name, type)			\
+	{						\
+		&xen_ ## name,				\
+		PARAVIRT_PATCH_TYPE_BR_ ## type,	\
+	}
+	PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
+	PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
+	PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
+	PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
+};
+
+static void __init
+xen_patch_branch(unsigned long tag, unsigned long type)
+{
+	const unsigned long nelem =
+		sizeof(xen_branch_target) / sizeof(xen_branch_target[0]);
+	__paravirt_patch_apply_branch(tag, type, xen_branch_target, nelem);
+}
-- 
cgit v1.2.3


From 853346e4354c948b50a6fb0002f8af2cf5fbf2ae Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Sat, 21 Mar 2009 22:05:11 +0800
Subject: PCI: fix conflict between SR-IOV and config space sizing

New pci_cfg_space_size() needs invalid pdev->class, put it in the
right place in the pci_setup_device().

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 56c71e585f3..e2f3dd098cf 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -713,7 +713,6 @@ int pci_setup_device(struct pci_dev *dev)
 	dev->dev.bus = &pci_bus_type;
 	dev->hdr_type = hdr_type & 0x7f;
 	dev->multifunction = !!(hdr_type & 0x80);
-	dev->cfg_size = pci_cfg_space_size(dev);
 	dev->error_state = pci_channel_io_normal;
 	set_pcie_port_type(dev);
 
@@ -738,6 +737,9 @@ int pci_setup_device(struct pci_dev *dev)
 	dev_dbg(&dev->dev, "found [%04x:%04x] class %06x header type %02x\n",
 		 dev->vendor, dev->device, class, dev->hdr_type);
 
+	/* need to have dev->class ready */
+	dev->cfg_size = pci_cfg_space_size(dev);
+
 	/* "Unknown power state" */
 	dev->current_state = PCI_UNKNOWN;
 
@@ -959,9 +961,6 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 		return NULL;
 	}
 
-	/* need to have dev->class ready */
-	dev->cfg_size = pci_cfg_space_size(dev);
-
 	return dev;
 }
 
-- 
cgit v1.2.3


From 7ae0567fd3f4f51d55c4c638ecc6836347992de2 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Thu, 26 Mar 2009 16:49:52 +0900
Subject: PCI: fix kernel oops on bridge removal

Fix the following kernel oops problem that happens when removing PCI
bridge with pciehp loaded. It should also occur with other hotplug
driver that is implemented as a bridge's driver.

[  459.997257] pciehp 0000:2f:04.0:pcie24: unloading service driver pciehp
[  459.997495] general protection fault: 0000 [#1] SMP
[  459.997737] last sysfs file: /sys/devices/pci0000:00/0000:00:04.0/0000:2e:00.0/0000:2f:04.0/remove
[  459.997964] CPU 4
[  459.998129] Modules linked in: pciehp ipv6 autofs4 hidp rfcomm l2cap bluetooth sunrpc cpufreq_ondemand acpi_cpufreq dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod sbs sbshc battery ac parport_pc lp parport mptspi mptscsih mptbase scsi_transport_spi e1000e sg sr_mod cdrom button serio_raw i2c_i801 i2c_core shpchp pcspkr ata_piix libata megaraid_sas sd_mod scsi_mod crc_t10dif ext3 jbd uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode]
[  459.998129] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1 PRIMERGY
[  459.998129] RIP: 0010:[<ffffffff803bf047>]  [<ffffffff803bf047>] pci_slot_release+0x37/0x100
[  459.998129] RSP: 0018:ffff88083b3bf9e0  EFLAGS: 00010246
[  459.998129] RAX: ffff88083adc5158 RBX: ffff880836c1bc80 RCX: 6b6b6b6b6b6b6b6b
[  459.998129] RDX: 0000000000000000 RSI: ffffffff803a77f0 RDI: ffff880836c1bc48
[  459.998129] RBP: ffff88083b3bfa00 R08: 0000000000000002 R09: 0000000000000000
[  459.998129] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880836c1bc48
[  459.998129] R13: ffff880836c1bc20 R14: ffff880836c1bc48 R15: ffff880836d1ec38
[  459.998129] FS:  0000000000000000(0000) GS:ffff88083ccc3770(0000) knlGS:0000000000000000
[  459.998129] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[  459.998129] CR2: 00007f1562f1d558 CR3: 0000000838090000 CR4: 00000000000006e0
[  459.998129] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  459.998129] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  459.998129] Process events/4 (pid: 56, threadinfo ffff88083b3be000, task ffff88083b3b3e40)
[  459.998129] Stack:
[  459.998129]  ffff880836c1bc80 ffff880836c1bc48 ffffffff80793320 ffff88083b0d0960
[  459.998129]  ffff88083b3bfa30 ffffffff803a788a ffff880836c1bc80 ffffffff803a77f0
[  459.998129]  ffff880836c1bc20 ffff880836d1ec38 ffff88083b3bfa50 ffffffff803a8ce7
[  459.998129] Call Trace:
[  459.998129]  [<ffffffff803a788a>] kobject_release+0x9a/0x290
[  459.998129]  [<ffffffff803a77f0>] ? kobject_release+0x0/0x290
[  459.998129]  [<ffffffff803a8ce7>] kref_put+0x37/0x80
[  459.998129]  [<ffffffff803a76f7>] kobject_put+0x27/0x60
[  459.998129]  [<ffffffff803bebcc>] ? pci_destroy_slot+0x3c/0xc0
[  459.998129]  [<ffffffff803bebd5>] pci_destroy_slot+0x45/0xc0
[  459.998129]  [<ffffffff803c797d>] pci_hp_deregister+0x13d/0x210
[  459.998129]  [<ffffffffa031141d>] cleanup_slots+0x2d/0x80 [pciehp]
[  459.998129]  [<ffffffffa0311735>] pciehp_remove+0x15/0x30 [pciehp]
[  459.998129]  [<ffffffff803c4c99>] pcie_port_remove_service+0x69/0x90
[  459.998129]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
[  459.998129]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
[  459.998129]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
[  459.998129]  [<ffffffff8043e46b>] device_del+0x12b/0x190
[  459.998129]  [<ffffffff803c4d90>] ? remove_iter+0x0/0x40
[  459.998129]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
[  459.998129]  [<ffffffff803c4dbf>] remove_iter+0x2f/0x40
[  459.998129]  [<ffffffff8043ddf3>] device_for_each_child+0x33/0x60
[  459.998129]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
[  459.998129]  [<ffffffff803c4d30>] pcie_port_device_remove+0x30/0x80
[  459.998129]  [<ffffffff803c55a1>] pcie_portdrv_remove+0x11/0x20
[  459.998129]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
[  459.998129]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
[  459.998129]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
[  459.998129]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
[  459.998129]  [<ffffffff8043e46b>] device_del+0x12b/0x190
[  459.998129]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
[  459.998129]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
[  459.998129]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
[  459.998129]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
[  459.998129]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
[  459.998129]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
[  459.998129]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
[  459.998129]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
[  459.998129]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
[  459.998129]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
[  459.998129]  [<ffffffff8025b89d>] kthread+0x4d/0x80
[  459.998129]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
[  459.998129]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
[  459.998129]  [<ffffffff8025b850>] ? kthread+0x0/0x80
[  459.998129]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20
[  459.998129] Code: 56 49 89 fe 41 55 4c 8d 6f d8 41 54 53 74 09 f6 05 b8 05 c7 00 08 75 72 49 8b 45 00 48 8b 48 28 eb 05 66 90 48 89 f1 49 8b 45 00 <48> 8b 31 48 83 c0 28 0f 18 0e 48 39 c1 74 1c 8b 41 38 41 0f b6
[  459.998129] RIP  [<ffffffff803bf047>] pci_slot_release+0x37/0x100
[  459.998129]  RSP <ffff88083b3bf9e0>
[  460.018595] ---[ end trace 5a08d2095374aedc ]---

The pci_remove_bus_device() removes all buses and devices under the
bridge, and then removes the bridge. So the remove() callback of the
hotplug drivers implemented as a bridge's driver is executed after the
struct pci_bus of the bridge's secondary bus is removed. The remove()
callback of those driver unregisters the slot using pci_destroy_slot(),
and slot's release callback refers to the the struct pci_bus that was
already freed. This is the cause of the kernel oops.

This patch solves the problem by stopping bus drivers before removing the
bridge and its child bus and devices.

Acked-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/remove.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index caf8e1eae45..86503c14ce7 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -95,6 +95,7 @@ EXPORT_SYMBOL(pci_remove_bus);
  */
 void pci_remove_bus_device(struct pci_dev *dev)
 {
+	pci_stop_bus_device(dev);
 	if (dev->subordinate) {
 		struct pci_bus *b = dev->subordinate;
 
-- 
cgit v1.2.3


From 7bb2cb3e90dc49be1cd14956c155451499c857a7 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 26 Mar 2009 18:34:33 +1100
Subject: PCI: update fakephp for bus_id removal

Get rid of a new use of bus_id that snuck in.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/fakephp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index 2dc7828df48..6151389fd90 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -18,6 +18,7 @@
 #include <linux/sysfs.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/device.h>
 #include "../pci.h"
 
 struct legacy_slot {
@@ -88,7 +89,7 @@ static int legacy_add_slot(struct pci_dev *pdev)
 
 	if (kobject_init_and_add(&slot->kobj, &legacy_ktype,
 				 &pci_slots_kset->kobj, "%s",
-				 pdev->dev.bus_id)) {
+				 dev_name(&pdev->dev))) {
 		dev_warn(&pdev->dev, "Failed to created legacy fake slot\n");
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 898585172fa729513d8636257b44bd1cfd279096 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 16 Feb 2009 02:55:47 +0800
Subject: PCI: save and restore PCIe 2.0 registers

PCIe 2.0 defines several new registers (Device Control 2, Link Control 2,
and Slot Control 2). Save and retore them in pci_save_pcie_state() and
pci_restore_pcie_state().

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c        | 11 ++++++++++-
 include/linux/pci_regs.h |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 676bbcbc272..59569b8cf1d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -647,6 +647,8 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
 
 EXPORT_SYMBOL(pci_choose_state);
 
+#define PCI_EXP_SAVE_REGS	7
+
 static int pci_save_pcie_state(struct pci_dev *dev)
 {
 	int pos, i = 0;
@@ -668,6 +670,9 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]);
 	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]);
 	pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL2, &cap[i++]);
 
 	return 0;
 }
@@ -688,6 +693,9 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]);
 	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]);
 	pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL2, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL2, cap[i++]);
 }
 
 
@@ -1372,7 +1380,8 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev)
 {
 	int error;
 
-	error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, 4 * sizeof(u16));
+	error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP,
+					PCI_EXP_SAVE_REGS * sizeof(u16));
 	if (error)
 		dev_err(&dev->dev,
 			"unable to preallocate PCI Express save buffer\n");
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d4e663877f4..e4d08c1b2e0 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -488,6 +488,8 @@
 #define  PCI_EXP_DEVCAP2_ARI	0x20	/* Alternative Routing-ID */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_ARI	0x20	/* Alternative Routing-ID */
+#define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
+#define PCI_EXP_SLTCTL2		56	/* Slot Control 2 */
 
 /* Extended Capabilities (PCI-X 2.0 and Express) */
 #define PCI_EXT_CAP_ID(header)		(header & 0x0000ffff)
-- 
cgit v1.2.3


From e42d1fe804408c57506a5326252b4db29958a7fb Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Mar 2009 13:26:13 -0700
Subject: x86/PCI: make pci=lastbus=255 work when acpi is on

Impact: scan more peer root buses even acpi is used

Move pci_bios_fixup_peer_bridges out of pci_legacy_init and into
pci_subsys_init.  This allows pci_bios_fixup_peer_bridges to be called
even pci_apci_init is driving PCI initialization.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/legacy.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index f1065b129e9..4061bb0f267 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -50,8 +50,6 @@ static int __init pci_legacy_init(void)
 	if (pci_root_bus)
 		pci_bus_add_devices(pci_root_bus);
 
-	pcibios_fixup_peer_bridges();
-
 	return 0;
 }
 
@@ -67,6 +65,7 @@ int __init pci_subsys_init(void)
 	pci_visws_init();
 #endif
 	pci_legacy_init();
+	pcibios_fixup_peer_bridges();
 	pcibios_irq_init();
 	pcibios_init();
 
-- 
cgit v1.2.3


From de7453065d5d5243686467998f1fcc58d20e0a7c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 20 Mar 2009 19:29:41 -0700
Subject: PCI: don't enable too much HT MSI mapping

Impact: fix bug

Prakash reported that his c51-mcp51 system ondie sound card doesn't work
MSI but if he hack out the HT-MSI on mcp51, the MSI will work well with
sound card.

This patch reworks nv_msi_ht_cap_quirk() and will only avoid enabling
ht_msi on devices following that root device.

Reported-by: Prakash Punnoor <prakash@punnoor.de>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/quirks.c | 118 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 36 deletions(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 7ddcfc65e79..faf02dd0001 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2147,6 +2147,65 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA,
 			PCI_DEVICE_ID_NVIDIA_NVENET_15,
 			nvenet_msi_disable);
 
+static int __devinit ht_check_msi_mapping(struct pci_dev *dev)
+{
+	int pos, ttl = 48;
+	int found = 0;
+
+	/* check if there is HT MSI cap or enabled on this device */
+	pos = pci_find_ht_capability(dev, HT_CAPTYPE_MSI_MAPPING);
+	while (pos && ttl--) {
+		u8 flags;
+
+		if (found < 1)
+			found = 1;
+		if (pci_read_config_byte(dev, pos + HT_MSI_FLAGS,
+					 &flags) == 0) {
+			if (flags & HT_MSI_FLAGS_ENABLE) {
+				if (found < 2) {
+					found = 2;
+					break;
+				}
+			}
+		}
+		pos = pci_find_next_ht_capability(dev, pos,
+						  HT_CAPTYPE_MSI_MAPPING);
+	}
+
+	return found;
+}
+
+static int __devinit host_bridge_with_leaf(struct pci_dev *host_bridge)
+{
+	struct pci_dev *dev;
+	int pos;
+	int i, dev_no;
+	int found = 0;
+
+	dev_no = host_bridge->devfn >> 3;
+	for (i = dev_no + 1; i < 0x20; i++) {
+		dev = pci_get_slot(host_bridge->bus, PCI_DEVFN(i, 0));
+		if (!dev)
+			continue;
+
+		/* found next host bridge ?*/
+		pos = pci_find_ht_capability(dev, HT_CAPTYPE_SLAVE);
+		if (pos != 0) {
+			pci_dev_put(dev);
+			break;
+		}
+
+		if (ht_check_msi_mapping(dev)) {
+			found = 1;
+			pci_dev_put(dev);
+			break;
+		}
+		pci_dev_put(dev);
+	}
+
+	return found;
+}
+
 static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev)
 {
 	struct pci_dev *host_bridge;
@@ -2171,6 +2230,10 @@ static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev)
 	if (!found)
 		return;
 
+	/* don't enable host_bridge with leaf directly here */
+	if (host_bridge == dev && host_bridge_with_leaf(host_bridge))
+		goto out;
+
 	/* root did that ! */
 	if (msi_ht_cap_enabled(host_bridge))
 		goto out;
@@ -2201,44 +2264,12 @@ static void __devinit ht_disable_msi_mapping(struct pci_dev *dev)
 	}
 }
 
-static int __devinit ht_check_msi_mapping(struct pci_dev *dev)
-{
-	int pos, ttl = 48;
-	int found = 0;
-
-	/* check if there is HT MSI cap or enabled on this device */
-	pos = pci_find_ht_capability(dev, HT_CAPTYPE_MSI_MAPPING);
-	while (pos && ttl--) {
-		u8 flags;
-
-		if (found < 1)
-			found = 1;
-		if (pci_read_config_byte(dev, pos + HT_MSI_FLAGS,
-					 &flags) == 0) {
-			if (flags & HT_MSI_FLAGS_ENABLE) {
-				if (found < 2) {
-					found = 2;
-					break;
-				}
-			}
-		}
-		pos = pci_find_next_ht_capability(dev, pos,
-						  HT_CAPTYPE_MSI_MAPPING);
-	}
-
-	return found;
-}
-
-static void __devinit nv_msi_ht_cap_quirk(struct pci_dev *dev)
+static void __devinit __nv_msi_ht_cap_quirk(struct pci_dev *dev, int all)
 {
 	struct pci_dev *host_bridge;
 	int pos;
 	int found;
 
-	/* Enabling HT MSI mapping on this device breaks MCP51 */
-	if (dev->device == 0x270)
-		return;
-
 	/* check if there is HT MSI cap or enabled on this device */
 	found = ht_check_msi_mapping(dev);
 
@@ -2262,7 +2293,10 @@ static void __devinit nv_msi_ht_cap_quirk(struct pci_dev *dev)
 		/* Host bridge is to HT */
 		if (found == 1) {
 			/* it is not enabled, try to enable it */
-			nv_ht_enable_msi_mapping(dev);
+			if (all)
+				ht_enable_msi_mapping(dev);
+			else
+				nv_ht_enable_msi_mapping(dev);
 		}
 		return;
 	}
@@ -2274,8 +2308,20 @@ static void __devinit nv_msi_ht_cap_quirk(struct pci_dev *dev)
 	/* Host bridge is not to HT, disable HT MSI mapping on this device */
 	ht_disable_msi_mapping(dev);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, nv_msi_ht_cap_quirk);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, PCI_ANY_ID, nv_msi_ht_cap_quirk);
+
+static void __devinit nv_msi_ht_cap_quirk_all(struct pci_dev *dev)
+{
+	return __nv_msi_ht_cap_quirk(dev, 1);
+}
+
+static void __devinit nv_msi_ht_cap_quirk_leaf(struct pci_dev *dev)
+{
+	return __nv_msi_ht_cap_quirk(dev, 0);
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, nv_msi_ht_cap_quirk_leaf);
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, PCI_ANY_ID, nv_msi_ht_cap_quirk_all);
 
 static void __devinit quirk_msi_intx_disable_bug(struct pci_dev *dev)
 {
-- 
cgit v1.2.3


From b46a0b08b8bdf6467cd2b49f520e100c72885302 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Fri, 27 Mar 2009 15:10:33 +0900
Subject: ia64/xen: fix the link error.

This patch fixes the following link error with xen_domu_defconfig.
Depending on compiler version, it doesn't link as follows.
So remove const and use __initdata for xen_iosapic_ops.

> arch/ia64/xen/xen_pv_ops.c:878: error: xen_iosapic_ops causes a section type conflict

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/xen/xen_pv_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index 6c44225e7b8..bf3c74cb150 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -875,7 +875,7 @@ xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
 	HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
 }
 
-static const struct pv_iosapic_ops xen_iosapic_ops __initconst = {
+static struct pv_iosapic_ops xen_iosapic_ops __initdata = {
 	.pcat_compat_init = xen_pcat_compat_init,
 	.__get_irq_chip = xen_iosapic_get_irq_chip,
 
-- 
cgit v1.2.3


From 7120569c76028a6883697b7643564f0c419cfe07 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Fri, 27 Mar 2009 15:11:57 +0900
Subject: ia64: remove some warnings.

This patch removes the following warnings and related ones.
Plus some cosmetics.

arch/ia64/kernel/patch.c:112: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast
arch/ia64/kernel/patch.c:135: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast
arch/ia64/kernel/patch.c:166: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast
arch/ia64/kernel/patch.c:202: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast
arch/ia64/kernel/patch.c:220: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/paravirt_privop.h | 22 +++++++++++++++++++---
 arch/ia64/include/asm/xen/privop.h      |  2 +-
 arch/ia64/kernel/paravirt.c             | 22 +++++++++++++++++-----
 arch/ia64/kernel/patch.c                |  2 +-
 arch/ia64/kvm/kvm-ia64.c                |  2 +-
 arch/ia64/kvm/vcpu.c                    |  2 +-
 arch/ia64/xen/xen_pv_ops.c              |  9 +++++++--
 7 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h
index 4e40e62c4ab..3d2951130b5 100644
--- a/arch/ia64/include/asm/paravirt_privop.h
+++ b/arch/ia64/include/asm/paravirt_privop.h
@@ -33,7 +33,7 @@
  */
 
 struct pv_cpu_ops {
-	void (*fc)(unsigned long addr);
+	void (*fc)(void *addr);
 	unsigned long (*thash)(unsigned long addr);
 	unsigned long (*get_cpuid)(int index);
 	unsigned long (*get_pmd)(int index);
@@ -248,7 +248,7 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 		"r15", "r16", "r17"
 
 #define PARAVIRT_REG_CLOBBERS1					\
-	"r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14",	\
+	"r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14",		\
 		"r15", "r16", "r17"
 
 #define PARAVIRT_REG_CLOBBERS2					\
@@ -330,6 +330,15 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 		      : PARAVIRT_OP(op), "0"(__##arg1)		\
 		      : PARAVIRT_CLOBBERS1)
 
+#define PARAVIRT_BR1_VOID(op, type, arg1)			\
+	register void *__##arg1 asm ("r8") = arg1;		\
+	register unsigned long ia64_clobber asm ("r8");		\
+	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
+					  PARAVIRT_TYPE(type))	\
+		      :	"=r"(ia64_clobber)			\
+		      : PARAVIRT_OP(op), "0"(__##arg1)		\
+		      : PARAVIRT_CLOBBERS1)
+
 #define PARAVIRT_BR2(op, type, arg1, arg2)				\
 	register unsigned long __##arg1 asm ("r8") = arg1;		\
 	register unsigned long __##arg2 asm ("r9") = arg2;		\
@@ -357,6 +366,13 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 		return ia64_intri_res;			\
 	}
 
+#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type)		\
+	static inline void				\
+	paravirt_ ## op (void *arg1)			\
+	{						\
+		PARAVIRT_BR1_VOID(op, type, arg1);	\
+	}
+
 #define PARAVIRT_DEFINE_CPU_OP1(op, type)		\
 	static inline void				\
 	paravirt_ ## op (unsigned long arg1)		\
@@ -381,7 +397,7 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
 	}
 
 
-PARAVIRT_DEFINE_CPU_OP1(fc, FC);
+PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC);
 PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH)
 PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID)
 PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD)
diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h
index e5fbaeeb161..fb4ec5e0b06 100644
--- a/arch/ia64/include/asm/xen/privop.h
+++ b/arch/ia64/include/asm/xen/privop.h
@@ -69,7 +69,7 @@
  *  may have different semantics depending on whether they are executed
  *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
  *  be allowed to execute directly, lest incorrect semantics result. */
-extern void xen_fc(unsigned long addr);
+extern void xen_fc(void *addr);
 extern unsigned long xen_thash(unsigned long addr);
 
 /* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index 158d52414e9..a21d7bb9c69 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -70,7 +70,14 @@ struct pv_init_ops pv_init_ops =
 	ia64_native_ ## name ## _func(unsigned long arg)	\
 	{							\
 		ia64_native_ ## name(arg);			\
-	}							\
+	}
+
+#define DEFINE_VOID_FUNC1_VOID(name)				\
+	static void						\
+	ia64_native_ ## name ## _func(void *arg)		\
+	{							\
+		ia64_native_ ## name(arg);			\
+	}
 
 #define DEFINE_VOID_FUNC2(name)					\
 	static void						\
@@ -78,7 +85,7 @@ struct pv_init_ops pv_init_ops =
 				      unsigned long arg1)	\
 	{							\
 		ia64_native_ ## name(arg0, arg1);		\
-	}							\
+	}
 
 #define DEFINE_FUNC0(name)			\
 	static unsigned long			\
@@ -94,7 +101,7 @@ struct pv_init_ops pv_init_ops =
 		return ia64_native_ ## name(arg);	\
 	}						\
 
-DEFINE_VOID_FUNC1(fc);
+DEFINE_VOID_FUNC1_VOID(fc);
 DEFINE_VOID_FUNC1(intrin_local_irq_restore);
 
 DEFINE_VOID_FUNC2(ptcga);
@@ -308,6 +315,11 @@ ia64_native_setreg_func(int regnum, unsigned long val)
 	ia64_native_ ## name ## _func(unsigned long arg);	\
 	__DEFINE_FUNC(name, code)
 
+#define DEFINE_VOID_FUNC1_VOID(name, code)			\
+	extern void						\
+	ia64_native_ ## name ## _func(void *arg);		\
+	__DEFINE_FUNC(name, code)
+
 #define DEFINE_VOID_FUNC2(name, code)				\
 	extern void						\
 	ia64_native_ ## name ## _func(unsigned long arg0,	\
@@ -324,8 +336,8 @@ ia64_native_setreg_func(int regnum, unsigned long val)
 	ia64_native_ ## name ## _func(type arg);	\
 	__DEFINE_FUNC(name, code)
 
-DEFINE_VOID_FUNC1(fc,
-		  "fc r8\n");
+DEFINE_VOID_FUNC1_VOID(fc,
+		       "fc r8\n");
 DEFINE_VOID_FUNC1(intrin_local_irq_restore,
 		  ";;\n"
 		  "     cmp.ne p6, p7 = r8, r0\n"
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index 64c6f95daa3..68a1311db80 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -249,7 +249,7 @@ void ia64_patch_phys_stack_reg(unsigned long val)
 	while (offp < end) {
 		ip = (u64) offp + *offp;
 		ia64_patch(ip, mask, imm);
-		ia64_fc(ip);
+		ia64_fc((void *)ip);
 		++offp;
 	}
 	ia64_sync_i();
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 28f982045f2..0344c666448 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -70,7 +70,7 @@ static void kvm_flush_icache(unsigned long start, unsigned long len)
 	int l;
 
 	for (l = 0; l < (len + 32); l += 32)
-		ia64_fc(start + l);
+		ia64_fc((void *)(start + l));
 
 	ia64_sync_i();
 	ia64_srlz_i();
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index ecd526b5532..6839a52f7a4 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -390,7 +390,7 @@ void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1,
 		else
 			*rnat_addr = (*rnat_addr) & (~nat_mask);
 
-		ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore);
+		ia64_setreg(_IA64_REG_AR_BSPSTORE, (unsigned long)bspstore);
 		ia64_setreg(_IA64_REG_AR_RNAT, rnat);
 	}
 	local_irq_restore(psr);
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index bf3c74cb150..5e2270a999f 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -416,6 +416,11 @@ xen_intrin_local_irq_restore(unsigned long mask)
 	xen_ ## name (unsigned long arg);	\
 	__DEFINE_FUNC(name, code)
 
+#define DEFINE_VOID_FUNC1_VOID(name, code)	\
+	extern void				\
+	xen_ ## name (void *arg);		\
+	__DEFINE_FUNC(name, code)
+
 #define DEFINE_VOID_FUNC2(name, code)		\
 	extern void				\
 	xen_ ## name (unsigned long arg0,	\
@@ -530,8 +535,8 @@ DEFINE_FUNC0(get_itc,
 	     "(p6) hint @pause\n"
 	     "(p6) br.cond.spnt 888b\n");
 
-DEFINE_VOID_FUNC1(fc,
-		  "break " __stringify(HYPERPRIVOP_FC) "\n");
+DEFINE_VOID_FUNC1_VOID(fc,
+		       "break " __stringify(HYPERPRIVOP_FC) "\n");
 
 /*
  * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR
-- 
cgit v1.2.3


From fe2c8191faa29d7a09f4962198f6dfab973ceec4 Mon Sep 17 00:00:00 2001
From: Thiemo Nagel <thiemo.nagel@ph.tum.de>
Date: Tue, 31 Mar 2009 08:36:10 -0400
Subject: ext4: add checks of block references for non-extent inodes

Check block references in the inode and indorect blocks for non-extent
inodes to make sure they are valid, and flag an error if they are
invalid.

Signed-off-by: Thiemo Nagel <thiemo.nagel@ph.tum.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c7d9250fbc3..b3fd65d4650 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
 	return n;
 }
 
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+				 unsigned int *p, unsigned int max) {
+
+	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+	unsigned int *bref = p;
+	while (bref < p+max) {
+		if (unlikely(*bref >= maxblocks)) {
+			ext4_error(inode->i_sb, function,
+				   "block reference %u >= max (%u) "
+				   "in inode #%lu, offset=%d",
+				   *bref, maxblocks,
+				   inode->i_ino, (int)(bref-p));
+ 			return -EIO;
+ 		}
+		bref++;
+ 	}
+ 	return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh)                         \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode)                                \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+			      EXT4_NDIR_BLOCKS)
+
 /**
  *	ext4_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
-		bh = sb_bread(sb, le32_to_cpu(p->key));
-		if (!bh)
+		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		if (unlikely(!bh))
 			goto failure;
+                  
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto failure;
+			}
+			/* validate block references */
+			if (ext4_check_indirect_blockref(inode, bh)) {
+				put_bh(bh);
+				goto failure;
+			}
+		}
+		
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
@@ -4371,11 +4412,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	if (ei->i_flags & EXT4_EXTENTS_FL) {
 		/* Validate extent which is part of inode */
 		ret = ext4_ext_check_inode(inode);
-		if (ret) {
-			brelse(bh);
-			goto bad_inode;
-		}
-
+ 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		   (S_ISLNK(inode->i_mode) &&
+		    !ext4_inode_is_fast_symlink(inode))) {
+	 	/* Validate block references which are part of inode */
+		ret = ext4_check_inode_blockref(inode);
+	}
+	if (ret) {
+ 		brelse(bh);
+ 		goto bad_inode;
 	}
 
 	if (S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.3


From cc0fb9ad7dbc5a149f4957a0dd6d65881d3d385b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 27 Mar 2009 17:16:58 -0400
Subject: ext4: Rename pa_linear to pa_type

Impact: code cleanup

This patch rename pa_linear to pa_type and add MB_INODE_PA
and MB_GROUP_PA to indicate inode and group prealloc space.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 21 ++++++++++++---------
 fs/ext4/mballoc.h |  7 +++++--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e72c72a0b80..7f6fc41a2dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3555,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	spin_unlock(&pa->pa_lock);
 
 	grp_blk = pa->pa_pstart;
-	/* If linear, pa_pstart may be in the next group when pa is used up */
-	if (pa->pa_linear)
+	/* 
+	 * If doing group-based preallocation, pa_pstart may be in the
+	 * next group when pa is used up
+	 */
+	if (pa->pa_type == MB_GROUP_PA)
 		grp_blk--;
 
 	ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3651,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	INIT_LIST_HEAD(&pa->pa_inode_list);
 	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_linear = 0;
+	pa->pa_type = MB_INODE_PA;
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3714,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	INIT_LIST_HEAD(&pa->pa_inode_list);
 	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_linear = 1;
+	pa->pa_type = MB_GROUP_PA;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
 		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3968,7 +3971,7 @@ repeat:
 		list_del_rcu(&pa->pa_inode_list);
 		spin_unlock(pa->pa_obj_lock);
 
-		if (pa->pa_linear)
+		if (pa->pa_type == MB_GROUP_PA)
 			ext4_mb_release_group_pa(&e4b, pa, ac);
 		else
 			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4068,7 +4071,7 @@ repeat:
 	spin_unlock(&ei->i_prealloc_lock);
 
 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-		BUG_ON(pa->pa_linear != 0);
+		BUG_ON(pa->pa_type != MB_INODE_PA);
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4320,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 			continue;
 		}
 		/* only lg prealloc space */
-		BUG_ON(!pa->pa_linear);
+		BUG_ON(pa->pa_type != MB_GROUP_PA);
 
 		/* seems this one can be freed ... */
 		pa->pa_deleted = 1;
@@ -4426,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
 	struct ext4_prealloc_space *pa = ac->ac_pa;
 	if (pa) {
-		if (pa->pa_linear) {
+		if (pa->pa_type == MB_GROUP_PA) {
 			/* see comment in ext4_mb_use_group_pa() */
 			spin_lock(&pa->pa_lock);
 			pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4446,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 		 * doesn't grow big.  We need to release
 		 * alloc_semp before calling ext4_mb_add_n_trim()
 		 */
-		if (pa->pa_linear && likely(pa->pa_free)) {
+		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
 			spin_lock(pa->pa_obj_lock);
 			list_del_rcu(&pa->pa_inode_list);
 			spin_unlock(pa->pa_obj_lock);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 7acf5ef2453..dd9e6cd5f6c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
 	ext4_lblk_t		pa_lstart;	/* log. block */
 	unsigned short		pa_len;		/* len of preallocated chunk */
 	unsigned short		pa_free;	/* how many blocks are free */
-	unsigned short		pa_linear;	/* consumed in one direction
-						 * strictly, for grp prealloc */
+	unsigned short		pa_type;	/* pa type. inode or group */
 	spinlock_t		*pa_obj_lock;
 	struct inode		*pa_inode;	/* hack, for history only */
 };
 
+enum {
+	MB_INODE_PA = 0,
+	MB_GROUP_PA = 1
+};
 
 struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
-- 
cgit v1.2.3


From 86db97c87f744364d5889ca8a4134ca2048b8f83 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 27 Mar 2009 17:20:40 -0400
Subject: jbd2: Update locking coments

Update information about locking in JBD2 revoke code. Inconsistency in
comments found by Lin Tan <tammy000@gmail.com>.

CC: Lin Tan <tammy000@gmail.com>.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/revoke.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff262576..bbe6d592d8b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
  *			need do nothing.
  * RevokeValid set, Revoked set:
  *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
  */
 
 #ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
  * the second time we would still have a pending revoke to cancel.  So,
  * do not trust the Revoked bit on buffers unless RevokeValid is also
  * set.
- *
- * The caller must have the journal locked.
  */
 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 /*
  * Write revoke records to the journal for all entries in the current
  * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
  */
-
 void jbd2_journal_write_revoke_records(journal_t *journal,
 				  transaction_t *transaction)
 {
-- 
cgit v1.2.3


From a7b19448ddbdc34b2b8fedc048ba154ca798667b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Fri, 27 Mar 2009 19:42:54 -0400
Subject: ext4: fix typo which causes a memory leak on error path

This was found by smatch (http://repo.or.cz/w/smatch.git/)

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7f6fc41a2dd..5f3e3a3a38d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2699,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		kfree(sbi->s_mb_maxs);
+		kfree(sbi->s_mb_offsets);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.2.3


From e7c9e3e99adf6c49c5d593a51375916acc039d1e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 27 Mar 2009 19:43:21 -0400
Subject: ext4: fix locking typo in mballoc which could cause soft lockup hangs

Smatch (http://repo.or.cz/w/smatch.git/) complains about the locking in
ext4_mb_add_n_trim() from fs/ext4/mballoc.c

  4438          list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
  4439                                                  pa_inode_list) {
  4440                  spin_lock(&tmp_pa->pa_lock);
  4441                  if (tmp_pa->pa_deleted) {
  4442                          spin_unlock(&pa->pa_lock);
  4443                          continue;
  4444                  }

Brown paper bag time...

Reported-by: Dan Carpenter <error27@gmail.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5f3e3a3a38d..f871677a798 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4392,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 						pa_inode_list) {
 		spin_lock(&tmp_pa->pa_lock);
 		if (tmp_pa->pa_deleted) {
-			spin_unlock(&pa->pa_lock);
+			spin_unlock(&tmp_pa->pa_lock);
 			continue;
 		}
 		if (!added && pa->pa_free < tmp_pa->pa_free) {
-- 
cgit v1.2.3


From 06705bff9114531a997a7d0c2520bea0f2927410 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 28 Mar 2009 10:59:57 -0400
Subject: ext4: Regularize mount options

Add support for using the mount options "barrier" and "nobarrier", and
"auto_da_alloc" and "noauto_da_alloc", which is more consistent than
"barrier=<0|1>" or "auto_da_alloc=<0|1>".  Most other ext3/ext4 mount
options use the foo/nofoo naming convention.  We allow the old forms
of these mount options for backwards compatibility.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 25 +++++++++++++++++++++++--
 fs/ext4/super.c                    | 30 ++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 5c484aec2ba..97882df0486 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -183,8 +183,8 @@ commit=nrsec	(*)	Ext4 can be told to sync all its data and metadata
 			performance.
 
 barrier=<0|1(*)>	This enables/disables the use of write barriers in
-			the jbd code.  barrier=0 disables, barrier=1 enables.
-			This also requires an IO stack which can support
+barrier(*)		the jbd code.  barrier=0 disables, barrier=1 enables.
+nobarrier		This also requires an IO stack which can support
 			barriers, and if jbd gets an error on a barrier
 			write, it will disable again with a warning.
 			Write barriers enforce proper on-disk ordering
@@ -192,6 +192,9 @@ barrier=<0|1(*)>	This enables/disables the use of write barriers in
 			safe to use, at some performance penalty.  If
 			your disks are battery-backed in one way or another,
 			disabling barriers may safely improve performance.
+			The mount options "barrier" and "nobarrier" can
+			also be used to enable or disable barriers, for
+			consistency with other ext4 mount options.
 
 inode_readahead=n	This tuning parameter controls the maximum
 			number of inode table blocks that ext4's inode
@@ -313,6 +316,24 @@ journal_ioprio=prio	The I/O priority (from 0 to 7, where 0 is the
 			a slightly higher priority than the default I/O
 			priority.
 
+auto_da_alloc(*)	Many broken applications don't use fsync() when 
+noauto_da_alloc		replacing existing files via patterns such as
+			fd = open("foo.new")/write(fd,..)/close(fd)/
+			rename("foo.new", "foo"), or worse yet,
+			fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+			If auto_da_alloc is enabled, ext4 will detect
+			the replace-via-rename and replace-via-truncate
+			patterns and force that any delayed allocation
+			blocks are allocated such that at the next
+			journal commit, in the default data=ordered
+			mode, the data blocks of the new file are forced
+			to disk before the rename() operation is
+			commited.  This provides roughly the same level
+			of guarantees as ext3, and avoids the
+			"zero-length" problem that can happen when a
+			system crashes before the delayed allocation
+			blocks are forced to disk.
+
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 45d07cf9df3..9987bba99db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -867,7 +867,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",data_err=abort");
 
 	if (test_opt(sb, NO_AUTO_DA_ALLOC))
-		seq_puts(seq, ",auto_da_alloc=0");
+		seq_puts(seq, ",noauto_da_alloc");
 
 	ext4_show_quota_options(seq, sb);
 	return 0;
@@ -1018,7 +1018,7 @@ enum {
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_auto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
@@ -1026,8 +1026,8 @@ enum {
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_i_version,
+	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1080,6 +1080,8 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
@@ -1088,6 +1090,8 @@ static const match_table_t tokens = {
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc"},
+	{Opt_noauto_da_alloc, "noauto_da_alloc"},
 	{Opt_err, NULL},
 };
 
@@ -1422,9 +1426,14 @@ set_qf_format:
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_nobarrier:
+			clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option))
-				return 0;
+			if (match_int(&args[0], &option)) {
+				set_opt(sbi->s_mount_opt, BARRIER);
+				break;
+			}
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1485,9 +1494,14 @@ set_qf_format:
 			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
 							    option);
 			break;
+		case Opt_noauto_da_alloc:
+			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+			break;
 		case Opt_auto_da_alloc:
-			if (match_int(&args[0], &option))
-				return 0;
+			if (match_int(&args[0], &option)) {
+				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+				break;
+			}
 			if (option)
 				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
 			else
-- 
cgit v1.2.3


From 01522df346f846906eaf6ca57148641476209909 Mon Sep 17 00:00:00 2001
From: "Michael K. Johnson" <johnsonm@rpath.com>
Date: Fri, 27 Mar 2009 13:14:41 -0400
Subject: x86, setup: mark %esi as clobbered in E820 BIOS call

Jordan Hargrave diagnosed a BIOS clobbering %esi in the E820 call.
That particular BIOS has been fixed, but there is a possibility that
this is responsible for other occasional reports of early boot
failure, and it does not hurt to add %esi to the clobbers.

-stable candidate patch.

Cc: Justin Forbes <jmforbes@linuxtx.org>
Signed-off-by: Michael K Johnson <johnsonm@rpath.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
---
 arch/x86/boot/memory.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 8c3c25f3557..a99dbbe77a0 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -27,13 +27,14 @@ static int detect_memory_e820(void)
 	do {
 		size = sizeof(struct e820entry);
 
-		/* Important: %edx is clobbered by some BIOSes,
-		   so it must be either used for the error output
+		/* Important: %edx and %esi are clobbered by some BIOSes,
+		   so they must be either used for the error output
 		   or explicitly marked clobbered. */
 		asm("int $0x15; setc %0"
 		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
 		      "=m" (*desc)
-		    : "D" (desc), "d" (SMAP), "a" (0xe820));
+		    : "D" (desc), "d" (SMAP), "a" (0xe820)
+		    : "esi");
 
 		/* BIOSes which terminate the chain with CF = 1 as opposed
 		   to %ebx = 0 don't always report the SMAP signature on
-- 
cgit v1.2.3


From 776bd5c7a207de546918f805090bfc823d2660c8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:28 -0400
Subject: SUNRPC: Don't flag empty RPCB_GETADDR reply as bogus

In 2007, commit e65fe3976f594603ed7b1b4a99d3e9b867f573ea added
additional sanity checking to rpcb_decode_getaddr() to make sure we
were getting a reply that was long enough to be an actual universal
address.  If the uaddr string isn't long enough, the XDR decoder
returns EIO.

However, an empty string is a valid RPCB_GETADDR response if the
requested service isn't registered.  Moreover, "::.n.m" is also a
valid RPCB_GETADDR response for IPv6 addresses that is shorter
than rpcb_decode_getaddr()'s lower limit of 11.  So this sanity
check introduced a regression for rpcbind requests against IPv6
remotes.

So revert the lower bound check added by commit
e65fe3976f594603ed7b1b4a99d3e9b867f573ea, and add an explicit check
for an empty uaddr string, similar to libtirpc's rpcb_getaddr(3).

Pointed-out-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 03ae007641e..2caa7edeeab 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -703,11 +703,16 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
 	*portp = 0;
 	addr_len = ntohl(*p++);
 
+	if (addr_len == 0) {
+		dprintk("RPC:       rpcb_decode_getaddr: "
+					"service is not registered\n");
+		return 0;
+	}
+
 	/*
-	 * Simple sanity check.  The smallest possible universal
-	 * address is an IPv4 address string containing 11 bytes.
+	 * Simple sanity check.
 	 */
-	if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN)
+	if (addr_len > RPCBIND_MAXUADDRLEN)
 		goto out_err;
 
 	/*
-- 
cgit v1.2.3


From efb3288b423d7e3533a68dccecaa05a56a281a4e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:43 -0400
Subject: SUNRPC: Clean up static inline functions in svc_xprt.h

Clean up:  Enable the use of const arguments in higher level svc_ APIs
by adding const to the arguments of the helper functions in svc_xprt.h

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc_xprt.h | 46 +++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0127daca435..959b931b605 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -88,29 +88,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
 	kref_get(&xprt->xpt_ref);
 }
 static inline void svc_xprt_set_local(struct svc_xprt *xprt,
-				      struct sockaddr *sa, int salen)
+				      const struct sockaddr *sa,
+				      const size_t salen)
 {
 	memcpy(&xprt->xpt_local, sa, salen);
 	xprt->xpt_locallen = salen;
 }
 static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
-				       struct sockaddr *sa, int salen)
+				       const struct sockaddr *sa,
+				       const size_t salen)
 {
 	memcpy(&xprt->xpt_remote, sa, salen);
 	xprt->xpt_remotelen = salen;
 }
-static inline unsigned short svc_addr_port(struct sockaddr *sa)
+static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 {
-	unsigned short ret = 0;
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
+
 	switch (sa->sa_family) {
 	case AF_INET:
-		ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
-		break;
+		return ntohs(sin->sin_port);
 	case AF_INET6:
-		ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
-		break;
+		return ntohs(sin6->sin6_port);
 	}
-	return ret;
+
+	return 0;
 }
 
 static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -124,36 +127,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	return -EAFNOSUPPORT;
 }
 
-static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
 }
 
-static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
 }
 
-static inline char *__svc_print_addr(struct sockaddr *addr,
-				     char *buf, size_t len)
+static inline char *__svc_print_addr(const struct sockaddr *addr,
+				     char *buf, const size_t len)
 {
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
+
 	switch (addr->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%pI4, port=%u",
-			&((struct sockaddr_in *)addr)->sin_addr,
-			ntohs(((struct sockaddr_in *) addr)->sin_port));
+		snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
+			ntohs(sin->sin_port));
 		break;
 
 	case AF_INET6:
 		snprintf(buf, len, "%pI6, port=%u",
-			 &((struct sockaddr_in6 *)addr)->sin6_addr,
-			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+			 &sin6->sin6_addr,
+			ntohs(sin6->sin6_port));
 		break;
 
 	default:
 		snprintf(buf, len, "unknown address type: %d", addr->sa_family);
 		break;
 	}
+
 	return buf;
 }
 #endif /* SUNRPC_SVC_XPRT_H */
-- 
cgit v1.2.3


From adbbe929569e6eec8ff9feca23f1f2b40b42853d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:51 -0400
Subject: NFSD: If port value written to /proc/fs/nfsd/portlist is invalid,
 return EINVAL

Make sure port value read from user space by write_ports is valid before
passing it to svc_find_xprt().  If it wasn't, the writer would get ENOENT
instead of EINVAL.

Noticed-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfsctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce..5a936c14f6f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,6 +938,8 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 		char transport[16];
 		int port;
 		if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+			if (port < 1 || port > 65535)
+				return -EINVAL;
 			err = nfsd_create_serv();
 			if (!err) {
 				err = svc_create_xprt(nfsd_serv,
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 		char transport[16];
 		int port;
 		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
-			if (port == 0)
+			if (port < 1 || port > 65535)
 				return -EINVAL;
 			if (nfsd_serv) {
 				xprt = svc_find_xprt(nfsd_serv, transport,
-- 
cgit v1.2.3


From 156e62094a74cf43f02f56ef96b6cda567501357 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:58 -0400
Subject: SUNRPC: Clean up svc_find_xprt() calling sequence

Clean up: add documentating comment and use appropriate data types for
svc_find_xprt()'s arguments.

This also eliminates a mixed sign comparison: @port was an int, while
the return value of svc_xprt_local_port() is an unsigned short.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc_xprt.h |  3 ++-
 net/sunrpc/svc_xprt.c           | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 959b931b605..55b68582c5d 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -80,7 +80,8 @@ void	svc_close_xprt(struct svc_xprt *xprt);
 void	svc_delete_xprt(struct svc_xprt *xprt);
 int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
-struct	svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
+struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			const sa_family_t af, const unsigned short port);
 int	svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b3..c947c93dbc2 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1033,7 +1033,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
 	return dr;
 }
 
-/*
+/**
+ * svc_find_xprt - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @af: Address family of transport's local address
+ * @port: transport's IP port number
+ *
  * Return the transport instance pointer for the endpoint accepting
  * connections/peer traffic from the specified transport class,
  * address family and port.
@@ -1042,14 +1048,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
  * wild-card, and will result in matching the first transport in the
  * service's list that has a matching class name.
  */
-struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
-			       int af, int port)
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			       const sa_family_t af, const unsigned short port)
 {
 	struct svc_xprt *xprt;
 	struct svc_xprt *found = NULL;
 
 	/* Sanity check the args */
-	if (!serv || !xcl_name)
+	if (serv == NULL || xcl_name == NULL)
 		return found;
 
 	spin_lock_bh(&serv->sv_lock);
@@ -1058,7 +1064,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
 			continue;
 		if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
 			continue;
-		if (port && port != svc_xprt_local_port(xprt))
+		if (port != 0 && port != svc_xprt_local_port(xprt))
 			continue;
 		found = xprt;
 		svc_xprt_get(xprt);
-- 
cgit v1.2.3


From 4b62e58cccff9c5e7ffc7023f7ec24c75fbd549b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:06 -0400
Subject: SUNRPC: Pass a family argument to svc_register()

The sv_family field is going away.  Instead of using sv_family, have
the svc_register() function take a protocol family argument.

Since this argument represents a protocol family, and not an address
family, this argument takes an int, as this is what is passed to
sock_create_kern().  Also make sure svc_register's helpers are
checking for PF_FOO instead of AF_FOO.  The value of [AP]F_FOO are
equivalent; this is simply a symbolic change to reflect the semantics
of the value stored in that variable.

sock_create_kern() should return EPFNOSUPPORT if the passed-in
protocol family isn't supported, but it uses EAFNOSUPPORT for this
case.  We will stick with that tradition here, as svc_register()
is called by the RPC server in the same path as sock_create_kern().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc.h |  4 ++--
 net/sunrpc/svc.c           | 21 +++++++++++----------
 net/sunrpc/svcsock.c       |  2 +-
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe5..1f18fc728cb 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,8 +396,8 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
-int		   svc_register(const struct svc_serv *, const unsigned short,
-				const unsigned short);
+int		   svc_register(const struct svc_serv *, const int,
+				const unsigned short, const unsigned short);
 
 void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af..41bc36ea222 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -800,17 +800,17 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
  * if any error occurs.
  */
 static int __svc_register(const u32 program, const u32 version,
-			  const sa_family_t family,
+			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
 	int error;
 
 	switch (family) {
-	case AF_INET:
+	case PF_INET:
 		return __svc_rpcb_register4(program, version,
 						protocol, port);
-	case AF_INET6:
+	case PF_INET6:
 		error = __svc_rpcb_register6(program, version,
 						protocol, port);
 		if (error < 0)
@@ -840,11 +840,11 @@ static int __svc_register(const u32 program, const u32 version,
  * if any error occurs.
  */
 static int __svc_register(const u32 program, const u32 version,
-			  sa_family_t family,
+			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
-	if (family != AF_INET)
+	if (family != PF_INET)
 		return -EAFNOSUPPORT;
 
 	return rpcb_register(program, version, protocol, port);
@@ -855,13 +855,14 @@ static int __svc_register(const u32 program, const u32 version,
 /**
  * svc_register - register an RPC service with the local portmapper
  * @serv: svc_serv struct for the service to register
+ * @family: protocol family of service's listener socket
  * @proto: transport protocol number to advertise
  * @port: port to advertise
  *
- * Service is registered for any address in serv's address family
+ * Service is registered for any address in the passed-in protocol family
  */
-int svc_register(const struct svc_serv *serv, const unsigned short proto,
-		 const unsigned short port)
+int svc_register(const struct svc_serv *serv, const int family,
+		 const unsigned short proto, const unsigned short port)
 {
 	struct svc_program	*progp;
 	unsigned int		i;
@@ -879,7 +880,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 					i,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
-					serv->sv_family,
+					family,
 					progp->pg_vers[i]->vs_hidden?
 						" (but not telling portmap)" : "");
 
@@ -887,7 +888,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 				continue;
 
 			error = __svc_register(progp->pg_prog, i,
-						serv->sv_family, proto, port);
+						family, proto, port);
 			if (error < 0)
 				break;
 		}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5763e6460fe..d00583c1cd0 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 	/* Register socket with portmapper */
 	if (*errp >= 0 && pmap_register)
-		*errp = svc_register(serv, inet->sk_protocol,
+		*errp = svc_register(serv, serv->sv_family, inet->sk_protocol,
 				     ntohs(inet_sk(inet)->sport));
 
 	if (*errp < 0) {
-- 
cgit v1.2.3


From baf01caf09e87579c2d157e5ee29975db8551522 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:13 -0400
Subject: SUNRPC: svc_setup_socket() gets protocol family from socket

Since the sv_family field is going away, modify svc_setup_socket() to
extract the protocol family from the passed-in socket instead of from
the passed-in svc_serv struct.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svcsock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d00583c1cd0..d00bc330774 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 	/* Register socket with portmapper */
 	if (*errp >= 0 && pmap_register)
-		*errp = svc_register(serv, serv->sv_family, inet->sk_protocol,
+		*errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
 				     ntohs(inet_sk(inet)->sport));
 
 	if (*errp < 0) {
@@ -1145,13 +1145,13 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 	/*
 	 * We start one listener per sv_serv.  We want AF_INET
-	 * requests to be automatically shunted to our AF_INET6
+	 * requests to be automatically shunted to our PF_INET6
 	 * listener using a mapped IPv4 address.  Make sure
 	 * no-one starts an equivalent IPv4 listener, which
 	 * would steal our incoming connections.
 	 */
 	val = 0;
-	if (serv->sv_family == AF_INET6)
+	if (inet->sk_family == PF_INET6)
 		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
 					(char *)&val, sizeof(val));
 
-- 
cgit v1.2.3


From 9652ada3fb5914a67d8422114e8a76388330fa79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:21 -0400
Subject: SUNRPC: Change svc_create_xprt() to take a @family argument

The sv_family field is going away.  Pass a protocol family argument to
svc_create_xprt() instead of extracting the family from the passed-in
svc_serv struct.

Again, as this is a listener socket and not an address, we make this
new argument an "int" protocol family, instead of an "sa_family_t."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c                  |  3 ++-
 fs/nfs/callback.c               |  4 ++--
 fs/nfsd/nfsctl.c                |  2 +-
 fs/nfsd/nfssvc.c                |  4 ++--
 include/linux/sunrpc/svc_xprt.h |  3 ++-
 net/sunrpc/svc_xprt.c           | 15 +++++++++------
 6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b585..390c5593655 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -211,7 +211,8 @@ static int create_lockd_listener(struct svc_serv *serv, char *name,
 
 	xprt = svc_find_xprt(serv, name, 0, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+		return svc_create_xprt(serv, name, nlmsvc_family,
+					port, SVC_SOCK_DEFAULTS);
 
 	svc_xprt_put(xprt);
 	return 0;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a108..fb35cab63c8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -122,8 +122,8 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 
-	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
-			      SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", nfs_callback_family,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5a936c14f6f..a4ed8644d69 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -943,7 +943,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 			err = nfsd_create_serv();
 			if (!err) {
 				err = svc_create_xprt(nfsd_serv,
-						      transport, port,
+						      transport, PF_INET, port,
 						      SVC_SOCK_ANONYMOUS);
 				if (err == -ENOENT)
 					/* Give a reasonable perror msg for
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa..ab7f249055b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -244,7 +244,7 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = svc_create_xprt(nfsd_serv, "udp", port,
+	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
@@ -253,7 +253,7 @@ static int nfsd_init_socks(int port)
 	if (error < 0)
 		return error;
 
-	error = svc_create_xprt(nfsd_serv, "tcp", port,
+	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 55b68582c5d..0d9cb6ef28b 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -71,7 +71,8 @@ int	svc_reg_xprt_class(struct svc_xprt_class *);
 void	svc_unreg_xprt_class(struct svc_xprt_class *);
 void	svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
-int	svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+int	svc_create_xprt(struct svc_serv *, const char *, const int,
+			const unsigned short, int);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_received(struct svc_xprt *);
 void	svc_xprt_put(struct svc_xprt *xprt);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c947c93dbc2..2819ee093f3 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init);
 
 static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 					 struct svc_serv *serv,
-					 unsigned short port, int flags)
+					 const int family,
+					 const unsigned short port,
+					 int flags)
 {
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
@@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	struct sockaddr *sap;
 	size_t len;
 
-	switch (serv->sv_family) {
-	case AF_INET:
+	switch (family) {
+	case PF_INET:
 		sap = (struct sockaddr *)&sin;
 		len = sizeof(sin);
 		break;
-	case AF_INET6:
+	case PF_INET6:
 		sap = (struct sockaddr *)&sin6;
 		len = sizeof(sin6);
 		break;
@@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
 }
 
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+		    const int family, const unsigned short port,
 		    int flags)
 {
 	struct svc_xprt_class *xcl;
@@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = __svc_xpo_create(xcl, serv, port, flags);
+		newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
 		if (IS_ERR(newxprt)) {
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
-- 
cgit v1.2.3


From 49a9072f29a1039f142ec98b44a72d7173651c02 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:29 -0400
Subject: SUNRPC: Remove @family argument from svc_create() and
 svc_create_pooled()

Since an RPC service listener's protocol family is specified now via
svc_create_xprt(), it no longer needs to be passed to svc_create() or
svc_create_pooled().  Remove that argument from the synopsis of those
functions, and remove the sv_family field from the svc_serv struct.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c             |  2 +-
 fs/nfs/callback.c          |  3 +--
 fs/nfsd/nfssvc.c           |  1 -
 include/linux/sunrpc/svc.h |  5 ++---
 net/sunrpc/svc.c           | 11 +++++------
 5 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 390c5593655..d30920038cb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -275,7 +275,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index fb35cab63c8..ddf4b4ae696 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -116,8 +116,7 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
-				nfs_callback_family, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ab7f249055b..bc3567bab8c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
 
 	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      AF_INET,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 1f18fc728cb..d3a4c023193 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
 	struct list_head	sv_tempsocks;	/* all temporary sockets */
 	int			sv_tmpcnt;	/* count of temporary sockets */
 	struct timer_list	sv_temptimer;	/* timer for aging temporary sockets */
-	sa_family_t		sv_family;	/* listener's address family */
 
 	char *			sv_name;	/* service name */
 
@@ -385,13 +384,13 @@ struct svc_procedure {
 /*
  * Function prototypes.
  */
-struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
+struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    void (*shutdown)(struct svc_serv *));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			sa_family_t, void (*shutdown)(struct svc_serv *),
+			void (*shutdown)(struct svc_serv *),
 			svc_thread_fn, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 41bc36ea222..d72ff44826d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	   sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+	     void (*shutdown)(struct svc_serv *serv))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
-	serv->sv_family    = family;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-		sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+	   void (*shutdown)(struct svc_serv *serv))
 {
-	return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  sa_family_t family, void (*shutdown)(struct svc_serv *serv),
+		  void (*shutdown)(struct svc_serv *serv),
 		  svc_thread_fn func, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, family, shutdown);
+	serv = __svc_create(prog, bufsize, npools, shutdown);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
-- 
cgit v1.2.3


From 26298caacac3e4754194b13aef377706d5de6cf6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:36 -0400
Subject: NFS: Revert creation of IPv6 listeners for lockd and NFSv4 callbacks

We're about to convert over to using separate PF_INET and PF_INET6
listeners, instead of a single PF_INET6 listener that also receives
AF_INET requests and maps them to AF_INET6.

Clear the way by removing the logic in lockd and the NFSv4 callback
server that creates an AF_INET6 service listener.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c    | 13 +------------
 fs/nfs/callback.c | 14 ++------------
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d30920038cb..566932b98fd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -52,17 +52,6 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
-	defined(CONFIG_SUNRPC_REGISTER_V4)
-static const sa_family_t	nlmsvc_family = AF_INET6;
-#else	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-static const sa_family_t	nlmsvc_family = AF_INET;
-#endif	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-
 /*
  * These can be set at insmod time (useful for NFS as root filesystem),
  * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
@@ -211,7 +200,7 @@ static int create_lockd_listener(struct svc_serv *serv, char *name,
 
 	xprt = svc_find_xprt(serv, name, 0, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, nlmsvc_family,
+		return svc_create_xprt(serv, name, PF_INET,
 					port, SVC_SOCK_DEFAULTS);
 
 	svc_xprt_put(xprt);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index ddf4b4ae696..0ef47dff89b 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -41,16 +41,6 @@ unsigned short nfs_callback_tcpport;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const sa_family_t	nfs_callback_family = AF_INET6;
-#else
-static const sa_family_t	nfs_callback_family = AF_INET;
-#endif
-
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
 	char *endp;
@@ -121,13 +111,13 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 
-	ret = svc_create_xprt(serv, "tcp", nfs_callback_family,
+	ret = svc_create_xprt(serv, "tcp", PF_INET,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
-			nfs_callback_tcpport, nfs_callback_family);
+			nfs_callback_tcpport, PF_INET);
 
 	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(nfs_callback_info.rqst)) {
-- 
cgit v1.2.3


From 7d21c0f9845f0ce4e81baac3519fbb2c6c2cc908 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:44 -0400
Subject: SUNRPC: Set IPV6ONLY flag on PF_INET6 RPC listener sockets

We are about to convert to using separate RPC listener sockets for
PF_INET and PF_INET6.  This echoes the way IPv6 is handled in user
space by TI-RPC, and eliminates the need for ULPs to worry about
mapped IPv4 AF_INET6 addresses when doing address comparisons.

Start by setting the IPV6ONLY flag on PF_INET6 RPC listener sockets.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svcsock.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d00bc330774..ac6cd65220c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1144,13 +1144,11 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 		svc_tcp_init(svsk, serv);
 
 	/*
-	 * We start one listener per sv_serv.  We want AF_INET
-	 * requests to be automatically shunted to our PF_INET6
-	 * listener using a mapped IPv4 address.  Make sure
-	 * no-one starts an equivalent IPv4 listener, which
-	 * would steal our incoming connections.
+	 * If this is a PF_INET6 listener, we want to avoid
+	 * getting requests from IPv4 remotes.  Those should
+	 * be shunted to a PF_INET listener via rpcbind.
 	 */
-	val = 0;
+	val = 1;
 	if (inet->sk_family == PF_INET6)
 		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
 					(char *)&val, sizeof(val));
-- 
cgit v1.2.3


From fc28decdc93633a65d54e42498e9e819d466329c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:51 -0400
Subject: SUNRPC: Use IPv4 loopback for registering AF_INET6 kernel RPC
 services

The kernel uses an IPv6 loopback address when registering its AF_INET6
RPC services so that it can tell whether the local portmapper is
actually IPv6-enabled.

Since the legacy portmapper doesn't listen on IPv6, however, this
causes a long timeout on older systems if the kernel happens to try
creating and registering an AF_INET6 RPC service.  Originally I wanted
to use a connected transport (either TCP or connected UDP) so that the
upcall would fail immediately if the portmapper wasn't listening on
IPv6, but we never agreed on what transport to use.

In the end, it's of little consequence to the kernel whether the local
portmapper is listening on IPv6.  It's only important whether the
portmapper supports rpcbind v4.  And the kernel can't tell that at all
if it is sending requests via IPv6 -- the portmapper will just ignore
them.

So, send both rpcbind v2 and v4 SET/UNSET requests via IPv4 loopback
to maintain better backwards compatibility between new kernels and
legacy user space, and prevent multi-second hangs in some cases when
the kernel attempts to register RPC services.

This patch is part of a series that addresses

   http://bugzilla.kernel.org/show_bug.cgi?id=12256

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 2caa7edeeab..ebce7a5976c 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -124,12 +124,6 @@ static const struct sockaddr_in rpcb_inaddr_loopback = {
 	.sin_port		= htons(RPCBIND_PORT),
 };
 
-static const struct sockaddr_in6 rpcb_in6addr_loopback = {
-	.sin6_family		= AF_INET6,
-	.sin6_addr		= IN6ADDR_LOOPBACK_INIT,
-	.sin6_port		= htons(RPCBIND_PORT),
-};
-
 static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr,
 					  size_t addrlen, u32 version)
 {
@@ -176,9 +170,10 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 	return rpc_create(&args);
 }
 
-static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
-			      u32 version, struct rpc_message *msg)
+static int rpcb_register_call(u32 version, struct rpc_message *msg)
 {
+	struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback;
+	size_t addrlen = sizeof(rpcb_inaddr_loopback);
 	struct rpc_clnt *rpcb_clnt;
 	int result, error = 0;
 
@@ -254,9 +249,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 	if (port)
 		msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
 
-	return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
-					sizeof(rpcb_inaddr_loopback),
-					RPCBVERS_2, &msg);
+	return rpcb_register_call(RPCBVERS_2, &msg);
 }
 
 /*
@@ -284,9 +277,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
 	if (port)
 		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
 
-	return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
-					sizeof(rpcb_inaddr_loopback),
-					RPCBVERS_4, msg);
+	return rpcb_register_call(RPCBVERS_4, msg);
 }
 
 /*
@@ -318,9 +309,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
 	if (port)
 		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
 
-	return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
-					sizeof(rpcb_in6addr_loopback),
-					RPCBVERS_4, msg);
+	return rpcb_register_call(RPCBVERS_4, msg);
 }
 
 /**
-- 
cgit v1.2.3


From ba5c35e0c7e30b095636cd58b0854fdbd3c32947 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:59 -0400
Subject: SUNRPC: Don't return EPROTONOSUPPORT in svc_register()'s helpers

The RPC client returns -EPROTONOSUPPORT if there is a protocol version
mismatch (ie the remote RPC server doesn't support the RPC protocol
version sent by the client).

Helpers for the svc_register() function return -EPROTONOSUPPORT if they
don't recognize the passed-in IPPROTO_ value.

These are two entirely different failure modes.

Have the helpers return -ENOPROTOOPT instead of -EPROTONOSUPPORT.  This
will allow callers to determine more precisely what the underlying
problem is, and decide to report or recover appropriately.

This patch is part of a series that addresses
   http://bugzilla.kernel.org/show_bug.cgi?id=12256

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d72ff44826d..17e0d7265df 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -749,7 +749,7 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
 		netid = RPCBIND_NETID_TCP;
 		break;
 	default:
-		return -EPROTONOSUPPORT;
+		return -ENOPROTOOPT;
 	}
 
 	return rpcb_v4_register(program, version,
@@ -785,7 +785,7 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
 		netid = RPCBIND_NETID_TCP6;
 		break;
 	default:
-		return -EPROTONOSUPPORT;
+		return -ENOPROTOOPT;
 	}
 
 	return rpcb_v4_register(program, version,
-- 
cgit v1.2.3


From 3aba45536fe8f92aa07bcdfd2fb1cf17eec7d786 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:06 -0400
Subject: SUNRPC: Clean up address type casts in rpcb_v4_register()

Clean up: Simplify rpcb_v4_register() and its helpers by moving the
details of sockaddr type casting to rpcb_v4_register()'s helper
functions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index ebce7a5976c..44d0732ba87 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -170,7 +170,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 	return rpc_create(&args);
 }
 
-static int rpcb_register_call(u32 version, struct rpc_message *msg)
+static int rpcb_register_call(const u32 version, struct rpc_message *msg)
 {
 	struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback;
 	size_t addrlen = sizeof(rpcb_inaddr_loopback);
@@ -255,17 +255,17 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 /*
  * Fill in AF_INET family-specific arguments to register
  */
-static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
+static int rpcb_register_netid4(const struct sockaddr *sap,
 				struct rpc_message *msg)
 {
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
 	struct rpcbind_args *map = msg->rpc_argp;
-	unsigned short port = ntohs(address_to_register->sin_port);
+	unsigned short port = ntohs(sin->sin_port);
 	char buf[32];
 
 	/* Construct AF_INET universal address */
 	snprintf(buf, sizeof(buf), "%pI4.%u.%u",
-		 &address_to_register->sin_addr.s_addr,
-		 port >> 8, port & 0xff);
+		 &sin->sin_addr.s_addr, port >> 8, port & 0xff);
 	map->r_addr = buf;
 
 	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
@@ -283,21 +283,21 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
 /*
  * Fill in AF_INET6 family-specific arguments to register
  */
-static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
+static int rpcb_register_netid6(const struct sockaddr *sap,
 				struct rpc_message *msg)
 {
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
 	struct rpcbind_args *map = msg->rpc_argp;
-	unsigned short port = ntohs(address_to_register->sin6_port);
+	unsigned short port = ntohs(sin6->sin6_port);
 	char buf[64];
 
 	/* Construct AF_INET6 universal address */
-	if (ipv6_addr_any(&address_to_register->sin6_addr))
+	if (ipv6_addr_any(&sin6->sin6_addr))
 		snprintf(buf, sizeof(buf), "::.%u.%u",
 				port >> 8, port & 0xff);
 	else
 		snprintf(buf, sizeof(buf), "%pI6.%u.%u",
-			 &address_to_register->sin6_addr,
-			 port >> 8, port & 0xff);
+			 &sin6->sin6_addr, port >> 8, port & 0xff);
 	map->r_addr = buf;
 
 	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
@@ -369,11 +369,9 @@ int rpcb_v4_register(const u32 program, const u32 version,
 
 	switch (address->sa_family) {
 	case AF_INET:
-		return rpcb_register_netid4((struct sockaddr_in *)address,
-					    &msg);
+		return rpcb_register_netid4(address, &msg);
 	case AF_INET6:
-		return rpcb_register_netid6((struct sockaddr_in6 *)address,
-					    &msg);
+		return rpcb_register_netid6(address, &msg);
 	}
 
 	return -EAFNOSUPPORT;
-- 
cgit v1.2.3


From 126e4bc3b3b446482696377f67a634c76eaf2e9c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:14 -0400
Subject: SUNRPC: rpcbind actually interprets r_owner string

RFC 1833 has little to say about the contents of r_owner; it only
specifies that it is a string, and states that it is used to control
who can UNSET an entry.

Our port of rpcbind (from Sun) assumes this string contains a numeric
UID value, not alphabetical or symbolic characters, but checks this
value only for AF_LOCAL RPCB_SET or RPCB_UNSET requests.  In all other
cases, rpcbind ignores the contents of the r_owner string.

The reference user space implementation of rpcb_set(3) uses a numeric
UID for all SET/UNSET requests (even via the network) and an empty
string for all other requests.  We emulate that behavior here to
maintain bug-for-bug compatibility.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 44d0732ba87..d550d0b967d 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -63,9 +63,16 @@ enum {
  * r_owner
  *
  * The "owner" is allowed to unset a service in the rpcbind database.
- * We always use the following (arbitrary) fixed string.
+ *
+ * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a
+ * UID which it maps to a local user name via a password lookup.
+ * In all other cases it is ignored.
+ *
+ * For SET/UNSET requests, user space provides a value, even for
+ * network requests, and GETADDR uses an empty string.  We follow
+ * those precedents here.
  */
-#define RPCB_OWNER_STRING	"rpcb"
+#define RPCB_OWNER_STRING	"0"
 #define RPCB_MAXOWNERLEN	sizeof(RPCB_OWNER_STRING)
 
 static void			rpcb_getport_done(struct rpc_task *, void *);
@@ -566,7 +573,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	map->r_xprt = xprt_get(xprt);
 	map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
 	map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR);
-	map->r_owner = RPCB_OWNER_STRING;	/* ignored for GETADDR */
+	map->r_owner = "";
 	map->r_status = -EIO;
 
 	child = rpcb_call_async(rpcb_clnt, map, proc);
-- 
cgit v1.2.3


From 1673d0de40ab46cac3b456ad50e1c8d6a31bfd66 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:21 -0400
Subject: SUNRPC: Allow callers to pass rpcb_v4_register a NULL address

The user space TI-RPC library uses an empty string for the universal
address when unregistering all target addresses for [program, version].
The kernel's rpcb client should behave the same way.

Here, we are switching between several registration methods based on
the protocol family of the incoming address.  Rename the other rpcbind
v4 registration functions to make it clear that they, as well, are
switched on protocol family.  In /etc/netconfig, this is either "inet"
or "inet6".

NB: The loopback protocol families are not supported in the kernel.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index d550d0b967d..8ea8907d4b8 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -262,8 +262,8 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
 /*
  * Fill in AF_INET family-specific arguments to register
  */
-static int rpcb_register_netid4(const struct sockaddr *sap,
-				struct rpc_message *msg)
+static int rpcb_register_inet4(const struct sockaddr *sap,
+			       struct rpc_message *msg)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
 	struct rpcbind_args *map = msg->rpc_argp;
@@ -290,8 +290,8 @@ static int rpcb_register_netid4(const struct sockaddr *sap,
 /*
  * Fill in AF_INET6 family-specific arguments to register
  */
-static int rpcb_register_netid6(const struct sockaddr *sap,
-				struct rpc_message *msg)
+static int rpcb_register_inet6(const struct sockaddr *sap,
+			       struct rpc_message *msg)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
 	struct rpcbind_args *map = msg->rpc_argp;
@@ -319,6 +319,20 @@ static int rpcb_register_netid6(const struct sockaddr *sap,
 	return rpcb_register_call(RPCBVERS_4, msg);
 }
 
+static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
+{
+	struct rpcbind_args *map = msg->rpc_argp;
+
+	dprintk("RPC:       unregistering [%u, %u, '%s'] with "
+		"local rpcbind\n",
+			map->r_prog, map->r_vers, map->r_netid);
+
+	map->r_addr = "";
+	msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+
+	return rpcb_register_call(RPCBVERS_4, msg);
+}
+
 /**
  * rpcb_v4_register - set or unset a port registration with the local rpcbind
  * @program: RPC program number of service to (un)register
@@ -336,10 +350,11 @@ static int rpcb_register_netid6(const struct sockaddr *sap,
  * invoke this function once for each [program, version, address,
  * netid] tuple they wish to advertise.
  *
- * Callers may also unregister RPC services that are no longer
- * available by setting the port number in the passed-in address
- * to zero.  Callers pass a netid of "" to unregister all
- * transport netids associated with [program, version, address].
+ * Callers may also unregister RPC services that are registered at a
+ * specific address by setting the port number in @address to zero.
+ * They may unregister all registered protocol families at once for
+ * a service by passing a NULL @address argument.  If @netid is ""
+ * then all netids for [program, version, address] are unregistered.
  *
  * This function uses rpcbind protocol version 4 to contact the
  * local rpcbind daemon.  The local rpcbind daemon must support
@@ -374,11 +389,14 @@ int rpcb_v4_register(const u32 program, const u32 version,
 		.rpc_argp	= &map,
 	};
 
+	if (address == NULL)
+		return rpcb_unregister_all_protofamilies(&msg);
+
 	switch (address->sa_family) {
 	case AF_INET:
-		return rpcb_register_netid4(address, &msg);
+		return rpcb_register_inet4(address, &msg);
 	case AF_INET6:
-		return rpcb_register_netid6(address, &msg);
+		return rpcb_register_inet6(address, &msg);
 	}
 
 	return -EAFNOSUPPORT;
-- 
cgit v1.2.3


From d5a8620f7c8a5bcade730e2fa1224191f289fb00 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:29 -0400
Subject: SUNRPC: Simplify svc_unregister()

Our initial implementation of svc_unregister() assumed that PMAP_UNSET
cleared all rpcbind registrations for a [program, version] tuple.
However, we now have evidence that PMAP_UNSET clears only "inet"
entries, and not "inet6" entries, in the rpcbind database.

For backwards compatibility with the legacy portmapper, the
svc_unregister() function also must work if user space doesn't support
rpcbind version 4 at all.

Thus we'll send an rpcbind v4 UNSET, and if that fails, we'll send a
PMAP_UNSET.

This simplifies the code in svc_unregister() and provides better
backwards compatibility with legacy user space that does not support
rpcbind version 4.  We can get rid of the conditional compilation in
here as well.

This patch is part of a series that addresses
   http://bugzilla.kernel.org/show_bug.cgi?id=12256

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svc.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 17e0d7265df..bd0ee312dac 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -896,38 +896,31 @@ int svc_register(const struct svc_serv *serv, const int family,
 	return error;
 }
 
-#ifdef CONFIG_SUNRPC_REGISTER_V4
-
+/*
+ * If user space is running rpcbind, it should take the v4 UNSET
+ * and clear everything for this [program, version].  If user space
+ * is running portmap, it will reject the v4 UNSET, but won't have
+ * any "inet6" entries anyway.  So a PMAP_UNSET should be sufficient
+ * in this case to clear all existing entries for [program, version].
+ */
 static void __svc_unregister(const u32 program, const u32 version,
 			     const char *progname)
 {
-	struct sockaddr_in6 sin6 = {
-		.sin6_family		= AF_INET6,
-		.sin6_addr		= IN6ADDR_ANY_INIT,
-		.sin6_port		= 0,
-	};
 	int error;
 
-	error = rpcb_v4_register(program, version,
-				(struct sockaddr *)&sin6, "");
-	dprintk("svc: %s(%sv%u), error %d\n",
-			__func__, progname, version, error);
-}
-
-#else	/* CONFIG_SUNRPC_REGISTER_V4 */
+	error = rpcb_v4_register(program, version, NULL, "");
 
-static void __svc_unregister(const u32 program, const u32 version,
-			     const char *progname)
-{
-	int error;
+	/*
+	 * User space didn't support rpcbind v4, so retry this
+	 * request with the legacy rpcbind v2 protocol.
+	 */
+	if (error == -EPROTONOSUPPORT)
+		error = rpcb_register(program, version, 0, 0);
 
-	error = rpcb_register(program, version, 0, 0);
 	dprintk("svc: %s(%sv%u), error %d\n",
 			__func__, progname, version, error);
 }
 
-#endif	/* CONFIG_SUNRPC_REGISTER_V4 */
-
 /*
  * All netids, bind addresses and ports registered for [program, version]
  * are removed from the local rpcbind database (if the service is not
-- 
cgit v1.2.3


From cadc0fa534e51e20fdffe1623913c163a18d71b1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:36 -0400
Subject: SUNRPC: Simplify kernel RPC service registration

The kernel registers RPC services with the local portmapper with an
rpcbind SET upcall to the local portmapper.  Traditionally, this used
rpcbind v2 (PMAP), but registering RPC services that support IPv6
requires rpcbind v3 or v4.

Since we now want separate PF_INET and PF_INET6 listeners for each
kernel RPC service, svc_register() will do only one of those
registrations at a time.

For PF_INET, it tries an rpcb v4 SET upcall first; if that fails, it
does a legacy portmap SET.  This makes it entirely backwards
compatible with legacy user space, but allows a proper v4 SET to be
used if rpcbind is available.

For PF_INET6, it does an rpcb v4 SET upcall.  If that fails, it fails
the registration, and thus the transport creation.  This let's the
kernel detect if user space is able to support IPv6 RPC services, and
thus whether it should maintain a PF_INET6 listener for each service
at all.

This provides complete backwards compatibilty with legacy user space
that only supports rpcbind v2.  The only down-side is that registering
a new kernel RPC service may take an extra exchange with the local
portmapper on legacy systems, but this is an infrequent operation and
is done over UDP (no lingering sockets in TIMEWAIT), so it shouldn't
be consequential.

This patch is part of a series that addresses
   http://bugzilla.kernel.org/show_bug.cgi?id=12256

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svc.c | 79 ++++++++++++++++++++++++--------------------------------
 1 file changed, 34 insertions(+), 45 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bd0ee312dac..142f64745fb 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -718,8 +718,6 @@ svc_exit_thread(struct svc_rqst *rqstp)
 }
 EXPORT_SYMBOL_GPL(svc_exit_thread);
 
-#ifdef CONFIG_SUNRPC_REGISTER_V4
-
 /*
  * Register an "inet" protocol family netid with the local
  * rpcbind daemon via an rpcbind v4 SET request.
@@ -734,12 +732,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
 				const unsigned short protocol,
 				const unsigned short port)
 {
-	struct sockaddr_in sin = {
+	const struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
 		.sin_addr.s_addr	= htonl(INADDR_ANY),
 		.sin_port		= htons(port),
 	};
-	char *netid;
+	const char *netid;
+	int error;
 
 	switch (protocol) {
 	case IPPROTO_UDP:
@@ -752,10 +751,20 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
 		return -ENOPROTOOPT;
 	}
 
-	return rpcb_v4_register(program, version,
-				(struct sockaddr *)&sin, netid);
+	error = rpcb_v4_register(program, version,
+					(const struct sockaddr *)&sin, netid);
+
+	/*
+	 * User space didn't support rpcbind v4, so retry this
+	 * registration request with the legacy rpcbind v2 protocol.
+	 */
+	if (error == -EPROTONOSUPPORT)
+		error = rpcb_register(program, version, protocol, port);
+
+	return error;
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
  * Register an "inet6" protocol family netid with the local
  * rpcbind daemon via an rpcbind v4 SET request.
@@ -770,12 +779,13 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
 				const unsigned short protocol,
 				const unsigned short port)
 {
-	struct sockaddr_in6 sin6 = {
+	const struct sockaddr_in6 sin6 = {
 		.sin6_family		= AF_INET6,
 		.sin6_addr		= IN6ADDR_ANY_INIT,
 		.sin6_port		= htons(port),
 	};
-	char *netid;
+	const char *netid;
+	int error;
 
 	switch (protocol) {
 	case IPPROTO_UDP:
@@ -788,9 +798,19 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
 		return -ENOPROTOOPT;
 	}
 
-	return rpcb_v4_register(program, version,
-				(struct sockaddr *)&sin6, netid);
+	error = rpcb_v4_register(program, version,
+					(const struct sockaddr *)&sin6, netid);
+
+	/*
+	 * User space didn't support rpcbind version 4, so we won't
+	 * use a PF_INET6 listener.
+	 */
+	if (error == -EPROTONOSUPPORT)
+		error = -EAFNOSUPPORT;
+
+	return error;
 }
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 
 /*
  * Register a kernel RPC service via rpcbind version 4.
@@ -809,48 +829,17 @@ static int __svc_register(const u32 program, const u32 version,
 	case PF_INET:
 		return __svc_rpcb_register4(program, version,
 						protocol, port);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case PF_INET6:
-		error = __svc_rpcb_register6(program, version,
+		return__svc_rpcb_register6(program, version,
 						protocol, port);
-		if (error < 0)
-			return error;
-
-		/*
-		 * Work around bug in some versions of Linux rpcbind
-		 * which don't allow registration of both inet and
-		 * inet6 netids.
-		 *
-		 * Error return ignored for now.
-		 */
-		__svc_rpcb_register4(program, version,
-						protocol, port);
-		return 0;
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 	}
 
 	return -EAFNOSUPPORT;
 }
 
-#else	/* CONFIG_SUNRPC_REGISTER_V4 */
-
-/*
- * Register a kernel RPC service via rpcbind version 2.
- *
- * Returns zero on success; a negative errno value is returned
- * if any error occurs.
- */
-static int __svc_register(const u32 program, const u32 version,
-			  const int family,
-			  const unsigned short protocol,
-			  const unsigned short port)
-{
-	if (family != PF_INET)
-		return -EAFNOSUPPORT;
-
-	return rpcb_register(program, version, protocol, port);
-}
-
-#endif /* CONFIG_SUNRPC_REGISTER_V4 */
-
 /**
  * svc_register - register an RPC service with the local portmapper
  * @serv: svc_serv struct for the service to register
-- 
cgit v1.2.3


From 363f724cdd3d2ae554e261be995abdeb15f7bdd9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:44 -0400
Subject: SUNRPC: rpcb_register() should handle errors silently

Move error reporting for RPC registration to rpcb_register's caller.

This way the caller can choose to recover silently from certain
errors, but report errors it does not recognize.  Error reporting
for kernel RPC service registration is now handled in one place.

This patch is part of a series that addresses
   http://bugzilla.kernel.org/show_bug.cgi?id=12256

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c |  2 +-
 net/sunrpc/svc.c       | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 8ea8907d4b8..beee6da3303 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -194,7 +194,7 @@ static int rpcb_register_call(const u32 version, struct rpc_message *msg)
 		error = PTR_ERR(rpcb_clnt);
 
 	if (error < 0) {
-		printk(KERN_WARNING "RPC: failed to contact local rpcbind "
+		dprintk("RPC:       failed to contact local rpcbind "
 				"server (errno %d).\n", -error);
 		return error;
 	}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 142f64745fb..8ba654bdd60 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -818,26 +818,30 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
  * Returns zero on success; a negative errno value is returned
  * if any error occurs.
  */
-static int __svc_register(const u32 program, const u32 version,
+static int __svc_register(const char *progname,
+			  const u32 program, const u32 version,
 			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
-	int error;
+	int error = -EAFNOSUPPORT;
 
 	switch (family) {
 	case PF_INET:
-		return __svc_rpcb_register4(program, version,
+		error = __svc_rpcb_register4(program, version,
 						protocol, port);
 		break;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case PF_INET6:
-		return__svc_rpcb_register6(program, version,
+		error = __svc_rpcb_register6(program, version,
 						protocol, port);
 #endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 	}
 
-	return -EAFNOSUPPORT;
+	if (error < 0)
+		printk(KERN_WARNING "svc: failed to register %sv%u RPC "
+			"service (errno %d).\n", progname, version, -error);
+	return error;
 }
 
 /**
@@ -875,8 +879,8 @@ int svc_register(const struct svc_serv *serv, const int family,
 			if (progp->pg_vers[i]->vs_hidden)
 				continue;
 
-			error = __svc_register(progp->pg_prog, i,
-						family, proto, port);
+			error = __svc_register(progp->pg_name, progp->pg_prog,
+						i, family, proto, port);
 			if (error < 0)
 				break;
 		}
-- 
cgit v1.2.3


From 9355982830ad67dca35e0f3d43319f3d438f82b4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:51 -0400
Subject: SUNRPC: Remove CONFIG_SUNRPC_REGISTER_V4

We just augmented the kernel's RPC service registration code so that
it automatically adjusts to what is supported in user space.  Thus we
no longer need the kernel configuration option to enable registering
RPC services with v4 -- it's all done automatically.

This patch is part of a series that addresses
   http://bugzilla.kernel.org/show_bug.cgi?id=12256

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/Kconfig | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 5592883e1e4..afd91c78ce8 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -17,28 +17,6 @@ config SUNRPC_XPRT_RDMA
 
 	  If unsure, say N.
 
-config SUNRPC_REGISTER_V4
-	bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	default n
-	help
-	  Sun added support for registering RPC services at an IPv6
-	  address by creating two new versions of the rpcbind protocol
-	  (RFC 1833).
-
-	  This option enables support in the kernel RPC server for
-	  registering kernel RPC services via version 4 of the rpcbind
-	  protocol.  If you enable this option, you must run a portmapper
-	  daemon that supports rpcbind protocol version 4.
-
-	  Serving NFS over IPv6 from knfsd (the kernel's NFS server)
-	  requires that you enable this option and use a portmapper that
-	  supports rpcbind version 4.
-
-	  If unsure, say N to get traditional behavior (register kernel
-	  RPC services using only rpcbind version 2).  Distributions
-	  using the legacy Linux portmapper daemon must say N here.
-
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
-- 
cgit v1.2.3


From eb16e907781a9da7f272a3e8284c26bc4e4aeb9d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:47:59 -0400
Subject: lockd: Start PF_INET6 listener only if IPv6 support is available

Apparently a lot of people need to disable IPv6 completely on their
distributor-built systems, which have CONFIG_IPV6_MODULE enabled at
build time.

They do this by blacklisting the ipv6.ko module.  This causes the
creation of the lockd service listener to fail if CONFIG_IPV6_MODULE
is set, but the module cannot be loaded.

Now that the kernel's PF_INET6 RPC listeners are completely separate
from PF_INET listeners, we can always start PF_INET.  Then lockd can
try to start PF_INET6, but it isn't required to be available.

Note this has the added benefit that NLM callbacks from AF_INET6
servers will never come from AF_INET remotes.  We no longer have to
worry about matching mapped IPv4 addresses to AF_INET when comparing
addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c | 51 +--------------------------------------------------
 fs/lockd/svc.c      | 30 +++++++++++++++++++++---------
 2 files changed, 22 insertions(+), 59 deletions(-)

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c..1f3b0fc0d35 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 	return 0;
 }
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
-						 struct in6_addr *addr_mapped)
-{
-	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
-
-	switch (sap->sa_family) {
-	case AF_INET6:
-		return &((const struct sockaddr_in6 *)sap)->sin6_addr;
-	case AF_INET:
-		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
-		return addr_mapped;
-	}
-
-	return NULL;
-}
-
-/*
- * If lockd is using a PF_INET6 listener, all incoming requests appear
- * to come from AF_INET6 remotes.  The address of AF_INET remotes are
- * mapped to AF_INET6 automatically by the network layer.  In case the
- * user passed an AF_INET server address at mount time, ensure both
- * addresses are AF_INET6 before comparing them.
- */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
-			    const struct sockaddr *sap)
-{
-	const struct in6_addr *addr1;
-	const struct in6_addr *addr2;
-	struct in6_addr addr1_mapped;
-	struct in6_addr addr2_mapped;
-
-	addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
-	if (likely(addr1 != NULL)) {
-		addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
-		if (likely(addr2 != NULL))
-			return ipv6_addr_equal(addr1, addr2);
-	}
-
-	return 0;
-}
-#else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
-			    const struct sockaddr *sap)
-{
-	return nlm_cmp_addr(nlm_addr(host), sap);
-}
-#endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		 */
 		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
-		if (!nlmclnt_cmp_addr(block->b_host, addr))
+		if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
 			continue;
 		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
 			continue;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 566932b98fd..abf83881f68 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -193,20 +193,30 @@ lockd(void *vrqstp)
 	return 0;
 }
 
-static int create_lockd_listener(struct svc_serv *serv, char *name,
-				 unsigned short port)
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
+				 const int family, const unsigned short port)
 {
 	struct svc_xprt *xprt;
 
-	xprt = svc_find_xprt(serv, name, 0, 0);
+	xprt = svc_find_xprt(serv, name, family, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, PF_INET,
-					port, SVC_SOCK_DEFAULTS);
-
+		return svc_create_xprt(serv, name, family, port,
+						SVC_SOCK_DEFAULTS);
 	svc_xprt_put(xprt);
 	return 0;
 }
 
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+	int err;
+
+	err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+	if (err < 0)
+		return err;
+
+	return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
+
 /*
  * Ensure there are active UDP and TCP listeners for lockd.
  *
@@ -222,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
 	static int warned;
 	int err;
 
-	err = create_lockd_listener(serv, "udp", nlm_udpport);
+	err = create_lockd_family(serv, PF_INET);
 	if (err < 0)
 		goto out_err;
 
-	err = create_lockd_listener(serv, "tcp", nlm_tcpport);
-	if (err < 0)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	err = create_lockd_family(serv, PF_INET6);
+	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_err;
+#endif	/* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
 
 	warned = 0;
 	return 0;
-- 
cgit v1.2.3


From f738f5170367b367e38b2d75a413e7b3c52d46a5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:48:06 -0400
Subject: NFS: Start PF_INET6 callback listener only if IPv6 support is
 available

Apparently a lot of people need to disable IPv6 completely on their
distributor-built systems, which have CONFIG_IPV6_MODULE enabled at
build time.

They do this by blacklisting the ipv6.ko module.  This causes the
creation of the NFSv4 callback service listener to fail if
CONFIG_IPV6_MODULE is set, but the module cannot be loaded.

Now that the kernel's PF_INET6 RPC listeners are completely separate
from PF_INET listeners, we can always start PF_INET.  Then the NFS
client can try to start a PF_INET6 listener, but it isn't required
to be available.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c  | 12 ++++++++++++
 fs/nfs/callback.h  |  1 +
 fs/nfs/nfs4state.c | 10 ++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0ef47dff89b..a886e692ddd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,6 +38,7 @@ static struct svc_program nfs4_callback_program;
 
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
@@ -119,6 +120,17 @@ int nfs_callback_up(void)
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
 			nfs_callback_tcpport, PF_INET);
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	ret = svc_create_xprt(serv, "tcp", PF_INET6,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	if (ret > 0) {
+		nfs_callback_tcpport6 = ret;
+		dprintk("NFS: Callback listener port = %u (af %u)\n",
+				nfs_callback_tcpport6, PF_INET6);
+	} else if (ret != -EAFNOSUPPORT)
+		goto out_err;
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+
 	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(nfs_callback_info.rqst)) {
 		ret = PTR_ERR(nfs_callback_info.rqst);
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff..e110e286a26 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
 
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966..0298e909559 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
 
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
-	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
-			nfs_callback_tcpport, cred);
+	unsigned short port;
+	int status;
+
+	port = nfs_callback_tcpport;
+	if (clp->cl_addr.ss_family == AF_INET6)
+		port = nfs_callback_tcpport6;
+
+	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
 	if (status == 0)
 		status = nfs4_proc_setclientid_confirm(clp, cred);
 	if (status == 0)
-- 
cgit v1.2.3


From 3c8c45dfab78a1919f6f8a3ea46998c487eb7e12 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:48:14 -0400
Subject: NFS: Simplify logic to compare socket addresses in client.c

Callback requests from IPv4 servers are now always guaranteed to be
AF_INET, and never mapped IPv4 AF_INET6 addresses.  Both
nfs_match_client() and nfs_find_client() can now share the same
address comparison logic, so fold them together.

We can also dispense with of most of the conditional compilation
in here.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 116 +++++++++++++++++++++++++-------------------------------
 1 file changed, 52 insertions(+), 64 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 574158ae239..855daac0f24 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -224,38 +224,6 @@ void nfs_put_client(struct nfs_client *clp)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
-{
-	switch (sa->sa_family) {
-		default:
-			return NULL;
-		case AF_INET6:
-			return &((const struct sockaddr_in6 *)sa)->sin6_addr;
-			break;
-		case AF_INET:
-			ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
-					addr_mapped);
-			return addr_mapped;
-	}
-}
-
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-		const struct sockaddr *sa2)
-{
-	const struct in6_addr *addr1;
-	const struct in6_addr *addr2;
-	struct in6_addr addr1_mapped;
-	struct in6_addr addr2_mapped;
-
-	addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
-	if (likely(addr1 != NULL)) {
-		addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
-		if (likely(addr2 != NULL))
-			return ipv6_addr_equal(addr1, addr2);
-	}
-	return 0;
-}
-
 /*
  * Test if two ip6 socket addresses refer to the same socket by
  * comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +235,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
  *
  * The caller should ensure both socket addresses are AF_INET6.
  */
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
-				const struct sockaddr *sa2)
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
 {
-	const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
 
-	if (!ipv6_addr_equal(&saddr1->sin6_addr,
-			     &saddr1->sin6_addr))
+	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+	    sin1->sin6_scope_id != sin2->sin6_scope_id)
 		return 0;
-	if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-	    saddr1->sin6_scope_id != saddr2->sin6_scope_id)
-		return 0;
-	return saddr1->sin6_port == saddr2->sin6_port;
-}
-#else
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
-				 const struct sockaddr_in *sa2)
-{
-	return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
-}
 
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-				 const struct sockaddr *sa2)
-{
-	if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
-		return 0;
-	return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-			(const struct sockaddr_in *)sa2);
+	return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
 }
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
-				const struct sockaddr * sa2)
+#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
 {
 	return 0;
 }
@@ -311,20 +262,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
  *
  * The caller should ensure both socket addresses are AF_INET.
  */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+		(sin1->sin6_port == sin2->sin6_port);
+}
+
 static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
 				const struct sockaddr *sa2)
 {
-	const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
 
-	if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+		(sin1->sin_port == sin2->sin_port);
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+				     const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
 		return 0;
-	return saddr1->sin_port == saddr2->sin_port;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+	}
+	return 0;
 }
 
 /*
  * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields.
+ * by comparing (only) relevant fields, including the port number.
  */
 static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
 			    const struct sockaddr *sa2)
-- 
cgit v1.2.3


From 32ec7fd08b597586774b92ac1cd2678021ccac1b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 28 Mar 2009 13:53:26 -0700
Subject: x86, setup: preemptively save/restore edi and ebp around INT 15 E820

Impact: BIOS bugproofing

Since there are BIOSes known to clobber %ebx and %esi for INT 15 E820,
assume there is something out there clobbering %edi and/or %ebp too,
and don't wait for it to fail.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/memory.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index a99dbbe77a0..fcdb10add9c 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,7 +20,7 @@ static int detect_memory_e820(void)
 {
 	int count = 0;
 	u32 next = 0;
-	u32 size, id;
+	u32 size, id, edi;
 	u8 err;
 	struct e820entry *desc = boot_params.e820_map;
 
@@ -29,10 +29,11 @@ static int detect_memory_e820(void)
 
 		/* Important: %edx and %esi are clobbered by some BIOSes,
 		   so they must be either used for the error output
-		   or explicitly marked clobbered. */
-		asm("int $0x15; setc %0"
+		   or explicitly marked clobbered.  Given that, assume there
+		   is something out there clobbering %ebp and %edi, too. */
+		asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
 		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-		      "=m" (*desc)
+		      "=D" (edi), "=m" (*desc)
 		    : "D" (desc), "d" (SMAP), "a" (0xe820)
 		    : "esi");
 
-- 
cgit v1.2.3


From c549e71d073a6e9a4847497344db28a784061455 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 28 Mar 2009 13:53:26 -0700
Subject: x86, setup: ACPI 3, BIOS workaround for E820-probing code

Impact: ACPI 3 spec compliance, BIOS bug workaround

The ACPI 3 spec added another field to the E820 buffer -- which is
backwards incompatible, since it contains a validity bit.
Furthermore, there has been at least one report of a BIOS which
assumes that the buffer it is pointed at is the same buffer as for the
previous E820 call.  Therefore, read the data into a temporary buffer
and copy the standard part of it if and only if the valid bit is set.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/memory.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index fcdb10add9c..d5d2360763d 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -16,6 +17,11 @@
 
 #define SMAP	0x534d4150	/* ASCII "SMAP" */
 
+struct e820_ext_entry {
+	struct e820entry std;
+	u32 ext_flags;
+} __attribute__((packed));
+
 static int detect_memory_e820(void)
 {
 	int count = 0;
@@ -23,9 +29,10 @@ static int detect_memory_e820(void)
 	u32 size, id, edi;
 	u8 err;
 	struct e820entry *desc = boot_params.e820_map;
+	static struct e820_ext_entry buf; /* static so it is zeroed */
 
 	do {
-		size = sizeof(struct e820entry);
+		size = sizeof buf;
 
 		/* Important: %edx and %esi are clobbered by some BIOSes,
 		   so they must be either used for the error output
@@ -33,8 +40,8 @@ static int detect_memory_e820(void)
 		   is something out there clobbering %ebp and %edi, too. */
 		asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
 		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-		      "=D" (edi), "=m" (*desc)
-		    : "D" (desc), "d" (SMAP), "a" (0xe820)
+		      "=D" (edi), "+m" (buf)
+		    : "D" (&buf), "d" (SMAP), "a" (0xe820)
 		    : "esi");
 
 		/* BIOSes which terminate the chain with CF = 1 as opposed
@@ -53,8 +60,14 @@ static int detect_memory_e820(void)
 			break;
 		}
 
+		/* ACPI 3.0 added the extended flags support.  If bit 0
+		   in the extended flags is zero, we're supposed to simply
+		   ignore the entry -- a backwards incompatible change! */
+		if (size > 20 && !(buf.ext_flags & 1))
+			continue;
+
+		*desc++ = buf.std;
 		count++;
-		desc++;
 	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
-- 
cgit v1.2.3


From 79f1bc06dbb05f222756d6df4a9ff95588c9cc06 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Sat, 28 Mar 2009 23:37:27 -0700
Subject: ni5010: convert to net_device_ops

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ni5010.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ni5010.c b/drivers/net/ni5010.c
index 539e18ab485..2a8da476ab3 100644
--- a/drivers/net/ni5010.c
+++ b/drivers/net/ni5010.c
@@ -189,6 +189,17 @@ static void __init trigger_irq(int ioaddr)
 		outb(MM_EN_XMT|MM_MUX, IE_MMODE); /* Start transmission */
 }
 
+static const struct net_device_ops ni5010_netdev_ops = {
+	.ndo_open		= ni5010_open,
+	.ndo_stop		= ni5010_close,
+	.ndo_start_xmit		= ni5010_send_packet,
+	.ndo_set_multicast_list	= ni5010_set_multicast_list,
+	.ndo_tx_timeout		= ni5010_timeout,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+};
+
 /*
  *      This is the real probe routine.  Linux has a history of friendly device
  *      probes on the ISA bus.  A good device probes avoids doing writes, and
@@ -328,13 +339,8 @@ static int __init ni5010_probe1(struct net_device *dev, int ioaddr)
         	outb(0, IE_RBUF);	/* set buffer byte 0 to 0 again */
 	}
         printk("-> bufsize rcv/xmt=%d/%d\n", bufsize_rcv, NI5010_BUFSIZE);
-	memset(netdev_priv(dev), 0, sizeof(struct ni5010_local));
 
-	dev->open		= ni5010_open;
-	dev->stop		= ni5010_close;
-	dev->hard_start_xmit	= ni5010_send_packet;
-	dev->set_multicast_list = ni5010_set_multicast_list;
-	dev->tx_timeout		= ni5010_timeout;
+	dev->netdev_ops		= &ni5010_netdev_ops;
 	dev->watchdog_timeo	= HZ/20;
 
 	dev->flags &= ~IFF_MULTICAST;	/* Multicast doesn't work */
-- 
cgit v1.2.3


From 4b21cd4eedff2123712c2132c8c6264d40332465 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 28 Mar 2009 23:38:40 -0700
Subject: skbuff.h: fix missing kernel-doc

Add missing struct field to fix kernel-doc warning:

Warning(include/linux/skbuff.h:182): No description found for parameter 'flags'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb1981fd60f..eb2e837afaf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -168,6 +168,7 @@ struct skb_shared_hwtstamps {
  * @software:		generate software time stamp
  * @in_progress:	device driver is going to provide
  *			hardware time stamp
+ * @flags:		all shared_tx flags
  *
  * These flags are attached to packets as part of the
  * &skb_shared_info. Use skb_tx() to get a pointer.
-- 
cgit v1.2.3


From 2f181855a0b3c2b39314944add7b41c15647cf86 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 28 Mar 2009 23:39:18 -0700
Subject: gso: Fix support for linear packets

When GRO/frag_list support was added to GSO, I made an error
which broke the support for segmenting linear GSO packets (GSO
packets are normally non-linear in the payload).

These days most of these packets are constructed by the tun
driver, which prefers to allocate linear memory if possible.
This is fixed in the latest kernel, but for 2.6.29 and earlier
it is still the norm.

Therefore this bug causes failures with GSO when used with tun
in 2.6.29.

Reported-by: James Huang <jamesclhuang@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6acbf9e79eb..ce6356cd9f7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2579,7 +2579,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 					  skb_network_header_len(skb));
 		skb_copy_from_linear_data(skb, nskb->data, doffset);
 
-		if (pos >= offset + len)
+		if (fskb != skb_shinfo(skb)->frag_list)
 			continue;
 
 		if (!sg) {
-- 
cgit v1.2.3


From 3e8af307bfe3b6318a1aaaf8ce18d0af7ddf2ea2 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Sat, 28 Mar 2009 23:40:05 -0700
Subject: dmascc: fix incomplete conversion to network_device_ops

drivers/net/hamradio/dmascc.c:587: error: 'struct net_device' has no
member named 'set_mac_address'

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/dmascc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index 881bf818bb4..7459b3ac77a 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -445,6 +445,7 @@ static const struct net_device_ops scc_netdev_ops = {
 	.ndo_stop = scc_close,
 	.ndo_start_xmit = scc_send_packet,
 	.ndo_do_ioctl = scc_ioctl,
+	.ndo_set_mac_address = scc_set_mac_address,
 };
 
 static int __init setup_adapter(int card_base, int type, int n)
@@ -584,7 +585,6 @@ static int __init setup_adapter(int card_base, int type, int n)
 		dev->irq = irq;
 		dev->netdev_ops = &scc_netdev_ops;
 		dev->header_ops = &ax25_header_ops;
-		dev->set_mac_address = scc_set_mac_address;
 	}
 	if (register_netdev(info->dev[0])) {
 		printk(KERN_ERR "dmascc: could not register %s\n",
-- 
cgit v1.2.3


From f940964901aa69e28ce729d7614061d014184472 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 28 Mar 2009 15:38:30 +0000
Subject: netfilter: fix endian bug in conntrack printks

dcc_ip is treated as a host-endian value in the first printk,
but the second printk uses %pI4 which expects a be32.  This
will cause a mismatch between the debug statement and the
warning statement.

Treat as a be32 throughout and avoid some byteswapping during
some comparisions, and allow another user of HIPQUAD to bite the
dust.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_irc.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 409c8be58e7..8bd98c84f77 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -66,7 +66,7 @@ static const char *const dccprotos[] = {
  *	ad_beg_p	returns pointer to first byte of addr data
  *	ad_end_p	returns pointer to last byte of addr data
  */
-static int parse_dcc(char *data, const char *data_end, u_int32_t *ip,
+static int parse_dcc(char *data, const char *data_end, __be32 *ip,
 		     u_int16_t *port, char **ad_beg_p, char **ad_end_p)
 {
 	char *tmp;
@@ -85,7 +85,7 @@ static int parse_dcc(char *data, const char *data_end, u_int32_t *ip,
 		return -1;
 
 	*ad_beg_p = data;
-	*ip = simple_strtoul(data, &data, 10);
+	*ip = cpu_to_be32(simple_strtoul(data, &data, 10));
 
 	/* skip blanks between ip and port */
 	while (*data == ' ') {
@@ -112,7 +112,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 	int dir = CTINFO2DIR(ctinfo);
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple *tuple;
-	u_int32_t dcc_ip;
+	__be32 dcc_ip;
 	u_int16_t dcc_port;
 	__be16 port;
 	int i, ret = NF_ACCEPT;
@@ -177,13 +177,14 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 				pr_debug("unable to parse dcc command\n");
 				continue;
 			}
-			pr_debug("DCC bound ip/port: %u.%u.%u.%u:%u\n",
-				 HIPQUAD(dcc_ip), dcc_port);
+
+			pr_debug("DCC bound ip/port: %pI4:%u\n",
+				 &dcc_ip, dcc_port);
 
 			/* dcc_ip can be the internal OR external (NAT'ed) IP */
 			tuple = &ct->tuplehash[dir].tuple;
-			if (tuple->src.u3.ip != htonl(dcc_ip) &&
-			    tuple->dst.u3.ip != htonl(dcc_ip)) {
+			if (tuple->src.u3.ip != dcc_ip &&
+			    tuple->dst.u3.ip != dcc_ip) {
 				if (net_ratelimit())
 					printk(KERN_WARNING
 						"Forged DCC command from %pI4: %pI4:%u\n",
-- 
cgit v1.2.3


From e7557af56a576762a655f1aaaded253ad14c5958 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 28 Mar 2009 15:38:31 +0000
Subject: netpoll: store local and remote ip in net-endian

Allows for the removal of byteswapping in some places and
the removal of HIPQUAD (replaced by %pI4).

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/netconsole.c | 10 ++++------
 include/linux/netpoll.h  |  2 +-
 net/core/netpoll.c       | 31 +++++++++++++++----------------
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index d304d38cd5d..eceadf787a6 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -294,14 +294,12 @@ static ssize_t show_remote_port(struct netconsole_target *nt, char *buf)
 
 static ssize_t show_local_ip(struct netconsole_target *nt, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d.%d.%d.%d\n",
-			HIPQUAD(nt->np.local_ip));
+	return snprintf(buf, PAGE_SIZE, "%pI4\n", &nt->np.local_ip);
 }
 
 static ssize_t show_remote_ip(struct netconsole_target *nt, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d.%d.%d.%d\n",
-			HIPQUAD(nt->np.remote_ip));
+	return snprintf(buf, PAGE_SIZE, "%pI4\n", &nt->np.remote_ip);
 }
 
 static ssize_t show_local_mac(struct netconsole_target *nt, char *buf)
@@ -438,7 +436,7 @@ static ssize_t store_local_ip(struct netconsole_target *nt,
 		return -EINVAL;
 	}
 
-	nt->np.local_ip = ntohl(in_aton(buf));
+	nt->np.local_ip = in_aton(buf);
 
 	return strnlen(buf, count);
 }
@@ -454,7 +452,7 @@ static ssize_t store_remote_ip(struct netconsole_target *nt,
 		return -EINVAL;
 	}
 
-	nt->np.remote_ip = ntohl(in_aton(buf));
+	nt->np.remote_ip = in_aton(buf);
 
 	return strnlen(buf, count);
 }
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index de99025f2c5..2524267210d 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -18,7 +18,7 @@ struct netpoll {
 	const char *name;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
 
-	u32 local_ip, remote_ip;
+	__be32 local_ip, remote_ip;
 	u16 local_port, remote_port;
 	u8 remote_mac[ETH_ALEN];
 };
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 755414cd49d..b5873bdff61 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -345,8 +345,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 	udph->dest = htons(np->remote_port);
 	udph->len = htons(udp_len);
 	udph->check = 0;
-	udph->check = csum_tcpudp_magic(htonl(np->local_ip),
-					htonl(np->remote_ip),
+	udph->check = csum_tcpudp_magic(np->local_ip,
+					np->remote_ip,
 					udp_len, IPPROTO_UDP,
 					csum_partial(udph, udp_len, 0));
 	if (udph->check == 0)
@@ -365,8 +365,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 	iph->ttl      = 64;
 	iph->protocol = IPPROTO_UDP;
 	iph->check    = 0;
-	put_unaligned(htonl(np->local_ip), &(iph->saddr));
-	put_unaligned(htonl(np->remote_ip), &(iph->daddr));
+	put_unaligned(np->local_ip, &(iph->saddr));
+	put_unaligned(np->remote_ip, &(iph->daddr));
 	iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl);
 
 	eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
@@ -424,7 +424,7 @@ static void arp_reply(struct sk_buff *skb)
 	memcpy(&tip, arp_ptr, 4);
 
 	/* Should we ignore arp? */
-	if (tip != htonl(np->local_ip) ||
+	if (tip != np->local_ip ||
 	    ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
 		return;
 
@@ -533,9 +533,9 @@ int __netpoll_rx(struct sk_buff *skb)
 		goto out;
 	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
 		goto out;
-	if (np->local_ip && np->local_ip != ntohl(iph->daddr))
+	if (np->local_ip && np->local_ip != iph->daddr)
 		goto out;
-	if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
+	if (np->remote_ip && np->remote_ip != iph->saddr)
 		goto out;
 	if (np->local_port && np->local_port != ntohs(uh->dest))
 		goto out;
@@ -560,14 +560,14 @@ void netpoll_print_options(struct netpoll *np)
 {
 	printk(KERN_INFO "%s: local port %d\n",
 			 np->name, np->local_port);
-	printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
-			 np->name, HIPQUAD(np->local_ip));
+	printk(KERN_INFO "%s: local IP %pI4\n",
+			 np->name, &np->local_ip);
 	printk(KERN_INFO "%s: interface %s\n",
 			 np->name, np->dev_name);
 	printk(KERN_INFO "%s: remote port %d\n",
 			 np->name, np->remote_port);
-	printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n",
-			 np->name, HIPQUAD(np->remote_ip));
+	printk(KERN_INFO "%s: remote IP %pI4\n",
+			 np->name, &np->remote_ip);
 	printk(KERN_INFO "%s: remote ethernet address %pM\n",
 	                 np->name, np->remote_mac);
 }
@@ -589,7 +589,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		if ((delim = strchr(cur, '/')) == NULL)
 			goto parse_failed;
 		*delim = 0;
-		np->local_ip = ntohl(in_aton(cur));
+		np->local_ip = in_aton(cur);
 		cur = delim;
 	}
 	cur++;
@@ -618,7 +618,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 	if ((delim = strchr(cur, '/')) == NULL)
 		goto parse_failed;
 	*delim = 0;
-	np->remote_ip = ntohl(in_aton(cur));
+	np->remote_ip = in_aton(cur);
 	cur = delim + 1;
 
 	if (*cur != 0) {
@@ -759,10 +759,9 @@ int netpoll_setup(struct netpoll *np)
 			goto release;
 		}
 
-		np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
+		np->local_ip = in_dev->ifa_list->ifa_local;
 		rcu_read_unlock();
-		printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
-		       np->name, HIPQUAD(np->local_ip));
+		printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);
 	}
 
 	if (np->rx_hook) {
-- 
cgit v1.2.3


From 2c60b6885afc56a17b9d55b04c4328123063fc9d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 28 Mar 2009 15:38:31 +0000
Subject: kernel: remove HIPQUAD()

All users have been removed.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 914918abfdd..f81d80f47dc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -379,18 +379,6 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 	((unsigned char *)&addr)[3]
 #define NIPQUAD_FMT "%u.%u.%u.%u"
 
-#if defined(__LITTLE_ENDIAN)
-#define HIPQUAD(addr) \
-	((unsigned char *)&addr)[3], \
-	((unsigned char *)&addr)[2], \
-	((unsigned char *)&addr)[1], \
-	((unsigned char *)&addr)[0]
-#elif defined(__BIG_ENDIAN)
-#define HIPQUAD	NIPQUAD
-#else
-#error "Please fix asm/byteorder.h"
-#endif /* __LITTLE_ENDIAN */
-
 /*
  * min()/max()/clamp() macros that also do
  * strict type-checking.. See the
-- 
cgit v1.2.3


From ee76db5e9e9896312f001790855a798472440328 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Mar 2009 01:19:37 -0700
Subject: gianfar: Fix use-after-of_node_put() in gfar_of_init().

We can't put 'mdio' until after we've used it in the
fsl_pq_mdio_bus_name() call.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/gianfar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 6a38800be3f..65f55877be9 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -289,9 +289,9 @@ static int gfar_of_init(struct net_device *dev)
 		id = of_get_property(phy, "reg", NULL);
 
 		of_node_put(phy);
-		of_node_put(mdio);
 
 		fsl_pq_mdio_bus_name(bus_name, mdio);
+		of_node_put(mdio);
 		snprintf(priv->phy_bus_id, sizeof(priv->phy_bus_id), "%s:%02x",
 				bus_name, *id);
 	}
-- 
cgit v1.2.3


From 129dd9677b30a07bb832247dfe8d6089f1ac61a0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Mar 2009 01:20:18 -0700
Subject: ucc_geth: Fix use-after-of_node_put() in ucc_geth_probe().

We can't put 'mdio' until after we've used it in the
fsl_pq_mdio_bus_name() call.

Also fix error return values.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ucc_geth.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 86a479f61c0..933fcfbf35e 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3648,15 +3648,16 @@ static int ucc_geth_probe(struct of_device* ofdev, const struct of_device_id *ma
 		mdio = of_get_parent(phy);
 
 		if (mdio == NULL)
-			return -1;
+			return -ENODEV;
 
 		err = of_address_to_resource(mdio, 0, &res);
-		of_node_put(mdio);
-
-		if (err)
-			return -1;
 
+		if (err) {
+			of_node_put(mdio);
+			return err;
+		}
 		fsl_pq_mdio_bus_name(bus_name, mdio);
+		of_node_put(mdio);
 		snprintf(ug_info->phy_bus_id, sizeof(ug_info->phy_bus_id),
 			"%s:%02x", bus_name, *prop);
 	}
-- 
cgit v1.2.3


From 4099e01224e2afcaeea439cd92db3e7cf6e0f84f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Mar 2009 01:39:41 -0700
Subject: niu: Add GRO support.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/niu.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 50c11126a3d..02c37e2f08a 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -3441,7 +3441,8 @@ static int niu_rx_pkt_ignore(struct niu *np, struct rx_ring_info *rp)
 	return num_rcr;
 }
 
-static int niu_process_rx_pkt(struct niu *np, struct rx_ring_info *rp)
+static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np,
+			      struct rx_ring_info *rp)
 {
 	unsigned int index = rp->rcr_index;
 	struct sk_buff *skb;
@@ -3518,7 +3519,7 @@ static int niu_process_rx_pkt(struct niu *np, struct rx_ring_info *rp)
 
 	skb->protocol = eth_type_trans(skb, np->dev);
 	skb_record_rx_queue(skb, rp->rx_channel);
-	netif_receive_skb(skb);
+	napi_gro_receive(napi, skb);
 
 	return num_rcr;
 }
@@ -3706,7 +3707,8 @@ static inline void niu_sync_rx_discard_stats(struct niu *np,
 	}
 }
 
-static int niu_rx_work(struct niu *np, struct rx_ring_info *rp, int budget)
+static int niu_rx_work(struct napi_struct *napi, struct niu *np,
+		       struct rx_ring_info *rp, int budget)
 {
 	int qlen, rcr_done = 0, work_done = 0;
 	struct rxdma_mailbox *mbox = rp->mbox;
@@ -3728,7 +3730,7 @@ static int niu_rx_work(struct niu *np, struct rx_ring_info *rp, int budget)
 	rcr_done = work_done = 0;
 	qlen = min(qlen, budget);
 	while (work_done < qlen) {
-		rcr_done += niu_process_rx_pkt(np, rp);
+		rcr_done += niu_process_rx_pkt(napi, np, rp);
 		work_done++;
 	}
 
@@ -3776,7 +3778,7 @@ static int niu_poll_core(struct niu *np, struct niu_ldg *lp, int budget)
 		if (rx_vec & (1 << rp->rx_channel)) {
 			int this_work_done;
 
-			this_work_done = niu_rx_work(np, rp,
+			this_work_done = niu_rx_work(&lp->napi, np, rp,
 						     budget);
 
 			budget -= this_work_done;
-- 
cgit v1.2.3


From 1383bdb98c01bbd28d72336d1bf614ce79114d29 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Mar 2009 01:39:49 -0700
Subject: tg3: Add GRO support.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f7efcecc410..1205c2a2265 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -4392,7 +4392,7 @@ static void tg3_recycle_rx(struct tg3 *tp, u32 opaque_key,
 #if TG3_VLAN_TAG_USED
 static int tg3_vlan_rx(struct tg3 *tp, struct sk_buff *skb, u16 vlan_tag)
 {
-	return vlan_hwaccel_receive_skb(skb, tp->vlgrp, vlan_tag);
+	return vlan_gro_receive(&tp->napi, tp->vlgrp, vlan_tag, skb);
 }
 #endif
 
@@ -4539,7 +4539,7 @@ static int tg3_rx(struct tg3 *tp, int budget)
 				    desc->err_vlan & RXD_VLAN_MASK);
 		} else
 #endif
-			netif_receive_skb(skb);
+			napi_gro_receive(&tp->napi, skb);
 
 		received++;
 		budget--;
-- 
cgit v1.2.3


From 424b86a6bc9459a830e1e94e0e908f3ac1716b7e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 29 Mar 2009 13:46:01 -0700
Subject: netfilter: xtables: fix IPv6 dependency in the cluster match

This patch fixes a dependency with IPv6:

ERROR: "__ipv6_addr_type" [net/netfilter/xt_cluster.ko] undefined!

This patch adds a function that checks if the higher bits of the
address is 0xFF to identify a multicast address, instead of adding a
dependency due to __ipv6_addr_type(). I came up with this idea after
Patrick McHardy pointed possible problems with runtime module
dependencies.

Reported-by: Steven Noonan <steven@uplinklabs.net>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Reported-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_cluster.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index ad5bd890e4e..6c4847662b8 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -57,6 +57,13 @@ xt_cluster_hash(const struct nf_conn *ct,
 	return (((u64)hash * info->total_nodes) >> 32);
 }
 
+static inline bool
+xt_cluster_ipv6_is_multicast(const struct in6_addr *addr)
+{
+	__be32 st = addr->s6_addr32[0];
+	return ((st & htonl(0xFF000000)) == htonl(0xFF000000));
+}
+
 static inline bool
 xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
 {
@@ -67,8 +74,8 @@ xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
 		is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr);
 		break;
 	case NFPROTO_IPV6:
-		is_multicast = ipv6_addr_type(&ipv6_hdr(skb)->daddr) &
-						IPV6_ADDR_MULTICAST;
+		is_multicast =
+			xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr);
 		break;
 	default:
 		WARN_ON(1);
-- 
cgit v1.2.3


From 321dee6e8b235c496f0a068a72d8df9a4e13ceb9 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Sun, 29 Mar 2009 13:52:21 -0700
Subject: wireless: remove duplicated .ndo_set_mac_address

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wireless/airo.c               | 2 --
 drivers/net/wireless/ipw2x00/ipw2200.c    | 1 -
 drivers/net/wireless/prism54/islpci_dev.c | 1 -
 drivers/net/wireless/zd1201.c             | 1 -
 4 files changed, 5 deletions(-)

diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 7e80aba8a14..93302c0a36b 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2752,7 +2752,6 @@ static const struct net_device_ops airo_netdev_ops = {
 	.ndo_set_mac_address	= airo_set_mac_address,
 	.ndo_do_ioctl		= airo_ioctl,
 	.ndo_change_mtu		= airo_change_mtu,
-	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
@@ -2765,7 +2764,6 @@ static const struct net_device_ops mpi_netdev_ops = {
 	.ndo_set_mac_address	= airo_set_mac_address,
 	.ndo_do_ioctl		= airo_ioctl,
 	.ndo_change_mtu		= airo_change_mtu,
-	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index b3449948a25..4a92af1d787 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -11593,7 +11593,6 @@ static const struct net_device_ops ipw_netdev_ops = {
 	.ndo_set_mac_address	= ipw_net_set_mac_address,
 	.ndo_start_xmit		= ieee80211_xmit,
 	.ndo_change_mtu		= ieee80211_change_mtu,
-	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index 166ed958460..e26d7b3ceab 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -803,7 +803,6 @@ static const struct net_device_ops islpci_netdev_ops = {
 	.ndo_tx_timeout		= islpci_eth_tx_timeout,
 	.ndo_set_mac_address 	= prism54_set_mac_address,
 	.ndo_change_mtu		= eth_change_mtu,
-	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
diff --git a/drivers/net/wireless/zd1201.c b/drivers/net/wireless/zd1201.c
index 9b244c96b22..5fabd9c0f07 100644
--- a/drivers/net/wireless/zd1201.c
+++ b/drivers/net/wireless/zd1201.c
@@ -1725,7 +1725,6 @@ static const struct net_device_ops zd1201_netdev_ops = {
 	.ndo_set_multicast_list = zd1201_set_multicast,
 	.ndo_set_mac_address	= zd1201_set_mac_address,
 	.ndo_change_mtu		= eth_change_mtu,
-	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-- 
cgit v1.2.3


From ffaba674090f287afe0c44fd8d978c64c03581a8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Mar 2009 15:40:33 -0700
Subject: sparc64: Fix reset hangs on Niagara systems.

Hypervisor versions older than version 1.6.1 cannot handle
leaving the profile counter overflow interrupt chirping
when the system does a soft reset.

So use a reboot notifier to shut off the NMI watchdog.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/nmi.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index f3577223c86..2c0cc72d295 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kernel_stat.h>
+#include <linux/reboot.h>
 #include <linux/slab.h>
 #include <linux/kdebug.h>
 #include <linux/delay.h>
@@ -206,13 +207,33 @@ void nmi_adjust_hz(unsigned int new_hz)
 }
 EXPORT_SYMBOL_GPL(nmi_adjust_hz);
 
+static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p)
+{
+	on_each_cpu(stop_watchdog, NULL, 1);
+	return 0;
+}
+
+static struct notifier_block nmi_reboot_notifier = {
+	.notifier_call = nmi_shutdown,
+};
+
 int __init nmi_init(void)
 {
+	int err;
+
 	nmi_usable = 1;
 
 	on_each_cpu(start_watchdog, NULL, 1);
 
-	return check_nmi_watchdog();
+	err = check_nmi_watchdog();
+	if (!err) {
+		err = register_reboot_notifier(&nmi_reboot_notifier);
+		if (err) {
+			nmi_usable = 0;
+			on_each_cpu(stop_watchdog, NULL, 1);
+		}
+	}
+	return err;
 }
 
 static int __init setup_nmi_watchdog(char *str)
-- 
cgit v1.2.3


From 3a35ce7dcefe9e80a00603a195269fbaf6e7d901 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 22 Jan 2009 16:42:57 +0100
Subject: virtio: fix BAD_RING, START_US and END_USE macros

Impact: cleanup

fix BAD_RING, START_US and END_USE macros

When these macros aren't called with a variable named vq as first
argument, this would result in a build failure.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5777196bf6c..12273bf9b3b 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -23,15 +23,15 @@
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
-#define BAD_RING(vq, fmt...)			\
-	do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0)
-#define START_USE(vq) \
-	do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0)
-#define END_USE(vq) \
-	do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0)
+#define BAD_RING(_vq, fmt...)			\
+	do { dev_err(&_vq->vq.vdev->dev, fmt); BUG(); } while(0)
+#define START_USE(_vq) \
+	do { if ((_vq)->in_use) panic("in_use = %i\n", (_vq)->in_use); (_vq)->in_use = __LINE__; mb(); } while(0)
+#define END_USE(_vq) \
+	do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; mb(); } while(0)
 #else
-#define BAD_RING(vq, fmt...)			\
-	do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0)
+#define BAD_RING(_vq, fmt...)			\
+	do { dev_err(&_vq->vq.vdev->dev, fmt); (_vq)->broken = true; } while(0)
 #define START_USE(vq)
 #define END_USE(vq)
 #endif
-- 
cgit v1.2.3


From c5f841f1780dad7efb7eca092f60742d47f47d25 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 21:55:22 -0600
Subject: virtio: more neatening of virtio_ring macros.

Impact: cleanup

Roel Kluin drew attention to these macros with his patch: here I
neaten them a little further:
1) Add a comment on what START_USE and END_USE are checking,
2) Brackets around _vq in BAD_RING,
3) Neaten formatting for START_USE so it's less than 80 cols.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 12273bf9b3b..5c52369ab9b 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,9 +24,15 @@
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt...)			\
-	do { dev_err(&_vq->vq.vdev->dev, fmt); BUG(); } while(0)
-#define START_USE(_vq) \
-	do { if ((_vq)->in_use) panic("in_use = %i\n", (_vq)->in_use); (_vq)->in_use = __LINE__; mb(); } while(0)
+	do { dev_err(&(_vq)->vq.vdev->dev, fmt); BUG(); } while(0)
+/* Caller is supposed to guarantee no reentry. */
+#define START_USE(_vq)						\
+	do {							\
+		if ((_vq)->in_use)				\
+			panic("in_use = %i\n", (_vq)->in_use);	\
+		(_vq)->in_use = __LINE__;			\
+		mb();						\
+	} while(0)
 #define END_USE(_vq) \
 	do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; mb(); } while(0)
 #else
-- 
cgit v1.2.3


From 6afbdd059c27330eccbd85943354f94c2b83a7fe Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 21:55:23 -0600
Subject: lguest: fix spurious BUG_ON() on invalid guest stack.

Impact: fix crash on misbehaving guest

gpte_addr() contains a BUG_ON(), insisting that the present flag is
set.  We need to return before we call it if that isn't the case.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: stable@kernel.org
---
 drivers/lguest/page_tables.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 576a8318221..82ff484bd8c 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -373,8 +373,10 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+	if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
 		kill_guest(cpu, "Bad address %#lx", vaddr);
+		return -1UL;
+	}
 
 	gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-- 
cgit v1.2.3


From b7ff99ea53cd16de8f6166c0e98f19a7c6ca67ee Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 21:55:23 -0600
Subject: lguest: wire up pte_update/pte_update_defer

Impact: intermittent guest segv/crash fix

I've been seeing random guest bad address crashes and segmentation faults:
bisect led to 4f98a2fee8 (vmscan: split LRU lists into anon & file sets),
but that's a red herring.

It turns out that lguest never hooked up the pte_update/pte_update_defer
calls, so our ptes were not always in sync.  After the vmscan commit, the
bug became reproducible; now a fsck in a 64MB guest causes reproducible
pagetable corruption.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: jeremy@xensource.com
Cc: virtualization@lists.osdl.org
Cc: stable@kernel.org
---
 arch/x86/lguest/boot.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9fe4ddaa8f6..7822d29b02c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -490,11 +490,17 @@ static void lguest_write_cr4(unsigned long val)
  * into a process' address space.  We set the entry then tell the Host the
  * toplevel and address this corresponds to.  The Guest uses one pagetable per
  * process, so we need to tell the Host which one we're changing (mm->pgd). */
+static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
+			       pte_t *ptep)
+{
+	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+}
+
 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pteval)
 {
 	*ptep = pteval;
-	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+	lguest_pte_update(mm, addr, ptep);
 }
 
 /* The Guest calls this to set a top-level entry.  Again, we set the entry then
@@ -1040,6 +1046,8 @@ __init void lguest_init(void)
 	pv_mmu_ops.read_cr3 = lguest_read_cr3;
 	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
 	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+	pv_mmu_ops.pte_update = lguest_pte_update;
+	pv_mmu_ops.pte_update_defer = lguest_pte_update;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
-- 
cgit v1.2.3


From 4cd8b5e2a159f18a1507f1187b44a1acbfa6341b Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Sat, 14 Mar 2009 13:37:52 -0200
Subject: lguest: use KVM hypercalls

Impact: cleanup

This patch allow us to use KVM hypercalls

Signed-off-by: Matias Zabaljauregui <zabaljauregui at gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h   | 24 ++---------
 arch/x86/lguest/boot.c                | 78 ++++++++++++++++++++++-------------
 arch/x86/lguest/i386_head.S           |  4 +-
 drivers/lguest/interrupts_and_traps.c |  7 ++--
 drivers/lguest/lguest_device.c        |  4 +-
 drivers/lguest/x86/core.c             | 62 +++++++++++++++++++++++++++-
 6 files changed, 122 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 43894428c3c..0f4ee7148af 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -26,36 +26,20 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/hw_irq.h>
+#include <asm/kvm_para.h>
 
 /*G:031 But first, how does our Guest contact the Host to ask for privileged
  * operations?  There are two ways: the direct way is to make a "hypercall",
  * to make requests of the Host Itself.
  *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Fifteen hypercalls are
+ * We use the KVM hypercall mechanism. Eighteen hypercalls are
  * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %edx, %ebx and %ecx.  If a return
+ * arguments (when required) are placed in %ebx, %ecx and %edx.  If a return
  * value makes sense, it's returned in %eax.
  *
  * Grossly invalid calls result in Sudden Death at the hands of the vengeful
  * Host, rather than returning failure.  This reflects Winston Churchill's
  * definition of a gentleman: "someone who is only rude intentionally". */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %edx, %ebx & %ecx */
-		     : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
 /*:*/
 
 /* Can't use our min() macro here: needs to be a constant */
@@ -64,7 +48,7 @@ hcall(unsigned long call,
 #define LHCALL_RING_SIZE 64
 struct hcall_args {
 	/* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
-	unsigned long arg0, arg2, arg3, arg1;
+	unsigned long arg0, arg1, arg2, arg3;
 };
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7822d29b02c..5be9293961b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -107,7 +107,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
 	local_irq_save(flags);
 	if (lguest_data.hcall_status[next_call] != 0xFF) {
 		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3);
+		kvm_hypercall3(call, arg1, arg2, arg3);
 	} else {
 		lguest_data.hcalls[next_call].arg0 = call;
 		lguest_data.hcalls[next_call].arg1 = arg1;
@@ -134,13 +134,32 @@ static void async_hcall(unsigned long call, unsigned long arg1,
  *
  * So, when we're in lazy mode, we call async_hcall() to store the call for
  * future processing: */
-static void lazy_hcall(unsigned long call,
+static void lazy_hcall1(unsigned long call,
+		       unsigned long arg1)
+{
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+		kvm_hypercall1(call, arg1);
+	else
+		async_hcall(call, arg1, 0, 0);
+}
+
+static void lazy_hcall2(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2)
+{
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+		kvm_hypercall2(call, arg1, arg2);
+	else
+		async_hcall(call, arg1, arg2, 0);
+}
+
+static void lazy_hcall3(unsigned long call,
 		       unsigned long arg1,
 		       unsigned long arg2,
 		       unsigned long arg3)
 {
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3);
+		kvm_hypercall3(call, arg1, arg2, arg3);
 	else
 		async_hcall(call, arg1, arg2, arg3);
 }
@@ -150,7 +169,7 @@ static void lazy_hcall(unsigned long call,
 static void lguest_leave_lazy_mode(void)
 {
 	paravirt_leave_lazy(paravirt_get_lazy_mode());
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	kvm_hypercall0(LHCALL_FLUSH_ASYNC);
 }
 
 /*G:033
@@ -229,7 +248,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
 	/* Keep the local copy up to date. */
 	native_write_idt_entry(dt, entrynum, g);
 	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
+	kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
 }
 
 /* Changing to a different IDT is very rare: we keep the IDT up-to-date every
@@ -241,7 +260,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
 	struct desc_struct *idt = (void *)desc->address;
 
 	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+		kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
 }
 
 /*
@@ -261,8 +280,8 @@ static void lguest_load_idt(const struct desc_ptr *desc)
  */
 static void lguest_load_gdt(const struct desc_ptr *desc)
 {
-	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
-	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+	BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
+	kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
 }
 
 /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@ -272,7 +291,7 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
 				   const void *desc, int type)
 {
 	native_write_gdt_entry(dt, entrynum, desc, type);
-	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+	kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
 }
 
 /* OK, I lied.  There are three "thread local storage" GDT entries which change
@@ -284,7 +303,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
 	 * can't handle us removing entries we're currently using.  So we clear
 	 * the GS register here: if it's needed it'll be reloaded anyway. */
 	lazy_load_gs(0);
-	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
 }
 
 /*G:038 That's enough excitement for now, back to ploughing through each of
@@ -382,7 +401,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 static unsigned long current_cr0;
 static void lguest_write_cr0(unsigned long val)
 {
-	lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
+	lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
 	current_cr0 = val;
 }
 
@@ -396,7 +415,7 @@ static unsigned long lguest_read_cr0(void)
  * the vowels have been optimized out. */
 static void lguest_clts(void)
 {
-	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	lazy_hcall1(LHCALL_TS, 0);
 	current_cr0 &= ~X86_CR0_TS;
 }
 
@@ -418,7 +437,7 @@ static bool cr3_changed = false;
 static void lguest_write_cr3(unsigned long cr3)
 {
 	lguest_data.pgdir = cr3;
-	lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
 	cr3_changed = true;
 }
 
@@ -493,7 +512,7 @@ static void lguest_write_cr4(unsigned long val)
 static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
 			       pte_t *ptep)
 {
-	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
 }
 
 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -509,8 +528,8 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	*pmdp = pmdval;
-	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
+		   (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
 }
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
@@ -526,7 +545,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 {
 	*ptep = pteval;
 	if (cr3_changed)
-		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
 
 /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
@@ -542,7 +561,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 static void lguest_flush_tlb_single(unsigned long addr)
 {
 	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
 }
 
 /* This is what happens after the Guest has removed a large number of entries.
@@ -550,7 +569,7 @@ static void lguest_flush_tlb_single(unsigned long addr)
  * have changed, ie. virtual addresses below PAGE_OFFSET. */
 static void lguest_flush_tlb_user(void)
 {
-	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
 }
 
 /* This is called when the kernel page tables have changed.  That's not very
@@ -558,7 +577,7 @@ static void lguest_flush_tlb_user(void)
  * slow), so it's worth separating this from the user flushing above. */
 static void lguest_flush_tlb_kernel(void)
 {
-	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
 
 /*
@@ -695,7 +714,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
 	}
 
 	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
+	kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
 	return 0;
 }
 
@@ -706,7 +725,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		/* A 0 argument shuts the clock down. */
-		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
+		kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
 		break;
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* This is what we expect. */
@@ -781,8 +800,8 @@ static void lguest_time_init(void)
 static void lguest_load_sp0(struct tss_struct *tss,
 			    struct thread_struct *thread)
 {
-	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
-		   THREAD_SIZE/PAGE_SIZE);
+	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
+		   THREAD_SIZE / PAGE_SIZE);
 }
 
 /* Let's just say, I wouldn't do debugging under a Guest. */
@@ -855,7 +874,7 @@ static void set_lguest_basic_apic_ops(void)
 /* STOP!  Until an interrupt comes in. */
 static void lguest_safe_halt(void)
 {
-	hcall(LHCALL_HALT, 0, 0, 0);
+	kvm_hypercall0(LHCALL_HALT);
 }
 
 /* The SHUTDOWN hypercall takes a string to describe what's happening, and
@@ -865,7 +884,8 @@ static void lguest_safe_halt(void)
  * rather than virtual addresses, so we use __pa() here. */
 static void lguest_power_off(void)
 {
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
+	kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
+					LGUEST_SHUTDOWN_POWEROFF);
 }
 
 /*
@@ -875,7 +895,7 @@ static void lguest_power_off(void)
  */
 static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
 {
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
+	kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
 	/* The hcall won't return, but to keep gcc happy, we're "done". */
 	return NOTIFY_DONE;
 }
@@ -916,7 +936,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
 		len = sizeof(scratch) - 1;
 	scratch[len] = '\0';
 	memcpy(scratch, buf, len);
-	hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
+	kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
 
 	/* This routine returns the number of bytes actually written. */
 	return len;
@@ -926,7 +946,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
  * Launcher to reboot us. */
 static void lguest_restart(char *reason)
 {
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+	kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
 }
 
 /*G:050
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 10b9bd35a8f..f7954198947 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -27,8 +27,8 @@ ENTRY(lguest_entry)
 	/* We make the "initialization" hypercall now to tell the Host about
 	 * us, and also find out where it put our page tables. */
 	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %edx
-	int $LGUEST_TRAP_ENTRY
+	movl $lguest_data - __PAGE_OFFSET, %ebx
+	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
 
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 415fab0125a..504091da173 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -288,9 +288,10 @@ static int direct_trap(unsigned int num)
 
 	/* The Host needs to see page faults (for shadow paging and to save the
 	 * fault address), general protection faults (in/out emulation) and
-	 * device not available (TS handling), and of course, the hypercall
-	 * trap. */
-	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
+	 * device not available (TS handling), invalid opcode fault (kvm hcall),
+	 * and of course, the hypercall trap. */
+	return num != 14 && num != 13 && num != 7 &&
+			num != 6 && num != LGUEST_TRAP_ENTRY;
 }
 /*:*/
 
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 8132533d71f..df44d962626 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -161,7 +161,7 @@ static void set_status(struct virtio_device *vdev, u8 status)
 
 	/* We set the status. */
 	to_lgdev(vdev)->desc->status = status;
-	hcall(LHCALL_NOTIFY, (max_pfn<<PAGE_SHIFT) + offset, 0, 0);
+	kvm_hypercall1(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset);
 }
 
 static void lg_set_status(struct virtio_device *vdev, u8 status)
@@ -209,7 +209,7 @@ static void lg_notify(struct virtqueue *vq)
 	 * virtqueue structure. */
 	struct lguest_vq_info *lvq = vq->priv;
 
-	hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0);
+	kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT);
 }
 
 /* An extern declaration inside a C file is bad form.  Don't do it. */
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index bf7942327bd..a6b717644be 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -290,6 +290,57 @@ static int emulate_insn(struct lg_cpu *cpu)
 	return 1;
 }
 
+/* Our hypercalls mechanism used to be based on direct software interrupts.
+ * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
+ * change over to using kvm hypercalls.
+ *
+ * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid
+ * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be
+ * an *emulation approach*: if the fault was really produced by an hypercall
+ * (is_hypercall() does exactly this check), we can just call the corresponding
+ * hypercall host implementation function.
+ *
+ * But these invalid opcode faults are notably slower than software interrupts.
+ * So we implemented the *patching (or rewriting) approach*: every time we hit
+ * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f"
+ * opcode, so next time the Guest calls this hypercall it will use the
+ * faster trap mechanism.
+ *
+ * Matias even benchmarked it to convince you: this shows the average cycle
+ * cost of a hypercall.  For each alternative solution mentioned above we've
+ * made 5 runs of the benchmark:
+ *
+ * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898
+ * 2) emulation technique: 3410, 3681, 3466, 3392, 3780
+ * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884
+ *
+ * One two-line function is worth a 20% hypercall speed boost!
+ */
+static void rewrite_hypercall(struct lg_cpu *cpu)
+{
+	/* This are the opcodes we use to patch the Guest.  The opcode for "int
+	 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
+	 * complete the sequence with a NOP (0x90). */
+	u8 insn[3] = {0xcd, 0x1f, 0x90};
+
+	__lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
+}
+
+static bool is_hypercall(struct lg_cpu *cpu)
+{
+	u8 insn[3];
+
+	/* This must be the Guest kernel trying to do something.
+	 * The bottom two bits of the CS segment register are the privilege
+	 * level. */
+	if ((cpu->regs->cs & 3) != GUEST_PL)
+		return false;
+
+	/* Is it a vmcall? */
+	__lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn));
+	return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1;
+}
+
 /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
 void lguest_arch_handle_trap(struct lg_cpu *cpu)
 {
@@ -337,7 +388,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		break;
 	case 32 ... 255:
 		/* These values mean a real interrupt occurred, in which case
-		 * the Host handler has already been run.  We just do a
+		 * the Host handler has already been run. We just do a
 		 * friendly check if another process should now be run, then
 		 * return to run the Guest again */
 		cond_resched();
@@ -347,6 +398,15 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		 * up the pointer now to indicate a hypercall is pending. */
 		cpu->hcall = (struct hcall_args *)cpu->regs;
 		return;
+	case 6:
+		/* kvm hypercalls trigger an invalid opcode fault (6).
+		 * We need to check if ring == GUEST_PL and
+		 * faulting instruction == vmcall. */
+		if (is_hypercall(cpu)) {
+			rewrite_hypercall(cpu);
+			return;
+		}
+		break;
 	}
 
 	/* We didn't handle the trap, so it needs to go to the Guest. */
-- 
cgit v1.2.3


From df1693abc42e34bbc4351e179dbe66c28a94efb8 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Wed, 18 Mar 2009 13:38:35 -0300
Subject: lguest: use bool instead of int

Impact: clean up

Rusty told me, some time ago, that he had become a fan of "bool".
So, here are some replacements.

Signed-off-by: Matias Zabaljauregui <zabaljauregui at gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c                 |  4 ++--
 drivers/lguest/interrupts_and_traps.c | 21 +++++++++++----------
 drivers/lguest/lg.h                   |  8 ++++----
 drivers/lguest/page_tables.c          | 18 +++++++++---------
 drivers/lguest/segments.c             |  2 +-
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 60156dfdc60..4845fb3cf74 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -152,8 +152,8 @@ static void unmap_switcher(void)
  * code.  We have to check that the range is below the pfn_limit the Launcher
  * gave us.  We have to make sure that addr + len doesn't give us a false
  * positive by overflowing, too. */
-int lguest_address_ok(const struct lguest *lg,
-		      unsigned long addr, unsigned long len)
+bool lguest_address_ok(const struct lguest *lg,
+		       unsigned long addr, unsigned long len)
 {
 	return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
 }
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 504091da173..6e99adbe194 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -34,7 +34,7 @@ static int idt_type(u32 lo, u32 hi)
 }
 
 /* An IDT entry can't be used unless the "present" bit is set. */
-static int idt_present(u32 lo, u32 hi)
+static bool idt_present(u32 lo, u32 hi)
 {
 	return (hi & 0x8000);
 }
@@ -60,7 +60,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
  * We set up the stack just like the CPU does for a real interrupt, so it's
  * identical for the Guest (and the standard "iret" instruction will undo
  * it). */
-static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
+static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
+				bool has_err)
 {
 	unsigned long gstack, origstack;
 	u32 eflags, ss, irq_enable;
@@ -184,7 +185,7 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 		/* set_guest_interrupt() takes the interrupt descriptor and a
 		 * flag to say whether this interrupt pushes an error code onto
 		 * the stack as well: virtual interrupts never do. */
-		set_guest_interrupt(cpu, idt->a, idt->b, 0);
+		set_guest_interrupt(cpu, idt->a, idt->b, false);
 	}
 
 	/* Every time we deliver an interrupt, we update the timestamp in the
@@ -244,26 +245,26 @@ void free_interrupts(void)
 /*H:220 Now we've got the routines to deliver interrupts, delivering traps like
  * page fault is easy.  The only trick is that Intel decided that some traps
  * should have error codes: */
-static int has_err(unsigned int trap)
+static bool has_err(unsigned int trap)
 {
 	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
 }
 
 /* deliver_trap() returns true if it could deliver the trap. */
-int deliver_trap(struct lg_cpu *cpu, unsigned int num)
+bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
 {
 	/* Trap numbers are always 8 bit, but we set an impossible trap number
 	 * for traps inside the Switcher, so check that here. */
 	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		return 0;
+		return false;
 
 	/* Early on the Guest hasn't set the IDT entries (or maybe it put a
 	 * bogus one in): if we fail here, the Guest will be killed. */
 	if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
-		return 0;
+		return false;
 	set_guest_interrupt(cpu, cpu->arch.idt[num].a,
 			    cpu->arch.idt[num].b, has_err(num));
-	return 1;
+	return true;
 }
 
 /*H:250 Here's the hard part: returning to the Host every time a trap happens
@@ -279,12 +280,12 @@ int deliver_trap(struct lg_cpu *cpu, unsigned int num)
  *
  * This routine indicates if a particular trap number could be delivered
  * directly. */
-static int direct_trap(unsigned int num)
+static bool direct_trap(unsigned int num)
 {
 	/* Hardware interrupts don't go to the Guest at all (except system
 	 * call). */
 	if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
-		return 0;
+		return false;
 
 	/* The Host needs to see page faults (for shadow paging and to save the
 	 * fault address), general protection faults (in/out emulation) and
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index f2c641e0bdd..ac8a4a3741b 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -109,8 +109,8 @@ struct lguest
 extern struct mutex lguest_lock;
 
 /* core.c: */
-int lguest_address_ok(const struct lguest *lg,
-		      unsigned long addr, unsigned long len);
+bool lguest_address_ok(const struct lguest *lg,
+		       unsigned long addr, unsigned long len);
 void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
 void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
 
@@ -140,7 +140,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 
 /* interrupts_and_traps.c: */
 void maybe_do_interrupt(struct lg_cpu *cpu);
-int deliver_trap(struct lg_cpu *cpu, unsigned int num);
+bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
 void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
@@ -173,7 +173,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
 		   unsigned long vaddr, pte_t val);
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
 void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
 unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
 void page_table_guest_data_init(struct lg_cpu *cpu);
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 82ff484bd8c..a059cf9980f 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -199,7 +199,7 @@ static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
  *
  * If we fixed up the fault (ie. we mapped the address), this routine returns
  * true.  Otherwise, it was a real fault and we need to tell the Guest. */
-int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 {
 	pgd_t gpgd;
 	pgd_t *spgd;
@@ -211,7 +211,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
 	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		return 0;
+		return false;
 
 	/* Now look at the matching shadow entry. */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
@@ -222,7 +222,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		 * simple for this corner case. */
 		if (!ptepage) {
 			kill_guest(cpu, "out of memory allocating pte page");
-			return 0;
+			return false;
 		}
 		/* We check that the Guest pgd is OK. */
 		check_gpgd(cpu, gpgd);
@@ -238,16 +238,16 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		return 0;
+		return false;
 
 	/* Check they're not trying to write to a page the Guest wants
 	 * read-only (bit 2 of errcode == write). */
 	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
-		return 0;
+		return false;
 
 	/* User access to a kernel-only page? (bit 3 == user access) */
 	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
-		return 0;
+		return false;
 
 	/* Check that the Guest PTE flags are OK, and the page number is below
 	 * the pfn_limit (ie. not mapping the Launcher binary). */
@@ -283,7 +283,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	 * manipulated, the result returned and the code complete.  A small
 	 * delay and a trace of alliteration are the only indications the Guest
 	 * has that a page fault occurred at all. */
-	return 1;
+	return true;
 }
 
 /*H:360
@@ -296,7 +296,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
  *
  * This is a quick version which answers the question: is this virtual address
  * mapped by the shadow page tables, and is it writable? */
-static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
+static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 {
 	pgd_t *spgd;
 	unsigned long flags;
@@ -304,7 +304,7 @@ static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 	/* Look at the current top level entry: is it present? */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
 	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
-		return 0;
+		return false;
 
 	/* Check the flags on the pte entry itself: it must be present and
 	 * writable. */
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index ec6aa3f1c36..4f15439b7f1 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -45,7 +45,7 @@
  * "Task State Segment" which controls all kinds of delicate things.  The
  * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
  * the Guest can't be trusted to deal with double faults. */
-static int ignored_gdt(unsigned int num)
+static bool ignored_gdt(unsigned int num)
 {
 	return (num == GDT_ENTRY_TSS
 		|| num == GDT_ENTRY_LGUEST_CS
-- 
cgit v1.2.3


From d1881d3192a3d3e8dc4f255b03187f4c36cb0617 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 21:55:25 -0600
Subject: lguest: barrier me harder

Impact: barrier correctness in example launcher

I doubt either lguest user will complain about performance.

Reported-by: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index f2dbbf3bdea..d36fcc0f271 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1630,6 +1630,13 @@ static bool service_io(struct device *dev)
 		}
 	}
 
+	/* OK, so we noted that it was pretty poor to use an fdatasync as a
+	 * barrier.  But Christoph Hellwig points out that we need a sync
+	 * *afterwards* as well: "Barriers specify no reordering to the front
+	 * or the back."  And Jens Axboe confirmed it, so here we are: */
+	if (out->type & VIRTIO_BLK_T_BARRIER)
+		fdatasync(vblk->fd);
+
 	/* We can't trigger an IRQ, because we're not the Launcher.  It does
 	 * that when we tell it we're done. */
 	add_used(dev->vq, head, wlen);
-- 
cgit v1.2.3


From 1a2142afa5646ad5af44bbe1febaa5e0b7e71156 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:10 -0600
Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL

Impact: cleanup

(Thanks to Al Viro for reminding me of this, via Ingo)

CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so:

	#define CPU_MASK_ALL (cpumask_t) { { ... } }

Taking the address of such a temporary is questionable at best,
unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added
CPU_MASK_ALL_PTR:

	#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)

Which formalizes this practice.  One day gcc could bite us over this
usage (though we seem to have gotten away with it so far).

So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR
with the modern "cpu_all_mask" (a real const struct cpumask *).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
---
 init/main.c      | 2 +-
 kernel/kmod.c    | 2 +-
 kernel/kthread.c | 4 ++--
 mm/pdflush.c     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/init/main.c b/init/main.c
index 6bf83afd654..1ac7ec78e60 100644
--- a/init/main.c
+++ b/init/main.c
@@ -842,7 +842,7 @@ static int __init kernel_init(void * unused)
 	/*
 	 * init can run on any cpu.
 	 */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 	/*
 	 * Tell the world that we're going to be the grim
 	 * reaper of innocent orphaned children.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443..f0c8f545180 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data)
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 
 	/*
 	 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393..84bbadd4d02 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
 		 */
 		sched_setscheduler(create->result, SCHED_NORMAL, &param);
 		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
+		set_cpus_allowed_ptr(create->result, cpu_all_mask);
 	}
 	complete(&create->done);
 }
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 15de509b68f..118905e3d78 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -191,7 +191,7 @@ static int pdflush(void *dummy)
 
 	/*
 	 * Some configs put our parent kthread in a limited cpuset,
-	 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
+	 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
 	 * Our needs are more modest - cut back to our cpusets cpus_allowed.
 	 * This is needed as pdflush's are dynamically created and destroyed.
 	 * The boottime pdflush's are easily placed w/o these 2 lines.
-- 
cgit v1.2.3


From af76aba00fdcfb21535c9f9872245d14097a4561 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:11 -0600
Subject: cpumask: fix seq_bitmap_*() functions.

1) seq_bitmap_list() should take a const.
2) All the seq_bitmap should use cpumask_bits().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/seq_file.c            | 2 +-
 include/linux/seq_file.h | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/seq_file.c b/fs/seq_file.c
index a1a4cfe1921..7f40f30c55c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -513,7 +513,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 }
 EXPORT_SYMBOL(seq_bitmap);
 
-int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
 		unsigned int nr_bits)
 {
 	if (m->count < m->size) {
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index f616f31576d..004f3b3342c 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -55,7 +55,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 				   unsigned int nr_bits);
 static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask)
 {
-	return seq_bitmap(m, mask->bits, nr_cpu_ids);
+	return seq_bitmap(m, cpumask_bits(mask), nr_cpu_ids);
 }
 
 static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
@@ -63,12 +63,13 @@ static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
 	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
 }
 
-int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
 		unsigned int nr_bits);
 
-static inline int seq_cpumask_list(struct seq_file *m, cpumask_t *mask)
+static inline int seq_cpumask_list(struct seq_file *m,
+				   const struct cpumask *mask)
 {
-	return seq_bitmap_list(m, mask->bits, NR_CPUS);
+	return seq_bitmap_list(m, cpumask_bits(mask), nr_cpu_ids);
 }
 
 static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
-- 
cgit v1.2.3


From 0451fb2ebc4f65c265bb51d71a2fc986ebf20218 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:11 -0600
Subject: cpumask: remove node_to_first_cpu

Everyone defines it, and only one person uses it
(arch/mips/sgi-ip27/ip27-nmi.c).  So just open code it there.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-mips@linux-mips.org
---
 arch/ia64/include/asm/topology.h           |  5 -----
 arch/mips/include/asm/mach-ip27/topology.h |  1 -
 arch/mips/sgi-ip27/ip27-nmi.c              |  2 +-
 arch/powerpc/include/asm/topology.h        |  5 -----
 arch/sh/include/asm/topology.h             |  1 -
 arch/sparc/include/asm/topology_64.h       |  5 -----
 arch/x86/include/asm/topology.h            | 12 ------------
 include/asm-generic/topology.h             |  3 ---
 8 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 3193f4417e1..f260dcf2151 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -43,11 +43,6 @@
  */
 #define parent_node(nid) (nid)
 
-/*
- * Returns the number of the first CPU on Node 'node'.
- */
-#define node_to_first_cpu(node) (cpumask_first(cpumask_of_node(node)))
-
 /*
  * Determines the node for a given pci bus
  */
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 55d481569a1..07547231e07 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -26,7 +26,6 @@ extern struct cpuinfo_ip27 sn_cpu_info[NR_CPUS];
 #define parent_node(node)	(node)
 #define node_to_cpumask(node)	(hub_data(node)->h_cpus)
 #define cpumask_of_node(node)	(&hub_data(node)->h_cpus)
-#define node_to_first_cpu(node)	(cpumask_first(cpumask_of_node(node)))
 struct pci_bus;
 extern int pcibus_to_node(struct pci_bus *);
 
diff --git a/arch/mips/sgi-ip27/ip27-nmi.c b/arch/mips/sgi-ip27/ip27-nmi.c
index 64459e7d891..b174a51a162 100644
--- a/arch/mips/sgi-ip27/ip27-nmi.c
+++ b/arch/mips/sgi-ip27/ip27-nmi.c
@@ -219,7 +219,7 @@ cont_nmi_dump(void)
 		if (i == 1000) {
 			for_each_online_node(node)
 				if (NODEPDA(node)->dump_count == 0) {
-					cpu = node_to_first_cpu(node);
+					cpu = cpumask_first(cpumask_of_node(node));
 					for (n=0; n < CNODE_NUM_CPUS(node); cpu++, n++) {
 						CPUMASK_SETB(nmied_cpus, cpu);
 						/*
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 375258559ae..054a16d6808 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -24,11 +24,6 @@ static inline cpumask_t node_to_cpumask(int node)
 
 #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
 
-static inline int node_to_first_cpu(int node)
-{
-	return cpumask_first(cpumask_of_node(node));
-}
-
 int of_node_to_nid(struct device_node *device);
 
 struct pci_bus;
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 066f0fba590..a3f23954589 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -33,7 +33,6 @@
 
 #define node_to_cpumask(node)	((void)node, cpu_online_map)
 #define cpumask_of_node(node)	((void)node, cpu_online_mask)
-#define node_to_first_cpu(node)	((void)(node),0)
 
 #define pcibus_to_node(bus)	((void)(bus), -1)
 #define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 5bc0b8fd637..2770f64fdc3 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -28,11 +28,6 @@ static inline cpumask_t node_to_cpumask(int node)
 #define node_to_cpumask_ptr_next(v, node)	\
 			   v = &(numa_cpumask_lookup_table[node])
 
-static inline int node_to_first_cpu(int node)
-{
-	return cpumask_first(cpumask_of_node(node));
-}
-
 struct pci_bus;
 #ifdef CONFIG_PCI
 extern int pcibus_to_node(struct pci_bus *pbus);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 77cfb2cfb38..744299c0b77 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -217,10 +217,6 @@ static inline cpumask_t node_to_cpumask(int node)
 {
 	return cpu_online_map;
 }
-static inline int node_to_first_cpu(int node)
-{
-	return first_cpu(cpu_online_map);
-}
 
 static inline void setup_node_to_cpumask_map(void) { }
 
@@ -237,14 +233,6 @@ static inline void setup_node_to_cpumask_map(void) { }
 
 #include <asm-generic/topology.h>
 
-#ifdef CONFIG_NUMA
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
-	return cpumask_first(cpumask_of_node(node));
-}
-#endif
-
 extern cpumask_t cpu_coregroup_map(int cpu);
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index 0e9e2bc0ee9..47766b30061 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -43,9 +43,6 @@
 #ifndef cpumask_of_node
 #define cpumask_of_node(node)	((void)node, cpu_online_mask)
 #endif
-#ifndef node_to_first_cpu
-#define node_to_first_cpu(node)	((void)(node),0)
-#endif
 #ifndef pcibus_to_node
 #define pcibus_to_node(bus)	((void)(bus), -1)
 #endif
-- 
cgit v1.2.3


From 2b17fa506c418e9fb02bbbc7f426d2bbb5b247a6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:12 -0600
Subject: cpumask: use set_cpu_active in init/main.c

cpu_active_map is deprecated in favor of cpu_active_mask, which is
const for safety: we use accessors now (set_cpu_active) is we really
want to make a change.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 init/main.c  | 3 +--
 kernel/cpu.c | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/init/main.c b/init/main.c
index 1ac7ec78e60..d6b388fbffa 100644
--- a/init/main.c
+++ b/init/main.c
@@ -407,8 +407,7 @@ static void __init smp_init(void)
 	 * Set up the current CPU as possible to migrate to.
 	 * The other ones will be done by cpu_up/cpu_down()
 	 */
-	cpu = smp_processor_id();
-	cpu_set(cpu, cpu_active_map);
+	set_cpu_active(smp_processor_id(), true);
 
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb..395b6974dc8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	cpu_clear(cpu, cpu_active_map);
+	set_cpu_active(cpu, false);
 
 	/*
 	 * Make sure the all cpus did the reschedule and are not
@@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
 	err = _cpu_down(cpu, 0);
 
 	if (cpu_online(cpu))
-		cpu_set(cpu, cpu_active_map);
+		set_cpu_active(cpu, true);
 
 out:
 	cpu_maps_update_done();
@@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	cpu_set(cpu, cpu_active_map);
+	set_cpu_active(cpu, true);
 
 	/* Now call notifier in preparation. */
 	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
-- 
cgit v1.2.3


From 9489424454c93f4d225d7af47978f8c7e84bf4d4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:12 -0600
Subject: cpumask: use mm_cpumask() wrapper: kernel/fork.c

Impact: futureproof

Makes code futureproof against the impending change to mm->cpu_vm_mask.

It's also a chance to use the new cpumask_ ops which take a pointer.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 6715ebc3761..47c15840a38 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -284,7 +284,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	cpus_clear(mm->cpu_vm_mask);
+	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
-- 
cgit v1.2.3


From 1a8a51004a18b627ea81444201f7867875212f46 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:13 -0600
Subject: cpumask: remove references to struct irqaction's mask field.

Impact: cleanup

It's unused, since about 1995.  So remove all initialization of it in
preparation for actually removing the field.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/cris/arch-v10/kernel/time.c          | 1 -
 arch/cris/arch-v32/kernel/smp.c           | 1 -
 arch/cris/arch-v32/kernel/time.c          | 1 -
 arch/frv/kernel/irq-mb93091.c             | 4 ----
 arch/frv/kernel/irq-mb93093.c             | 1 -
 arch/frv/kernel/irq-mb93493.c             | 2 --
 arch/frv/kernel/time.c                    | 1 -
 arch/h8300/kernel/timer/itu.c             | 1 -
 arch/h8300/kernel/timer/timer16.c         | 1 -
 arch/h8300/kernel/timer/timer8.c          | 1 -
 arch/h8300/kernel/timer/tpu.c             | 1 -
 arch/m32r/kernel/time.c                   | 1 -
 arch/mips/cobalt/irq.c                    | 1 -
 arch/mips/emma/markeins/irq.c             | 1 -
 arch/mips/jazz/irq.c                      | 1 -
 arch/mips/kernel/cevt-bcm1480.c           | 1 -
 arch/mips/kernel/cevt-sb1250.c            | 1 -
 arch/mips/kernel/i8253.c                  | 2 --
 arch/mips/kernel/i8259.c                  | 1 -
 arch/mips/lasat/interrupt.c               | 1 -
 arch/mips/lemote/lm2e/irq.c               | 1 -
 arch/mips/sgi-ip32/ip32-irq.c             | 2 --
 arch/mips/sni/rm200.c                     | 3 ++-
 arch/mips/vr41xx/common/irq.c             | 1 -
 arch/mn10300/kernel/time.c                | 1 -
 arch/powerpc/platforms/85xx/mpc85xx_cds.c | 1 -
 arch/powerpc/platforms/8xx/m8xx_setup.c   | 1 -
 arch/powerpc/platforms/chrp/setup.c       | 1 -
 arch/powerpc/platforms/powermac/pic.c     | 2 --
 arch/powerpc/platforms/powermac/smp.c     | 1 -
 arch/powerpc/sysdev/cpm1.c                | 1 -
 arch/sh/kernel/time_64.c                  | 1 -
 arch/sh/kernel/timers/timer-cmt.c         | 1 -
 arch/sh/kernel/timers/timer-mtu2.c        | 1 -
 arch/sh/kernel/timers/timer-tmu.c         | 1 -
 arch/sparc/kernel/irq_32.c                | 2 --
 arch/sparc/kernel/sun4d_irq.c             | 1 -
 arch/x86/kernel/irqinit_32.c              | 2 --
 arch/x86/kernel/irqinit_64.c              | 1 -
 arch/x86/kernel/mfgpt_32.c                | 1 -
 arch/x86/kernel/setup.c                   | 1 -
 arch/x86/kernel/time_64.c                 | 2 --
 arch/x86/kernel/vmiclock_32.c             | 1 -
 43 files changed, 2 insertions(+), 53 deletions(-)

diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index c685ba4c338..2b73c7a5b64 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -261,7 +261,6 @@ timer_interrupt(int irq, void *dev_id)
 static struct irqaction irq2  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_SHARED | IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "timer",
 };
 
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 9dac1733464..f59a973c97e 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -65,7 +65,6 @@ static int send_ipi(int vector, int wait, cpumask_t cpu_mask);
 static struct irqaction irq_ipi  = {
 	.handler = crisv32_ipi_interrupt,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "ipi",
 };
 
diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c
index 3a13dd6e0a9..65633d0dab8 100644
--- a/arch/cris/arch-v32/kernel/time.c
+++ b/arch/cris/arch-v32/kernel/time.c
@@ -267,7 +267,6 @@ timer_interrupt(int irq, void *dev_id)
 static struct irqaction irq_timer = {
 	.handler = timer_interrupt,
 	.flags = IRQF_SHARED | IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
 
diff --git a/arch/frv/kernel/irq-mb93091.c b/arch/frv/kernel/irq-mb93091.c
index 9e38f99bbab..4dd9adaf115 100644
--- a/arch/frv/kernel/irq-mb93091.c
+++ b/arch/frv/kernel/irq-mb93091.c
@@ -109,28 +109,24 @@ static struct irqaction fpga_irq[4]  = {
 	[0] = {
 		.handler	= fpga_interrupt,
 		.flags		= IRQF_DISABLED | IRQF_SHARED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "fpga.0",
 		.dev_id		= (void *) 0x0028UL,
 	},
 	[1] = {
 		.handler	= fpga_interrupt,
 		.flags		= IRQF_DISABLED | IRQF_SHARED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "fpga.1",
 		.dev_id		= (void *) 0x0050UL,
 	},
 	[2] = {
 		.handler	= fpga_interrupt,
 		.flags		= IRQF_DISABLED | IRQF_SHARED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "fpga.2",
 		.dev_id		= (void *) 0x1c00UL,
 	},
 	[3] = {
 		.handler	= fpga_interrupt,
 		.flags		= IRQF_DISABLED | IRQF_SHARED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "fpga.3",
 		.dev_id		= (void *) 0x6386UL,
 	}
diff --git a/arch/frv/kernel/irq-mb93093.c b/arch/frv/kernel/irq-mb93093.c
index 3c2752ca977..e4520903187 100644
--- a/arch/frv/kernel/irq-mb93093.c
+++ b/arch/frv/kernel/irq-mb93093.c
@@ -108,7 +108,6 @@ static struct irqaction fpga_irq[1]  = {
 	[0] = {
 		.handler	= fpga_interrupt,
 		.flags		= IRQF_DISABLED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "fpga.0",
 		.dev_id		= (void *) 0x0700UL,
 	}
diff --git a/arch/frv/kernel/irq-mb93493.c b/arch/frv/kernel/irq-mb93493.c
index 7754c7338e4..ba55ecdfb24 100644
--- a/arch/frv/kernel/irq-mb93493.c
+++ b/arch/frv/kernel/irq-mb93493.c
@@ -120,14 +120,12 @@ static struct irqaction mb93493_irq[2]  = {
 	[0] = {
 		.handler	= mb93493_interrupt,
 		.flags		= IRQF_DISABLED | IRQF_SHARED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "mb93493.0",
 		.dev_id		= (void *) __addr_MB93493_IQSR(0),
 	},
 	[1] = {
 		.handler	= mb93493_interrupt,
 		.flags		= IRQF_DISABLED | IRQF_SHARED,
-		.mask		= CPU_MASK_NONE,
 		.name		= "mb93493.1",
 		.dev_id		= (void *) __addr_MB93493_IQSR(1),
 	}
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 69f6a4ef5d6..fb0ce757722 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -45,7 +45,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy);
 static struct irqaction timer_irq  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "timer",
 };
 
diff --git a/arch/h8300/kernel/timer/itu.c b/arch/h8300/kernel/timer/itu.c
index d1c926596b0..4883ba7103a 100644
--- a/arch/h8300/kernel/timer/itu.c
+++ b/arch/h8300/kernel/timer/itu.c
@@ -60,7 +60,6 @@ static struct irqaction itu_irq = {
 	.name		= "itu",
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 };
 
 static const int __initdata divide_rate[] = {1, 2, 4, 8};
diff --git a/arch/h8300/kernel/timer/timer16.c b/arch/h8300/kernel/timer/timer16.c
index e14271b7211..042dbb53f3f 100644
--- a/arch/h8300/kernel/timer/timer16.c
+++ b/arch/h8300/kernel/timer/timer16.c
@@ -55,7 +55,6 @@ static struct irqaction timer16_irq = {
 	.name		= "timer-16",
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 };
 
 static const int __initdata divide_rate[] = {1, 2, 4, 8};
diff --git a/arch/h8300/kernel/timer/timer8.c b/arch/h8300/kernel/timer/timer8.c
index 0556d7c7bea..38be0cabef0 100644
--- a/arch/h8300/kernel/timer/timer8.c
+++ b/arch/h8300/kernel/timer/timer8.c
@@ -75,7 +75,6 @@ static struct irqaction timer8_irq = {
 	.name		= "timer-8",
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 };
 
 static const int __initdata divide_rate[] = {8, 64, 8192};
diff --git a/arch/h8300/kernel/timer/tpu.c b/arch/h8300/kernel/timer/tpu.c
index df7f453a967..ad383caae19 100644
--- a/arch/h8300/kernel/timer/tpu.c
+++ b/arch/h8300/kernel/timer/tpu.c
@@ -65,7 +65,6 @@ static struct irqaction tpu_irq = {
 	.name		= "tpu",
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 };
 
 const static int __initdata divide_rate[] = {
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 6ea017727cc..cada3ba4b99 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -230,7 +230,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 static struct irqaction irq0 = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "MFT2",
 };
 
diff --git a/arch/mips/cobalt/irq.c b/arch/mips/cobalt/irq.c
index ac4fb912649..cb9bf820fe5 100644
--- a/arch/mips/cobalt/irq.c
+++ b/arch/mips/cobalt/irq.c
@@ -47,7 +47,6 @@ asmlinkage void plat_irq_dispatch(void)
 
 static struct irqaction cascade = {
 	.handler	= no_action,
-	.mask		= CPU_MASK_NONE,
 	.name		= "cascade",
 };
 
diff --git a/arch/mips/emma/markeins/irq.c b/arch/mips/emma/markeins/irq.c
index c2583ecc93c..ff4e529fa69 100644
--- a/arch/mips/emma/markeins/irq.c
+++ b/arch/mips/emma/markeins/irq.c
@@ -194,7 +194,6 @@ void emma2rh_gpio_irq_init(void)
 static struct irqaction irq_cascade = {
 	   .handler = no_action,
 	   .flags = 0,
-	   .mask = CPU_MASK_NONE,
 	   .name = "cascade",
 	   .dev_id = NULL,
 	   .next = NULL,
diff --git a/arch/mips/jazz/irq.c b/arch/mips/jazz/irq.c
index 03965cb1b25..d9b6a5b5399 100644
--- a/arch/mips/jazz/irq.c
+++ b/arch/mips/jazz/irq.c
@@ -134,7 +134,6 @@ static irqreturn_t r4030_timer_interrupt(int irq, void *dev_id)
 static struct irqaction r4030_timer_irqaction = {
 	.handler	= r4030_timer_interrupt,
 	.flags		= IRQF_DISABLED,
-	.mask		= CPU_MASK_CPU0,
 	.name		= "R4030 timer",
 };
 
diff --git a/arch/mips/kernel/cevt-bcm1480.c b/arch/mips/kernel/cevt-bcm1480.c
index b820661678b..a5182a20769 100644
--- a/arch/mips/kernel/cevt-bcm1480.c
+++ b/arch/mips/kernel/cevt-bcm1480.c
@@ -144,7 +144,6 @@ void __cpuinit sb1480_clockevent_init(void)
 
 	action->handler	= sibyte_counter_handler;
 	action->flags	= IRQF_DISABLED | IRQF_PERCPU;
-	action->mask	= cpumask_of_cpu(cpu);
 	action->name	= name;
 	action->dev_id	= cd;
 
diff --git a/arch/mips/kernel/cevt-sb1250.c b/arch/mips/kernel/cevt-sb1250.c
index a2eebaafda5..340f53e5c6b 100644
--- a/arch/mips/kernel/cevt-sb1250.c
+++ b/arch/mips/kernel/cevt-sb1250.c
@@ -143,7 +143,6 @@ void __cpuinit sb1250_clockevent_init(void)
 
 	action->handler	= sibyte_counter_handler;
 	action->flags	= IRQF_DISABLED | IRQF_PERCPU;
-	action->mask	= cpumask_of_cpu(cpu);
 	action->name	= name;
 	action->dev_id	= cd;
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index f4d187825f9..689719e34f0 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -98,7 +98,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING,
-	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
 
@@ -121,7 +120,6 @@ void __init setup_pit_timer(void)
 	cd->min_delta_ns = clockevent_delta2ns(0xF, cd);
 	clockevents_register_device(cd);
 
-	irq0.mask = cpumask_of_cpu(cpu);
 	setup_irq(0, &irq0);
 }
 
diff --git a/arch/mips/kernel/i8259.c b/arch/mips/kernel/i8259.c
index 413bd1d37f5..01c0885a806 100644
--- a/arch/mips/kernel/i8259.c
+++ b/arch/mips/kernel/i8259.c
@@ -306,7 +306,6 @@ static void init_8259A(int auto_eoi)
  */
 static struct irqaction irq2 = {
 	.handler = no_action,
-	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
 
diff --git a/arch/mips/lasat/interrupt.c b/arch/mips/lasat/interrupt.c
index d1ac7a25c85..1353fb135ed 100644
--- a/arch/mips/lasat/interrupt.c
+++ b/arch/mips/lasat/interrupt.c
@@ -104,7 +104,6 @@ asmlinkage void plat_irq_dispatch(void)
 
 static struct irqaction cascade = {
 	.handler	= no_action,
-	.mask		= CPU_MASK_NONE,
 	.name		= "cascade",
 };
 
diff --git a/arch/mips/lemote/lm2e/irq.c b/arch/mips/lemote/lm2e/irq.c
index 3e0b7beb100..1d0a09f3b83 100644
--- a/arch/mips/lemote/lm2e/irq.c
+++ b/arch/mips/lemote/lm2e/irq.c
@@ -92,7 +92,6 @@ asmlinkage void plat_irq_dispatch(void)
 
 static struct irqaction cascade_irqaction = {
 	.handler = no_action,
-	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
 
diff --git a/arch/mips/sgi-ip32/ip32-irq.c b/arch/mips/sgi-ip32/ip32-irq.c
index 0d6b6663d5f..9cb28cd20ad 100644
--- a/arch/mips/sgi-ip32/ip32-irq.c
+++ b/arch/mips/sgi-ip32/ip32-irq.c
@@ -115,14 +115,12 @@ extern irqreturn_t crime_cpuerr_intr(int irq, void *dev_id);
 struct irqaction memerr_irq = {
 	.handler = crime_memerr_intr,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "CRIME memory error",
 };
 
 struct irqaction cpuerr_irq = {
 	.handler = crime_cpuerr_intr,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "CRIME CPU error",
 };
 
diff --git a/arch/mips/sni/rm200.c b/arch/mips/sni/rm200.c
index 5310aa75afa..a695a08c93f 100644
--- a/arch/mips/sni/rm200.c
+++ b/arch/mips/sni/rm200.c
@@ -359,7 +359,8 @@ void sni_rm200_init_8259A(void)
  * IRQ2 is cascade interrupt to second interrupt controller
  */
 static struct irqaction sni_rm200_irq2 = {
-	no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL
+	.handler = no_action,
+	.name = "cascade",
 };
 
 static struct resource sni_rm200_pic1_resource = {
diff --git a/arch/mips/vr41xx/common/irq.c b/arch/mips/vr41xx/common/irq.c
index 92dd1a0ca35..9cc389109b1 100644
--- a/arch/mips/vr41xx/common/irq.c
+++ b/arch/mips/vr41xx/common/irq.c
@@ -32,7 +32,6 @@ static irq_cascade_t irq_cascade[NR_IRQS] __cacheline_aligned;
 
 static struct irqaction cascade_irqaction = {
 	.handler	= no_action,
-	.mask		= CPU_MASK_NONE,
 	.name		= "cascade",
 };
 
diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c
index e4606586f94..395caf01b90 100644
--- a/arch/mn10300/kernel/time.c
+++ b/arch/mn10300/kernel/time.c
@@ -37,7 +37,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id);
 static struct irqaction timer_irq = {
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_SHARED | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 	.name		= "timer",
 };
 
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
index aeb6a5bc552..02fe215122f 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
@@ -179,7 +179,6 @@ static irqreturn_t mpc85xx_8259_cascade_action(int irq, void *dev_id)
 static struct irqaction mpc85xxcds_8259_irqaction = {
 	.handler = mpc85xx_8259_cascade_action,
 	.flags = IRQF_SHARED,
-	.mask = CPU_MASK_NONE,
 	.name = "8259 cascade",
 };
 #endif /* PPC_I8259 */
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 0d9f75c74f8..385acfc4839 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -44,7 +44,6 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
 
 static struct irqaction tbint_irqaction = {
 	.handler = timebase_interrupt,
-	.mask = CPU_MASK_NONE,
 	.name = "tbint",
 };
 
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index 272d79a8d28..cd4ad9aea76 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -472,7 +472,6 @@ static void __init chrp_find_openpic(void)
 #if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_XMON)
 static struct irqaction xmon_irqaction = {
 	.handler = xmon_irq,
-	.mask = CPU_MASK_NONE,
 	.name = "XMON break",
 };
 #endif
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 6d149ae8ffa..7039d8f1d3b 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -266,7 +266,6 @@ static unsigned int pmac_pic_get_irq(void)
 static struct irqaction xmon_action = {
 	.handler	= xmon_irq,
 	.flags		= 0,
-	.mask		= CPU_MASK_NONE,
 	.name		= "NMI - XMON"
 };
 #endif
@@ -274,7 +273,6 @@ static struct irqaction xmon_action = {
 static struct irqaction gatwick_cascade_action = {
 	.handler	= gatwick_action,
 	.flags		= IRQF_DISABLED,
-	.mask		= CPU_MASK_NONE,
 	.name		= "cascade",
 };
 
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index bd8817b00fa..cf1dbe75889 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -385,7 +385,6 @@ static void __init psurge_dual_sync_tb(int cpu_nr)
 static struct irqaction psurge_irqaction = {
 	.handler = psurge_primary_intr,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "primary IPI",
 };
 
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index 490473ce810..82424cd7e12 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -119,7 +119,6 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
 
 static struct irqaction cpm_error_irqaction = {
 	.handler = cpm_error_interrupt,
-	.mask = CPU_MASK_NONE,
 	.name = "error",
 };
 
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index 59d2a03e8b3..988c77c3723 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -284,7 +284,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
 	.name = "timer",
 };
 
diff --git a/arch/sh/kernel/timers/timer-cmt.c b/arch/sh/kernel/timers/timer-cmt.c
index c127293271e..9aa348658ae 100644
--- a/arch/sh/kernel/timers/timer-cmt.c
+++ b/arch/sh/kernel/timers/timer-cmt.c
@@ -109,7 +109,6 @@ static struct irqaction cmt_irq = {
 	.name		= "timer",
 	.handler	= cmt_timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-	.mask		= CPU_MASK_NONE,
 };
 
 static void cmt_clk_init(struct clk *clk)
diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c
index 9a77ae86b40..9b0ef012647 100644
--- a/arch/sh/kernel/timers/timer-mtu2.c
+++ b/arch/sh/kernel/timers/timer-mtu2.c
@@ -115,7 +115,6 @@ static struct irqaction mtu2_irq = {
 	.name		= "timer",
 	.handler	= mtu2_timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-	.mask		= CPU_MASK_NONE,
 };
 
 static unsigned int divisors[] = { 1, 4, 16, 64, 1, 1, 256 };
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index 10b5a6f17cc..c5d3396f596 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -162,7 +162,6 @@ static struct irqaction tmu0_irq = {
 	.name		= "periodic/oneshot timer",
 	.handler	= tmu_timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-	.mask		= CPU_MASK_NONE,
 };
 
 static void __init tmu_clk_init(struct clk *clk)
diff --git a/arch/sparc/kernel/irq_32.c b/arch/sparc/kernel/irq_32.c
index 44dd5ee6433..ad800b80c71 100644
--- a/arch/sparc/kernel/irq_32.c
+++ b/arch/sparc/kernel/irq_32.c
@@ -439,7 +439,6 @@ static int request_fast_irq(unsigned int irq,
 	flush_cache_all();
 
 	action->flags = irqflags;
-	cpus_clear(action->mask);
 	action->name = devname;
 	action->dev_id = NULL;
 	action->next = NULL;
@@ -574,7 +573,6 @@ int request_irq(unsigned int irq,
 
 	action->handler = handler;
 	action->flags = irqflags;
-	cpus_clear(action->mask);
 	action->name = devname;
 	action->next = NULL;
 	action->dev_id = dev_id;
diff --git a/arch/sparc/kernel/sun4d_irq.c b/arch/sparc/kernel/sun4d_irq.c
index 3369fef5b4b..ab036a72de5 100644
--- a/arch/sparc/kernel/sun4d_irq.c
+++ b/arch/sparc/kernel/sun4d_irq.c
@@ -326,7 +326,6 @@ int sun4d_request_irq(unsigned int irq,
 
 	action->handler = handler;
 	action->flags = irqflags;
-	cpus_clear(action->mask);
 	action->name = devname;
 	action->next = NULL;
 	action->dev_id = dev_id;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006..458c554c542 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -50,7 +50,6 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
  */
 static struct irqaction fpu_irq = {
 	.handler = math_error_irq,
-	.mask = CPU_MASK_NONE,
 	.name = "fpu",
 };
 
@@ -83,7 +82,6 @@ void __init init_ISA_irqs(void)
  */
 static struct irqaction irq2 = {
 	.handler = no_action,
-	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
 
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..76abe43aa73 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -45,7 +45,6 @@
 
 static struct irqaction irq2 = {
 	.handler = no_action,
-	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 8815f3c7fec..846510b78a0 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -348,7 +348,6 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
 static struct irqaction mfgptirq  = {
 	.handler = mfgpt_tick,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING,
-	.mask = CPU_MASK_NONE,
 	.name = "mfgpt-timer"
 };
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b746deb9ebc..900dad7fe38 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1027,7 +1027,6 @@ void __init x86_quirk_trap_init(void)
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
-	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
 
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 241ec3923f6..5ba343e6184 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -116,7 +116,6 @@ unsigned long __init calibrate_cpu(void)
 static struct irqaction irq0 = {
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 	.name		= "timer"
 };
 
@@ -125,7 +124,6 @@ void __init hpet_time_init(void)
 	if (!hpet_enable())
 		setup_pit_timer();
 
-	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 33a788d5879..d303369a7ba 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -202,7 +202,6 @@ static struct irqaction vmi_clock_action  = {
 	.name 		= "vmi-timer",
 	.handler 	= vmi_timer_interrupt,
 	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-	.mask 		= CPU_MASK_ALL,
 };
 
 static void __devinit vmi_time_init_clockevent(void)
-- 
cgit v1.2.3


From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:15 -0600
Subject: cpumask: use new cpumask_ functions in core code.

Impact: cleanup

Time to clean up remaining laggards using the old cpu_ functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Trond.Myklebust@netapp.com
---
 drivers/base/cpu.c     | 2 +-
 include/linux/cpuset.h | 4 ++--
 kernel/workqueue.c     | 6 +++---
 mm/allocpercpu.c       | 2 +-
 mm/vmstat.c            | 2 +-
 net/sunrpc/svc.c       | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 5b257a57bc5..e62a4ccea54 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -119,7 +119,7 @@ static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
 #define	print_cpus_func(type) \
 static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf)	\
 {									\
-	return print_cpus_map(buf, &cpu_##type##_map);			\
+	return print_cpus_map(buf, cpu_##type##_mask);			\
 }									\
 static struct sysdev_class_attribute attr_##type##_map = 		\
 	_SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074a36c..2e0d79678de 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -90,12 +90,12 @@ static inline void cpuset_init_smp(void) {}
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
 					      struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509b40d..9aedd9fd825 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -416,7 +416,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +547,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -911,7 +911,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
  	cpu_maps_update_done();
 
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 1882923bc70..139d5b7b662 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -143,7 +143,7 @@ void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
-	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
+	__percpu_depopulate_mask(__pdata, cpu_possible_mask);
 	kfree(__percpu_disguise(__pdata));
 }
 EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 91149746bb8..8cd81ea1ddc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 
 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 
-	for_each_cpu_mask_nr(cpu, *cpumask) {
+	for_each_cpu(cpu, cpumask) {
 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af..bb507e2bb94 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -312,7 +312,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+		set_cpus_allowed_ptr(task, cpumask_of(node));
 		break;
 	}
 	case SVC_POOL_PERNODE:
-- 
cgit v1.2.3


From 73d0a4b107d58908305f272bfae9bd17f74a2c81 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:16 -0600
Subject: cpumask: convert rcutorture.c

We're getting rid of cpumasks on the stack.

Simply change tmp_mask to a global, and allocate it in
rcu_torture_init().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
---
 kernel/rcutorture.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a79f0..9b4a975a4b4 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
 static long n_rcu_torture_timers = 0;
 static struct list_head rcu_torture_removed;
+static cpumask_var_t shuffle_tmp_mask;
 
 static int stutter_pause_test = 0;
 
@@ -889,10 +890,9 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_t tmp_mask;
 	int i;
 
-	cpus_setall(tmp_mask);
+	cpumask_setall(shuffle_tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
 	}
 
 	if (rcu_idle_cpu != -1)
-		cpu_clear(rcu_idle_cpu, tmp_mask);
+		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
 
-	set_cpus_allowed_ptr(current, &tmp_mask);
+	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
-						     &tmp_mask);
+						     shuffle_tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     &tmp_mask);
+						     shuffle_tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, &tmp_mask);
+		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, &tmp_mask);
+		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
 	if (shuffler_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
+		free_cpumask_var(shuffle_tmp_mask);
 	}
 	shuffler_task = NULL;
 
@@ -1190,10 +1191,18 @@ rcu_torture_init(void)
 	}
 	if (test_no_idle_hz) {
 		rcu_idle_cpu = num_online_cpus() - 1;
+
+		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+			firsterr = -ENOMEM;
+			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
+			goto unwind;
+		}
+
 		/* Create the shuffler thread */
 		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
 					  "rcu_torture_shuffle");
 		if (IS_ERR(shuffler_task)) {
+			free_cpumask_var(shuffle_tmp_mask);
 			firsterr = PTR_ERR(shuffler_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
 			shuffler_task = NULL;
-- 
cgit v1.2.3


From 612a726faf8486fa48b34fa37115ce1e7421d383 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:16 -0600
Subject: cpumask: remove cpumask_t from core

Impact: cleanup

struct cpumask is nicer, and we use it to make where we've made code
safe for CONFIG_CPUMASK_OFFSTACK=y.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/sched_cpupri.h | 2 +-
 kernel/stop_machine.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94ef8a0..9a7e859b8fb 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
 
 #ifdef CONFIG_SMP
 int  cpupri_find(struct cpupri *cp,
-		 struct task_struct *p, cpumask_t *lowest_mask);
+		 struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
 int cpupri_init(struct cpupri *cp, bool bootmem);
 void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 74541ca4953..912823e2a11 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
 static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
-static const cpumask_t *active_cpus;
+static const struct cpumask *active_cpus;
 static void *stop_machine_work;
 
 static void set_state(enum stopmachine_state newstate)
-- 
cgit v1.2.3


From 97c12f85ac5e4ac2faee6cada014ac6205105b19 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:17 -0600
Subject: cpumask: remove the now-obsoleted pcibus_to_cpumask(): generic

Impact: reduce stack usage for large NR_CPUS

cpumask_of_pcibus() is the new version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/asm-generic/topology.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index 47766b30061..88bada2ebc4 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -47,13 +47,6 @@
 #define pcibus_to_node(bus)	((void)(bus), -1)
 #endif
 
-#ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
-					CPU_MASK_ALL : \
-					node_to_cpumask(pcibus_to_node(bus)) \
-				)
-#endif
-
 #ifndef cpumask_of_pcibus
 #define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
 				 cpu_all_mask :				\
-- 
cgit v1.2.3


From bb75efddeaca89f8a67fd82cdcbaaf436cf17ca9 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Sun, 29 Mar 2009 17:12:22 +0100
Subject: oprofile: Thou shalt not call __exit functions from __init functions

Impact: fix ref to discarded function

`buffer_sync_cleanup' referenced in section `.init.text' of arch/arm/oprofile/built-in.o: defined in discarded section `.exit.text' of arch/arm/oprofile/built-in.o

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/oprofile/buffer_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index c3ea5fa7d05..2c9aa49e43c 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -574,7 +574,7 @@ int __init buffer_sync_init(void)
 		return 0;
 }
 
-void __exit buffer_sync_cleanup(void)
+void buffer_sync_cleanup(void)
 {
 	free_cpumask_var(marked_cpus);
 }
-- 
cgit v1.2.3


From eeafda70bf2807544e96fa4e52b2433cd470ff46 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 29 Mar 2009 12:30:05 -0700
Subject: PCI: fix HT MSI mapping fix

Impact: fix bug

This patch reworks the nv_msi_ht_cap_quirk() and will only try to avoid
to enable ht_msi on device following that root dev, and don't touch that
root dev, but only do that trick with end_device on the chain.

Reported-by: Prakash Punnoor <prakash@punnoor.de>
Tested-by: Prakash Punnoor <prakash@punnoor.de>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/quirks.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index faf02dd0001..9b2f0d96900 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2206,6 +2206,33 @@ static int __devinit host_bridge_with_leaf(struct pci_dev *host_bridge)
 	return found;
 }
 
+#define PCI_HT_CAP_SLAVE_CTRL0     4    /* link control */
+#define PCI_HT_CAP_SLAVE_CTRL1     8    /* link control to */
+
+static int __devinit is_end_of_ht_chain(struct pci_dev *dev)
+{
+	int pos, ctrl_off;
+	int end = 0;
+	u16 flags, ctrl;
+
+	pos = pci_find_ht_capability(dev, HT_CAPTYPE_SLAVE);
+
+	if (!pos)
+		goto out;
+
+	pci_read_config_word(dev, pos + PCI_CAP_FLAGS, &flags);
+
+	ctrl_off = ((flags >> 10) & 1) ?
+			PCI_HT_CAP_SLAVE_CTRL0 : PCI_HT_CAP_SLAVE_CTRL1;
+	pci_read_config_word(dev, pos + ctrl_off, &ctrl);
+
+	if (ctrl & (1 << 6))
+		end = 1;
+
+out:
+	return end;
+}
+
 static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev)
 {
 	struct pci_dev *host_bridge;
@@ -2230,8 +2257,9 @@ static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev)
 	if (!found)
 		return;
 
-	/* don't enable host_bridge with leaf directly here */
-	if (host_bridge == dev && host_bridge_with_leaf(host_bridge))
+	/* don't enable end_device/host_bridge with leaf directly here */
+	if (host_bridge == dev && is_end_of_ht_chain(host_bridge) &&
+	    host_bridge_with_leaf(host_bridge))
 		goto out;
 
 	/* root did that ! */
-- 
cgit v1.2.3


From 9202add67454c6a6f9508f41e733edce705dc56e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:40 +0200
Subject: hwmon: (ds1621) Reorder code statements

Reorder the ds1621 driver code so that we can get rid of forward
function declarations.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Aurelien Jarno <aurelien@aurel32.net>
---
 drivers/hwmon/ds1621.c | 123 +++++++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 66 deletions(-)

diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index 7415381601c..e688798e115 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c
@@ -81,34 +81,6 @@ struct ds1621_data {
 	u8 conf;			/* Register encoding, combined */
 };
 
-static int ds1621_probe(struct i2c_client *client,
-			const struct i2c_device_id *id);
-static int ds1621_detect(struct i2c_client *client, int kind,
-			 struct i2c_board_info *info);
-static void ds1621_init_client(struct i2c_client *client);
-static int ds1621_remove(struct i2c_client *client);
-static struct ds1621_data *ds1621_update_client(struct device *dev);
-
-static const struct i2c_device_id ds1621_id[] = {
-	{ "ds1621", ds1621 },
-	{ "ds1625", ds1621 },
-	{ }
-};
-MODULE_DEVICE_TABLE(i2c, ds1621_id);
-
-/* This is the driver that will be inserted */
-static struct i2c_driver ds1621_driver = {
-	.class		= I2C_CLASS_HWMON,
-	.driver = {
-		.name	= "ds1621",
-	},
-	.probe		= ds1621_probe,
-	.remove		= ds1621_remove,
-	.id_table	= ds1621_id,
-	.detect		= ds1621_detect,
-	.address_data	= &addr_data,
-};
-
 /* All registers are word-sized, except for the configuration register.
    DS1621 uses a high-byte first convention, which is exactly opposite to
    the SMBus standard. */
@@ -146,6 +118,45 @@ static void ds1621_init_client(struct i2c_client *client)
 	i2c_smbus_write_byte(client, DS1621_COM_START);
 }
 
+static struct ds1621_data *ds1621_update_client(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct ds1621_data *data = i2c_get_clientdata(client);
+	u8 new_conf;
+
+	mutex_lock(&data->update_lock);
+
+	if (time_after(jiffies, data->last_updated + HZ + HZ / 2)
+	    || !data->valid) {
+		int i;
+
+		dev_dbg(&client->dev, "Starting ds1621 update\n");
+
+		data->conf = ds1621_read_value(client, DS1621_REG_CONF);
+
+		for (i = 0; i < ARRAY_SIZE(data->temp); i++)
+			data->temp[i] = ds1621_read_value(client,
+							  DS1621_REG_TEMP[i]);
+
+		/* reset alarms if necessary */
+		new_conf = data->conf;
+		if (data->temp[0] > data->temp[1])	/* input > min */
+			new_conf &= ~DS1621_ALARM_TEMP_LOW;
+		if (data->temp[0] < data->temp[2])	/* input < max */
+			new_conf &= ~DS1621_ALARM_TEMP_HIGH;
+		if (data->conf != new_conf)
+			ds1621_write_value(client, DS1621_REG_CONF,
+					   new_conf);
+
+		data->last_updated = jiffies;
+		data->valid = 1;
+	}
+
+	mutex_unlock(&data->update_lock);
+
+	return data;
+}
+
 static ssize_t show_temp(struct device *dev, struct device_attribute *da,
 			 char *buf)
 {
@@ -294,45 +305,25 @@ static int ds1621_remove(struct i2c_client *client)
 	return 0;
 }
 
+static const struct i2c_device_id ds1621_id[] = {
+	{ "ds1621", ds1621 },
+	{ "ds1625", ds1621 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, ds1621_id);
 
-static struct ds1621_data *ds1621_update_client(struct device *dev)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct ds1621_data *data = i2c_get_clientdata(client);
-	u8 new_conf;
-
-	mutex_lock(&data->update_lock);
-
-	if (time_after(jiffies, data->last_updated + HZ + HZ / 2)
-	    || !data->valid) {
-		int i;
-
-		dev_dbg(&client->dev, "Starting ds1621 update\n");
-
-		data->conf = ds1621_read_value(client, DS1621_REG_CONF);
-
-		for (i = 0; i < ARRAY_SIZE(data->temp); i++)
-			data->temp[i] = ds1621_read_value(client,
-							  DS1621_REG_TEMP[i]);
-
-		/* reset alarms if necessary */
-		new_conf = data->conf;
-		if (data->temp[0] > data->temp[1])	/* input > min */
-			new_conf &= ~DS1621_ALARM_TEMP_LOW;
-		if (data->temp[0] < data->temp[2])	/* input < max */
-			new_conf &= ~DS1621_ALARM_TEMP_HIGH;
-		if (data->conf != new_conf)
-			ds1621_write_value(client, DS1621_REG_CONF,
-					   new_conf);
-
-		data->last_updated = jiffies;
-		data->valid = 1;
-	}
-
-	mutex_unlock(&data->update_lock);
-
-	return data;
-}
+/* This is the driver that will be inserted */
+static struct i2c_driver ds1621_driver = {
+	.class		= I2C_CLASS_HWMON,
+	.driver = {
+		.name	= "ds1621",
+	},
+	.probe		= ds1621_probe,
+	.remove		= ds1621_remove,
+	.id_table	= ds1621_id,
+	.detect		= ds1621_detect,
+	.address_data	= &addr_data,
+};
 
 static int __init ds1621_init(void)
 {
-- 
cgit v1.2.3


From 594592dc6f68356a3b7278eb4d260a50a66f0a06 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:40 +0200
Subject: hwmon: (ds1621) Clean up register access

Fix a few oddities in how the ds1621 driver accesses the registers:
* We don't need a wrapper to access the configuration register.
* Check for error before calling swab16. Error checking isn't
  complete yet, but that's a start.
* Device-specific read functions should never be called during
  detection, as by definition we don't know what device we are talking
  to at that point.
* Likewise, don't assume that register reads succeed during detection.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Aurelien Jarno <aurelien@aurel32.net>
---
 drivers/hwmon/ds1621.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index e688798e115..fe160c54b95 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c
@@ -81,28 +81,27 @@ struct ds1621_data {
 	u8 conf;			/* Register encoding, combined */
 };
 
-/* All registers are word-sized, except for the configuration register.
+/* Temperature registers are word-sized.
    DS1621 uses a high-byte first convention, which is exactly opposite to
    the SMBus standard. */
-static int ds1621_read_value(struct i2c_client *client, u8 reg)
+static int ds1621_read_temp(struct i2c_client *client, u8 reg)
 {
-	if (reg == DS1621_REG_CONF)
-		return i2c_smbus_read_byte_data(client, reg);
-	else
-		return swab16(i2c_smbus_read_word_data(client, reg));
+	int ret;
+
+	ret = i2c_smbus_read_word_data(client, reg);
+	if (ret < 0)
+		return ret;
+	return swab16(ret);
 }
 
-static int ds1621_write_value(struct i2c_client *client, u8 reg, u16 value)
+static int ds1621_write_temp(struct i2c_client *client, u8 reg, u16 value)
 {
-	if (reg == DS1621_REG_CONF)
-		return i2c_smbus_write_byte_data(client, reg, value);
-	else
-		return i2c_smbus_write_word_data(client, reg, swab16(value));
+	return i2c_smbus_write_word_data(client, reg, swab16(value));
 }
 
 static void ds1621_init_client(struct i2c_client *client)
 {
-	int reg = ds1621_read_value(client, DS1621_REG_CONF);
+	int reg = i2c_smbus_read_byte_data(client, DS1621_REG_CONF);
 	/* switch to continuous conversion mode */
 	reg &= ~ DS1621_REG_CONFIG_1SHOT;
 
@@ -112,7 +111,7 @@ static void ds1621_init_client(struct i2c_client *client)
 	else if (polarity == 1)
 		reg |= DS1621_REG_CONFIG_POLARITY;
 	
-	ds1621_write_value(client, DS1621_REG_CONF, reg);
+	i2c_smbus_write_byte_data(client, DS1621_REG_CONF, reg);
 	
 	/* start conversion */
 	i2c_smbus_write_byte(client, DS1621_COM_START);
@@ -132,11 +131,11 @@ static struct ds1621_data *ds1621_update_client(struct device *dev)
 
 		dev_dbg(&client->dev, "Starting ds1621 update\n");
 
-		data->conf = ds1621_read_value(client, DS1621_REG_CONF);
+		data->conf = i2c_smbus_read_byte_data(client, DS1621_REG_CONF);
 
 		for (i = 0; i < ARRAY_SIZE(data->temp); i++)
-			data->temp[i] = ds1621_read_value(client,
-							  DS1621_REG_TEMP[i]);
+			data->temp[i] = ds1621_read_temp(client,
+							 DS1621_REG_TEMP[i]);
 
 		/* reset alarms if necessary */
 		new_conf = data->conf;
@@ -145,8 +144,8 @@ static struct ds1621_data *ds1621_update_client(struct device *dev)
 		if (data->temp[0] < data->temp[2])	/* input < max */
 			new_conf &= ~DS1621_ALARM_TEMP_HIGH;
 		if (data->conf != new_conf)
-			ds1621_write_value(client, DS1621_REG_CONF,
-					   new_conf);
+			i2c_smbus_write_byte_data(client, DS1621_REG_CONF,
+						  new_conf);
 
 		data->last_updated = jiffies;
 		data->valid = 1;
@@ -176,8 +175,8 @@ static ssize_t set_temp(struct device *dev, struct device_attribute *da,
 
 	mutex_lock(&data->update_lock);
 	data->temp[attr->index] = val;
-	ds1621_write_value(client, DS1621_REG_TEMP[attr->index],
-			   data->temp[attr->index]);
+	ds1621_write_temp(client, DS1621_REG_TEMP[attr->index],
+			  data->temp[attr->index]);
 	mutex_unlock(&data->update_lock);
 	return count;
 }
@@ -239,13 +238,14 @@ static int ds1621_detect(struct i2c_client *client, int kind,
 		/* The NVB bit should be low if no EEPROM write has been 
 		   requested during the latest 10ms, which is highly 
 		   improbable in our case. */
-		conf = ds1621_read_value(client, DS1621_REG_CONF);
-		if (conf & DS1621_REG_CONFIG_NVB)
+		conf = i2c_smbus_read_byte_data(client, DS1621_REG_CONF);
+		if (conf < 0 || conf & DS1621_REG_CONFIG_NVB)
 			return -ENODEV;
 		/* The 7 lowest bits of a temperature should always be 0. */
 		for (i = 0; i < ARRAY_SIZE(DS1621_REG_TEMP); i++) {
-			temp = ds1621_read_value(client, DS1621_REG_TEMP[i]);
-			if (temp & 0x007f)
+			temp = i2c_smbus_read_word_data(client,
+							DS1621_REG_TEMP[i]);
+			if (temp < 0 || (temp & 0x7f00))
 				return -ENODEV;
 		}
 	}
-- 
cgit v1.2.3


From e4879e28abd67b894fb9d2db0afd08f1945670ba Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:40 +0200
Subject: hwmon: (ds1621) Avoid unneeded register access

Register access over SMBus isn't cheap, so avoid register access where
possible:
* Only write back the configuration register if it changed.
* Don't refresh the register cache when we don't have to.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Aurelien Jarno <aurelien@aurel32.net>
---
 drivers/hwmon/ds1621.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index fe160c54b95..53f88f51181 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c
@@ -101,17 +101,20 @@ static int ds1621_write_temp(struct i2c_client *client, u8 reg, u16 value)
 
 static void ds1621_init_client(struct i2c_client *client)
 {
-	int reg = i2c_smbus_read_byte_data(client, DS1621_REG_CONF);
+	u8 conf, new_conf;
+
+	new_conf = conf = i2c_smbus_read_byte_data(client, DS1621_REG_CONF);
 	/* switch to continuous conversion mode */
-	reg &= ~ DS1621_REG_CONFIG_1SHOT;
+	new_conf &= ~DS1621_REG_CONFIG_1SHOT;
 
 	/* setup output polarity */
 	if (polarity == 0)
-		reg &= ~DS1621_REG_CONFIG_POLARITY;
+		new_conf &= ~DS1621_REG_CONFIG_POLARITY;
 	else if (polarity == 1)
-		reg |= DS1621_REG_CONFIG_POLARITY;
+		new_conf |= DS1621_REG_CONFIG_POLARITY;
 	
-	i2c_smbus_write_byte_data(client, DS1621_REG_CONF, reg);
+	if (conf != new_conf)
+		i2c_smbus_write_byte_data(client, DS1621_REG_CONF, new_conf);
 	
 	/* start conversion */
 	i2c_smbus_write_byte(client, DS1621_COM_START);
@@ -170,7 +173,7 @@ static ssize_t set_temp(struct device *dev, struct device_attribute *da,
 {
 	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
 	struct i2c_client *client = to_i2c_client(dev);
-	struct ds1621_data *data = ds1621_update_client(dev);
+	struct ds1621_data *data = i2c_get_clientdata(client);
 	u16 val = LM75_TEMP_TO_REG(simple_strtol(buf, NULL, 10));
 
 	mutex_lock(&data->update_lock);
-- 
cgit v1.2.3


From 25f3311acc3405dd0dace3571a41f450e6cc6a65 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:41 +0200
Subject: hwmon: (ds1621) Clean up documentation

* The alarms sysfs file is deprecated, and individual alarm files are
  self-explanatory.
* The driver doesn't implement high-reslution temperature readings so
  don't document that.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Aurelien Jarno <aurelien@aurel32.net>
---
 Documentation/hwmon/ds1621 | 51 +++-------------------------------------------
 1 file changed, 3 insertions(+), 48 deletions(-)

diff --git a/Documentation/hwmon/ds1621 b/Documentation/hwmon/ds1621
index 1fee6f1e6bc..5e97f333c4d 100644
--- a/Documentation/hwmon/ds1621
+++ b/Documentation/hwmon/ds1621
@@ -49,12 +49,9 @@ of up to +/- 0.5 degrees even when compared against precise temperature
 readings. Be sure to have a high vs. low temperature limit gap of al least
 1.0 degree Celsius to avoid Tout "bouncing", though!
 
-As for alarms, you can read the alarm status of the DS1621 via the 'alarms'
-/sys file interface. The result consists mainly of bit 6 and 5 of the
-configuration register of the chip; bit 6 (0x40 or 64) is the high alarm
-bit and bit 5 (0x20 or 32) the low one. These bits are set when the high or
-low limits are met or exceeded and are reset by the module as soon as the
-respective temperature ranges are left.
+The alarm bits are set when the high or low limits are met or exceeded and
+are reset by the module as soon as the respective temperature ranges are
+left.
 
 The alarm registers are in no way suitable to find out about the actual
 status of Tout. They will only tell you about its history, whether or not
@@ -64,45 +61,3 @@ with neither of the alarms set.
 
 Temperature conversion of the DS1621 takes up to 1000ms; internal access to
 non-volatile registers may last for 10ms or below.
-
-High Accuracy Temperature Reading
----------------------------------
-
-As said before, the temperature issued via the 9-bit i2c-bus data is
-somewhat arbitrary. Internally, the temperature conversion is of a
-different kind that is explained (not so...) well in the DS1621 data sheet.
-To cut the long story short: Inside the DS1621 there are two oscillators,
-both of them biassed by a temperature coefficient.
-
-Higher resolution of the temperature reading can be achieved using the
-internal projection, which means taking account of REG_COUNT and REG_SLOPE
-(the driver manages them):
-
-Taken from Dallas Semiconductors App Note 068: 'Increasing Temperature
-Resolution on the DS1620' and App Note 105: 'High Resolution Temperature
-Measurement with Dallas Direct-to-Digital Temperature Sensors'
-
-- Read the 9-bit temperature and strip the LSB (Truncate the .5 degs)
-- The resulting value is TEMP_READ.
-- Then, read REG_COUNT.
-- And then, REG_SLOPE.
-
-      TEMP = TEMP_READ - 0.25 + ((REG_SLOPE - REG_COUNT) / REG_SLOPE)
-
-Note that this is what the DONE bit in the DS1621 configuration register is
-good for: Internally, one temperature conversion takes up to 1000ms. Before
-that conversion is complete you will not be able to read valid things out
-of REG_COUNT and REG_SLOPE. The DONE bit, as you may have guessed by now,
-tells you whether the conversion is complete ("done", in plain English) and
-thus, whether the values you read are good or not.
-
-The DS1621 has two modes of operation: "Continuous" conversion, which can
-be understood as the default stand-alone mode where the chip gets the
-temperature and controls external devices via its Tout pin or tells other
-i2c's about it if they care. The other mode is called "1SHOT", that means
-that it only figures out about the temperature when it is explicitly told
-to do so; this can be seen as power saving mode.
-
-Now if you want to read REG_COUNT and REG_SLOPE, you have to either stop
-the continuous conversions until the contents of these registers are valid,
-or, in 1SHOT mode, you have to have one conversion made.
-- 
cgit v1.2.3


From 2b8cf3e8c0638687a7a28a7517e673f855623e3b Mon Sep 17 00:00:00 2001
From: Frank Seidel <frank@f-seidel.de>
Date: Mon, 30 Mar 2009 21:46:41 +0200
Subject: hwmon: (hdaps) Allow inversion of separate axis

Fix for kernel.org bug #7154: hdaps inversion of each axis. This
version is based on the work from Michael Ruoss <miruoss@student.ethz.ch>.

Signed-off-by: Frank Seidel <frank@f-seidel.de>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/hwmon/hdaps.c | 64 ++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/drivers/hwmon/hdaps.c b/drivers/hwmon/hdaps.c
index a4d92d246d5..dd058e06e34 100644
--- a/drivers/hwmon/hdaps.c
+++ b/drivers/hwmon/hdaps.c
@@ -65,6 +65,10 @@
 #define HDAPS_INPUT_FUZZ	4	/* input event threshold */
 #define HDAPS_INPUT_FLAT	4
 
+#define HDAPS_X_AXIS		(1 << 0)
+#define HDAPS_Y_AXIS		(1 << 1)
+#define HDAPS_BOTH_AXES		(HDAPS_X_AXIS | HDAPS_Y_AXIS)
+
 static struct platform_device *pdev;
 static struct input_polled_dev *hdaps_idev;
 static unsigned int hdaps_invert;
@@ -182,11 +186,11 @@ static int __hdaps_read_pair(unsigned int port1, unsigned int port2,
 	km_activity = inb(HDAPS_PORT_KMACT);
 	__device_complete();
 
-	/* if hdaps_invert is set, negate the two values */
-	if (hdaps_invert) {
+	/* hdaps_invert is a bitvector to negate the axes */
+	if (hdaps_invert & HDAPS_X_AXIS)
 		*x = -*x;
+	if (hdaps_invert & HDAPS_Y_AXIS)
 		*y = -*y;
-	}
 
 	return 0;
 }
@@ -436,7 +440,8 @@ static ssize_t hdaps_invert_store(struct device *dev,
 {
 	int invert;
 
-	if (sscanf(buf, "%d", &invert) != 1 || (invert != 1 && invert != 0))
+	if (sscanf(buf, "%d", &invert) != 1 ||
+	    invert < 0 || invert > HDAPS_BOTH_AXES)
 		return -EINVAL;
 
 	hdaps_invert = invert;
@@ -483,56 +488,52 @@ static int __init hdaps_dmi_match(const struct dmi_system_id *id)
 /* hdaps_dmi_match_invert - found an inverted match. */
 static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id)
 {
-	hdaps_invert = 1;
-	printk(KERN_INFO "hdaps: inverting axis readings.\n");
+	hdaps_invert = (unsigned long)id->driver_data;
+	printk(KERN_INFO "hdaps: inverting axis (%u) readings.\n",
+	       hdaps_invert);
 	return hdaps_dmi_match(id);
 }
 
-#define HDAPS_DMI_MATCH_NORMAL(vendor, model) {		\
-	.ident = vendor " " model,			\
-	.callback = hdaps_dmi_match,			\
-	.matches = {					\
-		DMI_MATCH(DMI_BOARD_VENDOR, vendor),	\
-		DMI_MATCH(DMI_PRODUCT_VERSION, model)	\
-	}						\
-}
-
-#define HDAPS_DMI_MATCH_INVERT(vendor, model) {		\
+#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) {	\
 	.ident = vendor " " model,			\
 	.callback = hdaps_dmi_match_invert,		\
+	.driver_data = (void *)axes,			\
 	.matches = {					\
 		DMI_MATCH(DMI_BOARD_VENDOR, vendor),	\
 		DMI_MATCH(DMI_PRODUCT_VERSION, model)	\
 	}						\
 }
 
+#define HDAPS_DMI_MATCH_NORMAL(vendor, model)		\
+	HDAPS_DMI_MATCH_INVERT(vendor, model, 0)
+
 /* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match
    "ThinkPad T42p", so the order of the entries matters.
    If your ThinkPad is not recognized, please update to latest
    BIOS. This is especially the case for some R52 ThinkPads. */
 static struct dmi_system_id __initdata hdaps_whitelist[] = {
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p"),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61"),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p"),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p"),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61"),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X41"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61"),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p"),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES),
 	{ .ident = NULL }
 };
 
@@ -627,8 +628,9 @@ static void __exit hdaps_exit(void)
 module_init(hdaps_init);
 module_exit(hdaps_exit);
 
-module_param_named(invert, hdaps_invert, bool, 0);
-MODULE_PARM_DESC(invert, "invert data along each axis");
+module_param_named(invert, hdaps_invert, int, 0);
+MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, "
+		 "2 invert y-axis, 3 invert both axes.");
 
 MODULE_AUTHOR("Robert Love");
 MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver");
-- 
cgit v1.2.3


From b6a33fe2cc1b44851174967943fe5989f7e0550f Mon Sep 17 00:00:00 2001
From: Frank Seidel <frank@f-seidel.de>
Date: Mon, 30 Mar 2009 21:46:42 +0200
Subject: hwmon: (hdaps) Fix Thinkpad X41 axis inversion

Fix for kernel.org bug #7154: hdaps inversion of actual Thinkpad
X41's Y-axis.

Signed-off-by: Frank Seidel <frank@f-seidel.de>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/hwmon/hdaps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/hdaps.c b/drivers/hwmon/hdaps.c
index dd058e06e34..d3612a1f198 100644
--- a/drivers/hwmon/hdaps.c
+++ b/drivers/hwmon/hdaps.c
@@ -527,7 +527,7 @@ static struct dmi_system_id __initdata hdaps_whitelist[] = {
 	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X41"),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS),
 	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES),
 	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES),
-- 
cgit v1.2.3


From 1704b26ee3fd89c76724cbea238e951dc019faca Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:42 +0200
Subject: hwmon: (w83627ehf) Invert fan pin variables logic

Use positive logic for fan pin variables (variable is set if pin is
used for fan), instead of negative logic which is error prone.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Gong Jun <JGong@nuvoton.com>
---
 drivers/hwmon/w83627ehf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c
index feae743ba99..18432e34dc7 100644
--- a/drivers/hwmon/w83627ehf.c
+++ b/drivers/hwmon/w83627ehf.c
@@ -1317,8 +1317,8 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 
 	/* fan4 and fan5 share some pins with the GPIO and serial flash */
 
-	fan5pin = superio_inb(sio_data->sioreg, 0x24) & 0x2;
-	fan4pin = superio_inb(sio_data->sioreg, 0x29) & 0x6;
+	fan5pin = !(superio_inb(sio_data->sioreg, 0x24) & 0x2);
+	fan4pin = !(superio_inb(sio_data->sioreg, 0x29) & 0x6);
 	superio_exit(sio_data->sioreg);
 
 	/* It looks like fan4 and fan5 pins can be alternatively used
@@ -1329,9 +1329,9 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 
 	data->has_fan = 0x07; /* fan1, fan2 and fan3 */
 	i = w83627ehf_read_value(data, W83627EHF_REG_FANDIV1);
-	if ((i & (1 << 2)) && (!fan4pin))
+	if ((i & (1 << 2)) && fan4pin)
 		data->has_fan |= (1 << 3);
-	if (!(i & (1 << 1)) && (!fan5pin))
+	if (!(i & (1 << 1)) && fan5pin)
 		data->has_fan |= (1 << 4);
 
 	/* Read fan clock dividers immediately */
-- 
cgit v1.2.3


From 237c8d2f54ff12bd4fea1a9d18a94ae5810271d3 Mon Sep 17 00:00:00 2001
From: Gong Jun <JGong@nuvoton.com>
Date: Mon, 30 Mar 2009 21:46:42 +0200
Subject: hwmon: (w83627ehf) Add support for W83667HG

Add initial support for the Nuvoton W83667HG chip to the w83627ehf
driver. It has been tested on ASUS P5QL PRO by Gong Jun.

At the moment there is still a usability issue which is that only in6
or temp3 can be present on the W83667HG, so the driver shouldn't
expose both. This will be addressed later.

Signed-off-by: Gong Jun <JGong@nuvoton.com>
Acked-by: David Hubbard <david.c.hubbard@gmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/hwmon/w83627ehf |  29 ++++++++----
 drivers/hwmon/Kconfig         |   4 +-
 drivers/hwmon/w83627ehf.c     | 106 +++++++++++++++++++++++++++---------------
 3 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
index d6e1ae30fa6..b6eb59384bb 100644
--- a/Documentation/hwmon/w83627ehf
+++ b/Documentation/hwmon/w83627ehf
@@ -2,30 +2,40 @@ Kernel driver w83627ehf
 =======================
 
 Supported chips:
-  * Winbond W83627EHF/EHG/DHG (ISA access ONLY)
+  * Winbond W83627EHF/EHG (ISA access ONLY)
     Prefix: 'w83627ehf'
     Addresses scanned: ISA address retrieved from Super I/O registers
     Datasheet:
-        http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83627EHF_%20W83627EHGb.pdf
-        DHG datasheet confidential.
+        http://www.nuvoton.com.tw/NR/rdonlyres/A6A258F0-F0C9-4F97-81C0-C4D29E7E943E/0/W83627EHF.pdf
+  * Winbond W83627DHG
+    Prefix: 'w83627dhg'
+    Addresses scanned: ISA address retrieved from Super I/O registers
+    Datasheet:
+        http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf
+  * Winbond W83667HG
+    Prefix: 'w83667hg'
+    Addresses scanned: ISA address retrieved from Super I/O registers
+    Datasheet: not available
 
 Authors:
         Jean Delvare <khali@linux-fr.org>
         Yuan Mu (Winbond)
         Rudolf Marek <r.marek@assembler.cz>
         David Hubbard <david.c.hubbard@gmail.com>
+        Gong Jun <JGong@nuvoton.com>
 
 Description
 -----------
 
-This driver implements support for the Winbond W83627EHF, W83627EHG, and
-W83627DHG super I/O chips. We will refer to them collectively as Winbond chips.
+This driver implements support for the Winbond W83627EHF, W83627EHG,
+W83627DHG and W83667HG super I/O chips. We will refer to them collectively
+as Winbond chips.
 
 The chips implement three temperature sensors, five fan rotation
 speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
-VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG), alarms with beep
-warnings (control unimplemented), and some automatic fan regulation
-strategies (plus manual fan control mode).
+VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG and 667HG), alarms
+with beep warnings (control unimplemented), and some automatic fan
+regulation strategies (plus manual fan control mode).
 
 Temperatures are measured in degrees Celsius and measurement resolution is 1
 degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
@@ -54,7 +64,8 @@ follows:
 temp1 -> pwm1
 temp2 -> pwm2
 temp3 -> pwm3
-prog  -> pwm4 (the programmable setting is not supported by the driver)
+prog  -> pwm4 (not on 667HG; the programmable setting is not supported by
+	       the driver)
 
 /sys files
 ----------
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index b4eea0292c1..0eb4170cc4a 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -827,7 +827,7 @@ config SENSORS_W83627HF
 	  will be called w83627hf.
 
 config SENSORS_W83627EHF
-	tristate "Winbond W83627EHF/DHG"
+	tristate "Winbond W83627EHF/EHG/DHG, W83667HG"
 	select HWMON_VID
 	help
 	  If you say yes here you get support for the hardware
@@ -838,6 +838,8 @@ config SENSORS_W83627EHF
 	  chip suited for specific Intel processors that use PECI such as
 	  the Core 2 Duo.
 
+	  This driver also supports the W83667HG chip.
+
 	  This driver can also be built as a module.  If so, the module
 	  will be called w83627ehf.
 
diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c
index 18432e34dc7..20a9332959b 100644
--- a/drivers/hwmon/w83627ehf.c
+++ b/drivers/hwmon/w83627ehf.c
@@ -36,6 +36,7 @@
     w83627ehf   10      5       4       3      0x8850 0x88    0x5ca3
                                                0x8860 0xa1
     w83627dhg    9      5       4       3      0xa020 0xc1    0x5ca3
+    w83667hg     9      5       3       3      0xa510 0xc1    0x5ca3
 */
 
 #include <linux/module.h>
@@ -52,12 +53,13 @@
 #include <asm/io.h>
 #include "lm75.h"
 
-enum kinds { w83627ehf, w83627dhg };
+enum kinds { w83627ehf, w83627dhg, w83667hg };
 
 /* used to set data->name = w83627ehf_device_names[data->sio_kind] */
 static const char * w83627ehf_device_names[] = {
 	"w83627ehf",
 	"w83627dhg",
+	"w83667hg",
 };
 
 static unsigned short force_id;
@@ -71,6 +73,7 @@ MODULE_PARM_DESC(force_id, "Override the detected device ID");
  */
 
 #define W83627EHF_LD_HWM	0x0b
+#define W83667HG_LD_VID 	0x0d
 
 #define SIO_REG_LDSEL		0x07	/* Logical device select */
 #define SIO_REG_DEVID		0x20	/* Device ID (2 bytes) */
@@ -83,6 +86,7 @@ MODULE_PARM_DESC(force_id, "Override the detected device ID");
 #define SIO_W83627EHF_ID	0x8850
 #define SIO_W83627EHG_ID	0x8860
 #define SIO_W83627DHG_ID	0xa020
+#define SIO_W83667HG_ID 	0xa510
 #define SIO_ID_MASK		0xFFF0
 
 static inline void
@@ -289,6 +293,7 @@ struct w83627ehf_data {
 	u8 pwm_mode[4]; /* 0->DC variable voltage, 1->PWM variable duty cycle */
 	u8 pwm_enable[4]; /* 1->manual
 			     2->thermal cruise (also called SmartFan I) */
+	u8 pwm_num;		/* number of pwm */
 	u8 pwm[4];
 	u8 target_temp[4];
 	u8 tolerance[4];
@@ -1192,7 +1197,7 @@ static void w83627ehf_device_remove_files(struct device *dev)
 		device_remove_file(dev, &sda_fan_div[i].dev_attr);
 		device_remove_file(dev, &sda_fan_min[i].dev_attr);
 	}
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < data->pwm_num; i++) {
 		device_remove_file(dev, &sda_pwm[i].dev_attr);
 		device_remove_file(dev, &sda_pwm_mode[i].dev_attr);
 		device_remove_file(dev, &sda_pwm_enable[i].dev_attr);
@@ -1272,8 +1277,10 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 	data->name = w83627ehf_device_names[sio_data->kind];
 	platform_set_drvdata(pdev, data);
 
-	/* 627EHG and 627EHF have 10 voltage inputs; DHG has 9 */
-	data->in_num = (sio_data->kind == w83627dhg) ? 9 : 10;
+	/* 627EHG and 627EHF have 10 voltage inputs; 627DHG and 667HG have 9 */
+	data->in_num = (sio_data->kind == w83627ehf) ? 10 : 9;
+	/* 667HG has 3 pwms */
+	data->pwm_num = (sio_data->kind == w83667hg) ? 3 : 4;
 
 	/* Initialize the chip */
 	w83627ehf_init_device(data);
@@ -1281,44 +1288,64 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 	data->vrm = vid_which_vrm();
 	superio_enter(sio_data->sioreg);
 	/* Read VID value */
-	superio_select(sio_data->sioreg, W83627EHF_LD_HWM);
-	if (superio_inb(sio_data->sioreg, SIO_REG_VID_CTRL) & 0x80) {
-		/* Set VID input sensibility if needed. In theory the BIOS
-		   should have set it, but in practice it's not always the
-		   case. We only do it for the W83627EHF/EHG because the
-		   W83627DHG is more complex in this respect. */
-		if (sio_data->kind == w83627ehf) {
-			en_vrm10 = superio_inb(sio_data->sioreg,
-					       SIO_REG_EN_VRM10);
-			if ((en_vrm10 & 0x08) && data->vrm == 90) {
-				dev_warn(dev, "Setting VID input voltage to "
-					 "TTL\n");
-				superio_outb(sio_data->sioreg, SIO_REG_EN_VRM10,
-					     en_vrm10 & ~0x08);
-			} else if (!(en_vrm10 & 0x08) && data->vrm == 100) {
-				dev_warn(dev, "Setting VID input voltage to "
-					 "VRM10\n");
-				superio_outb(sio_data->sioreg, SIO_REG_EN_VRM10,
-					     en_vrm10 | 0x08);
-			}
-		}
-
-		data->vid = superio_inb(sio_data->sioreg, SIO_REG_VID_DATA);
-		if (sio_data->kind == w83627ehf) /* 6 VID pins only */
-			data->vid &= 0x3f;
-
+	if (sio_data->kind == w83667hg) {
+		/* W83667HG has different pins for VID input and output, so
+		we can get the VID input values directly at logical device D
+		0xe3. */
+		superio_select(sio_data->sioreg, W83667HG_LD_VID);
+		data->vid = superio_inb(sio_data->sioreg, 0xe3);
 		err = device_create_file(dev, &dev_attr_cpu0_vid);
 		if (err)
 			goto exit_release;
 	} else {
-		dev_info(dev, "VID pins in output mode, CPU VID not "
-			 "available\n");
+		superio_select(sio_data->sioreg, W83627EHF_LD_HWM);
+		if (superio_inb(sio_data->sioreg, SIO_REG_VID_CTRL) & 0x80) {
+			/* Set VID input sensibility if needed. In theory the
+			   BIOS should have set it, but in practice it's not
+			   always the case. We only do it for the W83627EHF/EHG
+			   because the W83627DHG is more complex in this
+			   respect. */
+			if (sio_data->kind == w83627ehf) {
+				en_vrm10 = superio_inb(sio_data->sioreg,
+						       SIO_REG_EN_VRM10);
+				if ((en_vrm10 & 0x08) && data->vrm == 90) {
+					dev_warn(dev, "Setting VID input "
+						 "voltage to TTL\n");
+					superio_outb(sio_data->sioreg,
+						     SIO_REG_EN_VRM10,
+						     en_vrm10 & ~0x08);
+				} else if (!(en_vrm10 & 0x08)
+					   && data->vrm == 100) {
+					dev_warn(dev, "Setting VID input "
+						 "voltage to VRM10\n");
+					superio_outb(sio_data->sioreg,
+						     SIO_REG_EN_VRM10,
+						     en_vrm10 | 0x08);
+				}
+			}
+
+			data->vid = superio_inb(sio_data->sioreg,
+						SIO_REG_VID_DATA);
+			if (sio_data->kind == w83627ehf) /* 6 VID pins only */
+				data->vid &= 0x3f;
+
+			err = device_create_file(dev, &dev_attr_cpu0_vid);
+			if (err)
+				goto exit_release;
+		} else {
+			dev_info(dev, "VID pins in output mode, CPU VID not "
+				 "available\n");
+		}
 	}
 
 	/* fan4 and fan5 share some pins with the GPIO and serial flash */
-
-	fan5pin = !(superio_inb(sio_data->sioreg, 0x24) & 0x2);
-	fan4pin = !(superio_inb(sio_data->sioreg, 0x29) & 0x6);
+	if (sio_data->kind == w83667hg) {
+		fan5pin = superio_inb(sio_data->sioreg, 0x27) & 0x20;
+		fan4pin = superio_inb(sio_data->sioreg, 0x27) & 0x40;
+	} else {
+		fan5pin = !(superio_inb(sio_data->sioreg, 0x24) & 0x02);
+		fan4pin = !(superio_inb(sio_data->sioreg, 0x29) & 0x06);
+	}
 	superio_exit(sio_data->sioreg);
 
 	/* It looks like fan4 and fan5 pins can be alternatively used
@@ -1344,7 +1371,7 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 			goto exit_remove;
 
 	/* if fan4 is enabled create the sf3 files for it */
-	if (data->has_fan & (1 << 3))
+	if ((data->has_fan & (1 << 3)) && data->pwm_num >= 4)
 		for (i = 0; i < ARRAY_SIZE(sda_sf3_arrays_fan4); i++) {
 			if ((err = device_create_file(dev,
 				&sda_sf3_arrays_fan4[i].dev_attr)))
@@ -1372,7 +1399,7 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 				|| (err = device_create_file(dev,
 					&sda_fan_min[i].dev_attr)))
 				goto exit_remove;
-			if (i < 4 && /* w83627ehf only has 4 pwm */
+			if (i < data->pwm_num &&
 				((err = device_create_file(dev,
 					&sda_pwm[i].dev_attr))
 				|| (err = device_create_file(dev,
@@ -1442,6 +1469,7 @@ static int __init w83627ehf_find(int sioaddr, unsigned short *addr,
 	static const char __initdata sio_name_W83627EHF[] = "W83627EHF";
 	static const char __initdata sio_name_W83627EHG[] = "W83627EHG";
 	static const char __initdata sio_name_W83627DHG[] = "W83627DHG";
+	static const char __initdata sio_name_W83667HG[] = "W83667HG";
 
 	u16 val;
 	const char *sio_name;
@@ -1466,6 +1494,10 @@ static int __init w83627ehf_find(int sioaddr, unsigned short *addr,
 		sio_data->kind = w83627dhg;
 		sio_name = sio_name_W83627DHG;
 		break;
+	case SIO_W83667HG_ID:
+		sio_data->kind = w83667hg;
+		sio_name = sio_name_W83667HG;
+		break;
 	default:
 		if (val != 0xffff)
 			pr_debug(DRVNAME ": unsupported chip ID: 0x%04x\n",
-- 
cgit v1.2.3


From a157d06d4d70318a0818552095071d7430dd5d34 Mon Sep 17 00:00:00 2001
From: Gong Jun <JGong@nuvoton.com>
Date: Mon, 30 Mar 2009 21:46:43 +0200
Subject: hwmon: (w83627ehf) Only expose in6 or temp3 on the W83667HG

The pin for in6 and temp3 is shared on the W83667HG, so only one of
these features can be supported on any given system. Let the driver
select which one depending on the temp3 disabled bit.

Signed-off-by: Gong Jun <JGong@nuvoton.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/hwmon/w83627ehf.c | 60 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c
index 20a9332959b..e64b42058b2 100644
--- a/drivers/hwmon/w83627ehf.c
+++ b/drivers/hwmon/w83627ehf.c
@@ -303,6 +303,9 @@ struct w83627ehf_data {
 
 	u8 vid;
 	u8 vrm;
+
+	u8 temp3_disable;
+	u8 in6_skip;
 };
 
 struct w83627ehf_sio_data {
@@ -871,25 +874,37 @@ show_temp_type(struct device *dev, struct device_attribute *attr, char *buf)
 	return sprintf(buf, "%d\n", (int)data->temp_type[nr]);
 }
 
-static struct sensor_device_attribute sda_temp[] = {
+static struct sensor_device_attribute sda_temp_input[] = {
 	SENSOR_ATTR(temp1_input, S_IRUGO, show_temp1, NULL, 0),
 	SENSOR_ATTR(temp2_input, S_IRUGO, show_temp, NULL, 0),
 	SENSOR_ATTR(temp3_input, S_IRUGO, show_temp, NULL, 1),
+};
+
+static struct sensor_device_attribute sda_temp_max[] = {
 	SENSOR_ATTR(temp1_max, S_IRUGO | S_IWUSR, show_temp1_max,
 		    store_temp1_max, 0),
 	SENSOR_ATTR(temp2_max, S_IRUGO | S_IWUSR, show_temp_max,
 		    store_temp_max, 0),
 	SENSOR_ATTR(temp3_max, S_IRUGO | S_IWUSR, show_temp_max,
 		    store_temp_max, 1),
+};
+
+static struct sensor_device_attribute sda_temp_max_hyst[] = {
 	SENSOR_ATTR(temp1_max_hyst, S_IRUGO | S_IWUSR, show_temp1_max_hyst,
 		    store_temp1_max_hyst, 0),
 	SENSOR_ATTR(temp2_max_hyst, S_IRUGO | S_IWUSR, show_temp_max_hyst,
 		    store_temp_max_hyst, 0),
 	SENSOR_ATTR(temp3_max_hyst, S_IRUGO | S_IWUSR, show_temp_max_hyst,
 		    store_temp_max_hyst, 1),
+};
+
+static struct sensor_device_attribute sda_temp_alarm[] = {
 	SENSOR_ATTR(temp1_alarm, S_IRUGO, show_alarm, NULL, 4),
 	SENSOR_ATTR(temp2_alarm, S_IRUGO, show_alarm, NULL, 5),
 	SENSOR_ATTR(temp3_alarm, S_IRUGO, show_alarm, NULL, 13),
+};
+
+static struct sensor_device_attribute sda_temp_type[] = {
 	SENSOR_ATTR(temp1_type, S_IRUGO, show_temp_type, NULL, 0),
 	SENSOR_ATTR(temp2_type, S_IRUGO, show_temp_type, NULL, 1),
 	SENSOR_ATTR(temp3_type, S_IRUGO, show_temp_type, NULL, 2),
@@ -1186,6 +1201,8 @@ static void w83627ehf_device_remove_files(struct device *dev)
 	for (i = 0; i < ARRAY_SIZE(sda_sf3_arrays_fan4); i++)
 		device_remove_file(dev, &sda_sf3_arrays_fan4[i].dev_attr);
 	for (i = 0; i < data->in_num; i++) {
+		if ((i == 6) && data->in6_skip)
+			continue;
 		device_remove_file(dev, &sda_in_input[i].dev_attr);
 		device_remove_file(dev, &sda_in_alarm[i].dev_attr);
 		device_remove_file(dev, &sda_in_min[i].dev_attr);
@@ -1204,8 +1221,15 @@ static void w83627ehf_device_remove_files(struct device *dev)
 		device_remove_file(dev, &sda_target_temp[i].dev_attr);
 		device_remove_file(dev, &sda_tolerance[i].dev_attr);
 	}
-	for (i = 0; i < ARRAY_SIZE(sda_temp); i++)
-		device_remove_file(dev, &sda_temp[i].dev_attr);
+	for (i = 0; i < 3; i++) {
+		if ((i == 2) && data->temp3_disable)
+			continue;
+		device_remove_file(dev, &sda_temp_input[i].dev_attr);
+		device_remove_file(dev, &sda_temp_max[i].dev_attr);
+		device_remove_file(dev, &sda_temp_max_hyst[i].dev_attr);
+		device_remove_file(dev, &sda_temp_alarm[i].dev_attr);
+		device_remove_file(dev, &sda_temp_type[i].dev_attr);
+	}
 
 	device_remove_file(dev, &dev_attr_name);
 	device_remove_file(dev, &dev_attr_cpu0_vid);
@@ -1227,6 +1251,8 @@ static inline void __devinit w83627ehf_init_device(struct w83627ehf_data *data)
 	for (i = 0; i < 2; i++) {
 		tmp = w83627ehf_read_value(data,
 					   W83627EHF_REG_TEMP_CONFIG[i]);
+		if ((i == 1) && data->temp3_disable)
+			continue;
 		if (tmp & 0x01)
 			w83627ehf_write_value(data,
 					      W83627EHF_REG_TEMP_CONFIG[i],
@@ -1282,6 +1308,13 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 	/* 667HG has 3 pwms */
 	data->pwm_num = (sio_data->kind == w83667hg) ? 3 : 4;
 
+	/* Check temp3 configuration bit for 667HG */
+	if (sio_data->kind == w83667hg) {
+		data->temp3_disable = w83627ehf_read_value(data,
+					W83627EHF_REG_TEMP_CONFIG[1]) & 0x01;
+		data->in6_skip = !data->temp3_disable;
+	}
+
 	/* Initialize the chip */
 	w83627ehf_init_device(data);
 
@@ -1378,7 +1411,9 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 				goto exit_remove;
 		}
 
-	for (i = 0; i < data->in_num; i++)
+	for (i = 0; i < data->in_num; i++) {
+		if ((i == 6) && data->in6_skip)
+			continue;
 		if ((err = device_create_file(dev, &sda_in_input[i].dev_attr))
 			|| (err = device_create_file(dev,
 				&sda_in_alarm[i].dev_attr))
@@ -1387,6 +1422,7 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 			|| (err = device_create_file(dev,
 				&sda_in_max[i].dev_attr)))
 			goto exit_remove;
+	}
 
 	for (i = 0; i < 5; i++) {
 		if (data->has_fan & (1 << i)) {
@@ -1414,9 +1450,21 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev)
 		}
 	}
 
-	for (i = 0; i < ARRAY_SIZE(sda_temp); i++)
-		if ((err = device_create_file(dev, &sda_temp[i].dev_attr)))
+	for (i = 0; i < 3; i++) {
+		if ((i == 2) && data->temp3_disable)
+			continue;
+		if ((err = device_create_file(dev,
+				&sda_temp_input[i].dev_attr))
+			|| (err = device_create_file(dev,
+				&sda_temp_max[i].dev_attr))
+			|| (err = device_create_file(dev,
+				&sda_temp_max_hyst[i].dev_attr))
+			|| (err = device_create_file(dev,
+				&sda_temp_alarm[i].dev_attr))
+			|| (err = device_create_file(dev,
+				&sda_temp_type[i].dev_attr)))
 			goto exit_remove;
+	}
 
 	err = device_create_file(dev, &dev_attr_name);
 	if (err)
-- 
cgit v1.2.3


From fb4504fe84b09cbf49fda19e6630a1003d79656a Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:43 +0200
Subject: Move the pcf8591 driver to hwmon

Directory drivers/i2c/chips is going away, so drivers there must find
new homes. For the pcf8591 driver, the best choice seems to be the
hwmon subsystem. While the Philips PCF8591 device isn't a typical
hardware monitoring chip, its DAC interface is compatible with the
hwmon one, so it fits somewhat.

If a better subsystem is ever created for ADC/DAC chips, the driver
could be moved there.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Aurelien Jarno <aurelien@aurel32.net>
---
 Documentation/hwmon/pcf8591     |  90 +++++++++++
 Documentation/i2c/chips/pcf8591 |  90 -----------
 drivers/hwmon/Kconfig           |  14 ++
 drivers/hwmon/Makefile          |   1 +
 drivers/hwmon/pcf8591.c         | 325 ++++++++++++++++++++++++++++++++++++++++
 drivers/i2c/chips/Kconfig       |  13 --
 drivers/i2c/chips/Makefile      |   1 -
 drivers/i2c/chips/pcf8591.c     | 325 ----------------------------------------
 8 files changed, 430 insertions(+), 429 deletions(-)
 create mode 100644 Documentation/hwmon/pcf8591
 delete mode 100644 Documentation/i2c/chips/pcf8591
 create mode 100644 drivers/hwmon/pcf8591.c
 delete mode 100644 drivers/i2c/chips/pcf8591.c

diff --git a/Documentation/hwmon/pcf8591 b/Documentation/hwmon/pcf8591
new file mode 100644
index 00000000000..5628fcf4207
--- /dev/null
+++ b/Documentation/hwmon/pcf8591
@@ -0,0 +1,90 @@
+Kernel driver pcf8591
+=====================
+
+Supported chips:
+  * Philips PCF8591
+    Prefix: 'pcf8591'
+    Addresses scanned: I2C 0x48 - 0x4f
+    Datasheet: Publicly available at the Philips Semiconductor website
+               http://www.semiconductors.philips.com/pip/PCF8591P.html
+
+Authors:
+        Aurelien Jarno <aurelien@aurel32.net>
+        valuable contributions by Jan M. Sendler <sendler@sendler.de>,
+        Jean Delvare <khali@linux-fr.org>
+
+
+Description
+-----------
+The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one
+analog output) for the I2C bus produced by Philips Semiconductors. It
+is designed to provide a byte I2C interface to up to 4 separate devices.
+
+The PCF8591 has 4 analog inputs programmable as single-ended or
+differential inputs :
+- mode 0 : four single ended inputs
+        Pins AIN0 to AIN3 are single ended inputs for channels 0 to 3
+
+- mode 1 : three differential inputs
+        Pins AIN3 is the common negative differential input
+        Pins AIN0 to AIN2 are positive differential inputs for channels 0 to 2
+
+- mode 2 : single ended and differential mixed
+        Pins AIN0 and AIN1 are single ended inputs for channels 0 and 1
+        Pins AIN2 is the positive differential input for channel 3
+        Pins AIN3 is the negative differential input for channel 3
+
+- mode 3 : two differential inputs
+        Pins AIN0 is the positive differential input for channel 0
+        Pins AIN1 is the negative differential input for channel 0
+        Pins AIN2 is the positive differential input for channel 1
+        Pins AIN3 is the negative differential input for channel 1
+
+See the datasheet for details.
+
+Module parameters
+-----------------
+
+* input_mode int
+
+    Analog input mode:
+         0 = four single ended inputs
+         1 = three differential inputs
+         2 = single ended and differential mixed
+         3 = two differential inputs
+
+
+Accessing PCF8591 via /sys interface
+-------------------------------------
+
+! Be careful !
+The PCF8591 is plainly impossible to detect ! Stupid chip.
+So every chip with address in the interval [48..4f] is
+detected as PCF8591. If you have other chips in this address
+range, the workaround is to load this module after the one
+for your others chips.
+
+On detection (i.e. insmod, modprobe et al.), directories are being
+created for each detected PCF8591:
+
+/sys/bus/devices/<0>-<1>/
+where <0> is the bus the chip was detected on (e. g. i2c-0)
+and <1> the chip address ([48..4f])
+
+Inside these directories, there are such files:
+in0, in1, in2, in3, out0_enable, out0_output, name
+
+Name contains chip name.
+
+The in0, in1, in2 and in3 files are RO. Reading gives the value of the
+corresponding channel. Depending on the current analog inputs configuration,
+files in2 and/or in3 do not exist. Values range are from 0 to 255 for single
+ended inputs and -128 to +127 for differential inputs (8-bit ADC).
+
+The out0_enable file is RW. Reading gives "1" for analog output enabled and
+"0" for analog output disabled. Writing accepts "0" and "1" accordingly.
+
+The out0_output file is RW. Writing a number between 0 and 255 (8-bit DAC), send
+the value to the digital-to-analog converter. Note that a voltage will
+only appears on AOUT pin if aout0_enable equals 1. Reading returns the last
+value written.
diff --git a/Documentation/i2c/chips/pcf8591 b/Documentation/i2c/chips/pcf8591
deleted file mode 100644
index 5628fcf4207..00000000000
--- a/Documentation/i2c/chips/pcf8591
+++ /dev/null
@@ -1,90 +0,0 @@
-Kernel driver pcf8591
-=====================
-
-Supported chips:
-  * Philips PCF8591
-    Prefix: 'pcf8591'
-    Addresses scanned: I2C 0x48 - 0x4f
-    Datasheet: Publicly available at the Philips Semiconductor website
-               http://www.semiconductors.philips.com/pip/PCF8591P.html
-
-Authors:
-        Aurelien Jarno <aurelien@aurel32.net>
-        valuable contributions by Jan M. Sendler <sendler@sendler.de>,
-        Jean Delvare <khali@linux-fr.org>
-
-
-Description
------------
-The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one
-analog output) for the I2C bus produced by Philips Semiconductors. It
-is designed to provide a byte I2C interface to up to 4 separate devices.
-
-The PCF8591 has 4 analog inputs programmable as single-ended or
-differential inputs :
-- mode 0 : four single ended inputs
-        Pins AIN0 to AIN3 are single ended inputs for channels 0 to 3
-
-- mode 1 : three differential inputs
-        Pins AIN3 is the common negative differential input
-        Pins AIN0 to AIN2 are positive differential inputs for channels 0 to 2
-
-- mode 2 : single ended and differential mixed
-        Pins AIN0 and AIN1 are single ended inputs for channels 0 and 1
-        Pins AIN2 is the positive differential input for channel 3
-        Pins AIN3 is the negative differential input for channel 3
-
-- mode 3 : two differential inputs
-        Pins AIN0 is the positive differential input for channel 0
-        Pins AIN1 is the negative differential input for channel 0
-        Pins AIN2 is the positive differential input for channel 1
-        Pins AIN3 is the negative differential input for channel 1
-
-See the datasheet for details.
-
-Module parameters
------------------
-
-* input_mode int
-
-    Analog input mode:
-         0 = four single ended inputs
-         1 = three differential inputs
-         2 = single ended and differential mixed
-         3 = two differential inputs
-
-
-Accessing PCF8591 via /sys interface
--------------------------------------
-
-! Be careful !
-The PCF8591 is plainly impossible to detect ! Stupid chip.
-So every chip with address in the interval [48..4f] is
-detected as PCF8591. If you have other chips in this address
-range, the workaround is to load this module after the one
-for your others chips.
-
-On detection (i.e. insmod, modprobe et al.), directories are being
-created for each detected PCF8591:
-
-/sys/bus/devices/<0>-<1>/
-where <0> is the bus the chip was detected on (e. g. i2c-0)
-and <1> the chip address ([48..4f])
-
-Inside these directories, there are such files:
-in0, in1, in2, in3, out0_enable, out0_output, name
-
-Name contains chip name.
-
-The in0, in1, in2 and in3 files are RO. Reading gives the value of the
-corresponding channel. Depending on the current analog inputs configuration,
-files in2 and/or in3 do not exist. Values range are from 0 to 255 for single
-ended inputs and -128 to +127 for differential inputs (8-bit ADC).
-
-The out0_enable file is RW. Reading gives "1" for analog output enabled and
-"0" for analog output disabled. Writing accepts "0" and "1" accordingly.
-
-The out0_output file is RW. Writing a number between 0 and 255 (8-bit DAC), send
-the value to the digital-to-analog converter. Note that a voltage will
-only appears on AOUT pin if aout0_enable equals 1. Reading returns the last
-value written.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 0eb4170cc4a..9a22816b37d 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -635,6 +635,20 @@ config SENSORS_PC87427
 	  This driver can also be built as a module.  If so, the module
 	  will be called pc87427.
 
+config SENSORS_PCF8591
+	tristate "Philips PCF8591 ADC/DAC"
+	depends on I2C
+	default n
+	help
+	  If you say yes here you get support for Philips PCF8591 4-channel
+	  ADC, 1-channel DAC chips.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called pcf8591.
+
+	  These devices are hard to detect and rarely found on mainstream
+	  hardware.  If unsure, say N.
+
 config SENSORS_SIS5595
 	tristate "Silicon Integrated Systems Corp. SiS5595"
 	depends on PCI
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 2e80f37f39e..e332d626792 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_SENSORS_MAX1619)	+= max1619.o
 obj-$(CONFIG_SENSORS_MAX6650)	+= max6650.o
 obj-$(CONFIG_SENSORS_PC87360)	+= pc87360.o
 obj-$(CONFIG_SENSORS_PC87427)	+= pc87427.o
+obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
 obj-$(CONFIG_SENSORS_SIS5595)	+= sis5595.o
 obj-$(CONFIG_SENSORS_SMSC47B397)+= smsc47b397.o
 obj-$(CONFIG_SENSORS_SMSC47M1)	+= smsc47m1.o
diff --git a/drivers/hwmon/pcf8591.c b/drivers/hwmon/pcf8591.c
new file mode 100644
index 00000000000..1d7ffebd679
--- /dev/null
+++ b/drivers/hwmon/pcf8591.c
@@ -0,0 +1,325 @@
+/*
+    Copyright (C) 2001-2004 Aurelien Jarno <aurelien@aurel32.net>
+    Ported to Linux 2.6 by Aurelien Jarno <aurelien@aurel32.net> with
+    the help of Jean Delvare <khali@linux-fr.org>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/mutex.h>
+
+/* Addresses to scan */
+static const unsigned short normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b, 0x4c,
+					0x4d, 0x4e, 0x4f, I2C_CLIENT_END };
+
+/* Insmod parameters */
+I2C_CLIENT_INSMOD_1(pcf8591);
+
+static int input_mode;
+module_param(input_mode, int, 0);
+MODULE_PARM_DESC(input_mode,
+	"Analog input mode:\n"
+	" 0 = four single ended inputs\n"
+	" 1 = three differential inputs\n"
+	" 2 = single ended and differential mixed\n"
+	" 3 = two differential inputs\n");
+
+/* The PCF8591 control byte
+      7    6    5    4    3    2    1    0
+   |  0 |AOEF|   AIP   |  0 |AINC|  AICH   | */
+
+/* Analog Output Enable Flag (analog output active if 1) */
+#define PCF8591_CONTROL_AOEF		0x40
+
+/* Analog Input Programming
+   0x00 = four single ended inputs
+   0x10 = three differential inputs
+   0x20 = single ended and differential mixed
+   0x30 = two differential inputs */
+#define PCF8591_CONTROL_AIP_MASK	0x30
+
+/* Autoincrement Flag (switch on if 1) */
+#define PCF8591_CONTROL_AINC		0x04
+
+/* Channel selection
+   0x00 = channel 0
+   0x01 = channel 1
+   0x02 = channel 2
+   0x03 = channel 3 */
+#define PCF8591_CONTROL_AICH_MASK	0x03
+
+/* Initial values */
+#define PCF8591_INIT_CONTROL	((input_mode << 4) | PCF8591_CONTROL_AOEF)
+#define PCF8591_INIT_AOUT	0	/* DAC out = 0 */
+
+/* Conversions */
+#define REG_TO_SIGNED(reg)	(((reg) & 0x80)?((reg) - 256):(reg))
+
+struct pcf8591_data {
+	struct mutex update_lock;
+
+	u8 control;
+	u8 aout;
+};
+
+static void pcf8591_init_client(struct i2c_client *client);
+static int pcf8591_read_channel(struct device *dev, int channel);
+
+/* following are the sysfs callback functions */
+#define show_in_channel(channel)					\
+static ssize_t show_in##channel##_input(struct device *dev, struct device_attribute *attr, char *buf)	\
+{									\
+	return sprintf(buf, "%d\n", pcf8591_read_channel(dev, channel));\
+}									\
+static DEVICE_ATTR(in##channel##_input, S_IRUGO,			\
+		   show_in##channel##_input, NULL);
+
+show_in_channel(0);
+show_in_channel(1);
+show_in_channel(2);
+show_in_channel(3);
+
+static ssize_t show_out0_ouput(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev));
+	return sprintf(buf, "%d\n", data->aout * 10);
+}
+
+static ssize_t set_out0_output(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	unsigned int value;
+	struct i2c_client *client = to_i2c_client(dev);
+	struct pcf8591_data *data = i2c_get_clientdata(client);
+	if ((value = (simple_strtoul(buf, NULL, 10) + 5) / 10) <= 255) {
+		data->aout = value;
+		i2c_smbus_write_byte_data(client, data->control, data->aout);
+		return count;
+	}
+	return -EINVAL;
+}
+
+static DEVICE_ATTR(out0_output, S_IWUSR | S_IRUGO,
+		   show_out0_ouput, set_out0_output);
+
+static ssize_t show_out0_enable(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev));
+	return sprintf(buf, "%u\n", !(!(data->control & PCF8591_CONTROL_AOEF)));
+}
+
+static ssize_t set_out0_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct pcf8591_data *data = i2c_get_clientdata(client);
+	unsigned long val = simple_strtoul(buf, NULL, 10);
+
+	mutex_lock(&data->update_lock);
+	if (val)
+		data->control |= PCF8591_CONTROL_AOEF;
+	else
+		data->control &= ~PCF8591_CONTROL_AOEF;
+	i2c_smbus_write_byte(client, data->control);
+	mutex_unlock(&data->update_lock);
+	return count;
+}
+
+static DEVICE_ATTR(out0_enable, S_IWUSR | S_IRUGO,
+		   show_out0_enable, set_out0_enable);
+
+static struct attribute *pcf8591_attributes[] = {
+	&dev_attr_out0_enable.attr,
+	&dev_attr_out0_output.attr,
+	&dev_attr_in0_input.attr,
+	&dev_attr_in1_input.attr,
+	NULL
+};
+
+static const struct attribute_group pcf8591_attr_group = {
+	.attrs = pcf8591_attributes,
+};
+
+static struct attribute *pcf8591_attributes_opt[] = {
+	&dev_attr_in2_input.attr,
+	&dev_attr_in3_input.attr,
+	NULL
+};
+
+static const struct attribute_group pcf8591_attr_group_opt = {
+	.attrs = pcf8591_attributes_opt,
+};
+
+/*
+ * Real code
+ */
+
+/* Return 0 if detection is successful, -ENODEV otherwise */
+static int pcf8591_detect(struct i2c_client *client, int kind,
+			  struct i2c_board_info *info)
+{
+	struct i2c_adapter *adapter = client->adapter;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE
+				     | I2C_FUNC_SMBUS_WRITE_BYTE_DATA))
+		return -ENODEV;
+
+	/* Now, we would do the remaining detection. But the PCF8591 is plainly
+	   impossible to detect! Stupid chip. */
+
+	strlcpy(info->type, "pcf8591", I2C_NAME_SIZE);
+
+	return 0;
+}
+
+static int pcf8591_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
+{
+	struct pcf8591_data *data;
+	int err;
+
+	if (!(data = kzalloc(sizeof(struct pcf8591_data), GFP_KERNEL))) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	i2c_set_clientdata(client, data);
+	mutex_init(&data->update_lock);
+
+	/* Initialize the PCF8591 chip */
+	pcf8591_init_client(client);
+
+	/* Register sysfs hooks */
+	err = sysfs_create_group(&client->dev.kobj, &pcf8591_attr_group);
+	if (err)
+		goto exit_kfree;
+
+	/* Register input2 if not in "two differential inputs" mode */
+	if (input_mode != 3) {
+		if ((err = device_create_file(&client->dev,
+					      &dev_attr_in2_input)))
+			goto exit_sysfs_remove;
+	}
+
+	/* Register input3 only in "four single ended inputs" mode */
+	if (input_mode == 0) {
+		if ((err = device_create_file(&client->dev,
+					      &dev_attr_in3_input)))
+			goto exit_sysfs_remove;
+	}
+
+	return 0;
+
+exit_sysfs_remove:
+	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt);
+	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group);
+exit_kfree:
+	kfree(data);
+exit:
+	return err;
+}
+
+static int pcf8591_remove(struct i2c_client *client)
+{
+	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt);
+	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group);
+	kfree(i2c_get_clientdata(client));
+	return 0;
+}
+
+/* Called when we have found a new PCF8591. */
+static void pcf8591_init_client(struct i2c_client *client)
+{
+	struct pcf8591_data *data = i2c_get_clientdata(client);
+	data->control = PCF8591_INIT_CONTROL;
+	data->aout = PCF8591_INIT_AOUT;
+
+	i2c_smbus_write_byte_data(client, data->control, data->aout);
+
+	/* The first byte transmitted contains the conversion code of the
+	   previous read cycle. FLUSH IT! */
+	i2c_smbus_read_byte(client);
+}
+
+static int pcf8591_read_channel(struct device *dev, int channel)
+{
+	u8 value;
+	struct i2c_client *client = to_i2c_client(dev);
+	struct pcf8591_data *data = i2c_get_clientdata(client);
+
+	mutex_lock(&data->update_lock);
+
+	if ((data->control & PCF8591_CONTROL_AICH_MASK) != channel) {
+		data->control = (data->control & ~PCF8591_CONTROL_AICH_MASK)
+			      | channel;
+		i2c_smbus_write_byte(client, data->control);
+
+		/* The first byte transmitted contains the conversion code of
+		   the previous read cycle. FLUSH IT! */
+		i2c_smbus_read_byte(client);
+	}
+	value = i2c_smbus_read_byte(client);
+
+	mutex_unlock(&data->update_lock);
+
+	if ((channel == 2 && input_mode == 2) ||
+	    (channel != 3 && (input_mode == 1 || input_mode == 3)))
+		return (10 * REG_TO_SIGNED(value));
+	else
+		return (10 * value);
+}
+
+static const struct i2c_device_id pcf8591_id[] = {
+	{ "pcf8591", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, pcf8591_id);
+
+static struct i2c_driver pcf8591_driver = {
+	.driver = {
+		.name	= "pcf8591",
+	},
+	.probe		= pcf8591_probe,
+	.remove		= pcf8591_remove,
+	.id_table	= pcf8591_id,
+
+	.class		= I2C_CLASS_HWMON,	/* Nearest choice */
+	.detect		= pcf8591_detect,
+	.address_data	= &addr_data,
+};
+
+static int __init pcf8591_init(void)
+{
+	if (input_mode < 0 || input_mode > 3) {
+		printk(KERN_WARNING "pcf8591: invalid input_mode (%d)\n",
+		       input_mode);
+		input_mode = 0;
+	}
+	return i2c_add_driver(&pcf8591_driver);
+}
+
+static void __exit pcf8591_exit(void)
+{
+	i2c_del_driver(&pcf8591_driver);
+}
+
+MODULE_AUTHOR("Aurelien Jarno <aurelien@aurel32.net>");
+MODULE_DESCRIPTION("PCF8591 driver");
+MODULE_LICENSE("GPL");
+
+module_init(pcf8591_init);
+module_exit(pcf8591_exit);
diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index c80312c1f38..8f8c81eb0ae 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -64,19 +64,6 @@ config SENSORS_PCA9539
 	  This driver is deprecated and will be dropped soon. Use
 	  drivers/gpio/pca953x.c instead.
 
-config SENSORS_PCF8591
-	tristate "Philips PCF8591"
-	depends on EXPERIMENTAL
-	default n
-	help
-	  If you say yes here you get support for Philips PCF8591 chips.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called pcf8591.
-
-	  These devices are hard to detect and rarely found on mainstream
-	  hardware.  If unsure, say N.
-
 config SENSORS_MAX6875
 	tristate "Maxim MAX6875 Power supply supervisor"
 	depends on EXPERIMENTAL
diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile
index d142f238a2d..55a37603718 100644
--- a/drivers/i2c/chips/Makefile
+++ b/drivers/i2c/chips/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_SENSORS_MAX6875)	+= max6875.o
 obj-$(CONFIG_SENSORS_PCA9539)	+= pca9539.o
 obj-$(CONFIG_SENSORS_PCF8574)	+= pcf8574.o
 obj-$(CONFIG_PCF8575)		+= pcf8575.o
-obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
 obj-$(CONFIG_SENSORS_TSL2550)	+= tsl2550.o
 
 ifeq ($(CONFIG_I2C_DEBUG_CHIP),y)
diff --git a/drivers/i2c/chips/pcf8591.c b/drivers/i2c/chips/pcf8591.c
deleted file mode 100644
index 16ce3e19377..00000000000
--- a/drivers/i2c/chips/pcf8591.c
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
-    Copyright (C) 2001-2004 Aurelien Jarno <aurelien@aurel32.net>
-    Ported to Linux 2.6 by Aurelien Jarno <aurelien@aurel32.net> with 
-    the help of Jean Delvare <khali@linux-fr.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/i2c.h>
-#include <linux/mutex.h>
-
-/* Addresses to scan */
-static const unsigned short normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b, 0x4c,
-					0x4d, 0x4e, 0x4f, I2C_CLIENT_END };
-
-/* Insmod parameters */
-I2C_CLIENT_INSMOD_1(pcf8591);
-
-static int input_mode;
-module_param(input_mode, int, 0);
-MODULE_PARM_DESC(input_mode,
-	"Analog input mode:\n"
-	" 0 = four single ended inputs\n"
-	" 1 = three differential inputs\n"
-	" 2 = single ended and differential mixed\n"
-	" 3 = two differential inputs\n");
-
-/* The PCF8591 control byte
-      7    6    5    4    3    2    1    0  
-   |  0 |AOEF|   AIP   |  0 |AINC|  AICH   | */
-
-/* Analog Output Enable Flag (analog output active if 1) */
-#define PCF8591_CONTROL_AOEF		0x40
-					
-/* Analog Input Programming 
-   0x00 = four single ended inputs
-   0x10 = three differential inputs
-   0x20 = single ended and differential mixed
-   0x30 = two differential inputs */
-#define PCF8591_CONTROL_AIP_MASK	0x30
-
-/* Autoincrement Flag (switch on if 1) */
-#define PCF8591_CONTROL_AINC		0x04
-
-/* Channel selection
-   0x00 = channel 0 
-   0x01 = channel 1
-   0x02 = channel 2
-   0x03 = channel 3 */
-#define PCF8591_CONTROL_AICH_MASK	0x03
-
-/* Initial values */
-#define PCF8591_INIT_CONTROL	((input_mode << 4) | PCF8591_CONTROL_AOEF)
-#define PCF8591_INIT_AOUT	0	/* DAC out = 0 */
-
-/* Conversions */
-#define REG_TO_SIGNED(reg)	(((reg) & 0x80)?((reg) - 256):(reg))
-
-struct pcf8591_data {
-	struct mutex update_lock;
-
-	u8 control;
-	u8 aout;
-};
-
-static void pcf8591_init_client(struct i2c_client *client);
-static int pcf8591_read_channel(struct device *dev, int channel);
-
-/* following are the sysfs callback functions */
-#define show_in_channel(channel)					\
-static ssize_t show_in##channel##_input(struct device *dev, struct device_attribute *attr, char *buf)	\
-{									\
-	return sprintf(buf, "%d\n", pcf8591_read_channel(dev, channel));\
-}									\
-static DEVICE_ATTR(in##channel##_input, S_IRUGO,			\
-		   show_in##channel##_input, NULL);
-
-show_in_channel(0);
-show_in_channel(1);
-show_in_channel(2);
-show_in_channel(3);
-
-static ssize_t show_out0_ouput(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev));
-	return sprintf(buf, "%d\n", data->aout * 10);
-}
-
-static ssize_t set_out0_output(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
-{
-	unsigned int value;
-	struct i2c_client *client = to_i2c_client(dev);
-	struct pcf8591_data *data = i2c_get_clientdata(client);
-	if ((value = (simple_strtoul(buf, NULL, 10) + 5) / 10) <= 255) {
-		data->aout = value;
-		i2c_smbus_write_byte_data(client, data->control, data->aout);
-		return count;
-	}
-	return -EINVAL;
-}
-
-static DEVICE_ATTR(out0_output, S_IWUSR | S_IRUGO, 
-		   show_out0_ouput, set_out0_output);
-
-static ssize_t show_out0_enable(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev));
-	return sprintf(buf, "%u\n", !(!(data->control & PCF8591_CONTROL_AOEF)));
-}
-
-static ssize_t set_out0_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct pcf8591_data *data = i2c_get_clientdata(client);
-	unsigned long val = simple_strtoul(buf, NULL, 10);
-
-	mutex_lock(&data->update_lock);
-	if (val)
-		data->control |= PCF8591_CONTROL_AOEF;
-	else
-		data->control &= ~PCF8591_CONTROL_AOEF;
-	i2c_smbus_write_byte(client, data->control);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static DEVICE_ATTR(out0_enable, S_IWUSR | S_IRUGO, 
-		   show_out0_enable, set_out0_enable);
-
-static struct attribute *pcf8591_attributes[] = {
-	&dev_attr_out0_enable.attr,
-	&dev_attr_out0_output.attr,
-	&dev_attr_in0_input.attr,
-	&dev_attr_in1_input.attr,
-	NULL
-};
-
-static const struct attribute_group pcf8591_attr_group = {
-	.attrs = pcf8591_attributes,
-};
-
-static struct attribute *pcf8591_attributes_opt[] = {
-	&dev_attr_in2_input.attr,
-	&dev_attr_in3_input.attr,
-	NULL
-};
-
-static const struct attribute_group pcf8591_attr_group_opt = {
-	.attrs = pcf8591_attributes_opt,
-};
-
-/*
- * Real code
- */
-
-/* Return 0 if detection is successful, -ENODEV otherwise */
-static int pcf8591_detect(struct i2c_client *client, int kind,
-			  struct i2c_board_info *info)
-{
-	struct i2c_adapter *adapter = client->adapter;
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE
-				     | I2C_FUNC_SMBUS_WRITE_BYTE_DATA))
-		return -ENODEV;
-
-	/* Now, we would do the remaining detection. But the PCF8591 is plainly
-	   impossible to detect! Stupid chip. */
-
-	strlcpy(info->type, "pcf8591", I2C_NAME_SIZE);
-
-	return 0;
-}
-
-static int pcf8591_probe(struct i2c_client *client,
-			 const struct i2c_device_id *id)
-{
-	struct pcf8591_data *data;
-	int err;
-
-	if (!(data = kzalloc(sizeof(struct pcf8591_data), GFP_KERNEL))) {
-		err = -ENOMEM;
-		goto exit;
-	}
-	
-	i2c_set_clientdata(client, data);
-	mutex_init(&data->update_lock);
-
-	/* Initialize the PCF8591 chip */
-	pcf8591_init_client(client);
-
-	/* Register sysfs hooks */
-	err = sysfs_create_group(&client->dev.kobj, &pcf8591_attr_group);
-	if (err)
-		goto exit_kfree;
-
-	/* Register input2 if not in "two differential inputs" mode */
-	if (input_mode != 3) {
-		if ((err = device_create_file(&client->dev,
-					      &dev_attr_in2_input)))
-			goto exit_sysfs_remove;
-	}
-
-	/* Register input3 only in "four single ended inputs" mode */
-	if (input_mode == 0) {
-		if ((err = device_create_file(&client->dev,
-					      &dev_attr_in3_input)))
-			goto exit_sysfs_remove;
-	}
-
-	return 0;
-
-exit_sysfs_remove:
-	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt);
-	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group);
-exit_kfree:
-	kfree(data);
-exit:
-	return err;
-}
-
-static int pcf8591_remove(struct i2c_client *client)
-{
-	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt);
-	sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group);
-	kfree(i2c_get_clientdata(client));
-	return 0;
-}
-
-/* Called when we have found a new PCF8591. */
-static void pcf8591_init_client(struct i2c_client *client)
-{
-	struct pcf8591_data *data = i2c_get_clientdata(client);
-	data->control = PCF8591_INIT_CONTROL;
-	data->aout = PCF8591_INIT_AOUT;
-
-	i2c_smbus_write_byte_data(client, data->control, data->aout);
-	
-	/* The first byte transmitted contains the conversion code of the 
-	   previous read cycle. FLUSH IT! */
-	i2c_smbus_read_byte(client);
-}
-
-static int pcf8591_read_channel(struct device *dev, int channel)
-{
-	u8 value;
-	struct i2c_client *client = to_i2c_client(dev);
-	struct pcf8591_data *data = i2c_get_clientdata(client);
-
-	mutex_lock(&data->update_lock);
-
-	if ((data->control & PCF8591_CONTROL_AICH_MASK) != channel) {
-		data->control = (data->control & ~PCF8591_CONTROL_AICH_MASK)
-			      | channel;
-		i2c_smbus_write_byte(client, data->control);
-	
-		/* The first byte transmitted contains the conversion code of 
-		   the previous read cycle. FLUSH IT! */
-		i2c_smbus_read_byte(client);
-	}
-	value = i2c_smbus_read_byte(client);
-
-	mutex_unlock(&data->update_lock);
-
-	if ((channel == 2 && input_mode == 2) ||
-	    (channel != 3 && (input_mode == 1 || input_mode == 3)))
-		return (10 * REG_TO_SIGNED(value));
-	else
-		return (10 * value);
-}
-
-static const struct i2c_device_id pcf8591_id[] = {
-	{ "pcf8591", 0 },
-	{ }
-};
-MODULE_DEVICE_TABLE(i2c, pcf8591_id);
-
-static struct i2c_driver pcf8591_driver = {
-	.driver = {
-		.name	= "pcf8591",
-	},
-	.probe		= pcf8591_probe,
-	.remove		= pcf8591_remove,
-	.id_table	= pcf8591_id,
-
-	.class		= I2C_CLASS_HWMON,	/* Nearest choice */
-	.detect		= pcf8591_detect,
-	.address_data	= &addr_data,
-};
-
-static int __init pcf8591_init(void)
-{
-	if (input_mode < 0 || input_mode > 3) {
-		printk(KERN_WARNING "pcf8591: invalid input_mode (%d)\n",
-		       input_mode);
-		input_mode = 0;
-	}
-	return i2c_add_driver(&pcf8591_driver);
-}
-
-static void __exit pcf8591_exit(void)
-{
-	i2c_del_driver(&pcf8591_driver);
-}
-
-MODULE_AUTHOR("Aurelien Jarno <aurelien@aurel32.net>");
-MODULE_DESCRIPTION("PCF8591 driver");
-MODULE_LICENSE("GPL");
-
-module_init(pcf8591_init);
-module_exit(pcf8591_exit);
-- 
cgit v1.2.3


From ec19920944246b4686c7772a58507a20c361dc9d Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:44 +0200
Subject: hwmon: Define a standard interface for chassis intrusion detection

Define a standard interface for the chassis intrusion detection feature
some hardware monitoring chips have. Some drivers have custom sysfs
entries for it, but a standard interface would allow integration with
user-space (namely libsensors.)

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Hans de Goede <j.w.r.degoede@hhs.nl>
Acked-by: Matt Roberds <mattroberds@cox.net>
---
 Documentation/hwmon/sysfs-interface | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 6dbfd5efd99..2f10ce6a879 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -365,6 +365,7 @@ energy[1-*]_input		Cumulative energy use
 				Unit: microJoule
 				RO
 
+
 **********
 * Alarms *
 **********
@@ -453,6 +454,27 @@ beep_mask	Bitmask for beep.
 		RW
 
 
+***********************
+* Intrusion detection *
+***********************
+
+intrusion[0-*]_alarm
+		Chassis intrusion detection
+		0: OK
+		1: intrusion detected
+		RW
+		Contrary to regular alarm flags which clear themselves
+		automatically when read, this one sticks until cleared by
+		the user. This is done by writing 0 to the file. Writing
+		other values is unsupported.
+
+intrusion[0-*]_beep
+		Chassis intrusion beep
+		0: disable
+		1: enable
+		RW
+
+
 sysfs attribute writes interpretation
 -------------------------------------
 
-- 
cgit v1.2.3


From e7a19c5624c66afa8118b10cd59f87ee407646bc Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:44 +0200
Subject: dmi: Let dmi_walk() users pass private data

At the moment, dmi_walk() lacks flexibility, users can't pass data to
the callback function. Add a pointer for private data to make this
function more flexible.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Roland Dreier <rolandd@cisco.com>
---
 drivers/firmware/dmi_scan.c        | 18 +++++++++++-------
 drivers/hwmon/fschmd.c             |  4 ++--
 drivers/platform/x86/dell-laptop.c |  4 ++--
 drivers/watchdog/hpwdt.c           |  4 ++--
 include/linux/dmi.h                |  7 ++++---
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 8f0f7c44930..5f1b5400d96 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -68,7 +68,8 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
  *	pointing to completely the wrong place for example
  */
 static void dmi_table(u8 *buf, int len, int num,
-		      void (*decode)(const struct dmi_header *))
+		      void (*decode)(const struct dmi_header *, void *),
+		      void *private_data)
 {
 	u8 *data = buf;
 	int i = 0;
@@ -89,7 +90,7 @@ static void dmi_table(u8 *buf, int len, int num,
 		while ((data - buf < len - 1) && (data[0] || data[1]))
 			data++;
 		if (data - buf < len - 1)
-			decode(dm);
+			decode(dm, private_data);
 		data += 2;
 		i++;
 	}
@@ -99,7 +100,8 @@ static u32 dmi_base;
 static u16 dmi_len;
 static u16 dmi_num;
 
-static int __init dmi_walk_early(void (*decode)(const struct dmi_header *))
+static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
+		void *))
 {
 	u8 *buf;
 
@@ -107,7 +109,7 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *))
 	if (buf == NULL)
 		return -1;
 
-	dmi_table(buf, dmi_len, dmi_num, decode);
+	dmi_table(buf, dmi_len, dmi_num, decode, NULL);
 
 	dmi_iounmap(buf, dmi_len);
 	return 0;
@@ -295,7 +297,7 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
  *	and machine entries. For 2.5 we should pull the smbus controller info
  *	out of here.
  */
-static void __init dmi_decode(const struct dmi_header *dm)
+static void __init dmi_decode(const struct dmi_header *dm, void *dummy)
 {
 	switch(dm->type) {
 	case 0:		/* BIOS Information */
@@ -598,10 +600,12 @@ int dmi_get_year(int field)
 /**
  *	dmi_walk - Walk the DMI table and get called back for every record
  *	@decode: Callback function
+ *	@private_data: Private data to be passed to the callback function
  *
  *	Returns -1 when the DMI table can't be reached, 0 on success.
  */
-int dmi_walk(void (*decode)(const struct dmi_header *))
+int dmi_walk(void (*decode)(const struct dmi_header *, void *),
+	     void *private_data)
 {
 	u8 *buf;
 
@@ -612,7 +616,7 @@ int dmi_walk(void (*decode)(const struct dmi_header *))
 	if (buf == NULL)
 		return -1;
 
-	dmi_table(buf, dmi_len, dmi_num, decode);
+	dmi_table(buf, dmi_len, dmi_num, decode, private_data);
 
 	iounmap(buf);
 	return 0;
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index d07f4ef7509..b557f2ebd9a 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -856,7 +856,7 @@ static struct file_operations watchdog_fops = {
 
 /* DMI decode routine to read voltage scaling factors from special DMI tables,
    which are available on FSC machines with an fscher or later chip. */
-static void fschmd_dmi_decode(const struct dmi_header *header)
+static void fschmd_dmi_decode(const struct dmi_header *header, void *dummy)
 {
 	int i, mult[3] = { 0 }, offset[3] = { 0 }, vref = 0, found = 0;
 
@@ -991,7 +991,7 @@ static int fschmd_probe(struct i2c_client *client,
 
 	/* Read the special DMI table for fscher and newer chips */
 	if ((kind == fscher || kind >= fschrc) && dmi_vref == -1) {
-		dmi_walk(fschmd_dmi_decode);
+		dmi_walk(fschmd_dmi_decode, NULL);
 		if (dmi_vref == -1) {
 			dev_warn(&client->dev,
 				"Couldn't get voltage scaling factors from "
diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index 16e11c2ee19..af9f4302117 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -103,7 +103,7 @@ static void parse_da_table(const struct dmi_header *dm)
 	da_num_tokens += tokens;
 }
 
-static void find_tokens(const struct dmi_header *dm)
+static void find_tokens(const struct dmi_header *dm, void *dummy)
 {
 	switch (dm->type) {
 	case 0xd4: /* Indexed IO */
@@ -356,7 +356,7 @@ static int __init dell_init(void)
 	if (!dmi_check_system(dell_device_table))
 		return -ENODEV;
 
-	dmi_walk(find_tokens);
+	dmi_walk(find_tokens, NULL);
 
 	if (!da_tokens)  {
 		printk(KERN_INFO "dell-laptop: Unable to find dmi tokens\n");
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 6cf155d6b35..3137361ccbf 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -380,7 +380,7 @@ asm(".text                      \n\t"
  *	This function checks whether or not a SMBIOS/DMI record is
  *	the 64bit CRU info or not
  */
-static void __devinit dmi_find_cru(const struct dmi_header *dm)
+static void __devinit dmi_find_cru(const struct dmi_header *dm, void *dummy)
 {
 	struct smbios_cru64_info *smbios_cru64_ptr;
 	unsigned long cru_physical_address;
@@ -403,7 +403,7 @@ static int __devinit detect_cru_service(void)
 {
 	cru_rom_addr = NULL;
 
-	dmi_walk(dmi_find_cru);
+	dmi_walk(dmi_find_cru, NULL);
 
 	/* if cru_rom_addr has been set then we found a CRU service */
 	return ((cru_rom_addr != NULL) ? 0 : -ENODEV);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index d741b9ceb0e..bb5489c82c9 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -47,7 +47,8 @@ extern int dmi_get_year(int field);
 extern int dmi_name_in_vendors(const char *str);
 extern int dmi_name_in_serial(const char *str);
 extern int dmi_available;
-extern int dmi_walk(void (*decode)(const struct dmi_header *));
+extern int dmi_walk(void (*decode)(const struct dmi_header *, void *),
+	void *private_data);
 extern bool dmi_match(enum dmi_field f, const char *str);
 
 #else
@@ -61,8 +62,8 @@ static inline int dmi_get_year(int year) { return 0; }
 static inline int dmi_name_in_vendors(const char *s) { return 0; }
 static inline int dmi_name_in_serial(const char *s) { return 0; }
 #define dmi_available 0
-static inline int dmi_walk(void (*decode)(const struct dmi_header *))
-	{ return -1; }
+static inline int dmi_walk(void (*decode)(const struct dmi_header *, void *),
+	void *private_data) { return -1; }
 static inline bool dmi_match(enum dmi_field f, const char *str)
 	{ return false; }
 static inline const struct dmi_system_id *
-- 
cgit v1.2.3


From fa5bfab7128e58c31448fca83a288a86e7d476cc Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 30 Mar 2009 21:46:44 +0200
Subject: i2c-i801: Instantiate FSC hardware montioring chips

Detect various FSC hwmon IC's based on DMI tables and then let
the i2c-i801 driver instantiate the i2c client devices. Note that
some of the info in the added table is indentical for all rows, still
this is kept in the table to keep the code general and thus (hopefully)
easily extensible in the future.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/busses/i2c-i801.c | 77 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 230238df56c..10411848fd7 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -65,6 +65,7 @@
 #include <linux/i2c.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
+#include <linux/dmi.h>
 
 /* I801 SMBus address offsets */
 #define SMBHSTSTS	(0 + i801_smba)
@@ -616,10 +617,81 @@ static void __init input_apanel_init(void)
 static void __init input_apanel_init(void) {}
 #endif
 
+#if defined CONFIG_SENSORS_FSCHMD || defined CONFIG_SENSORS_FSCHMD_MODULE
+struct dmi_onboard_device_info {
+	const char *name;
+	u8 type;
+	unsigned short i2c_addr;
+	const char *i2c_type;
+};
+
+static struct dmi_onboard_device_info __devinitdata dmi_devices[] = {
+	{ "Syleus", DMI_DEV_TYPE_OTHER, 0x73, "fscsyl" },
+	{ "Hermes", DMI_DEV_TYPE_OTHER, 0x73, "fscher" },
+	{ "Hades",  DMI_DEV_TYPE_OTHER, 0x73, "fschds" },
+};
+
+static void __devinit dmi_check_onboard_device(u8 type, const char *name,
+					       struct i2c_adapter *adap)
+{
+	int i;
+	struct i2c_board_info info;
+
+	for (i = 0; i < ARRAY_SIZE(dmi_devices); i++) {
+		/* & ~0x80, ignore enabled/disabled bit */
+		if ((type & ~0x80) != dmi_devices[i].type)
+			continue;
+		if (strcmp(name, dmi_devices[i].name))
+			continue;
+
+		memset(&info, 0, sizeof(struct i2c_board_info));
+		info.addr = dmi_devices[i].i2c_addr;
+		strlcpy(info.type, dmi_devices[i].i2c_type, I2C_NAME_SIZE);
+		i2c_new_device(adap, &info);
+		break;
+	}
+}
+
+/* We use our own function to check for onboard devices instead of
+   dmi_find_device() as some buggy BIOS's have the devices we are interested
+   in marked as disabled */
+static void __devinit dmi_check_onboard_devices(const struct dmi_header *dm,
+						void *adap)
+{
+	int i, count;
+
+	if (dm->type != 10)
+		return;
+
+	count = (dm->length - sizeof(struct dmi_header)) / 2;
+	for (i = 0; i < count; i++) {
+		const u8 *d = (char *)(dm + 1) + (i * 2);
+		const char *name = ((char *) dm) + dm->length;
+		u8 type = d[0];
+		u8 s = d[1];
+
+		if (!s)
+			continue;
+		s--;
+		while (s > 0 && name[0]) {
+			name += strlen(name) + 1;
+			s--;
+		}
+		if (name[0] == 0) /* Bogus string reference */
+			continue;
+
+		dmi_check_onboard_device(type, name, adap);
+	}
+}
+#endif
+
 static int __devinit i801_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	unsigned char temp;
 	int err;
+#if defined CONFIG_SENSORS_FSCHMD || defined CONFIG_SENSORS_FSCHMD_MODULE
+	const char *vendor;
+#endif
 
 	I801_dev = dev;
 	i801_features = 0;
@@ -712,6 +784,11 @@ static int __devinit i801_probe(struct pci_dev *dev, const struct pci_device_id
 		i2c_new_device(&i801_adapter, &info);
 	}
 #endif
+#if defined CONFIG_SENSORS_FSCHMD || defined CONFIG_SENSORS_FSCHMD_MODULE
+	vendor = dmi_get_system_info(DMI_BOARD_VENDOR);
+	if (vendor && !strcmp(vendor, "FUJITSU SIEMENS"))
+		dmi_walk(dmi_check_onboard_devices, &i801_adapter);
+#endif
 
 	return 0;
 
-- 
cgit v1.2.3


From c69ab2b78efbe388bb0fc5d885b718ff4668cceb Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 30 Mar 2009 21:46:45 +0200
Subject: hwmon: (fschmd) Add support for the FSC Syleus IC

Many thanks to Fujitsu Siemens Computers for providing docs and a
machine to test the driver on.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/hwmon/Kconfig  |   7 +-
 drivers/hwmon/fschmd.c | 214 ++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 161 insertions(+), 60 deletions(-)

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 9a22816b37d..41e3f861c47 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -343,11 +343,12 @@ config SENSORS_FSCPOS
 	  will be called fscpos.
 
 config SENSORS_FSCHMD
-	tristate "FSC Poseidon, Scylla, Hermes, Heimdall and Heracles"
+	tristate "Fujitsu Siemens Computers sensor chips"
 	depends on X86 && I2C
 	help
-	  If you say yes here you get support for various Fujitsu Siemens
-	  Computers sensor chips, including support for the integrated
+	  If you say yes here you get support for the following Fujitsu
+	  Siemens Computers (FSC) sensor chips: Poseidon, Scylla, Hermes,
+	  Heimdall, Heracles and Syleus including support for the integrated
 	  watchdog.
 
 	  This is a merged driver for FSC sensor chips replacing the fscpos,
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index b557f2ebd9a..ac515bd6b1e 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -1,6 +1,6 @@
 /* fschmd.c
  *
- * Copyright (C) 2007,2008 Hans de Goede <hdegoede@redhat.com>
+ * Copyright (C) 2007 - 2009 Hans de Goede <hdegoede@redhat.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@
 
 /*
  *  Merged Fujitsu Siemens hwmon driver, supporting the Poseidon, Hermes,
- *  Scylla, Heracles and Heimdall chips
+ *  Scylla, Heracles, Heimdall and Syleus chips
  *
  *  Based on the original 2.4 fscscy, 2.6 fscpos, 2.6 fscher and 2.6
  *  (candidate) fschmd drivers:
@@ -56,7 +56,7 @@ static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
 	__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
-I2C_CLIENT_INSMOD_5(fscpos, fscher, fscscy, fschrc, fschmd);
+I2C_CLIENT_INSMOD_6(fscpos, fscher, fscscy, fschrc, fschmd, fscsyl);
 
 /*
  * The FSCHMD registers and other defines
@@ -75,9 +75,12 @@ I2C_CLIENT_INSMOD_5(fscpos, fscher, fscscy, fschrc, fschmd);
 #define FSCHMD_CONTROL_ALERT_LED	0x01
 
 /* watchdog */
-#define FSCHMD_REG_WDOG_PRESET		0x28
-#define FSCHMD_REG_WDOG_STATE		0x23
-#define FSCHMD_REG_WDOG_CONTROL		0x21
+static const u8 FSCHMD_REG_WDOG_CONTROL[6] =
+	{ 0x21, 0x21, 0x21, 0x21, 0x21, 0x28 };
+static const u8 FSCHMD_REG_WDOG_STATE[6] =
+	{ 0x23, 0x23, 0x23, 0x23, 0x23, 0x29 };
+static const u8 FSCHMD_REG_WDOG_PRESET[6] =
+	{ 0x28, 0x28, 0x28, 0x28, 0x28, 0x2a };
 
 #define FSCHMD_WDOG_CONTROL_TRIGGER	0x10
 #define FSCHMD_WDOG_CONTROL_STARTED	0x10 /* the same as trigger */
@@ -87,70 +90,88 @@ I2C_CLIENT_INSMOD_5(fscpos, fscher, fscscy, fschrc, fschmd);
 #define FSCHMD_WDOG_STATE_CARDRESET	0x02
 
 /* voltages, weird order is to keep the same order as the old drivers */
-static const u8 FSCHMD_REG_VOLT[3] = { 0x45, 0x42, 0x48 };
+static const u8 FSCHMD_REG_VOLT[6][6] = {
+	{ 0x45, 0x42, 0x48 },				/* pos */
+	{ 0x45, 0x42, 0x48 },				/* her */
+	{ 0x45, 0x42, 0x48 },				/* scy */
+	{ 0x45, 0x42, 0x48 },				/* hrc */
+	{ 0x45, 0x42, 0x48 },				/* hmd */
+	{ 0x21, 0x20, 0x22, 0x23, 0x24, 0x25 },		/* syl */
+};
+
+static const int FSCHMD_NO_VOLT_SENSORS[6] = { 3, 3, 3, 3, 3, 6 };
 
 /* minimum pwm at which the fan is driven (pwm can by increased depending on
    the temp. Notice that for the scy some fans share there minimum speed.
    Also notice that with the scy the sensor order is different than with the
    other chips, this order was in the 2.4 driver and kept for consistency. */
-static const u8 FSCHMD_REG_FAN_MIN[5][6] = {
+static const u8 FSCHMD_REG_FAN_MIN[6][7] = {
 	{ 0x55, 0x65 },					/* pos */
 	{ 0x55, 0x65, 0xb5 },				/* her */
 	{ 0x65, 0x65, 0x55, 0xa5, 0x55, 0xa5 },		/* scy */
 	{ 0x55, 0x65, 0xa5, 0xb5 },			/* hrc */
 	{ 0x55, 0x65, 0xa5, 0xb5, 0xc5 },		/* hmd */
+	{ 0x54, 0x64, 0x74, 0x84, 0x94, 0xa4, 0xb4 },	/* syl */
 };
 
 /* actual fan speed */
-static const u8 FSCHMD_REG_FAN_ACT[5][6] = {
+static const u8 FSCHMD_REG_FAN_ACT[6][7] = {
 	{ 0x0e, 0x6b, 0xab },				/* pos */
 	{ 0x0e, 0x6b, 0xbb },				/* her */
 	{ 0x6b, 0x6c, 0x0e, 0xab, 0x5c, 0xbb },		/* scy */
 	{ 0x0e, 0x6b, 0xab, 0xbb },			/* hrc */
 	{ 0x5b, 0x6b, 0xab, 0xbb, 0xcb },		/* hmd */
+	{ 0x57, 0x67, 0x77, 0x87, 0x97, 0xa7, 0xb7 },	/* syl */
 };
 
 /* fan status registers */
-static const u8 FSCHMD_REG_FAN_STATE[5][6] = {
+static const u8 FSCHMD_REG_FAN_STATE[6][7] = {
 	{ 0x0d, 0x62, 0xa2 },				/* pos */
 	{ 0x0d, 0x62, 0xb2 },				/* her */
 	{ 0x62, 0x61, 0x0d, 0xa2, 0x52, 0xb2 },		/* scy */
 	{ 0x0d, 0x62, 0xa2, 0xb2 },			/* hrc */
 	{ 0x52, 0x62, 0xa2, 0xb2, 0xc2 },		/* hmd */
+	{ 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0 },	/* syl */
 };
 
 /* fan ripple / divider registers */
-static const u8 FSCHMD_REG_FAN_RIPPLE[5][6] = {
+static const u8 FSCHMD_REG_FAN_RIPPLE[6][7] = {
 	{ 0x0f, 0x6f, 0xaf },				/* pos */
 	{ 0x0f, 0x6f, 0xbf },				/* her */
 	{ 0x6f, 0x6f, 0x0f, 0xaf, 0x0f, 0xbf },		/* scy */
 	{ 0x0f, 0x6f, 0xaf, 0xbf },			/* hrc */
 	{ 0x5f, 0x6f, 0xaf, 0xbf, 0xcf },		/* hmd */
+	{ 0x56, 0x66, 0x76, 0x86, 0x96, 0xa6, 0xb6 },	/* syl */
 };
 
-static const int FSCHMD_NO_FAN_SENSORS[5] = { 3, 3, 6, 4, 5 };
+static const int FSCHMD_NO_FAN_SENSORS[6] = { 3, 3, 6, 4, 5, 7 };
 
 /* Fan status register bitmasks */
 #define FSCHMD_FAN_ALARM	0x04 /* called fault by FSC! */
-#define FSCHMD_FAN_NOT_PRESENT	0x08 /* not documented */
+#define FSCHMD_FAN_NOT_PRESENT	0x08
+#define FSCHMD_FAN_DISABLED	0x80
 
 
 /* actual temperature registers */
-static const u8 FSCHMD_REG_TEMP_ACT[5][5] = {
+static const u8 FSCHMD_REG_TEMP_ACT[6][11] = {
 	{ 0x64, 0x32, 0x35 },				/* pos */
 	{ 0x64, 0x32, 0x35 },				/* her */
 	{ 0x64, 0xD0, 0x32, 0x35 },			/* scy */
 	{ 0x64, 0x32, 0x35 },				/* hrc */
 	{ 0x70, 0x80, 0x90, 0xd0, 0xe0 },		/* hmd */
+	{ 0x58, 0x68, 0x78, 0x88, 0x98, 0xa8,		/* syl */
+	  0xb8, 0xc8, 0xd8, 0xe8, 0xf8 },
 };
 
 /* temperature state registers */
-static const u8 FSCHMD_REG_TEMP_STATE[5][5] = {
+static const u8 FSCHMD_REG_TEMP_STATE[6][11] = {
 	{ 0x71, 0x81, 0x91 },				/* pos */
 	{ 0x71, 0x81, 0x91 },				/* her */
 	{ 0x71, 0xd1, 0x81, 0x91 },			/* scy */
 	{ 0x71, 0x81, 0x91 },				/* hrc */
 	{ 0x71, 0x81, 0x91, 0xd1, 0xe1 },		/* hmd */
+	{ 0x59, 0x69, 0x79, 0x89, 0x99, 0xa9,		/* syl */
+	  0xb9, 0xc9, 0xd9, 0xe9, 0xf9 },
 };
 
 /* temperature high limit registers, FSC does not document these. Proven to be
@@ -158,24 +179,30 @@ static const u8 FSCHMD_REG_TEMP_STATE[5][5] = {
    in the fscscy 2.4 driver. FSC has confirmed that the fschmd has registers
    at these addresses, but doesn't want to confirm they are the same as with
    the fscher?? */
-static const u8 FSCHMD_REG_TEMP_LIMIT[5][5] = {
+static const u8 FSCHMD_REG_TEMP_LIMIT[6][11] = {
 	{ 0, 0, 0 },					/* pos */
 	{ 0x76, 0x86, 0x96 },				/* her */
 	{ 0x76, 0xd6, 0x86, 0x96 },			/* scy */
 	{ 0x76, 0x86, 0x96 },				/* hrc */
 	{ 0x76, 0x86, 0x96, 0xd6, 0xe6 },		/* hmd */
+	{ 0x5a, 0x6a, 0x7a, 0x8a, 0x9a, 0xaa,		/* syl */
+	  0xba, 0xca, 0xda, 0xea, 0xfa },
 };
 
 /* These were found through experimenting with an fscher, currently they are
    not used, but we keep them around for future reference.
+   On the fscsyl AUTOP1 lives at 0x#c (so 0x5c for fan1, 0x6c for fan2, etc),
+   AUTOP2 lives at 0x#e, and 0x#1 is a bitmask defining which temps influence
+   the fan speed.
 static const u8 FSCHER_REG_TEMP_AUTOP1[] =	{ 0x73, 0x83, 0x93 };
 static const u8 FSCHER_REG_TEMP_AUTOP2[] =	{ 0x75, 0x85, 0x95 }; */
 
-static const int FSCHMD_NO_TEMP_SENSORS[5] = { 3, 3, 4, 3, 5 };
+static const int FSCHMD_NO_TEMP_SENSORS[6] = { 3, 3, 4, 3, 5, 11 };
 
 /* temp status register bitmasks */
 #define FSCHMD_TEMP_WORKING	0x01
 #define FSCHMD_TEMP_ALERT	0x02
+#define FSCHMD_TEMP_DISABLED	0x80
 /* there only really is an alarm if the sensor is working and alert == 1 */
 #define FSCHMD_TEMP_ALARM_MASK \
 	(FSCHMD_TEMP_WORKING | FSCHMD_TEMP_ALERT)
@@ -201,6 +228,7 @@ static const struct i2c_device_id fschmd_id[] = {
 	{ "fscscy", fscscy },
 	{ "fschrc", fschrc },
 	{ "fschmd", fschmd },
+	{ "fscsyl", fscsyl },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, fschmd_id);
@@ -242,14 +270,14 @@ struct fschmd_data {
 	u8 watchdog_control;    /* watchdog control register */
 	u8 watchdog_state;      /* watchdog status register */
 	u8 watchdog_preset;     /* watchdog counter preset on trigger val */
-	u8 volt[3];		/* 12, 5, battery voltage */
-	u8 temp_act[5];		/* temperature */
-	u8 temp_status[5];	/* status of sensor */
-	u8 temp_max[5];		/* high temp limit, notice: undocumented! */
-	u8 fan_act[6];		/* fans revolutions per second */
-	u8 fan_status[6];	/* fan status */
-	u8 fan_min[6];		/* fan min value for rps */
-	u8 fan_ripple[6];	/* divider for rps */
+	u8 volt[6];		/* voltage */
+	u8 temp_act[11];	/* temperature */
+	u8 temp_status[11];	/* status of sensor */
+	u8 temp_max[11];	/* high temp limit, notice: undocumented! */
+	u8 fan_act[7];		/* fans revolutions per second */
+	u8 fan_status[7];	/* fan status */
+	u8 fan_min[7];		/* fan min value for rps */
+	u8 fan_ripple[7];	/* divider for rps */
 };
 
 /* Global variables to hold information read from special DMI tables, which are
@@ -257,8 +285,8 @@ struct fschmd_data {
    protect these with a lock as they are only modified from our attach function
    which always gets called with the i2c-core lock held and never accessed
    before the attach function is done with them. */
-static int dmi_mult[3] = { 490, 200, 100 };
-static int dmi_offset[3] = { 0, 0, 0 };
+static int dmi_mult[6] = { 490, 200, 100, 100, 200, 100 };
+static int dmi_offset[6] = { 0, 0, 0, 0, 0, 0 };
 static int dmi_vref = -1;
 
 /* Somewhat ugly :( global data pointer list with all fschmd devices, so that
@@ -450,10 +478,11 @@ static ssize_t show_pwm_auto_point1_pwm(struct device *dev,
 	struct device_attribute *devattr, char *buf)
 {
 	int index = to_sensor_dev_attr(devattr)->index;
-	int val = fschmd_update_device(dev)->fan_min[index];
+	struct fschmd_data *data = fschmd_update_device(dev);
+	int val = data->fan_min[index];
 
-	/* 0 = allow turning off, 1-255 = 50-100% */
-	if (val)
+	/* 0 = allow turning off (except on the syl), 1-255 = 50-100% */
+	if (val || data->kind == fscsyl - 1)
 		val = val / 2 + 128;
 
 	return sprintf(buf, "%d\n", val);
@@ -466,8 +495,8 @@ static ssize_t store_pwm_auto_point1_pwm(struct device *dev,
 	struct fschmd_data *data = dev_get_drvdata(dev);
 	unsigned long v = simple_strtoul(buf, NULL, 10);
 
-	/* register: 0 = allow turning off, 1-255 = 50-100% */
-	if (v) {
+	/* reg: 0 = allow turning off (except on the syl), 1-255 = 50-100% */
+	if (v || data->kind == fscsyl - 1) {
 		v = SENSORS_LIMIT(v, 128, 255);
 		v = (v - 128) * 2 + 1;
 	}
@@ -522,11 +551,15 @@ static ssize_t store_alert_led(struct device *dev,
 	return count;
 }
 
+static DEVICE_ATTR(alert_led, 0644, show_alert_led, store_alert_led);
+
 static struct sensor_device_attribute fschmd_attr[] = {
 	SENSOR_ATTR(in0_input, 0444, show_in_value, NULL, 0),
 	SENSOR_ATTR(in1_input, 0444, show_in_value, NULL, 1),
 	SENSOR_ATTR(in2_input, 0444, show_in_value, NULL, 2),
-	SENSOR_ATTR(alert_led, 0644, show_alert_led, store_alert_led, 0),
+	SENSOR_ATTR(in3_input, 0444, show_in_value, NULL, 3),
+	SENSOR_ATTR(in4_input, 0444, show_in_value, NULL, 4),
+	SENSOR_ATTR(in5_input, 0444, show_in_value, NULL, 5),
 };
 
 static struct sensor_device_attribute fschmd_temp_attr[] = {
@@ -550,6 +583,30 @@ static struct sensor_device_attribute fschmd_temp_attr[] = {
 	SENSOR_ATTR(temp5_max,   0644, show_temp_max, store_temp_max, 4),
 	SENSOR_ATTR(temp5_fault, 0444, show_temp_fault, NULL, 4),
 	SENSOR_ATTR(temp5_alarm, 0444, show_temp_alarm, NULL, 4),
+	SENSOR_ATTR(temp6_input, 0444, show_temp_value, NULL, 5),
+	SENSOR_ATTR(temp6_max,   0644, show_temp_max, store_temp_max, 5),
+	SENSOR_ATTR(temp6_fault, 0444, show_temp_fault, NULL, 5),
+	SENSOR_ATTR(temp6_alarm, 0444, show_temp_alarm, NULL, 5),
+	SENSOR_ATTR(temp7_input, 0444, show_temp_value, NULL, 6),
+	SENSOR_ATTR(temp7_max,   0644, show_temp_max, store_temp_max, 6),
+	SENSOR_ATTR(temp7_fault, 0444, show_temp_fault, NULL, 6),
+	SENSOR_ATTR(temp7_alarm, 0444, show_temp_alarm, NULL, 6),
+	SENSOR_ATTR(temp8_input, 0444, show_temp_value, NULL, 7),
+	SENSOR_ATTR(temp8_max,   0644, show_temp_max, store_temp_max, 7),
+	SENSOR_ATTR(temp8_fault, 0444, show_temp_fault, NULL, 7),
+	SENSOR_ATTR(temp8_alarm, 0444, show_temp_alarm, NULL, 7),
+	SENSOR_ATTR(temp9_input, 0444, show_temp_value, NULL, 8),
+	SENSOR_ATTR(temp9_max,   0644, show_temp_max, store_temp_max, 8),
+	SENSOR_ATTR(temp9_fault, 0444, show_temp_fault, NULL, 8),
+	SENSOR_ATTR(temp9_alarm, 0444, show_temp_alarm, NULL, 8),
+	SENSOR_ATTR(temp10_input, 0444, show_temp_value, NULL, 9),
+	SENSOR_ATTR(temp10_max,   0644, show_temp_max, store_temp_max, 9),
+	SENSOR_ATTR(temp10_fault, 0444, show_temp_fault, NULL, 9),
+	SENSOR_ATTR(temp10_alarm, 0444, show_temp_alarm, NULL, 9),
+	SENSOR_ATTR(temp11_input, 0444, show_temp_value, NULL, 10),
+	SENSOR_ATTR(temp11_max,   0644, show_temp_max, store_temp_max, 10),
+	SENSOR_ATTR(temp11_fault, 0444, show_temp_fault, NULL, 10),
+	SENSOR_ATTR(temp11_alarm, 0444, show_temp_alarm, NULL, 10),
 };
 
 static struct sensor_device_attribute fschmd_fan_attr[] = {
@@ -589,6 +646,12 @@ static struct sensor_device_attribute fschmd_fan_attr[] = {
 	SENSOR_ATTR(fan6_fault, 0444, show_fan_fault, NULL, 5),
 	SENSOR_ATTR(pwm6_auto_point1_pwm, 0644, show_pwm_auto_point1_pwm,
 		store_pwm_auto_point1_pwm, 5),
+	SENSOR_ATTR(fan7_input, 0444, show_fan_value, NULL, 6),
+	SENSOR_ATTR(fan7_div,   0644, show_fan_div, store_fan_div, 6),
+	SENSOR_ATTR(fan7_alarm, 0444, show_fan_alarm, NULL, 6),
+	SENSOR_ATTR(fan7_fault, 0444, show_fan_fault, NULL, 6),
+	SENSOR_ATTR(pwm7_auto_point1_pwm, 0644, show_pwm_auto_point1_pwm,
+		store_pwm_auto_point1_pwm, 6),
 };
 
 
@@ -624,10 +687,11 @@ static int watchdog_set_timeout(struct fschmd_data *data, int timeout)
 	data->watchdog_preset = DIV_ROUND_UP(timeout, resolution);
 
 	/* Write new timeout value */
-	i2c_smbus_write_byte_data(data->client, FSCHMD_REG_WDOG_PRESET,
-		data->watchdog_preset);
+	i2c_smbus_write_byte_data(data->client,
+		FSCHMD_REG_WDOG_PRESET[data->kind], data->watchdog_preset);
 	/* Write new control register, do not trigger! */
-	i2c_smbus_write_byte_data(data->client, FSCHMD_REG_WDOG_CONTROL,
+	i2c_smbus_write_byte_data(data->client,
+		FSCHMD_REG_WDOG_CONTROL[data->kind],
 		data->watchdog_control & ~FSCHMD_WDOG_CONTROL_TRIGGER);
 
 	ret = data->watchdog_preset * resolution;
@@ -662,8 +726,9 @@ static int watchdog_trigger(struct fschmd_data *data)
 	}
 
 	data->watchdog_control |= FSCHMD_WDOG_CONTROL_TRIGGER;
-	i2c_smbus_write_byte_data(data->client, FSCHMD_REG_WDOG_CONTROL,
-					data->watchdog_control);
+	i2c_smbus_write_byte_data(data->client,
+				  FSCHMD_REG_WDOG_CONTROL[data->kind],
+				  data->watchdog_control);
 leave:
 	mutex_unlock(&data->watchdog_lock);
 	return ret;
@@ -682,7 +747,8 @@ static int watchdog_stop(struct fschmd_data *data)
 	data->watchdog_control &= ~FSCHMD_WDOG_CONTROL_STARTED;
 	/* Don't store the stop flag in our watchdog control register copy, as
 	   its a write only bit (read always returns 0) */
-	i2c_smbus_write_byte_data(data->client, FSCHMD_REG_WDOG_CONTROL,
+	i2c_smbus_write_byte_data(data->client,
+		FSCHMD_REG_WDOG_CONTROL[data->kind],
 		data->watchdog_control | FSCHMD_WDOG_CONTROL_STOP);
 leave:
 	mutex_unlock(&data->watchdog_lock);
@@ -912,6 +978,15 @@ static void fschmd_dmi_decode(const struct dmi_header *header, void *dummy)
 			dmi_mult[i] = mult[i] * 10;
 			dmi_offset[i] = offset[i] * 10;
 		}
+		/* According to the docs there should be separate dmi entries
+		   for the mult's and offsets of in3-5 of the syl, but on
+		   my test machine these are not present */
+		dmi_mult[3] = dmi_mult[2];
+		dmi_mult[4] = dmi_mult[1];
+		dmi_mult[5] = dmi_mult[2];
+		dmi_offset[3] = dmi_offset[2];
+		dmi_offset[4] = dmi_offset[1];
+		dmi_offset[5] = dmi_offset[2];
 		dmi_vref = vref;
 	}
 }
@@ -920,8 +995,6 @@ static int fschmd_detect(struct i2c_client *client, int kind,
 			 struct i2c_board_info *info)
 {
 	struct i2c_adapter *adapter = client->adapter;
-	const char * const client_names[5] = { "fscpos", "fscher", "fscscy",
-						"fschrc", "fschmd" };
 
 	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
 		return -ENODEV;
@@ -948,11 +1021,13 @@ static int fschmd_detect(struct i2c_client *client, int kind,
 			kind = fschrc;
 		else if (!strcmp(id, "HMD"))
 			kind = fschmd;
+		else if (!strcmp(id, "SYL"))
+			kind = fscsyl;
 		else
 			return -ENODEV;
 	}
 
-	strlcpy(info->type, client_names[kind - 1], I2C_NAME_SIZE);
+	strlcpy(info->type, fschmd_id[kind - 1].name, I2C_NAME_SIZE);
 
 	return 0;
 }
@@ -961,8 +1036,8 @@ static int fschmd_probe(struct i2c_client *client,
 			const struct i2c_device_id *id)
 {
 	struct fschmd_data *data;
-	const char * const names[5] = { "Poseidon", "Hermes", "Scylla",
-					"Heracles", "Heimdall" };
+	const char * const names[6] = { "Poseidon", "Hermes", "Scylla",
+					"Heracles", "Heimdall", "Syleus" };
 	const int watchdog_minors[] = { WATCHDOG_MINOR, 212, 213, 214, 215 };
 	int i, err;
 	enum chips kind = id->driver_data;
@@ -1000,21 +1075,25 @@ static int fschmd_probe(struct i2c_client *client,
 		}
 	}
 
+	/* i2c kind goes from 1-6, we want from 0-5 to address arrays */
+	data->kind = kind - 1;
+
 	/* Read in some never changing registers */
 	data->revision = i2c_smbus_read_byte_data(client, FSCHMD_REG_REVISION);
 	data->global_control = i2c_smbus_read_byte_data(client,
 					FSCHMD_REG_CONTROL);
 	data->watchdog_control = i2c_smbus_read_byte_data(client,
-					FSCHMD_REG_WDOG_CONTROL);
+					FSCHMD_REG_WDOG_CONTROL[data->kind]);
 	data->watchdog_state = i2c_smbus_read_byte_data(client,
-					FSCHMD_REG_WDOG_STATE);
+					FSCHMD_REG_WDOG_STATE[data->kind]);
 	data->watchdog_preset = i2c_smbus_read_byte_data(client,
-					FSCHMD_REG_WDOG_PRESET);
+					FSCHMD_REG_WDOG_PRESET[data->kind]);
 
-	/* i2c kind goes from 1-5, we want from 0-4 to address arrays */
-	data->kind = kind - 1;
+	err = device_create_file(&client->dev, &dev_attr_alert_led);
+	if (err)
+		goto exit_detach;
 
-	for (i = 0; i < ARRAY_SIZE(fschmd_attr); i++) {
+	for (i = 0; i < FSCHMD_NO_VOLT_SENSORS[data->kind]; i++) {
 		err = device_create_file(&client->dev,
 					&fschmd_attr[i].dev_attr);
 		if (err)
@@ -1027,6 +1106,16 @@ static int fschmd_probe(struct i2c_client *client,
 				show_temp_max)
 			continue;
 
+		if (kind == fscsyl) {
+			if (i % 4 == 0)
+				data->temp_status[i / 4] =
+					i2c_smbus_read_byte_data(client,
+						FSCHMD_REG_TEMP_STATE
+						[data->kind][i / 4]);
+			if (data->temp_status[i / 4] & FSCHMD_TEMP_DISABLED)
+				continue;
+		}
+
 		err = device_create_file(&client->dev,
 					&fschmd_temp_attr[i].dev_attr);
 		if (err)
@@ -1040,6 +1129,16 @@ static int fschmd_probe(struct i2c_client *client,
 					"pwm3_auto_point1_pwm"))
 			continue;
 
+		if (kind == fscsyl) {
+			if (i % 5 == 0)
+				data->fan_status[i / 5] =
+					i2c_smbus_read_byte_data(client,
+						FSCHMD_REG_FAN_STATE
+						[data->kind][i / 5]);
+			if (data->fan_status[i / 5] & FSCHMD_FAN_DISABLED)
+				continue;
+		}
+
 		err = device_create_file(&client->dev,
 					&fschmd_fan_attr[i].dev_attr);
 		if (err)
@@ -1126,7 +1225,8 @@ static int fschmd_remove(struct i2c_client *client)
 	if (data->hwmon_dev)
 		hwmon_device_unregister(data->hwmon_dev);
 
-	for (i = 0; i < ARRAY_SIZE(fschmd_attr); i++)
+	device_remove_file(&client->dev, &dev_attr_alert_led);
+	for (i = 0; i < (FSCHMD_NO_VOLT_SENSORS[data->kind]); i++)
 		device_remove_file(&client->dev, &fschmd_attr[i].dev_attr);
 	for (i = 0; i < (FSCHMD_NO_TEMP_SENSORS[data->kind] * 4); i++)
 		device_remove_file(&client->dev,
@@ -1171,7 +1271,7 @@ static struct fschmd_data *fschmd_update_device(struct device *dev)
 					data->temp_act[i] < data->temp_max[i])
 				i2c_smbus_write_byte_data(client,
 					FSCHMD_REG_TEMP_STATE[data->kind][i],
-					FSCHMD_TEMP_ALERT);
+					data->temp_status[i]);
 		}
 
 		for (i = 0; i < FSCHMD_NO_FAN_SENSORS[data->kind]; i++) {
@@ -1193,12 +1293,12 @@ static struct fschmd_data *fschmd_update_device(struct device *dev)
 					data->fan_act[i])
 				i2c_smbus_write_byte_data(client,
 					FSCHMD_REG_FAN_STATE[data->kind][i],
-					FSCHMD_FAN_ALARM);
+					data->fan_status[i]);
 		}
 
-		for (i = 0; i < 3; i++)
+		for (i = 0; i < FSCHMD_NO_VOLT_SENSORS[data->kind]; i++)
 			data->volt[i] = i2c_smbus_read_byte_data(client,
-						FSCHMD_REG_VOLT[i]);
+					       FSCHMD_REG_VOLT[data->kind][i]);
 
 		data->last_updated = jiffies;
 		data->valid = 1;
@@ -1220,8 +1320,8 @@ static void __exit fschmd_exit(void)
 }
 
 MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>");
-MODULE_DESCRIPTION("FSC Poseidon, Hermes, Scylla, Heracles and "
-			"Heimdall driver");
+MODULE_DESCRIPTION("FSC Poseidon, Hermes, Scylla, Heracles, Heimdall and "
+			"Syleus driver");
 MODULE_LICENSE("GPL");
 
 module_init(fschmd_init);
-- 
cgit v1.2.3


From de15f093e666ccd542f6f7a0e3e917166a07ab44 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 30 Mar 2009 21:46:45 +0200
Subject: hwmon: (fschmd) Add support for the FSC Hades IC

Add support for the Hades to the FSC hwmon driver.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/hwmon/Kconfig  |  4 ++--
 drivers/hwmon/fschmd.c | 57 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 41e3f861c47..51ff9b3d7ea 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -348,8 +348,8 @@ config SENSORS_FSCHMD
 	help
 	  If you say yes here you get support for the following Fujitsu
 	  Siemens Computers (FSC) sensor chips: Poseidon, Scylla, Hermes,
-	  Heimdall, Heracles and Syleus including support for the integrated
-	  watchdog.
+	  Heimdall, Heracles, Hades and Syleus including support for the
+	  integrated watchdog.
 
 	  This is a merged driver for FSC sensor chips replacing the fscpos,
 	  fscscy and fscher drivers and adding support for several other FSC
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index ac515bd6b1e..ea955edde87 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -19,7 +19,7 @@
 
 /*
  *  Merged Fujitsu Siemens hwmon driver, supporting the Poseidon, Hermes,
- *  Scylla, Heracles, Heimdall and Syleus chips
+ *  Scylla, Heracles, Heimdall, Hades and Syleus chips
  *
  *  Based on the original 2.4 fscscy, 2.6 fscpos, 2.6 fscher and 2.6
  *  (candidate) fschmd drivers:
@@ -56,7 +56,7 @@ static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
 	__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
-I2C_CLIENT_INSMOD_6(fscpos, fscher, fscscy, fschrc, fschmd, fscsyl);
+I2C_CLIENT_INSMOD_7(fscpos, fscher, fscscy, fschrc, fschmd, fschds, fscsyl);
 
 /*
  * The FSCHMD registers and other defines
@@ -75,12 +75,12 @@ I2C_CLIENT_INSMOD_6(fscpos, fscher, fscscy, fschrc, fschmd, fscsyl);
 #define FSCHMD_CONTROL_ALERT_LED	0x01
 
 /* watchdog */
-static const u8 FSCHMD_REG_WDOG_CONTROL[6] =
-	{ 0x21, 0x21, 0x21, 0x21, 0x21, 0x28 };
-static const u8 FSCHMD_REG_WDOG_STATE[6] =
-	{ 0x23, 0x23, 0x23, 0x23, 0x23, 0x29 };
-static const u8 FSCHMD_REG_WDOG_PRESET[6] =
-	{ 0x28, 0x28, 0x28, 0x28, 0x28, 0x2a };
+static const u8 FSCHMD_REG_WDOG_CONTROL[7] =
+	{ 0x21, 0x21, 0x21, 0x21, 0x21, 0x28, 0x28 };
+static const u8 FSCHMD_REG_WDOG_STATE[7] =
+	{ 0x23, 0x23, 0x23, 0x23, 0x23, 0x29, 0x29 };
+static const u8 FSCHMD_REG_WDOG_PRESET[7] =
+	{ 0x28, 0x28, 0x28, 0x28, 0x28, 0x2a, 0x2a };
 
 #define FSCHMD_WDOG_CONTROL_TRIGGER	0x10
 #define FSCHMD_WDOG_CONTROL_STARTED	0x10 /* the same as trigger */
@@ -90,61 +90,66 @@ static const u8 FSCHMD_REG_WDOG_PRESET[6] =
 #define FSCHMD_WDOG_STATE_CARDRESET	0x02
 
 /* voltages, weird order is to keep the same order as the old drivers */
-static const u8 FSCHMD_REG_VOLT[6][6] = {
+static const u8 FSCHMD_REG_VOLT[7][6] = {
 	{ 0x45, 0x42, 0x48 },				/* pos */
 	{ 0x45, 0x42, 0x48 },				/* her */
 	{ 0x45, 0x42, 0x48 },				/* scy */
 	{ 0x45, 0x42, 0x48 },				/* hrc */
 	{ 0x45, 0x42, 0x48 },				/* hmd */
+	{ 0x21, 0x20, 0x22 },				/* hds */
 	{ 0x21, 0x20, 0x22, 0x23, 0x24, 0x25 },		/* syl */
 };
 
-static const int FSCHMD_NO_VOLT_SENSORS[6] = { 3, 3, 3, 3, 3, 6 };
+static const int FSCHMD_NO_VOLT_SENSORS[7] = { 3, 3, 3, 3, 3, 3, 6 };
 
 /* minimum pwm at which the fan is driven (pwm can by increased depending on
    the temp. Notice that for the scy some fans share there minimum speed.
    Also notice that with the scy the sensor order is different than with the
    other chips, this order was in the 2.4 driver and kept for consistency. */
-static const u8 FSCHMD_REG_FAN_MIN[6][7] = {
+static const u8 FSCHMD_REG_FAN_MIN[7][7] = {
 	{ 0x55, 0x65 },					/* pos */
 	{ 0x55, 0x65, 0xb5 },				/* her */
 	{ 0x65, 0x65, 0x55, 0xa5, 0x55, 0xa5 },		/* scy */
 	{ 0x55, 0x65, 0xa5, 0xb5 },			/* hrc */
 	{ 0x55, 0x65, 0xa5, 0xb5, 0xc5 },		/* hmd */
+	{ 0x55, 0x65, 0xa5, 0xb5, 0xc5 },		/* hds */
 	{ 0x54, 0x64, 0x74, 0x84, 0x94, 0xa4, 0xb4 },	/* syl */
 };
 
 /* actual fan speed */
-static const u8 FSCHMD_REG_FAN_ACT[6][7] = {
+static const u8 FSCHMD_REG_FAN_ACT[7][7] = {
 	{ 0x0e, 0x6b, 0xab },				/* pos */
 	{ 0x0e, 0x6b, 0xbb },				/* her */
 	{ 0x6b, 0x6c, 0x0e, 0xab, 0x5c, 0xbb },		/* scy */
 	{ 0x0e, 0x6b, 0xab, 0xbb },			/* hrc */
 	{ 0x5b, 0x6b, 0xab, 0xbb, 0xcb },		/* hmd */
+	{ 0x5b, 0x6b, 0xab, 0xbb, 0xcb },		/* hds */
 	{ 0x57, 0x67, 0x77, 0x87, 0x97, 0xa7, 0xb7 },	/* syl */
 };
 
 /* fan status registers */
-static const u8 FSCHMD_REG_FAN_STATE[6][7] = {
+static const u8 FSCHMD_REG_FAN_STATE[7][7] = {
 	{ 0x0d, 0x62, 0xa2 },				/* pos */
 	{ 0x0d, 0x62, 0xb2 },				/* her */
 	{ 0x62, 0x61, 0x0d, 0xa2, 0x52, 0xb2 },		/* scy */
 	{ 0x0d, 0x62, 0xa2, 0xb2 },			/* hrc */
 	{ 0x52, 0x62, 0xa2, 0xb2, 0xc2 },		/* hmd */
+	{ 0x52, 0x62, 0xa2, 0xb2, 0xc2 },		/* hds */
 	{ 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0 },	/* syl */
 };
 
 /* fan ripple / divider registers */
-static const u8 FSCHMD_REG_FAN_RIPPLE[6][7] = {
+static const u8 FSCHMD_REG_FAN_RIPPLE[7][7] = {
 	{ 0x0f, 0x6f, 0xaf },				/* pos */
 	{ 0x0f, 0x6f, 0xbf },				/* her */
 	{ 0x6f, 0x6f, 0x0f, 0xaf, 0x0f, 0xbf },		/* scy */
 	{ 0x0f, 0x6f, 0xaf, 0xbf },			/* hrc */
 	{ 0x5f, 0x6f, 0xaf, 0xbf, 0xcf },		/* hmd */
+	{ 0x5f, 0x6f, 0xaf, 0xbf, 0xcf },		/* hds */
 	{ 0x56, 0x66, 0x76, 0x86, 0x96, 0xa6, 0xb6 },	/* syl */
 };
 
-static const int FSCHMD_NO_FAN_SENSORS[6] = { 3, 3, 6, 4, 5, 7 };
+static const int FSCHMD_NO_FAN_SENSORS[7] = { 3, 3, 6, 4, 5, 5, 7 };
 
 /* Fan status register bitmasks */
 #define FSCHMD_FAN_ALARM	0x04 /* called fault by FSC! */
@@ -153,23 +158,25 @@ static const int FSCHMD_NO_FAN_SENSORS[6] = { 3, 3, 6, 4, 5, 7 };
 
 
 /* actual temperature registers */
-static const u8 FSCHMD_REG_TEMP_ACT[6][11] = {
+static const u8 FSCHMD_REG_TEMP_ACT[7][11] = {
 	{ 0x64, 0x32, 0x35 },				/* pos */
 	{ 0x64, 0x32, 0x35 },				/* her */
 	{ 0x64, 0xD0, 0x32, 0x35 },			/* scy */
 	{ 0x64, 0x32, 0x35 },				/* hrc */
 	{ 0x70, 0x80, 0x90, 0xd0, 0xe0 },		/* hmd */
+	{ 0x70, 0x80, 0x90, 0xd0, 0xe0 },		/* hds */
 	{ 0x58, 0x68, 0x78, 0x88, 0x98, 0xa8,		/* syl */
 	  0xb8, 0xc8, 0xd8, 0xe8, 0xf8 },
 };
 
 /* temperature state registers */
-static const u8 FSCHMD_REG_TEMP_STATE[6][11] = {
+static const u8 FSCHMD_REG_TEMP_STATE[7][11] = {
 	{ 0x71, 0x81, 0x91 },				/* pos */
 	{ 0x71, 0x81, 0x91 },				/* her */
 	{ 0x71, 0xd1, 0x81, 0x91 },			/* scy */
 	{ 0x71, 0x81, 0x91 },				/* hrc */
 	{ 0x71, 0x81, 0x91, 0xd1, 0xe1 },		/* hmd */
+	{ 0x71, 0x81, 0x91, 0xd1, 0xe1 },		/* hds */
 	{ 0x59, 0x69, 0x79, 0x89, 0x99, 0xa9,		/* syl */
 	  0xb9, 0xc9, 0xd9, 0xe9, 0xf9 },
 };
@@ -179,12 +186,13 @@ static const u8 FSCHMD_REG_TEMP_STATE[6][11] = {
    in the fscscy 2.4 driver. FSC has confirmed that the fschmd has registers
    at these addresses, but doesn't want to confirm they are the same as with
    the fscher?? */
-static const u8 FSCHMD_REG_TEMP_LIMIT[6][11] = {
+static const u8 FSCHMD_REG_TEMP_LIMIT[7][11] = {
 	{ 0, 0, 0 },					/* pos */
 	{ 0x76, 0x86, 0x96 },				/* her */
 	{ 0x76, 0xd6, 0x86, 0x96 },			/* scy */
 	{ 0x76, 0x86, 0x96 },				/* hrc */
 	{ 0x76, 0x86, 0x96, 0xd6, 0xe6 },		/* hmd */
+	{ 0x76, 0x86, 0x96, 0xd6, 0xe6 },		/* hds */
 	{ 0x5a, 0x6a, 0x7a, 0x8a, 0x9a, 0xaa,		/* syl */
 	  0xba, 0xca, 0xda, 0xea, 0xfa },
 };
@@ -197,7 +205,7 @@ static const u8 FSCHMD_REG_TEMP_LIMIT[6][11] = {
 static const u8 FSCHER_REG_TEMP_AUTOP1[] =	{ 0x73, 0x83, 0x93 };
 static const u8 FSCHER_REG_TEMP_AUTOP2[] =	{ 0x75, 0x85, 0x95 }; */
 
-static const int FSCHMD_NO_TEMP_SENSORS[6] = { 3, 3, 4, 3, 5, 11 };
+static const int FSCHMD_NO_TEMP_SENSORS[7] = { 3, 3, 4, 3, 5, 5, 11 };
 
 /* temp status register bitmasks */
 #define FSCHMD_TEMP_WORKING	0x01
@@ -228,6 +236,7 @@ static const struct i2c_device_id fschmd_id[] = {
 	{ "fscscy", fscscy },
 	{ "fschrc", fschrc },
 	{ "fschmd", fschmd },
+	{ "fschds", fschds },
 	{ "fscsyl", fscsyl },
 	{ }
 };
@@ -1021,6 +1030,8 @@ static int fschmd_detect(struct i2c_client *client, int kind,
 			kind = fschrc;
 		else if (!strcmp(id, "HMD"))
 			kind = fschmd;
+		else if (!strcmp(id, "HDS"))
+			kind = fschds;
 		else if (!strcmp(id, "SYL"))
 			kind = fscsyl;
 		else
@@ -1036,8 +1047,8 @@ static int fschmd_probe(struct i2c_client *client,
 			const struct i2c_device_id *id)
 {
 	struct fschmd_data *data;
-	const char * const names[6] = { "Poseidon", "Hermes", "Scylla",
-					"Heracles", "Heimdall", "Syleus" };
+	const char * const names[7] = { "Poseidon", "Hermes", "Scylla",
+				"Heracles", "Heimdall", "Hades", "Syleus" };
 	const int watchdog_minors[] = { WATCHDOG_MINOR, 212, 213, 214, 215 };
 	int i, err;
 	enum chips kind = id->driver_data;
@@ -1320,8 +1331,8 @@ static void __exit fschmd_exit(void)
 }
 
 MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>");
-MODULE_DESCRIPTION("FSC Poseidon, Hermes, Scylla, Heracles, Heimdall and "
-			"Syleus driver");
+MODULE_DESCRIPTION("FSC Poseidon, Hermes, Scylla, Heracles, Heimdall, Hades "
+			"and Syleus driver");
 MODULE_LICENSE("GPL");
 
 module_init(fschmd_init);
-- 
cgit v1.2.3


From 19cefdffbfe0f7e280f21e80875937e8700e99e2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 15 Mar 2009 06:03:11 +0100
Subject: lockdep: annotate reclaim context (__GFP_NOFS), fix SLOB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: build fix

fix typo in mm/slob.c:

 mm/slob.c:469: error: ‘flags’ undeclared (first use in this function)
 mm/slob.c:469: error: (Each undeclared identifier is reported only once
 mm/slob.c:469: error: for each function it appears in.)

Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090128135457.350751756@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/slob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slob.c b/mm/slob.c
index 1264799df5d..4b1c0c1d63c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -464,7 +464,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
-	lockdep_trace_alloc(flags);
+	lockdep_trace_alloc(gfp);
 
 	if (size < PAGE_SIZE - align) {
 		if (!size)
-- 
cgit v1.2.3


From 1681bc30f272dd2fe347b90468791b05c7044f03 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 13 Jan 2009 13:53:48 +0300
Subject: proc: move fs/proc/inode-alloc.txt comment into a source file

so that people will realize that it exists and can update it as needed.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/generic.c       | 15 +++++++++++++++
 fs/proc/inode-alloc.txt | 14 --------------
 2 files changed, 15 insertions(+), 14 deletions(-)
 delete mode 100644 fs/proc/inode-alloc.txt

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5d2989e9dcc..8c68bbe2b61 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -307,6 +307,21 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
+ *
+ * Current inode allocations in the proc-fs (hex-numbers):
+ *
+ * 00000000		reserved
+ * 00000001-00000fff	static entries	(goners)
+ *      001		root-ino
+ *
+ * 00001000-00001fff	unused
+ * 0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
+ * 80000000-efffffff	unused
+ * f0000000-ffffffff	dynamic entries
+ *
+ * Goal:
+ *	Once we split the thing into several virtual filesystems,
+ *	we will get rid of magical ranges (and this comment, BTW).
  */
 static unsigned int get_inode_number(void)
 {
diff --git a/fs/proc/inode-alloc.txt b/fs/proc/inode-alloc.txt
deleted file mode 100644
index 77212f938c2..00000000000
--- a/fs/proc/inode-alloc.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Current inode allocations in the proc-fs (hex-numbers):
-
-  00000000		reserved
-  00000001-00000fff	static entries	(goners)
-       001		root-ino
-
-  00001000-00001fff	unused
-  0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
-  80000000-efffffff	unused
-  f0000000-ffffffff	dynamic entries
-
-Goal:
-	a) once we'll split the thing into several virtual filesystems we
-	will get rid of magical ranges (and this file, BTW).
-- 
cgit v1.2.3


From 09729a9919fdaf137995b0f19cbd401e22229cac Mon Sep 17 00:00:00 2001
From: Milind Arun Choudhary <milindchoudhary@gmail.com>
Date: Fri, 20 Feb 2009 16:56:45 +0300
Subject: proc: fix sparse warnings in pagemap_read()

fs/proc/task_mmu.c:696:12: warning: cast removes address space of expression
fs/proc/task_mmu.c:696:9: warning: incorrect type in assignment (different address spaces)
fs/proc/task_mmu.c:696:9:    expected unsigned long long [noderef] [usertype] <asn:1>*out
fs/proc/task_mmu.c:696:9:    got unsigned long long [usertype] *<noident>
fs/proc/task_mmu.c:697:12: warning: cast removes address space of expression
fs/proc/task_mmu.c:697:9: warning: incorrect type in assignment (different address spaces)
fs/proc/task_mmu.c:697:9:    expected unsigned long long [noderef] [usertype] <asn:1>*end
fs/proc/task_mmu.c:697:9:    got unsigned long long [usertype] *<noident>
fs/proc/task_mmu.c:723:12: warning: cast removes address space of expression
fs/proc/task_mmu.c:723:26: error: subtraction of different types can't work (different address spaces)
fs/proc/task_mmu.c:725:24: error: subtraction of different types can't work (different address spaces)

Signed-off-by: Milind Arun Choudhary <milindchoudhary@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/task_mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 94063840832..b0ae0be4801 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -693,8 +693,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out_pages;
 	}
 
-	pm.out = (u64 *)buf;
-	pm.end = (u64 *)(buf + count);
+	pm.out = (u64 __user *)buf;
+	pm.end = (u64 __user *)(buf + count);
 
 	pagemap_walk.pmd_entry = pagemap_pte_range;
 	pagemap_walk.pte_hole = pagemap_pte_hole;
@@ -720,9 +720,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (ret == PM_END_OF_BUFFER)
 		ret = 0;
 	/* don't need mmap_sem for these, but this looks cleaner */
-	*ppos += (char *)pm.out - buf;
+	*ppos += (char __user *)pm.out - buf;
 	if (!ret)
-		ret = (char *)pm.out - buf;
+		ret = (char __user *)pm.out - buf;
 
 out_pages:
 	for (; pagecount; pagecount--) {
-- 
cgit v1.2.3


From 3dec7f59c370c7b58184d63293c3dc984d475840 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 20 Feb 2009 17:04:33 +0300
Subject: proc 1/2: do PDE usecounting even for ->read_proc, ->write_proc

struct proc_dir_entry::owner is going to be removed. Now it's only necessary
to protect PDEs which are using ->read_proc, ->write_proc hooks.

However, ->owner assignments are racy and make it very easy for someone to switch
->owner on live PDE (as some subsystems do) without fixing refcounts and so on.

http://bugzilla.kernel.org/show_bug.cgi?id=12454

So, ->owner is on death row.

Proxy file operations exist already (proc_file_operations), just bump usecount
when necessary.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/generic.c  | 48 ++++++++++++++++++++++++++++++++++++++----------
 fs/proc/inode.c    |  2 +-
 fs/proc/internal.h |  1 +
 3 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8c68bbe2b61..fa678abc9db 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -37,7 +37,7 @@ static int proc_match(int len, const char *name, struct proc_dir_entry *de)
 #define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
 
 static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	       loff_t *ppos)
 {
 	struct inode * inode = file->f_path.dentry->d_inode;
@@ -182,20 +182,48 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	return retval;
 }
 
+static ssize_t
+proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+	       loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	spin_unlock(&pde->pde_unload_lock);
+
+	rv = __proc_file_read(file, buf, nbytes, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
 static ssize_t
 proc_file_write(struct file *file, const char __user *buffer,
 		size_t count, loff_t *ppos)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct proc_dir_entry * dp;
-	
-	dp = PDE(inode);
-
-	if (!dp->write_proc)
-		return -EIO;
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+
+	if (pde->write_proc) {
+		spin_lock(&pde->pde_unload_lock);
+		if (!pde->proc_fops) {
+			spin_unlock(&pde->pde_unload_lock);
+			return rv;
+		}
+		pde->pde_users++;
+		spin_unlock(&pde->pde_unload_lock);
 
-	/* FIXME: does this routine need ppos?  probably... */
-	return dp->write_proc(file, buffer, count, dp->data);
+		/* FIXME: does this routine need ppos?  probably... */
+		rv = pde->write_proc(file, buffer, count, pde->data);
+		pde_users_dec(pde);
+	}
+	return rv;
 }
 
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d8bb5c671f4..e11dc22c651 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -127,7 +127,7 @@ static void __pde_users_dec(struct proc_dir_entry *pde)
 		complete(pde->pde_unload_completion);
 }
 
-static void pde_users_dec(struct proc_dir_entry *pde)
+void pde_users_dec(struct proc_dir_entry *pde)
 {
 	spin_lock(&pde->pde_unload_lock);
 	__pde_users_dec(pde);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd53ff83849..f6db9618a88 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -91,3 +91,4 @@ struct pde_opener {
 	int (*release)(struct inode *, struct file *);
 	struct list_head lh;
 };
+void pde_users_dec(struct proc_dir_entry *pde);
-- 
cgit v1.2.3


From 99b76233803beab302123d243eea9e41149804f3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 25 Mar 2009 22:48:06 +0300
Subject: proc 2/2: remove struct proc_dir_entry::owner

Setting ->owner as done currently (pde->owner = THIS_MODULE) is racy
as correctly noted at bug #12454. Someone can lookup entry with NULL
->owner, thus not pinning enything, and release it later resulting
in module refcount underflow.

We can keep ->owner and supply it at registration time like ->proc_fops
and ->data.

But this leaves ->owner as easy-manipulative field (just one C assignment)
and somebody will forget to unpin previous/pin current module when
switching ->owner. ->proc_fops is declared as "const" which should give
some thoughts.

->read_proc/->write_proc were just fixed to not require ->owner for
protection.

rmmod'ed directories will be empty and return "." and ".." -- no harm.
And directories with tricky enough readdir and lookup shouldn't be modular.
We definitely don't want such modular code.

Removing ->owner will also make PDE smaller.

So, let's nuke it.

Kudos to Jeff Layton for reminding about this, let's say, oversight.

http://bugzilla.kernel.org/show_bug.cgi?id=12454

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Documentation/DocBook/procfs_example.c  |  9 ---------
 arch/alpha/kernel/srm_env.c             |  5 -----
 arch/blackfin/mm/sram-alloc.c           |  1 -
 arch/ia64/kernel/palinfo.c              |  2 --
 arch/ia64/sn/kernel/sn2/prominfo_proc.c |  9 ++-------
 arch/powerpc/kernel/rtas_flash.c        |  1 -
 arch/sparc/kernel/led.c                 |  1 -
 arch/x86/kernel/cpu/mtrr/if.c           | 10 +---------
 drivers/acpi/ac.c                       |  1 -
 drivers/acpi/battery.c                  |  1 -
 drivers/acpi/button.c                   |  3 ---
 drivers/acpi/fan.c                      |  2 --
 drivers/acpi/processor_core.c           |  2 --
 drivers/acpi/sbs.c                      |  1 -
 drivers/acpi/thermal.c                  |  2 --
 drivers/acpi/video.c                    |  5 -----
 drivers/block/ps3vram.c                 |  2 --
 drivers/char/ipmi/ipmi_msghandler.c     | 12 ++++-------
 drivers/char/ipmi/ipmi_si_intf.c        |  6 +++---
 drivers/input/input.c                   |  2 --
 drivers/isdn/hardware/eicon/divasi.c    |  1 -
 drivers/media/video/cpia.c              |  4 +---
 drivers/message/i2o/i2o_proc.c          |  2 --
 drivers/net/bonding/bond_main.c         | 35 ++-------------------------------
 drivers/net/irda/vlsi_ir.c              |  7 -------
 drivers/net/wireless/airo.c             |  1 -
 drivers/platform/x86/asus_acpi.c        |  3 ---
 drivers/platform/x86/thinkpad_acpi.c    |  2 --
 drivers/platform/x86/toshiba_acpi.c     |  3 ---
 drivers/rtc/rtc-proc.c                  | 10 ++--------
 drivers/s390/block/dasd_proc.c          |  2 --
 drivers/scsi/scsi_devinfo.c             |  2 --
 drivers/scsi/scsi_proc.c                |  3 ---
 drivers/video/via/viafbdev.c            |  5 -----
 fs/afs/proc.c                           |  1 -
 fs/cifs/cifs_debug.c                    |  1 -
 fs/jfs/jfs_debug.c                      |  1 -
 fs/nfs/client.c                         |  2 --
 fs/proc/inode.c                         | 19 +++---------------
 fs/proc/proc_tty.c                      |  1 -
 fs/reiserfs/procfs.c                    |  5 +----
 include/linux/ipmi_smi.h                |  2 +-
 include/linux/proc_fs.h                 |  4 ----
 net/appletalk/atalk_proc.c              |  1 -
 net/atm/mpoa_proc.c                     |  1 -
 net/atm/proc.c                          |  1 -
 net/can/bcm.c                           |  4 ----
 net/can/proc.c                          |  2 --
 net/core/pktgen.c                       |  1 -
 net/irda/irproc.c                       |  1 -
 net/llc/llc_proc.c                      |  1 -
 net/sctp/protocol.c                     |  8 ++------
 net/sunrpc/cache.c                      |  4 ----
 net/sunrpc/stats.c                      | 10 ++--------
 sound/core/info.c                       | 31 ++---------------------------
 55 files changed, 26 insertions(+), 232 deletions(-)

diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
index 8c6396e4bf3..a5b11793b1e 100644
--- a/Documentation/DocBook/procfs_example.c
+++ b/Documentation/DocBook/procfs_example.c
@@ -117,9 +117,6 @@ static int __init init_procfs_example(void)
 		rv = -ENOMEM;
 		goto out;
 	}
-	
-	example_dir->owner = THIS_MODULE;
-	
 	/* create jiffies using convenience function */
 	jiffies_file = create_proc_read_entry("jiffies", 
 					      0444, example_dir, 
@@ -130,8 +127,6 @@ static int __init init_procfs_example(void)
 		goto no_jiffies;
 	}
 
-	jiffies_file->owner = THIS_MODULE;
-
 	/* create foo and bar files using same callback
 	 * functions 
 	 */
@@ -146,7 +141,6 @@ static int __init init_procfs_example(void)
 	foo_file->data = &foo_data;
 	foo_file->read_proc = proc_read_foobar;
 	foo_file->write_proc = proc_write_foobar;
-	foo_file->owner = THIS_MODULE;
 		
 	bar_file = create_proc_entry("bar", 0644, example_dir);
 	if(bar_file == NULL) {
@@ -159,7 +153,6 @@ static int __init init_procfs_example(void)
 	bar_file->data = &bar_data;
 	bar_file->read_proc = proc_read_foobar;
 	bar_file->write_proc = proc_write_foobar;
-	bar_file->owner = THIS_MODULE;
 		
 	/* create symlink */
 	symlink = proc_symlink("jiffies_too", example_dir, 
@@ -169,8 +162,6 @@ static int __init init_procfs_example(void)
 		goto no_symlink;
 	}
 
-	symlink->owner = THIS_MODULE;
-
 	/* everything OK */
 	printk(KERN_INFO "%s %s initialised\n",
 	       MODULE_NAME, MODULE_VERS);
diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c
index 78ad7cd1bbd..d12af472e1c 100644
--- a/arch/alpha/kernel/srm_env.c
+++ b/arch/alpha/kernel/srm_env.c
@@ -218,7 +218,6 @@ srm_env_init(void)
 				BASE_DIR);
 		goto cleanup;
 	}
-	base_dir->owner = THIS_MODULE;
 
 	/*
 	 * Create per-name subdirectory
@@ -229,7 +228,6 @@ srm_env_init(void)
 				BASE_DIR, NAMED_DIR);
 		goto cleanup;
 	}
-	named_dir->owner = THIS_MODULE;
 
 	/*
 	 * Create per-number subdirectory
@@ -241,7 +239,6 @@ srm_env_init(void)
 		goto cleanup;
 
 	}
-	numbered_dir->owner = THIS_MODULE;
 
 	/*
 	 * Create all named nodes
@@ -254,7 +251,6 @@ srm_env_init(void)
 			goto cleanup;
 
 		entry->proc_entry->data		= (void *) entry;
-		entry->proc_entry->owner	= THIS_MODULE;
 		entry->proc_entry->read_proc	= srm_env_read;
 		entry->proc_entry->write_proc	= srm_env_write;
 
@@ -275,7 +271,6 @@ srm_env_init(void)
 
 		entry->id			= var_num;
 		entry->proc_entry->data		= (void *) entry;
-		entry->proc_entry->owner	= THIS_MODULE;
 		entry->proc_entry->read_proc	= srm_env_read;
 		entry->proc_entry->write_proc	= srm_env_write;
 	}
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index 834cab7438a..530d1393a23 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -854,7 +854,6 @@ static int __init sram_proc_init(void)
 		printk(KERN_WARNING "unable to create /proc/sram\n");
 		return -1;
 	}
-	ptr->owner = THIS_MODULE;
 	ptr->read_proc = sram_proc_read;
 	return 0;
 }
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index e5c57f413ca..a4f19c70aad 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -1002,8 +1002,6 @@ create_palinfo_proc_entries(unsigned int cpu)
 		*pdir = create_proc_read_entry(
 				palinfo_entries[j].name, 0, cpu_dir,
 				palinfo_read_entry, (void *)f.value);
-		if (*pdir)
-			(*pdir)->owner = THIS_MODULE;
 		pdir++;
 	}
 }
diff --git a/arch/ia64/sn/kernel/sn2/prominfo_proc.c b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
index 4dcce3d0e04..e6332881864 100644
--- a/arch/ia64/sn/kernel/sn2/prominfo_proc.c
+++ b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
@@ -225,7 +225,6 @@ static struct proc_dir_entry *sgi_prominfo_entry;
 int __init prominfo_init(void)
 {
 	struct proc_dir_entry **entp;
-	struct proc_dir_entry *p;
 	cnodeid_t cnodeid;
 	unsigned long nasid;
 	int size;
@@ -246,14 +245,10 @@ int __init prominfo_init(void)
 		sprintf(name, "node%d", cnodeid);
 		*entp = proc_mkdir(name, sgi_prominfo_entry);
 		nasid = cnodeid_to_nasid(cnodeid);
-		p = create_proc_read_entry("fit", 0, *entp, read_fit_entry,
+		create_proc_read_entry("fit", 0, *entp, read_fit_entry,
 					   (void *)nasid);
-		if (p)
-			p->owner = THIS_MODULE;
-		p = create_proc_read_entry("version", 0, *entp,
+		create_proc_read_entry("version", 0, *entp,
 					   read_version_entry, (void *)nasid);
-		if (p)
-			p->owner = THIS_MODULE;
 		entp++;
 	}
 
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 149cb112cd1..13011a96a97 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -669,7 +669,6 @@ static void remove_flash_pde(struct proc_dir_entry *dp)
 {
 	if (dp) {
 		kfree(dp->data);
-		dp->owner = NULL;
 		remove_proc_entry(dp->name, dp->parent);
 	}
 }
diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c
index adaaed4ea2f..00d034ea216 100644
--- a/arch/sparc/kernel/led.c
+++ b/arch/sparc/kernel/led.c
@@ -126,7 +126,6 @@ static int __init led_init(void)
 	led = proc_create("led", 0, NULL, &led_proc_fops);
 	if (!led)
 		return -ENOMEM;
-	led->owner = THIS_MODULE;
 
 	printk(KERN_INFO
 	       "led: version %s, Lars Kotthoff <metalhead@metalhead.ws>\n",
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4c4214690dd..fb73a52913a 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -377,10 +377,6 @@ static const struct file_operations mtrr_fops = {
 	.release = mtrr_close,
 };
 
-
-static struct proc_dir_entry *proc_root_mtrr;
-
-
 static int mtrr_seq_show(struct seq_file *seq, void *offset)
 {
 	char factor;
@@ -423,11 +419,7 @@ static int __init mtrr_if_init(void)
 	    (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
 		return -ENODEV;
 
-	proc_root_mtrr =
-		proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
-
-	if (proc_root_mtrr)
-		proc_root_mtrr->owner = THIS_MODULE;
+	proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
 	return 0;
 }
 
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 9b917dac773..88e42abf5d8 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -191,7 +191,6 @@ static int acpi_ac_add_fs(struct acpi_device *device)
 						     acpi_ac_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	/* 'state' [R] */
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 69cbc57c2d1..3bcb5bfc45d 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -760,7 +760,6 @@ static int acpi_battery_add_fs(struct acpi_device *device)
 						     acpi_battery_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	for (i = 0; i < ACPI_BATTERY_NUMFILES; ++i) {
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 171fd914f43..c2f06069dcd 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -200,12 +200,10 @@ static int acpi_button_add_fs(struct acpi_device *device)
 
 	if (!entry)
 		return -ENODEV;
-	entry->owner = THIS_MODULE;
 
 	acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device), entry);
 	if (!acpi_device_dir(device))
 		return -ENODEV;
-	acpi_device_dir(device)->owner = THIS_MODULE;
 
 	/* 'info' [R] */
 	entry = proc_create_data(ACPI_BUTTON_FILE_INFO,
@@ -522,7 +520,6 @@ static int __init acpi_button_init(void)
 	acpi_button_dir = proc_mkdir(ACPI_BUTTON_CLASS, acpi_root_dir);
 	if (!acpi_button_dir)
 		return -ENODEV;
-	acpi_button_dir->owner = THIS_MODULE;
 	result = acpi_bus_register_driver(&acpi_button_driver);
 	if (result < 0) {
 		remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir);
diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index eaaee1660bd..8a02944bf92 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -193,7 +193,6 @@ static int acpi_fan_add_fs(struct acpi_device *device)
 						     acpi_fan_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	/* 'status' [R/W] */
@@ -347,7 +346,6 @@ static int __init acpi_fan_init(void)
 	acpi_fan_dir = proc_mkdir(ACPI_FAN_CLASS, acpi_root_dir);
 	if (!acpi_fan_dir)
 		return -ENODEV;
-	acpi_fan_dir->owner = THIS_MODULE;
 #endif
 
 	result = acpi_bus_register_driver(&acpi_fan_driver);
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0cc2fd31e37..fa2f7422d23 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -359,7 +359,6 @@ static int acpi_processor_add_fs(struct acpi_device *device)
 		if (!acpi_device_dir(device))
 			return -ENODEV;
 	}
-	acpi_device_dir(device)->owner = THIS_MODULE;
 
 	/* 'info' [R] */
 	entry = proc_create_data(ACPI_PROCESSOR_FILE_INFO,
@@ -1137,7 +1136,6 @@ static int __init acpi_processor_init(void)
 	acpi_processor_dir = proc_mkdir(ACPI_PROCESSOR_CLASS, acpi_root_dir);
 	if (!acpi_processor_dir)
 		return -ENOMEM;
-	acpi_processor_dir->owner = THIS_MODULE;
 
 	/*
 	 * Check whether the system is DMI table. If yes, OSPM
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 6050ce48187..59afd52ccc1 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -488,7 +488,6 @@ acpi_sbs_add_fs(struct proc_dir_entry **dir,
 		if (!*dir) {
 			return -ENODEV;
 		}
-		(*dir)->owner = THIS_MODULE;
 	}
 
 	/* 'info' [R] */
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 99e6f1f8ea4..c11f9aeca70 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -1506,7 +1506,6 @@ static int acpi_thermal_add_fs(struct acpi_device *device)
 						     acpi_thermal_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	/* 'state' [R] */
@@ -1875,7 +1874,6 @@ static int __init acpi_thermal_init(void)
 	acpi_thermal_dir = proc_mkdir(ACPI_THERMAL_CLASS, acpi_root_dir);
 	if (!acpi_thermal_dir)
 		return -ENODEV;
-	acpi_thermal_dir->owner = THIS_MODULE;
 
 	result = acpi_bus_register_driver(&acpi_thermal_driver);
 	if (result < 0) {
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index bb5ed059114..67cc36dc9b8 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -1125,8 +1125,6 @@ static int acpi_video_device_add_fs(struct acpi_device *device)
 	if (!device_dir)
 		return -ENOMEM;
 
-	device_dir->owner = THIS_MODULE;
-
 	/* 'info' [R] */
 	entry = proc_create_data("info", S_IRUGO, device_dir,
 			&acpi_video_device_info_fops, acpi_driver_data(device));
@@ -1403,8 +1401,6 @@ static int acpi_video_bus_add_fs(struct acpi_device *device)
 	if (!device_dir)
 		return -ENOMEM;
 
-	device_dir->owner = THIS_MODULE;
-
 	/* 'info' [R] */
 	entry = proc_create_data("info", S_IRUGO, device_dir,
 				 &acpi_video_bus_info_fops,
@@ -2131,7 +2127,6 @@ static int __init acpi_video_init(void)
 	acpi_video_dir = proc_mkdir(ACPI_VIDEO_CLASS, acpi_root_dir);
 	if (!acpi_video_dir)
 		return -ENODEV;
-	acpi_video_dir->owner = THIS_MODULE;
 
 	result = acpi_bus_register_driver(&acpi_video_bus);
 	if (result < 0) {
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 393ed6760d7..8eddef373a9 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -551,8 +551,6 @@ static void __devinit ps3vram_proc_init(struct ps3_system_bus_device *dev)
 		dev_warn(&dev->core, "failed to create /proc entry\n");
 		return;
 	}
-
-	pde->owner = THIS_MODULE;
 	pde->data = priv;
 }
 
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 7a88dfd4427..e93fc8d22fb 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -1944,7 +1944,7 @@ static int stat_file_read_proc(char *page, char **start, off_t off,
 
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    read_proc_t *read_proc,
-			    void *data, struct module *owner)
+			    void *data)
 {
 	int                    rv = 0;
 #ifdef CONFIG_PROC_FS
@@ -1970,7 +1970,6 @@ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 	} else {
 		file->data = data;
 		file->read_proc = read_proc;
-		file->owner = owner;
 
 		mutex_lock(&smi->proc_entry_lock);
 		/* Stick it on the list. */
@@ -1993,23 +1992,21 @@ static int add_proc_entries(ipmi_smi_t smi, int num)
 	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
 	if (!smi->proc_dir)
 		rv = -ENOMEM;
-	else
-		smi->proc_dir->owner = THIS_MODULE;
 
 	if (rv == 0)
 		rv = ipmi_smi_add_proc_entry(smi, "stats",
 					     stat_file_read_proc,
-					     smi, THIS_MODULE);
+					     smi);
 
 	if (rv == 0)
 		rv = ipmi_smi_add_proc_entry(smi, "ipmb",
 					     ipmb_file_read_proc,
-					     smi, THIS_MODULE);
+					     smi);
 
 	if (rv == 0)
 		rv = ipmi_smi_add_proc_entry(smi, "version",
 					     version_file_read_proc,
-					     smi, THIS_MODULE);
+					     smi);
 #endif /* CONFIG_PROC_FS */
 
 	return rv;
@@ -4265,7 +4262,6 @@ static int ipmi_init_msghandler(void)
 	    return -ENOMEM;
 	}
 
-	proc_ipmi_root->owner = THIS_MODULE;
 #endif /* CONFIG_PROC_FS */
 
 	setup_timer(&ipmi_timer, ipmi_timeout, 0);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 3000135f2ea..e58ea4cd55c 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2899,7 +2899,7 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "type",
 				     type_file_read_proc,
-				     new_smi, THIS_MODULE);
+				     new_smi);
 	if (rv) {
 		printk(KERN_ERR
 		       "ipmi_si: Unable to create proc entry: %d\n",
@@ -2909,7 +2909,7 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "si_stats",
 				     stat_file_read_proc,
-				     new_smi, THIS_MODULE);
+				     new_smi);
 	if (rv) {
 		printk(KERN_ERR
 		       "ipmi_si: Unable to create proc entry: %d\n",
@@ -2919,7 +2919,7 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "params",
 				     param_read_proc,
-				     new_smi, THIS_MODULE);
+				     new_smi);
 	if (rv) {
 		printk(KERN_ERR
 		       "ipmi_si: Unable to create proc entry: %d\n",
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 1730d7331a5..ec3db3ade11 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -903,8 +903,6 @@ static int __init input_proc_init(void)
 	if (!proc_bus_input_dir)
 		return -ENOMEM;
 
-	proc_bus_input_dir->owner = THIS_MODULE;
-
 	entry = proc_create("devices", 0, proc_bus_input_dir,
 			    &input_devices_fileops);
 	if (!entry)
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index f4969fe0a05..69e71ebe784 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -118,7 +118,6 @@ static int DIVA_INIT_FUNCTION create_um_idi_proc(void)
 		return (0);
 
 	um_idi_proc_entry->read_proc = um_idi_proc_read;
-	um_idi_proc_entry->owner = THIS_MODULE;
 
 	return (1);
 }
diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c
index c3b0c8c63c7..43ab0adf3b6 100644
--- a/drivers/media/video/cpia.c
+++ b/drivers/media/video/cpia.c
@@ -1381,9 +1381,7 @@ static void proc_cpia_create(void)
 {
 	cpia_proc_root = proc_mkdir("cpia", NULL);
 
-	if (cpia_proc_root)
-		cpia_proc_root->owner = THIS_MODULE;
-	else
+	if (!cpia_proc_root)
 		LOG("Unable to initialise /proc/cpia\n");
 }
 
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 9a36b5a7de5..7045c45da9b 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -2037,8 +2037,6 @@ static int __init i2o_proc_fs_create(void)
 	if (!i2o_proc_dir_root)
 		return -1;
 
-	i2o_proc_dir_root->owner = THIS_MODULE;
-
 	list_for_each_entry(c, &i2o_controllers, list)
 	    i2o_proc_iop_add(i2o_proc_dir_root, c);
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9c326a50a3e..99610f358c4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3444,25 +3444,12 @@ static void bond_remove_proc_entry(struct bonding *bond)
  */
 static void bond_create_proc_dir(void)
 {
-	int len = strlen(DRV_NAME);
-
-	for (bond_proc_dir = init_net.proc_net->subdir; bond_proc_dir;
-	     bond_proc_dir = bond_proc_dir->next) {
-		if ((bond_proc_dir->namelen == len) &&
-		    !memcmp(bond_proc_dir->name, DRV_NAME, len)) {
-			break;
-		}
-	}
-
 	if (!bond_proc_dir) {
 		bond_proc_dir = proc_mkdir(DRV_NAME, init_net.proc_net);
-		if (bond_proc_dir) {
-			bond_proc_dir->owner = THIS_MODULE;
-		} else {
+		if (!bond_proc_dir)
 			printk(KERN_WARNING DRV_NAME
 				": Warning: cannot create /proc/net/%s\n",
 				DRV_NAME);
-		}
 	}
 }
 
@@ -3471,25 +3458,7 @@ static void bond_create_proc_dir(void)
  */
 static void bond_destroy_proc_dir(void)
 {
-	struct proc_dir_entry *de;
-
-	if (!bond_proc_dir) {
-		return;
-	}
-
-	/* verify that the /proc dir is empty */
-	for (de = bond_proc_dir->subdir; de; de = de->next) {
-		/* ignore . and .. */
-		if (*(de->name) != '.') {
-			break;
-		}
-	}
-
-	if (de) {
-		if (bond_proc_dir->owner == THIS_MODULE) {
-			bond_proc_dir->owner = NULL;
-		}
-	} else {
+	if (bond_proc_dir) {
 		remove_proc_entry(DRV_NAME, init_net.proc_net);
 		bond_proc_dir = NULL;
 	}
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 1243bc8e003..ac0e4b6b6b6 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -1871,13 +1871,6 @@ static int __init vlsi_mod_init(void)
 	 * without procfs - it's not required for the driver to work.
 	 */
 	vlsi_proc_root = proc_mkdir(PROC_DIR, NULL);
-	if (vlsi_proc_root) {
-		/* protect registered procdir against module removal.
-		 * Because we are in the module init path there's no race
-		 * window after create_proc_entry (and no barrier needed).
-		 */
-		vlsi_proc_root->owner = THIS_MODULE;
-	}
 
 	ret = pci_register_driver(&vlsi_irda_driver);
 
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 7e80aba8a14..31b1cc2b778 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -4494,7 +4494,6 @@ static int setup_proc_entry( struct net_device *dev,
 		goto fail;
 	apriv->proc_entry->uid = proc_uid;
 	apriv->proc_entry->gid = proc_gid;
-	apriv->proc_entry->owner = THIS_MODULE;
 
 	/* Setup the StatsDelta */
 	entry = proc_create_data("StatsDelta",
diff --git a/drivers/platform/x86/asus_acpi.c b/drivers/platform/x86/asus_acpi.c
index d63f26e666a..ba1f7497e4b 100644
--- a/drivers/platform/x86/asus_acpi.c
+++ b/drivers/platform/x86/asus_acpi.c
@@ -987,7 +987,6 @@ asus_proc_add(char *name, proc_writefunc *writefunc,
 	proc->write_proc = writefunc;
 	proc->read_proc = readfunc;
 	proc->data = acpi_driver_data(device);
-	proc->owner = THIS_MODULE;
 	proc->uid = asus_uid;
 	proc->gid = asus_gid;
 	return 0;
@@ -1020,7 +1019,6 @@ static int asus_hotk_add_fs(struct acpi_device *device)
 	if (proc) {
 		proc->read_proc = proc_read_info;
 		proc->data = acpi_driver_data(device);
-		proc->owner = THIS_MODULE;
 		proc->uid = asus_uid;
 		proc->gid = asus_gid;
 	} else {
@@ -1436,7 +1434,6 @@ static int __init asus_acpi_init(void)
 		printk(KERN_ERR "Asus ACPI: Unable to create /proc entry\n");
 		return -ENODEV;
 	}
-	asus_proc_dir->owner = THIS_MODULE;
 
 	result = acpi_bus_register_driver(&asus_hotk_driver);
 	if (result < 0) {
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d2433204a40..3dad27a385d 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -6992,7 +6992,6 @@ static int __init ibm_init(struct ibm_init_struct *iibm)
 			ret = -ENODEV;
 			goto err_out;
 		}
-		entry->owner = THIS_MODULE;
 		entry->data = ibm;
 		entry->read_proc = &dispatch_procfs_read;
 		if (ibm->write)
@@ -7405,7 +7404,6 @@ static int __init thinkpad_acpi_module_init(void)
 		thinkpad_acpi_module_exit();
 		return -ENODEV;
 	}
-	proc_dir->owner = THIS_MODULE;
 
 	ret = platform_driver_register(&tpacpi_pdriver);
 	if (ret) {
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 40e60fc2e59..9f187265db8 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -679,8 +679,6 @@ static acpi_status __init add_device(void)
 					      toshiba_proc_dir,
 					      (read_proc_t *) dispatch_read,
 					      item);
-		if (proc)
-			proc->owner = THIS_MODULE;
 		if (proc && item->write_func)
 			proc->write_proc = (write_proc_t *) dispatch_write;
 	}
@@ -772,7 +770,6 @@ static int __init toshiba_acpi_init(void)
 		toshiba_acpi_exit();
 		return -ENODEV;
 	} else {
-		toshiba_proc_dir->owner = THIS_MODULE;
 		status = add_device();
 		if (ACPI_FAILURE(status)) {
 			toshiba_acpi_exit();
diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c
index 0c6257a034f..c086fc30a84 100644
--- a/drivers/rtc/rtc-proc.c
+++ b/drivers/rtc/rtc-proc.c
@@ -105,14 +105,8 @@ static const struct file_operations rtc_proc_fops = {
 
 void rtc_proc_add_device(struct rtc_device *rtc)
 {
-	if (rtc->id == 0) {
-		struct proc_dir_entry *ent;
-
-		ent = proc_create_data("driver/rtc", 0, NULL,
-				       &rtc_proc_fops, rtc);
-		if (ent)
-			ent->owner = rtc->owner;
-	}
+	if (rtc->id == 0)
+		proc_create_data("driver/rtc", 0, NULL, &rtc_proc_fops, rtc);
 }
 
 void rtc_proc_del_device(struct rtc_device *rtc)
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 2080ba6a69b..654daa3cdfd 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -320,7 +320,6 @@ dasd_proc_init(void)
 	dasd_proc_root_entry = proc_mkdir("dasd", NULL);
 	if (!dasd_proc_root_entry)
 		goto out_nodasd;
-	dasd_proc_root_entry->owner = THIS_MODULE;
 	dasd_devices_entry = proc_create("devices",
 					 S_IFREG | S_IRUGO | S_IWUSR,
 					 dasd_proc_root_entry,
@@ -334,7 +333,6 @@ dasd_proc_init(void)
 		goto out_nostatistics;
 	dasd_statistics_entry->read_proc = dasd_statistics_read;
 	dasd_statistics_entry->write_proc = dasd_statistics_write;
-	dasd_statistics_entry->owner = THIS_MODULE;
 	return 0;
 
  out_nostatistics:
diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index 099b5455bbc..b1348136964 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -596,8 +596,6 @@ int __init scsi_init_devinfo(void)
 		error = -ENOMEM;
 		goto out;
 	}
-
-	p->owner = THIS_MODULE;
 #endif /* CONFIG_SCSI_PROC_FS */
 
  out:
diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 82f7b2dd08a..77fbddb507f 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -115,8 +115,6 @@ void scsi_proc_hostdir_add(struct scsi_host_template *sht)
         	if (!sht->proc_dir)
 			printk(KERN_ERR "%s: proc_mkdir failed for %s\n",
 			       __func__, sht->proc_name);
-		else
-			sht->proc_dir->owner = sht->module;
 	}
 	mutex_unlock(&global_host_template_mutex);
 }
@@ -163,7 +161,6 @@ void scsi_proc_host_add(struct Scsi_Host *shost)
 	} 
 
 	p->write_proc = proc_scsi_write_proc;
-	p->owner = sht->module;
 }
 
 /**
diff --git a/drivers/video/via/viafbdev.c b/drivers/video/via/viafbdev.c
index 37b433a08ce..e327b84820d 100644
--- a/drivers/video/via/viafbdev.c
+++ b/drivers/video/via/viafbdev.c
@@ -2059,25 +2059,21 @@ static void viafb_init_proc(struct proc_dir_entry **viafb_entry)
 	if (viafb_entry) {
 		entry = create_proc_entry("dvp0", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dvp0_proc_read;
 			entry->write_proc = viafb_dvp0_proc_write;
 		}
 		entry = create_proc_entry("dvp1", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dvp1_proc_read;
 			entry->write_proc = viafb_dvp1_proc_write;
 		}
 		entry = create_proc_entry("dfph", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dfph_proc_read;
 			entry->write_proc = viafb_dfph_proc_write;
 		}
 		entry = create_proc_entry("dfpl", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dfpl_proc_read;
 			entry->write_proc = viafb_dfpl_proc_write;
 		}
@@ -2086,7 +2082,6 @@ static void viafb_init_proc(struct proc_dir_entry **viafb_entry)
 		    viaparinfo->chip_info->lvds_chip_info2.lvds_chip_name) {
 			entry = create_proc_entry("vt1636", 0, *viafb_entry);
 			if (entry) {
-				entry->owner = THIS_MODULE;
 				entry->read_proc = viafb_vt1636_proc_read;
 				entry->write_proc = viafb_vt1636_proc_write;
 			}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 7578c1ab9e0..8630615e57f 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -146,7 +146,6 @@ int afs_proc_init(void)
 	proc_afs = proc_mkdir("fs/afs", NULL);
 	if (!proc_afs)
 		goto error_dir;
-	proc_afs->owner = THIS_MODULE;
 
 	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
 	if (!p)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 877e4d9a115..7f19fefd3d4 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -404,7 +404,6 @@ cifs_proc_init(void)
 	if (proc_fs_cifs == NULL)
 		return;
 
-	proc_fs_cifs->owner = THIS_MODULE;
 	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
 
 #ifdef CONFIG_CIFS_STATS
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 6a73de84bce..dd824d9b0b1 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -90,7 +90,6 @@ void jfs_proc_init(void)
 
 	if (!(base = proc_mkdir("fs/jfs", NULL)))
 		return;
-	base->owner = THIS_MODULE;
 
 	for (i = 0; i < NPROCENT; i++)
 		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 574158ae239..2277421656e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1606,8 +1606,6 @@ int __init nfs_fs_proc_init(void)
 	if (!proc_fs_nfs)
 		goto error_0;
 
-	proc_fs_nfs->owner = THIS_MODULE;
-
 	/* a file of servers with which we're dealing */
 	p = proc_create("servers", S_IFREG|S_IRUGO,
 			proc_fs_nfs, &nfs_server_list_fops);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e11dc22c651..d78ade30554 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,11 +58,8 @@ static void proc_delete_inode(struct inode *inode)
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
-	if (de) {
-		if (de->owner)
-			module_put(de->owner);
+	if (de)
 		de_put(de);
-	}
 	if (PROC_I(inode)->sysctl)
 		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
@@ -449,12 +446,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 {
 	struct inode * inode;
 
-	if (!try_module_get(de->owner))
-		goto out_mod;
-
 	inode = iget_locked(sb, ino);
 	if (!inode)
-		goto out_ino;
+		return NULL;
 	if (inode->i_state & I_NEW) {
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->fd = 0;
@@ -485,16 +479,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	} else {
-	       module_put(de->owner);
+	} else
 	       de_put(de);
-	}
 	return inode;
-
-out_ino:
-	module_put(de->owner);
-out_mod:
-	return NULL;
 }			
 
 int proc_fill_super(struct super_block *s)
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d153946d6d1..4a9e0f65ae6 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -152,7 +152,6 @@ void proc_tty_register_driver(struct tty_driver *driver)
 	if (!ent)
 		return;
 	ent->read_proc = driver->ops->read_proc;
-	ent->owner = driver->owner;
 	ent->data = driver;
 
 	driver->proc_entry = ent;
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index d5066400638..9229e5514a4 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -492,7 +492,6 @@ int reiserfs_proc_info_init(struct super_block *sb)
 	spin_lock_init(&__PINFO(sb).lock);
 	REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
 	if (REISERFS_SB(sb)->procdir) {
-		REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
 		REISERFS_SB(sb)->procdir->data = sb;
 		add_file(sb, "version", show_version);
 		add_file(sb, "super", show_super);
@@ -556,9 +555,7 @@ int reiserfs_proc_info_global_init(void)
 {
 	if (proc_info_root == NULL) {
 		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
-		if (proc_info_root) {
-			proc_info_root->owner = THIS_MODULE;
-		} else {
+		if (!proc_info_root) {
 			reiserfs_warning(NULL, "cannot create /proc/%s",
 					 proc_info_root_name);
 			return 1;
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 62b73668b60..f7c9c75a277 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -230,6 +230,6 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
    automatically be dstroyed when the interface is destroyed. */
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    read_proc_t *read_proc,
-			    void *data, struct module *owner);
+			    void *data);
 
 #endif /* __LINUX_IPMI_SMI_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b8bdb96eff7..fbfa3d44d33 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -41,9 +41,6 @@ enum {
  * while parent/subdir create the directory structure (every
  * /proc file has a parent, but "subdir" is NULL for all
  * non-directory entries).
- *
- * "owner" is used to protect module
- * from unloading while proc_dir_entry is in use
  */
 
 typedef	int (read_proc_t)(char *page, char **start, off_t off,
@@ -70,7 +67,6 @@ struct proc_dir_entry {
 	 * somewhere.
 	 */
 	const struct file_operations *proc_fops;
-	struct module *owner;
 	struct proc_dir_entry *next, *parent, *subdir;
 	void *data;
 	read_proc_t *read_proc;
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 162199a2d74..fd8e0847b25 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -281,7 +281,6 @@ int __init atalk_proc_init(void)
 	atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net);
 	if (!atalk_proc_dir)
 		goto out;
-	atalk_proc_dir->owner = THIS_MODULE;
 
 	p = proc_create("interface", S_IRUGO, atalk_proc_dir,
 			&atalk_seq_interface_fops);
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 4990541ef5d..1a0f5ccea9c 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -281,7 +281,6 @@ int mpc_proc_init(void)
 		printk(KERN_ERR "Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
 		return -ENOMEM;
 	}
-	p->owner = THIS_MODULE;
 	return 0;
 }
 
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 49487b313f2..e7b3b273907 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -476,7 +476,6 @@ int __init atm_proc_init(void)
 				     atm_proc_root, e->proc_fops);
 		if (!dirent)
 			goto err_out_remove;
-		dirent->owner = THIS_MODULE;
 		e->dirent = dirent;
 	}
 	ret = 0;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index b7c7d465113..95d7f32643a 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1604,10 +1604,6 @@ static int __init bcm_module_init(void)
 
 	/* create /proc/net/can-bcm directory */
 	proc_dir = proc_mkdir("can-bcm", init_net.proc_net);
-
-	if (proc_dir)
-		proc_dir->owner = THIS_MODULE;
-
 	return 0;
 }
 
diff --git a/net/can/proc.c b/net/can/proc.c
index 520fef5e539..1463653dbe3 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -473,8 +473,6 @@ void can_init_proc(void)
 		return;
 	}
 
-	can_dir->owner = THIS_MODULE;
-
 	/* own procfs entries from the AF_CAN core */
 	pde_version     = can_create_proc_readentry(CAN_PROC_VERSION, 0644,
 					can_proc_read_version, NULL);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 32d419f5ac9..3779c1438c1 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3806,7 +3806,6 @@ static int __init pg_init(void)
 	pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net);
 	if (!pg_proc_dir)
 		return -ENODEV;
-	pg_proc_dir->owner = THIS_MODULE;
 
 	pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
 	if (pe == NULL) {
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
index 88e80a31273..8ff1861649e 100644
--- a/net/irda/irproc.c
+++ b/net/irda/irproc.c
@@ -70,7 +70,6 @@ void __init irda_proc_register(void)
 	proc_irda = proc_mkdir("irda", init_net.proc_net);
 	if (proc_irda == NULL)
 		return;
-	proc_irda->owner = THIS_MODULE;
 
 	for (i = 0; i < ARRAY_SIZE(irda_dirs); i++)
 		d = proc_create(irda_dirs[i].name, 0, proc_irda,
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index b58bd7c6cdf..d208b3396d9 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -236,7 +236,6 @@ int __init llc_proc_init(void)
 	llc_proc_dir = proc_mkdir("llc", init_net.proc_net);
 	if (!llc_proc_dir)
 		goto out;
-	llc_proc_dir->owner = THIS_MODULE;
 
 	p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops);
 	if (!p)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index cb198af8887..8eb3e61cb70 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -106,12 +106,8 @@ static __init int sctp_proc_init(void)
 		goto out_nomem;
 #ifdef CONFIG_PROC_FS
 	if (!proc_net_sctp) {
-		struct proc_dir_entry *ent;
-		ent = proc_mkdir("sctp", init_net.proc_net);
-		if (ent) {
-			ent->owner = THIS_MODULE;
-			proc_net_sctp = ent;
-		} else
+		proc_net_sctp = proc_mkdir("sctp", init_net.proc_net);
+		if (!proc_net_sctp)
 			goto out_free_percpu;
 	}
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4735caad26e..20029a79a5d 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -313,7 +313,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 	cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
 	if (cd->proc_ent == NULL)
 		goto out_nomem;
-	cd->proc_ent->owner = cd->owner;
 	cd->channel_ent = cd->content_ent = NULL;
 
 	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
@@ -321,7 +320,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 	cd->flush_ent = p;
 	if (p == NULL)
 		goto out_nomem;
-	p->owner = cd->owner;
 
 	if (cd->cache_request || cd->cache_parse) {
 		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
@@ -329,7 +327,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 		cd->channel_ent = p;
 		if (p == NULL)
 			goto out_nomem;
-		p->owner = cd->owner;
 	}
 	if (cd->cache_show) {
 		p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR,
@@ -337,7 +334,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 		cd->content_ent = p;
 		if (p == NULL)
 			goto out_nomem;
-		p->owner = cd->owner;
 	}
 	return 0;
 out_nomem:
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 085372ef4fe..1ef6e46d9da 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -262,14 +262,8 @@ void
 rpc_proc_init(void)
 {
 	dprintk("RPC:       registering /proc/net/rpc\n");
-	if (!proc_net_rpc) {
-		struct proc_dir_entry *ent;
-		ent = proc_mkdir("rpc", init_net.proc_net);
-		if (ent) {
-			ent->owner = THIS_MODULE;
-			proc_net_rpc = ent;
-		}
-	}
+	if (!proc_net_rpc)
+		proc_net_rpc = proc_mkdir("rpc", init_net.proc_net);
 }
 
 void
diff --git a/sound/core/info.c b/sound/core/info.c
index 70fa87189f3..35df614f6c5 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -154,11 +154,6 @@ EXPORT_SYMBOL(snd_seq_root);
 struct snd_info_entry *snd_oss_root;
 #endif
 
-static inline void snd_info_entry_prepare(struct proc_dir_entry *de)
-{
-	de->owner = THIS_MODULE;
-}
-
 static void snd_remove_proc_entry(struct proc_dir_entry *parent,
 				  struct proc_dir_entry *de)
 {
@@ -522,32 +517,11 @@ static const struct file_operations snd_info_entry_operations =
 	.release =		snd_info_entry_release,
 };
 
-/**
- * snd_create_proc_entry - create a procfs entry
- * @name: the name of the proc file
- * @mode: the file permission bits, S_Ixxx
- * @parent: the parent proc-directory entry
- *
- * Creates a new proc file entry with the given name and permission
- * on the given directory.
- *
- * Returns the pointer of new instance or NULL on failure.
- */
-static struct proc_dir_entry *snd_create_proc_entry(const char *name, mode_t mode,
-						    struct proc_dir_entry *parent)
-{
-	struct proc_dir_entry *p;
-	p = create_proc_entry(name, mode, parent);
-	if (p)
-		snd_info_entry_prepare(p);
-	return p;
-}
-
 int __init snd_info_init(void)
 {
 	struct proc_dir_entry *p;
 
-	p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL);
+	p = create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL);
 	if (p == NULL)
 		return -ENOMEM;
 	snd_proc_root = p;
@@ -974,12 +948,11 @@ int snd_info_register(struct snd_info_entry * entry)
 		return -ENXIO;
 	root = entry->parent == NULL ? snd_proc_root : entry->parent->p;
 	mutex_lock(&info_mutex);
-	p = snd_create_proc_entry(entry->name, entry->mode, root);
+	p = create_proc_entry(entry->name, entry->mode, root);
 	if (!p) {
 		mutex_unlock(&info_mutex);
 		return -ENOMEM;
 	}
-	p->owner = entry->module;
 	if (!S_ISDIR(entry->mode))
 		p->proc_fops = &snd_info_entry_operations;
 	p->size = entry->size;
-- 
cgit v1.2.3


From a9caa3de249a6c43bc9c6aec87881f09276677e3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 20 Feb 2009 17:07:22 +0300
Subject: Revert "proc: revert /proc/uptime to ->read_proc hook"

This reverts commit 6c87df37dcb9c6c33923707fa5191e0a65874d60.

proc files implemented through seq_file do pread(2) now.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/uptime.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index df26aa88fa4..0c10a0b3f14 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,45 +1,43 @@
+#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 #include <linux/time.h>
 #include <asm/cputime.h>
 
-static int proc_calc_metrics(char *page, char **start, off_t off,
-				 int count, int *eof, int len)
-{
-	if (len <= off + count)
-		*eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len > count)
-		len = count;
-	if (len < 0)
-		len = 0;
-	return len;
-}
-
-static int uptime_read_proc(char *page, char **start, off_t off, int count,
-			    int *eof, void *data)
+static int uptime_proc_show(struct seq_file *m, void *v)
 {
 	struct timespec uptime;
 	struct timespec idle;
-	int len;
 	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
 	cputime_to_timespec(idletime, &idle);
-	len = sprintf(page, "%lu.%02lu %lu.%02lu\n",
+	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
 			(unsigned long) idle.tv_sec,
 			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-	return proc_calc_metrics(page, start, off, count, eof, len);
+	return 0;
 }
 
+static int uptime_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uptime_proc_show, NULL);
+}
+
+static const struct file_operations uptime_proc_fops = {
+	.open		= uptime_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int __init proc_uptime_init(void)
 {
-	create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL);
+	proc_create("uptime", 0, NULL, &uptime_proc_fops);
 	return 0;
 }
 module_init(proc_uptime_init);
-- 
cgit v1.2.3


From 2f8501815256af8498904e68bd0984b1afffd6f8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Mar 2009 11:13:20 +0100
Subject: lockdep: fix deadlock in lockdep_trace_alloc

Heiko reported that we grab the graph lock with irqs enabled.

Fix this by providng the same wrapper as all other lockdep entry
functions have.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <npiggin@suse.de>
LKML-Reference: <1237544000.24626.52.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 022d2ed7fd8..3673a3f44d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2260,7 +2260,7 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(&redundant_softirqs_off);
 }
 
-void lockdep_trace_alloc(gfp_t gfp_mask)
+static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 {
 	struct task_struct *curr = current;
 
@@ -2279,12 +2279,29 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
 	if (!(gfp_mask & __GFP_FS))
 		return;
 
-	if (DEBUG_LOCKS_WARN_ON(irqs_disabled()))
+	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
 		return;
 
 	mark_held_locks(curr, RECLAIM_FS);
 }
 
+static void check_flags(unsigned long flags);
+
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lockdep_trace_alloc(gfp_mask, flags);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
-- 
cgit v1.2.3


From 5886cea45d1b230f86fd24de708b0d9f14ff88db Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 31 Mar 2009 19:13:13 +0200
Subject: [PATCH] sysrq: include interrupt.h instead of irq.h

With "cpumask: update irq_desc to use cpumask_var_t"
we get this build failure on s390:

  CC      drivers/char/sysrq.o
In file included from drivers/char/sysrq.c:38:
include/linux/irq.h: In function 'init_alloc_desc_masks':
include/linux/irq.h:442: error: dereferencing pointer to incomplete type

drivers/char/sysrq.c should include interrupt.h instead of irq.h.

Cc: Mike Travis <travis@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/char/sysrq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 33a9351c896..3df694e5454 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -35,7 +35,7 @@
 #include <linux/vt_kern.h>
 #include <linux/workqueue.h>
 #include <linux/kexec.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
 #include <linux/hrtimer.h>
 #include <linux/oom.h>
 
-- 
cgit v1.2.3


From 33b26d7951619e4debf3a82f30a03423a36a1412 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 31 Mar 2009 19:16:02 +0200
Subject: [S390] fix hypfs build failure

Fix build breakage below which probably was introduced with
("rcu: don't include unnecessary headers, allow kmemtrace w/ tracepoints").

  CC      arch/s390/hypfs/hypfs_diag.o
arch/s390/hypfs/hypfs_diag.c: In function 'diag204_free_buffer':
arch/s390/hypfs/hypfs_diag.c:364: error: implicit declaration of function 'free_pages'
arch/s390/hypfs/hypfs_diag.c: In function 'diag204_alloc_rbuf':
arch/s390/hypfs/hypfs_diag.c:384: error: implicit declaration of function '__get_free_pages'
arch/s390/hypfs/hypfs_diag.c:384: error: 'GFP_KERNEL' undeclared (first use in this function)
arch/s390/hypfs/hypfs_diag.c:384: error: (Each undeclared identifier is reported only once
arch/s390/hypfs/hypfs_diag.c:384: error: for each function it appears in.)

Reported-by: Sachin Sant <sachinp@in.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/hypfs/hypfs_diag.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index b1e892a4381..704dd396257 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -12,6 +12,8 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <asm/ebcdic.h>
-- 
cgit v1.2.3


From 156013ffd1225ef862853a4340b46f76845f8db1 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 31 Mar 2009 19:16:03 +0200
Subject: [S390] cio: wake up on failed recognition

Wake up even on failed device recognition, since this may be triggered
from a user trying to force a device online. With this patch a write
to the online sysfs attribute will not block for ever but return with
-EAGAIN in this case.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/device.c     | 10 ++++++++--
 drivers/s390/cio/device_fsm.c |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index c4d2f667a2f..9f016fef610 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -482,17 +482,21 @@ static int online_store_recog_and_online(struct ccw_device *cdev)
 		}
 		wait_event(cdev->private->wait_q,
 			   cdev->private->flags.recog_done);
+		if (cdev->private->state != DEV_STATE_OFFLINE)
+			/* recognition failed */
+			return -EAGAIN;
 	}
 	if (cdev->drv && cdev->drv->set_online)
 		ccw_device_set_online(cdev);
 	return 0;
 }
+
 static int online_store_handle_online(struct ccw_device *cdev, int force)
 {
 	int ret;
 
 	ret = online_store_recog_and_online(cdev);
-	if (ret)
+	if (ret && !force)
 		return ret;
 	if (force && cdev->private->state == DEV_STATE_BOXED) {
 		ret = ccw_device_stlck(cdev);
@@ -500,7 +504,9 @@ static int online_store_handle_online(struct ccw_device *cdev, int force)
 			return ret;
 		if (cdev->id.cu_type == 0)
 			cdev->private->state = DEV_STATE_NOT_OPER;
-		online_store_recog_and_online(cdev);
+		ret = online_store_recog_and_online(cdev);
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 87b4bfca080..ccd72f95765 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -260,6 +260,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 		if (state == DEV_STATE_NOT_OPER) {
 			cdev->private->flags.recog_done = 1;
 			cdev->private->state = DEV_STATE_DISCONNECTED;
+			wake_up(&cdev->private->wait_q);
 			return;
 		}
 		/* Boxed devices don't need extra treatment. */
@@ -311,8 +312,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 	}
 	cdev->private->state = state;
 	io_subchannel_recog_done(cdev);
-	if (state != DEV_STATE_NOT_OPER)
-		wake_up(&cdev->private->wait_q);
+	wake_up(&cdev->private->wait_q);
 }
 
 /*
-- 
cgit v1.2.3


From c4621a62649a56f155a96dfc5de479be226f0768 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 31 Mar 2009 19:16:04 +0200
Subject: [S390] cio: introduce ccw_device_schedule_sch_unregister

Introduce ccw_device_schedule_sch_unregister as a wrapper for queuing
ccw_device_call_sch_unregister on the slow_path_wq. This wrapper
will be used in the next patch.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/device.c | 21 +++++++++++----------
 drivers/s390/cio/device.h |  1 +
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 9f016fef610..cdbf664ed44 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -310,8 +310,6 @@ static void ccw_device_remove_orphan_cb(struct work_struct *work)
 	put_device(&cdev->dev);
 }
 
-static void ccw_device_call_sch_unregister(struct work_struct *work);
-
 static void
 ccw_device_remove_disconnected(struct ccw_device *cdev)
 {
@@ -335,11 +333,10 @@ ccw_device_remove_disconnected(struct ccw_device *cdev)
 		spin_unlock_irqrestore(cdev->ccwlock, flags);
 		PREPARE_WORK(&cdev->private->kick_work,
 				ccw_device_remove_orphan_cb);
+		queue_work(slow_path_wq, &cdev->private->kick_work);
 	} else
 		/* Deregister subchannel, which will kill the ccw device. */
-		PREPARE_WORK(&cdev->private->kick_work,
-				ccw_device_call_sch_unregister);
-	queue_work(slow_path_wq, &cdev->private->kick_work);
+		ccw_device_schedule_sch_unregister(cdev);
 }
 
 /**
@@ -1020,6 +1017,13 @@ static void ccw_device_call_sch_unregister(struct work_struct *work)
 	put_device(&sch->dev);
 }
 
+void ccw_device_schedule_sch_unregister(struct ccw_device *cdev)
+{
+	PREPARE_WORK(&cdev->private->kick_work,
+		     ccw_device_call_sch_unregister);
+	queue_work(slow_path_wq, &cdev->private->kick_work);
+}
+
 /*
  * subchannel recognition done. Called from the state machine.
  */
@@ -1036,9 +1040,7 @@ io_subchannel_recog_done(struct ccw_device *cdev)
 		/* Remove device found not operational. */
 		if (!get_device(&cdev->dev))
 			break;
-		PREPARE_WORK(&cdev->private->kick_work,
-			     ccw_device_call_sch_unregister);
-		queue_work(slow_path_wq, &cdev->private->kick_work);
+		ccw_device_schedule_sch_unregister(cdev);
 		if (atomic_dec_and_test(&ccw_device_init_count))
 			wake_up(&ccw_device_init_wq);
 		break;
@@ -1557,8 +1559,7 @@ static int purge_fn(struct device *dev, void *data)
 		goto out;
 	CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", priv->dev_id.ssid,
 		      priv->dev_id.devno);
-	PREPARE_WORK(&cdev->private->kick_work, ccw_device_call_sch_unregister);
-	queue_work(slow_path_wq, &cdev->private->kick_work);
+	ccw_device_schedule_sch_unregister(cdev);
 
 out:
 	/* Abort loop in case of pending signal. */
diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h
index 85e01846ca6..f1cbbd94ad4 100644
--- a/drivers/s390/cio/device.h
+++ b/drivers/s390/cio/device.h
@@ -87,6 +87,7 @@ int ccw_device_is_orphan(struct ccw_device *);
 int ccw_device_recognition(struct ccw_device *);
 int ccw_device_online(struct ccw_device *);
 int ccw_device_offline(struct ccw_device *);
+void ccw_device_schedule_sch_unregister(struct ccw_device *);
 int ccw_purge_blacklisted(void);
 
 /* Function prototypes for device status and basic sense stuff. */
-- 
cgit v1.2.3


From 47593bfa1056d306fde067b28dd8617009be4121 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 31 Mar 2009 19:16:05 +0200
Subject: [S390] cio: introduce notifier for boxed state

If a ccw device did not respond in time during internal io, we set it
into boxed state. With this patch we have the following behaviour:
 * the ccw driver will get a notification if the device was online and
   goes into the boxed state
 * if the device was disconnected and got boxed nothing special is to be
   done (it will be handled in reprobing later)
 * if the device got boxed while initial sensing it will be unregistered

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/cio.h   |  2 ++
 drivers/s390/block/dasd.c     |  1 +
 drivers/s390/cio/device.c     |  4 ++--
 drivers/s390/cio/device_fsm.c | 29 ++++++++++++++++++-----------
 drivers/s390/scsi/zfcp_ccw.c  |  5 +++++
 5 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index 6dccb071aec..619bf94b11f 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -456,6 +456,8 @@ struct ciw {
 #define CIO_OPER       0x0004
 /* Sick revalidation of device. */
 #define CIO_REVALIDATE 0x0008
+/* Device did not respond in time. */
+#define CIO_BOXED      0x0010
 
 /**
  * struct ccw_dev_id - unique identifier for ccw devices
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 2fd64e5a9ab..0570794ccf1 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2363,6 +2363,7 @@ int dasd_generic_notify(struct ccw_device *cdev, int event)
 	ret = 0;
 	switch (event) {
 	case CIO_GONE:
+	case CIO_BOXED:
 	case CIO_NO_PATH:
 		/* First of all call extended error reporting. */
 		dasd_eer_write(device, NULL, DASD_EER_NOPATH);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index cdbf664ed44..868f8c6b053 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -1035,6 +1035,8 @@ io_subchannel_recog_done(struct ccw_device *cdev)
 		return;
 	}
 	switch (cdev->private->state) {
+	case DEV_STATE_BOXED:
+		/* Device did not respond in time. */
 	case DEV_STATE_NOT_OPER:
 		cdev->private->flags.recog_done = 1;
 		/* Remove device found not operational. */
@@ -1044,8 +1046,6 @@ io_subchannel_recog_done(struct ccw_device *cdev)
 		if (atomic_dec_and_test(&ccw_device_init_count))
 			wake_up(&ccw_device_init_wq);
 		break;
-	case DEV_STATE_BOXED:
-		/* Device did not respond in time. */
 	case DEV_STATE_OFFLINE:
 		/* 
 		 * We can't register the device in interrupt context so
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index ccd72f95765..e4604926156 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -256,14 +256,12 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 		old_lpm = 0;
 	if (sch->lpm != old_lpm)
 		__recover_lost_chpids(sch, old_lpm);
-	if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) {
-		if (state == DEV_STATE_NOT_OPER) {
-			cdev->private->flags.recog_done = 1;
-			cdev->private->state = DEV_STATE_DISCONNECTED;
-			wake_up(&cdev->private->wait_q);
-			return;
-		}
-		/* Boxed devices don't need extra treatment. */
+	if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID &&
+	    (state == DEV_STATE_NOT_OPER || state == DEV_STATE_BOXED)) {
+		cdev->private->flags.recog_done = 1;
+		cdev->private->state = DEV_STATE_DISCONNECTED;
+		wake_up(&cdev->private->wait_q);
+		return;
 	}
 	notify = 0;
 	same_dev = 0; /* Keep the compiler quiet... */
@@ -275,7 +273,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 			      sch->schid.ssid, sch->schid.sch_no);
 		break;
 	case DEV_STATE_OFFLINE:
-		if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) {
+		if (cdev->online) {
 			same_dev = ccw_device_handle_oper(cdev);
 			notify = 1;
 		}
@@ -308,6 +306,12 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 			      " subchannel 0.%x.%04x\n",
 			      cdev->private->dev_id.devno,
 			      sch->schid.ssid, sch->schid.sch_no);
+		if (cdev->id.cu_type != 0) { /* device was recognized before */
+			cdev->private->flags.recog_done = 1;
+			cdev->private->state = DEV_STATE_BOXED;
+			wake_up(&cdev->private->wait_q);
+			return;
+		}
 		break;
 	}
 	cdev->private->state = state;
@@ -390,10 +394,13 @@ ccw_device_done(struct ccw_device *cdev, int state)
 
 	cdev->private->state = state;
 
-
-	if (state == DEV_STATE_BOXED)
+	if (state == DEV_STATE_BOXED) {
 		CIO_MSG_EVENT(0, "Boxed device %04x on subchannel %04x\n",
 			      cdev->private->dev_id.devno, sch->schid.sch_no);
+		if (cdev->online && !ccw_device_notify(cdev, CIO_BOXED))
+			ccw_device_schedule_sch_unregister(cdev);
+		cdev->private->flags.donotify = 0;
+	}
 
 	if (cdev->private->flags.donotify) {
 		cdev->private->flags.donotify = 0;
diff --git a/drivers/s390/scsi/zfcp_ccw.c b/drivers/s390/scsi/zfcp_ccw.c
index 1fe1e2eda51..cfb0dcb6e3f 100644
--- a/drivers/s390/scsi/zfcp_ccw.c
+++ b/drivers/s390/scsi/zfcp_ccw.c
@@ -176,6 +176,11 @@ static int zfcp_ccw_notify(struct ccw_device *ccw_device, int event)
 		zfcp_erp_adapter_reopen(adapter, ZFCP_STATUS_COMMON_ERP_FAILED,
 					"ccnoti4", NULL);
 		break;
+	case CIO_BOXED:
+		dev_warn(&adapter->ccw_device->dev,
+			 "The ccw device did not respond in time.\n");
+		zfcp_erp_adapter_shutdown(adapter, 0, "ccnoti5", NULL);
+		break;
 	}
 	return 1;
 }
-- 
cgit v1.2.3


From b5cd99e6b002776c0e946f38292adbb0258b7983 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 31 Mar 2009 19:16:06 +0200
Subject: [S390] cio: disallow online setting of device in transient state

Return -EAGAIN on writes to sysfs online attribute if the corresponding
ccw device is in transient state.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/device.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 868f8c6b053..e47aa3f0476 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -515,7 +515,11 @@ static ssize_t online_store (struct device *dev, struct device_attribute *attr,
 	int force, ret;
 	unsigned long i;
 
-	if (atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0)
+	if ((cdev->private->state != DEV_STATE_OFFLINE &&
+	     cdev->private->state != DEV_STATE_ONLINE &&
+	     cdev->private->state != DEV_STATE_BOXED &&
+	     cdev->private->state != DEV_STATE_DISCONNECTED) ||
+	    atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0)
 		return -EAGAIN;
 
 	if (cdev->drv && !try_module_get(cdev->drv->owner)) {
-- 
cgit v1.2.3


From 99f6a570eedc885675b6aa36b7acdbdcc3a7f55b Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 31 Mar 2009 19:16:07 +0200
Subject: [S390] cio: online_store - trigger recognition for boxed devices

Start a new device recognition if someone writes to sysfs online attribute
of a boxed ccw device. The current test will fail, since cu_type != 0
for devices which were recognized before.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index e47aa3f0476..35441fa16be 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -468,7 +468,7 @@ static int online_store_recog_and_online(struct ccw_device *cdev)
 	int ret;
 
 	/* Do device recognition, if needed. */
-	if (cdev->id.cu_type == 0) {
+	if (cdev->private->state == DEV_STATE_BOXED) {
 		ret = ccw_device_recognition(cdev);
 		if (ret) {
 			CIO_MSG_EVENT(0, "Couldn't start recognition "
-- 
cgit v1.2.3


From 9010941c5483a7a5bb1f7d97ee62491fb078bb51 Mon Sep 17 00:00:00 2001
From: Elias Oltmanns <eo@nebensachen.de>
Date: Tue, 31 Mar 2009 20:14:56 +0200
Subject: ide: Fix code dealing with sleeping devices in do_ide_request()

Unfortunately, I missed a catch when reviewing the patch committed as
201bffa4. Here is the fix to the currently broken handling of sleeping
devices. In particular, this is required to get the disk shock
protection code working again.

Reported-by: Christian Thaeter <ct@pipapo.org>
Cc: stable@kernel.org
Signed-off-by: Elias Oltmanns <eo@nebensachen.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 1adc5e2e7fb..3c52317d852 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,11 +481,10 @@ repeat:
 		prev_port = hwif->host->cur_port;
 		hwif->rq = NULL;
 
-		if (drive->dev_flags & IDE_DFLAG_SLEEPING) {
-			if (time_before(drive->sleep, jiffies)) {
-				ide_unlock_port(hwif);
-				goto plug_device;
-			}
+		if (drive->dev_flags & IDE_DFLAG_SLEEPING &&
+		    time_after(drive->sleep, jiffies)) {
+			ide_unlock_port(hwif);
+			goto plug_device;
 		}
 
 		if ((hwif->host->host_flags & IDE_HFLAG_SERIALIZE) &&
-- 
cgit v1.2.3


From 479edf065576aeed7ac99d10838bb3b4f870b5f9 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 31 Mar 2009 20:14:57 +0200
Subject: ide: drivers/ide/ide-atapi.c needs <linux/scatterlist.h>

On m68k:
| drivers/ide/ide-atapi.c: In function 'ide_io_buffers':
| drivers/ide/ide-atapi.c:87: error: implicit declaration of function 'sg_page'
| drivers/ide/ide-atapi.c:87: warning: passing argument 1 of 'PageHighMem' makes pointer from integer without a cast
| drivers/ide/ide-atapi.c:91: warning: passing argument 1 of 'kmap_atomic' makes pointer from integer without a cast
| drivers/ide/ide-atapi.c:96: error: implicit declaration of function 'sg_virt'
| drivers/ide/ide-atapi.c:96: warning: assignment makes pointer from integer without a cast
| drivers/ide/ide-atapi.c:107: error: implicit declaration of function 'sg_next'
| drivers/ide/ide-atapi.c:107: warning: assignment makes pointer from integer without a cast

[bart: Dmitri Vorobiev submitted similar patch fixing MIPS]

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2fb5d28a9be..d937e45a677 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -6,6 +6,8 @@
 #include <linux/cdrom.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
+#include <linux/scatterlist.h>
+
 #include <scsi/scsi.h>
 
 #ifdef DEBUG
-- 
cgit v1.2.3


From da19620d99377a52b953245089f831a9c3f049c2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:14:57 +0200
Subject: at91_ide: fix ->ftf_flags handling

Fix some incorrect IDE_FTFLAG_* changes which slipped in commit
"ide: add "flagged" taskfile flags to struct ide_taskfile (v2)"
(commit 19710d25d50ae0be05eebe4231ed8918b1092d82) few days ago.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 27547121daf..6bb45a2b24c 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -192,10 +192,10 @@ static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (cmd->tf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->tf_flags & IDE_FTFLAG_OUT_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		at91_ide_output_data(drive, NULL, &data, 2);
@@ -233,7 +233,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->tf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		at91_ide_input_data(drive, NULL, &data, 2);
-- 
cgit v1.2.3


From 2eba08270990b99fb5429b76ee97184ddd272f7f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Tue, 31 Mar 2009 20:14:58 +0200
Subject: ide-atapi: start DMA after issuing a packet command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apparently¹, some ATAPI devices want to see the packet command first
before enabling DMA otherwise they simply hang indefinitely. Reorder the
two steps and start DMA only after having issued the command first.

[1] http://marc.info/?l=linux-kernel&m=123835520317235&w=2

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Reported-by: Michael Roth <mroth@nessie.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index d937e45a677..f591166d2c9 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -613,6 +613,10 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 					     : ide_pc_intr),
 			timeout);
 
+	/* Send the actual packet */
+	if ((drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) == 0)
+		hwif->tp_ops->output_data(drive, NULL, rq->cmd, cmd_len);
+
 	/* Begin DMA, if necessary */
 	if (dev_is_idecd(drive)) {
 		if (drive->dma)
@@ -624,10 +628,6 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 		}
 	}
 
-	/* Send the actual packet */
-	if ((drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) == 0)
-		hwif->tp_ops->output_data(drive, NULL, rq->cmd, cmd_len);
-
 	return ide_started;
 }
 
-- 
cgit v1.2.3


From 7a00798b1a7502ff31736152b23189138db0b978 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:14:59 +0200
Subject: ide: add support for arbitrary transfer lengths to ide_pio_bytes()

Add support for arbitrary transfer lengths to ide_pio_bytes()
and then inline ide_pio_multi() into ide_pio_datablock().

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-taskfile.c | 73 +++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 84532be97c0..8d7e87d04b3 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -189,7 +189,7 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 }
 
 static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
-			  unsigned int write, unsigned int nr_bytes)
+			  unsigned int write, unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -202,56 +202,55 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 	u8 *buf;
 
 	cursg = cmd->cursg;
-	if (!cursg) {
-		cursg = sg;
-		cmd->cursg = sg;
-	}
+	if (cursg == NULL)
+		cursg = cmd->cursg = sg;
+
+	while (len) {
+		unsigned nr_bytes = min(len, cursg->length - cmd->cursg_ofs);
 
-	page = sg_page(cursg);
-	offset = cursg->offset + cmd->cursg_ofs;
+		if (nr_bytes > PAGE_SIZE)
+			nr_bytes = PAGE_SIZE;
 
-	/* get the current page and offset */
-	page = nth_page(page, (offset >> PAGE_SHIFT));
-	offset %= PAGE_SIZE;
+		page = sg_page(cursg);
+		offset = cursg->offset + cmd->cursg_ofs;
+
+		/* get the current page and offset */
+		page = nth_page(page, (offset >> PAGE_SHIFT));
+		offset %= PAGE_SIZE;
 
 #ifdef CONFIG_HIGHMEM
-	local_irq_save(flags);
+		local_irq_save(flags);
 #endif
-	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
+		buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
-	cmd->nleft -= nr_bytes;
-	cmd->cursg_ofs += nr_bytes;
+		cmd->nleft -= nr_bytes;
+		cmd->cursg_ofs += nr_bytes;
 
-	if (cmd->cursg_ofs == cursg->length) {
-		cmd->cursg = sg_next(cmd->cursg);
-		cmd->cursg_ofs = 0;
-	}
+		if (cmd->cursg_ofs == cursg->length) {
+			cursg = cmd->cursg = sg_next(cmd->cursg);
+			cmd->cursg_ofs = 0;
+		}
 
-	/* do the actual data transfer */
-	if (write)
-		hwif->tp_ops->output_data(drive, cmd, buf, nr_bytes);
-	else
-		hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes);
+		/* do the actual data transfer */
+		if (write)
+			hwif->tp_ops->output_data(drive, cmd, buf, nr_bytes);
+		else
+			hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes);
 
-	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
+		kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
-	local_irq_restore(flags);
+		local_irq_restore(flags);
 #endif
-}
-
-static void ide_pio_multi(ide_drive_t *drive, struct ide_cmd *cmd,
-			  unsigned int write)
-{
-	unsigned int nsect;
 
-	nsect = min_t(unsigned int, cmd->nleft >> 9, drive->mult_count);
-	while (nsect--)
-		ide_pio_bytes(drive, cmd, write, SECTOR_SIZE);
+		len -= nr_bytes;
+	}
 }
 
 static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 			      unsigned int write)
 {
+	unsigned int nr_bytes;
+
 	u8 saved_io_32bit = drive->io_32bit;
 
 	if (cmd->tf_flags & IDE_TFLAG_FS)
@@ -263,9 +262,11 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 	touch_softlockup_watchdog();
 
 	if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
-		ide_pio_multi(drive, cmd, write);
+		nr_bytes = min_t(unsigned, cmd->nleft, drive->mult_count << 9);
 	else
-		ide_pio_bytes(drive, cmd, write, SECTOR_SIZE);
+		nr_bytes = SECTOR_SIZE;
+
+	ide_pio_bytes(drive, cmd, write, nr_bytes);
 
 	drive->io_32bit = saved_io_32bit;
 }
-- 
cgit v1.2.3


From f2bc316736e69e5623443a010f9581a01429c075 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:14:59 +0200
Subject: ide: use PageHighMem() instead of ifdefs in ide_pio_bytes()

Use PageHighMem() instead of ifdefs in ide_pio_bytes()
(=> local IRQs won't be disabled when not necessary).

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-taskfile.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 8d7e87d04b3..0e333ecf2ad 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -195,9 +195,7 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 	struct scatterlist *sg = hwif->sg_table;
 	struct scatterlist *cursg = cmd->cursg;
 	struct page *page;
-#ifdef CONFIG_HIGHMEM
 	unsigned long flags;
-#endif
 	unsigned int offset;
 	u8 *buf;
 
@@ -218,9 +216,9 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		page = nth_page(page, (offset >> PAGE_SHIFT));
 		offset %= PAGE_SIZE;
 
-#ifdef CONFIG_HIGHMEM
-		local_irq_save(flags);
-#endif
+		if (PageHighMem(page))
+			local_irq_save(flags);
+
 		buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
 		cmd->nleft -= nr_bytes;
@@ -238,9 +236,9 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 			hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes);
 
 		kunmap_atomic(buf, KM_BIO_SRC_IRQ);
-#ifdef CONFIG_HIGHMEM
-		local_irq_restore(flags);
-#endif
+
+		if (PageHighMem(page))
+			local_irq_restore(flags);
 
 		len -= nr_bytes;
 	}
-- 
cgit v1.2.3


From 116e690f4e69ce0458a9be7010c80b59eb7a99d8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:14:59 +0200
Subject: ide-cd: remove dead URLs

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3f630e4080d..a71ca2a9ab4 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -12,12 +12,9 @@
  * See Documentation/cdrom/ide-cd for usage information.
  *
  * Suggestions are welcome. Patches that work are more welcome though. ;-)
- * For those wishing to work on this driver, please be sure you download
- * and comply with the latest Mt. Fuji (SFF8090 version 4) and ATAPI
- * (SFF-8020i rev 2.6) standards. These documents can be obtained by
- * anonymous ftp from:
- * ftp://fission.dt.wdc.com/pub/standards/SFF_atapi/spec/SFF8020-r2.6/PS/8020r26.ps
- * ftp://ftp.avc-pioneer.com/Mtfuji4/Spec/Fuji4r10.pdf
+ *
+ * Documentation:
+ *	Mt. Fuji (SFF8090 version 4) and ATAPI (SFF-8020i rev 2.6) standards.
  *
  * For historical changelog please see:
  *	Documentation/ide/ChangeLog.ide-cd.1994-2004
-- 
cgit v1.2.3


From bf12a9c1c95e1b0204fc2fc9fe625a056e284f5a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:00 +0200
Subject: ide-cd: use ide_end_rq() also for failed non-fs requests

Use ide_end_rq() also for failed non-fs requests on completion
of REQUEST SENSE requests + use blk_rq_bytes() while at it.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index a71ca2a9ab4..6f64fb2f63d 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -265,18 +265,10 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 				failed->sense_len = rq->sense_len;
 			}
 			cdrom_analyze_sense_data(drive, failed, sense);
-			/*
-			 * now end the failed request
-			 */
-			if (blk_fs_request(failed)) {
-				if (ide_end_rq(drive, failed, -EIO,
-						failed->hard_nr_sectors << 9))
-					BUG();
-			} else {
-				if (blk_end_request(failed, -EIO,
-						    failed->data_len))
-					BUG();
-			}
+
+			if (ide_end_rq(drive, failed, -EIO,
+				       blk_rq_bytes(failed)))
+				BUG();
 		} else
 			cdrom_analyze_sense_data(drive, NULL, sense);
 	}
-- 
cgit v1.2.3


From 13eae6a48fc57495eb9e733430b8fc20df7bf415 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:00 +0200
Subject: ide-cd: remove dead code from cdrom_decode_status()

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 6f64fb2f63d..4a289711c55 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -321,12 +321,6 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 	err = ide_read_error(drive);
 	sense_key = err >> 4;
 
-	if (rq == NULL) {
-		printk(KERN_ERR PFX "%s: missing rq in %s\n",
-				drive->name, __func__);
-		return 1;
-	}
-
 	ide_debug_log(IDE_DBG_RQ, "stat: 0x%x, good_stat: 0x%x, cmd[0]: 0x%x, "
 				  "rq->cmd_type: 0x%x, err: 0x%x",
 				  stat, good_stat, rq->cmd[0], rq->cmd_type,
-- 
cgit v1.2.3


From 1ab6d7451684078bfc4fbabc432f0ef8a809e975 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:00 +0200
Subject: ide-cd: remove needless ide_dump_status_no_sense() wrapper

It makes no sense to check for BSY bit being set as earlier OK_STAT()
check in cdrom_end_request() makes sure that BSY bit is cleared.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4a289711c55..19ccadead5e 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -290,13 +290,6 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 	ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
 }
 
-static void ide_dump_status_no_sense(ide_drive_t *drive, const char *msg, u8 st)
-{
-	if (st & 0x80)
-		return;
-	ide_dump_status(drive, msg, st);
-}
-
 /*
  * Returns:
  * 0: if the request should be continued.
@@ -439,21 +432,19 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 			 * No point in retrying after an illegal request or data
 			 * protect error.
 			 */
-			ide_dump_status_no_sense(drive, "command error", stat);
+			ide_dump_status(drive, "command error", stat);
 			do_end_request = 1;
 		} else if (sense_key == MEDIUM_ERROR) {
 			/*
 			 * No point in re-trying a zillion times on a bad
 			 * sector. If we got here the error is not correctable.
 			 */
-			ide_dump_status_no_sense(drive,
-						 "media error (bad sector)",
-						 stat);
+			ide_dump_status(drive, "media error (bad sector)",
+					stat);
 			do_end_request = 1;
 		} else if (sense_key == BLANK_CHECK) {
 			/* disk appears blank ?? */
-			ide_dump_status_no_sense(drive, "media error (blank)",
-						 stat);
+			ide_dump_status(drive, "media error (blank)", stat);
 			do_end_request = 1;
 		} else if ((err & ~ATA_ABORTED) != 0) {
 			/* go to the default handler for other errors */
-- 
cgit v1.2.3


From 6041e8fba8b9a9a64bd7402be700b0f1247a9c55 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:01 +0200
Subject: ide-cd: remove no longer needed 'ignore' module parameter

ide-scsi is gone...

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 19ccadead5e..9d3150f549f 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1942,9 +1942,6 @@ static struct block_device_operations idecd_ops = {
 };
 
 /* module options */
-static char *ignore;
-module_param(ignore, charp, 0400);
-
 static unsigned long debug_mask;
 module_param(debug_mask, ulong, 0644);
 
@@ -1965,15 +1962,6 @@ static int ide_cd_probe(ide_drive_t *drive)
 	if (drive->media != ide_cdrom && drive->media != ide_optical)
 		goto failed;
 
-	/* skip drives that we were told to ignore */
-	if (ignore != NULL) {
-		if (strstr(ignore, drive->name)) {
-			printk(KERN_INFO PFX "ignoring drive %s\n",
-					 drive->name);
-			goto failed;
-		}
-	}
-
 	drive->debug_mask = debug_mask;
 	drive->irq_handler = cdrom_newpc_intr;
 
-- 
cgit v1.2.3


From 299c4852fc6995e0665d246927d25cefd4dad754 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:01 +0200
Subject: ide-cd: factor out failed request completion from cdrom_end_request()

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 9d3150f549f..eb0c2fe2b87 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -242,6 +242,29 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
+static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
+{
+	/*
+	 * For REQ_TYPE_SENSE, "rq->buffer" points to the original
+	 * failed request
+	 */
+	struct request *failed = (struct request *)rq->buffer;
+	struct cdrom_info *info = drive->driver_data;
+	void *sense = &info->sense_data;
+
+	if (failed) {
+		if (failed->sense) {
+			sense = failed->sense;
+			failed->sense_len = rq->sense_len;
+		}
+		cdrom_analyze_sense_data(drive, failed, sense);
+
+		if (ide_end_rq(drive, failed, -EIO, blk_rq_bytes(failed)))
+			BUG();
+	} else
+		cdrom_analyze_sense_data(drive, NULL, sense);
+}
+
 static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 {
 	struct request *rq = drive->hwif->rq;
@@ -250,28 +273,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, uptodate: 0x%x, nsectors: %d",
 				    rq->cmd[0], uptodate, nsectors);
 
-	if (blk_sense_request(rq) && uptodate) {
-		/*
-		 * For REQ_TYPE_SENSE, "rq->buffer" points to the original
-		 * failed request
-		 */
-		struct request *failed = (struct request *) rq->buffer;
-		struct cdrom_info *info = drive->driver_data;
-		void *sense = &info->sense_data;
-
-		if (failed) {
-			if (failed->sense) {
-				sense = failed->sense;
-				failed->sense_len = rq->sense_len;
-			}
-			cdrom_analyze_sense_data(drive, failed, sense);
-
-			if (ide_end_rq(drive, failed, -EIO,
-				       blk_rq_bytes(failed)))
-				BUG();
-		} else
-			cdrom_analyze_sense_data(drive, NULL, sense);
-	}
+	if (blk_sense_request(rq) && uptodate)
+		ide_cd_complete_failed_rq(drive, rq);
 
 	if (!rq->current_nr_sectors && blk_fs_request(rq))
 		uptodate = 1;
-- 
cgit v1.2.3


From e0458ccb0668edbecbc1ae1c17ed58a6b1a4ff3e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:02 +0200
Subject: ide-cd: unify ide_cd_do_request() exit paths

* Move cdrom_end_request() calls from cdrom_start_rw()
  and ide_cd_prepare_rw_request() to ide_cd_do_request().

* Unify ide_cd_do_request() exit paths.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index eb0c2fe2b87..3e3058cff84 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -595,7 +595,6 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 				printk(KERN_ERR PFX "%s: %s: buffer botch (%u)\n",
 						drive->name, __func__,
 						rq->current_nr_sectors);
-				cdrom_end_request(drive, 0);
 				return ide_stopped;
 			}
 			rq->current_nr_sectors += nskip;
@@ -972,10 +971,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 
 	if (write) {
 		/* disk has become write protected */
-		if (get_disk_ro(cd->disk)) {
-			cdrom_end_request(drive, 0);
+		if (get_disk_ro(cd->disk))
 			return ide_stopped;
-		}
 	} else {
 		/*
 		 * We may be retrying this request after an error.  Fix up any
@@ -987,10 +984,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	/* use DMA, if possible / writes *must* be hardware frame aligned */
 	if ((rq->nr_sectors & (sectors_per_frame - 1)) ||
 	    (rq->sector & (sectors_per_frame - 1))) {
-		if (write) {
-			cdrom_end_request(drive, 0);
+		if (write)
 			return ide_stopped;
-		}
 		drive->dma = 0;
 	} else
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
@@ -1045,6 +1040,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
 	struct ide_cmd cmd;
+	int uptodate = 0;
 
 	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu",
 				  rq->cmd[0], (unsigned long long)block);
@@ -1053,11 +1049,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		blk_dump_rq_flags(rq, "ide_cd_do_request");
 
 	if (blk_fs_request(rq)) {
-		if (cdrom_start_rw(drive, rq) == ide_stopped)
-			return ide_stopped;
-
-		if (ide_cd_prepare_rw_request(drive, rq) == ide_stopped)
-			return ide_stopped;
+		if (cdrom_start_rw(drive, rq) == ide_stopped ||
+		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped)
+			goto out_end;
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 		   rq->cmd_type == REQ_TYPE_ATA_PC) {
 		if (!rq->timeout)
@@ -1066,12 +1060,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		cdrom_do_block_pc(drive, rq);
 	} else if (blk_special_request(rq)) {
 		/* right now this can only be a reset... */
-		cdrom_end_request(drive, 1);
-		return ide_stopped;
+		uptodate = 1;
+		goto out_end;
 	} else {
 		blk_dump_rq_flags(rq, DRV_NAME " bad flags");
-		cdrom_end_request(drive, 0);
-		return ide_stopped;
+		goto out_end;
 	}
 
 	memset(&cmd, 0, sizeof(cmd));
@@ -1082,6 +1075,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	cmd.rq = rq;
 
 	return ide_issue_pc(drive, &cmd);
+out_end:
+	cdrom_end_request(drive, uptodate);
+	return ide_stopped;
 }
 
 /*
-- 
cgit v1.2.3


From 984c5e5974227d2d4dba58cdf19af641f89be83f Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:03 +0200
Subject: ide-cd: move setting REQ_FAILED flag out from 'end_request' exit path

Move setting REQ_FAILED flag out from 'end_request' exit path in
cdrom_newpc_intr() and also rename 'end_request' to 'out_end'.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3e3058cff84..b66da3f1678 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -776,7 +776,9 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			ide_complete_rq(drive, 0, 512);
 			return ide_stopped;
 		}
-		goto end_request;
+		if (blk_pc_request(rq) == 0 && uptodate == 0)
+			rq->cmd_flags |= REQ_FAILED;
+		goto out_end;
 	}
 
 	ide_read_bcount_and_ireason(drive, &len, &ireason);
@@ -811,8 +813,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			ide_cd_request_sense_fixup(drive, rq);
 			/* complain if we still have data left to transfer */
 			uptodate = rq->data_len ? 0 : 1;
+			if (uptodate == 0)
+				rq->cmd_flags |= REQ_FAILED;
 		}
-		goto end_request;
+		goto out_end;
 	}
 
 	/* check which way to transfer data */
@@ -939,7 +943,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	ide_set_handler(drive, cdrom_newpc_intr, timeout);
 	return ide_started;
 
-end_request:
+out_end:
 	if (blk_pc_request(rq)) {
 		unsigned int dlen = rq->data_len;
 
@@ -951,8 +955,6 @@ end_request:
 
 		hwif->rq = NULL;
 	} else {
-		if (!uptodate)
-			rq->cmd_flags |= REQ_FAILED;
 		cdrom_end_request(drive, uptodate);
 	}
 	return ide_stopped;
-- 
cgit v1.2.3


From 8a116974852a727bdfe6b1b897102903a17228a5 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:03 +0200
Subject: ide-cd: unify cdrom_newpc_intr() exit paths

* Move cdrom_end_request() calls from cdrom_decode_status()
  and ide_cd_check_ireason() to cdrom_newpc_intr().

* Unify cdrom_newpc_intr() exit paths.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index b66da3f1678..4c32e8db55c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -296,7 +296,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 /*
  * Returns:
  * 0: if the request should be continued.
- * 1: if the request was ended.
+ * 1: if the request will be going through error recovery.
+ * 2: if the request should be ended.
  */
 static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 {
@@ -329,10 +330,7 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 		 * Just give up.
 		 */
 		rq->cmd_flags |= REQ_FAILED;
-		cdrom_end_request(drive, 0);
-		ide_error(drive, "request sense failure", stat);
-		return 1;
-
+		return 2;
 	} else if (blk_pc_request(rq) || rq->cmd_type == REQ_TYPE_ATA_PC) {
 		/* All other functions, except for READ. */
 
@@ -472,14 +470,12 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 		 */
 		if (stat & ATA_ERR)
 			cdrom_queue_request_sense(drive, NULL, NULL);
+		return 1;
 	} else {
 		blk_dump_rq_flags(rq, PFX "bad rq");
-		cdrom_end_request(drive, 0);
+		return 2;
 	}
 
-	/* retry, or handle the next request */
-	return 1;
-
 end_request:
 	if (stat & ATA_ERR) {
 		struct request_queue *q = drive->queue;
@@ -492,10 +488,9 @@ end_request:
 		hwif->rq = NULL;
 
 		cdrom_queue_request_sense(drive, rq->sense, rq);
+		return 1;
 	} else
-		cdrom_end_request(drive, 0);
-
-	return 1;
+		return 2;
 }
 
 /*
@@ -539,7 +534,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 	if (rq->cmd_type == REQ_TYPE_ATA_PC)
 		rq->cmd_flags |= REQ_FAILED;
 
-	cdrom_end_request(drive, 0);
 	return -1;
 }
 
@@ -741,7 +735,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	xfer_func_t *xferfunc;
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, stat, thislen, uptodate = 0;
-	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0;
+	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc;
+	int sense = blk_sense_request(rq);
 	unsigned int timeout;
 	u16 len;
 	u8 ireason;
@@ -761,8 +756,12 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		}
 	}
 
-	if (cdrom_decode_status(drive, 0, &stat))
+	rc = cdrom_decode_status(drive, 0, &stat);
+	if (rc) {
+		if (rc == 2)
+			goto out_end;
 		return ide_stopped;
+	}
 
 	/* using dma, transfer is complete now */
 	if (dma) {
@@ -807,8 +806,6 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 					rq->cmd_flags |= REQ_FAILED;
 				uptodate = 0;
 			}
-			cdrom_end_request(drive, uptodate);
-			return ide_stopped;
 		} else if (!blk_pc_request(rq)) {
 			ide_cd_request_sense_fixup(drive, rq);
 			/* complain if we still have data left to transfer */
@@ -820,17 +817,16 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	}
 
 	/* check which way to transfer data */
-	if (ide_cd_check_ireason(drive, rq, len, ireason, write))
-		return ide_stopped;
+	rc = ide_cd_check_ireason(drive, rq, len, ireason, write);
+	if (rc)
+		goto out_end;
 
 	if (blk_fs_request(rq)) {
 		if (write == 0) {
 			int nskip;
 
-			if (ide_cd_check_transfer_size(drive, len)) {
-				cdrom_end_request(drive, 0);
-				return ide_stopped;
-			}
+			if (ide_cd_check_transfer_size(drive, len))
+				goto out_end;
 
 			/*
 			 * First, figure out if we need to bit-bucket
@@ -923,7 +919,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			else
 				rq->data += blen;
 		}
-		if (!write && blk_sense_request(rq))
+		if (sense && write == 0)
 			rq->sense_len += blen;
 	}
 
@@ -944,7 +940,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	return ide_started;
 
 out_end:
-	if (blk_pc_request(rq)) {
+	if (blk_pc_request(rq) && rc == 0) {
 		unsigned int dlen = rq->data_len;
 
 		if (dma)
@@ -956,6 +952,8 @@ out_end:
 		hwif->rq = NULL;
 	} else {
 		cdrom_end_request(drive, uptodate);
+		if (sense && rc == 2)
+			ide_error(drive, "request sense failure", stat);
 	}
 	return ide_stopped;
 }
-- 
cgit v1.2.3


From f63174e7a7ba3afa7f53e61c59b3f1ca5d88f3fb Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:04 +0200
Subject: ide-cd: remove cdrom_end_request()

Inline cdrom_end_request() into cdrom_newpc_intr()
and ide_cd_do_request().

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 81 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4c32e8db55c..c859eafe759 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -265,34 +265,6 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 		cdrom_analyze_sense_data(drive, NULL, sense);
 }
 
-static void cdrom_end_request(ide_drive_t *drive, int uptodate)
-{
-	struct request *rq = drive->hwif->rq;
-	int nsectors = rq->hard_cur_sectors;
-
-	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, uptodate: 0x%x, nsectors: %d",
-				    rq->cmd[0], uptodate, nsectors);
-
-	if (blk_sense_request(rq) && uptodate)
-		ide_cd_complete_failed_rq(drive, rq);
-
-	if (!rq->current_nr_sectors && blk_fs_request(rq))
-		uptodate = 1;
-	/* make sure it's fully ended */
-	if (blk_pc_request(rq))
-		nsectors = (rq->data_len + 511) >> 9;
-	if (!nsectors)
-		nsectors = 1;
-
-	ide_debug_log(IDE_DBG_FUNC, "uptodate: 0x%x, nsectors: %d",
-				    uptodate, nsectors);
-
-	if (blk_fs_request(rq) == 0 && uptodate <= 0 && rq->errors == 0)
-		rq->errors = -EIO;
-
-	ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
-}
-
 /*
  * Returns:
  * 0: if the request should be continued.
@@ -735,7 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	xfer_func_t *xferfunc;
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, stat, thislen, uptodate = 0;
-	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc;
+	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc, nsectors;
 	int sense = blk_sense_request(rq);
 	unsigned int timeout;
 	u16 len;
@@ -902,8 +874,14 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			rq->current_nr_sectors -= (blen >> 9);
 			rq->sector += (blen >> 9);
 
-			if (rq->current_nr_sectors == 0 && rq->nr_sectors)
-				cdrom_end_request(drive, 1);
+			if (rq->current_nr_sectors == 0 && rq->nr_sectors) {
+				nsectors = rq->hard_cur_sectors;
+
+				if (nsectors == 0)
+					nsectors = 1;
+
+				ide_complete_rq(drive, 0, nsectors << 9);
+			}
 		} else {
 			rq->data_len -= blen;
 
@@ -951,7 +929,28 @@ out_end:
 
 		hwif->rq = NULL;
 	} else {
-		cdrom_end_request(drive, uptodate);
+		if (sense && uptodate)
+			ide_cd_complete_failed_rq(drive, rq);
+
+		if (blk_fs_request(rq)) {
+			if (rq->current_nr_sectors == 0)
+				uptodate = 1;
+		} else {
+			if (uptodate <= 0 && rq->errors == 0)
+				rq->errors = -EIO;
+		}
+
+		/* make sure it's fully ended */
+		if (blk_pc_request(rq))
+			nsectors = (rq->data_len + 511) >> 9;
+		else
+			nsectors = rq->hard_cur_sectors;
+
+		if (nsectors == 0)
+			nsectors = 1;
+
+		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
+
 		if (sense && rc == 2)
 			ide_error(drive, "request sense failure", stat);
 	}
@@ -1040,7 +1039,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
 	struct ide_cmd cmd;
-	int uptodate = 0;
+	int uptodate = 0, nsectors;
 
 	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu",
 				  rq->cmd[0], (unsigned long long)block);
@@ -1076,7 +1075,21 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	return ide_issue_pc(drive, &cmd);
 out_end:
-	cdrom_end_request(drive, uptodate);
+	if (blk_fs_request(rq)) {
+		if (rq->current_nr_sectors == 0)
+			uptodate = 1;
+	} else {
+		if (uptodate <= 0 && rq->errors == 0)
+			rq->errors = -EIO;
+	}
+
+	nsectors = rq->hard_cur_sectors;
+
+	if (nsectors == 0)
+		nsectors = 1;
+
+	ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
+
 	return ide_stopped;
 }
 
-- 
cgit v1.2.3


From c4c69e21b51005e24e2fc4efc8a73460a5ab7799 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:04 +0200
Subject: ide-cd: kill whole failed request in ide_cd_do_request()

Untangling cdrom_end_request() uncovered an error in completing
failed requests in ide_cd_do_request().  Fix it.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index c859eafe759..978e1c0c172 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1083,7 +1083,7 @@ out_end:
 			rq->errors = -EIO;
 	}
 
-	nsectors = rq->hard_cur_sectors;
+	nsectors = rq->hard_nr_sectors;
 
 	if (nsectors == 0)
 		nsectors = 1;
-- 
cgit v1.2.3


From 5ed57ad705d6b58386ac43d2ca1c8fc66aee1101 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:05 +0200
Subject: ide-cd: cleanup ide_cd_do_request()

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 978e1c0c172..30113e69c8b 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1049,8 +1049,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	if (blk_fs_request(rq)) {
 		if (cdrom_start_rw(drive, rq) == ide_stopped ||
-		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped)
+		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped) {
+			if (rq->current_nr_sectors == 0)
+				uptodate = 1;
 			goto out_end;
+		}
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 		   rq->cmd_type == REQ_TYPE_ATA_PC) {
 		if (!rq->timeout)
@@ -1063,6 +1066,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		goto out_end;
 	} else {
 		blk_dump_rq_flags(rq, DRV_NAME " bad flags");
+		if (rq->errors == 0)
+			rq->errors = -EIO;
 		goto out_end;
 	}
 
@@ -1075,14 +1080,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	return ide_issue_pc(drive, &cmd);
 out_end:
-	if (blk_fs_request(rq)) {
-		if (rq->current_nr_sectors == 0)
-			uptodate = 1;
-	} else {
-		if (uptodate <= 0 && rq->errors == 0)
-			rq->errors = -EIO;
-	}
-
 	nsectors = rq->hard_nr_sectors;
 
 	if (nsectors == 0)
-- 
cgit v1.2.3


From a08915ba594da66145f33a972db578a58b9135f1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:13 +0200
Subject: ide-cd: use scatterlists for PIO transfers (fs requests)

* Export ide_pio_bytes().

* Add ->last_xfer_len field to struct ide_cmd.

* Add ide_cd_error_cmd() helper to ide-cd.

* Convert ide-cd to use scatterlists also for PIO transfers (fs requests
  only for now) and get rid of partial completions (except when the error
  happens -- which is still subject to change later because looking at
  ATAPI spec it seems that the device is free to error the whole transfer
  with setting the Error bit only on the last transfer chunk).

* Update ide_cd_{prepare_rw,restore_request,do_request}() accordingly.

* Inline ide_cd_restore_request() into cdrom_start_rw().

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c       | 146 +++++++++++++--------------------------------
 drivers/ide/ide-taskfile.c |   5 +-
 include/linux/ide.h        |   4 ++
 3 files changed, 50 insertions(+), 105 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 30113e69c8b..5f15859c2c7 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -539,64 +539,12 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 {
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags);
 
-	if (rq_data_dir(rq) == READ) {
-		unsigned short sectors_per_frame =
-			queue_hardsect_size(drive->queue) >> SECTOR_BITS;
-		int nskip = rq->sector & (sectors_per_frame - 1);
-
-		/*
-		 * If the requested sector doesn't start on a frame boundary,
-		 * we must adjust the start of the transfer so that it does,
-		 * and remember to skip the first few sectors.
-		 *
-		 * If the rq->current_nr_sectors field is larger than the size
-		 * of the buffer, it will mean that we're to skip a number of
-		 * sectors equal to the amount by which rq->current_nr_sectors
-		 * is larger than the buffer size.
-		 */
-		if (nskip > 0) {
-			/* sanity check... */
-			if (rq->current_nr_sectors !=
-			    bio_cur_sectors(rq->bio)) {
-				printk(KERN_ERR PFX "%s: %s: buffer botch (%u)\n",
-						drive->name, __func__,
-						rq->current_nr_sectors);
-				return ide_stopped;
-			}
-			rq->current_nr_sectors += nskip;
-		}
-	}
-
 	/* set up the command */
 	rq->timeout = ATAPI_WAIT_PC;
 
 	return ide_started;
 }
 
-/*
- * Fix up a possibly partially-processed request so that we can start it over
- * entirely, or even put it back on the request queue.
- */
-static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq)
-{
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (rq->buffer != bio_data(rq->bio)) {
-		sector_t n =
-			(rq->buffer - (char *)bio_data(rq->bio)) / SECTOR_SIZE;
-
-		rq->buffer = bio_data(rq->bio);
-		rq->nr_sectors += n;
-		rq->sector -= n;
-	}
-	rq->current_nr_sectors = bio_cur_sectors(rq->bio);
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->hard_nr_sectors = rq->nr_sectors;
-	rq->hard_sector = rq->sector;
-	rq->q->prep_rq_fn(rq->q, rq);
-}
-
 static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
 {
 	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
@@ -690,6 +638,17 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 	return (flags & REQ_FAILED) ? -EIO : 0;
 }
 
+static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	unsigned int nr_bytes = cmd->nbytes - cmd->nleft;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		nr_bytes -= cmd->last_xfer_len;
+
+	if (nr_bytes > 0)
+		ide_complete_rq(drive, 0, nr_bytes);
+}
+
 /*
  * Called from blk_end_request_callback() after the data of the request is
  * completed and before the request itself is completed. By returning value '1',
@@ -703,6 +662,7 @@ static int cdrom_newpc_intr_dummy_cb(struct request *rq)
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	xfer_func_t *xferfunc;
 	ide_expiry_t *expiry = NULL;
@@ -769,11 +729,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			 * Otherwise, complete the command normally.
 			 */
 			uptodate = 1;
-			if (rq->current_nr_sectors > 0) {
+			if (cmd->nleft > 0) {
 				printk(KERN_ERR PFX "%s: %s: data underrun "
-						"(%d blocks)\n",
-						drive->name, __func__,
-						rq->current_nr_sectors);
+					"(%u bytes)\n", drive->name, __func__,
+					cmd->nleft);
 				if (!write)
 					rq->cmd_flags |= REQ_FAILED;
 				uptodate = 0;
@@ -795,24 +754,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	if (blk_fs_request(rq)) {
 		if (write == 0) {
-			int nskip;
-
 			if (ide_cd_check_transfer_size(drive, len))
 				goto out_end;
-
-			/*
-			 * First, figure out if we need to bit-bucket
-			 * any of the leading sectors.
-			 */
-			nskip = min_t(int, rq->current_nr_sectors
-					   - bio_cur_sectors(rq->bio),
-					   thislen >> 9);
-			if (nskip > 0) {
-				ide_pad_transfer(drive, write, nskip << 9);
-				rq->current_nr_sectors -= nskip;
-				thislen -= (nskip << 9);
-			}
 		}
+		cmd->last_xfer_len = 0;
 	}
 
 	if (ireason == 0) {
@@ -835,15 +780,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		/* bio backed? */
 		if (rq->bio) {
 			if (blk_fs_request(rq)) {
-				ptr = rq->buffer;
-				blen = rq->current_nr_sectors << 9;
+				blen = min_t(int, thislen, cmd->nleft);
 			} else {
 				ptr = bio_data(rq->bio);
 				blen = bio_iovec(rq->bio)->bv_len;
 			}
 		}
 
-		if (!ptr) {
+		if ((blk_fs_request(rq) && cmd->nleft == 0) ||
+		    (blk_fs_request(rq) == 0 && ptr == NULL)) {
 			if (blk_fs_request(rq) && !write)
 				/*
 				 * If the buffers are full, pipe the rest into
@@ -863,26 +808,16 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		if (blen > thislen)
 			blen = thislen;
 
-		xferfunc(drive, NULL, ptr, blen);
+		if (blk_fs_request(rq)) {
+			ide_pio_bytes(drive, cmd, write, blen);
+			cmd->last_xfer_len += blen;
+		} else
+			xferfunc(drive, NULL, ptr, blen);
 
 		thislen -= blen;
 		len -= blen;
 
-		if (blk_fs_request(rq)) {
-			rq->buffer += blen;
-			rq->nr_sectors -= (blen >> 9);
-			rq->current_nr_sectors -= (blen >> 9);
-			rq->sector += (blen >> 9);
-
-			if (rq->current_nr_sectors == 0 && rq->nr_sectors) {
-				nsectors = rq->hard_cur_sectors;
-
-				if (nsectors == 0)
-					nsectors = 1;
-
-				ide_complete_rq(drive, 0, nsectors << 9);
-			}
-		} else {
+		if (blk_fs_request(rq) == 0) {
 			rq->data_len -= blen;
 
 			/*
@@ -933,8 +868,10 @@ out_end:
 			ide_cd_complete_failed_rq(drive, rq);
 
 		if (blk_fs_request(rq)) {
-			if (rq->current_nr_sectors == 0)
+			if (cmd->nleft == 0)
 				uptodate = 1;
+			if (uptodate == 0)
+				ide_cd_error_cmd(drive, cmd);
 		} else {
 			if (uptodate <= 0 && rq->errors == 0)
 				rq->errors = -EIO;
@@ -944,7 +881,7 @@ out_end:
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
 		else
-			nsectors = rq->hard_cur_sectors;
+			nsectors = rq->hard_nr_sectors;
 
 		if (nsectors == 0)
 			nsectors = 1;
@@ -960,9 +897,10 @@ out_end:
 static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 {
 	struct cdrom_info *cd = drive->driver_data;
+	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(drive->queue) >> SECTOR_BITS;
+		queue_hardsect_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, "
 				  "secs_per_frame: %u",
@@ -977,17 +915,16 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		ide_cd_restore_request(drive, rq);
+		q->prep_rq_fn(q, rq);
 	}
 
-	/* use DMA, if possible / writes *must* be hardware frame aligned */
+	/* fs requests *must* be hardware frame aligned */
 	if ((rq->nr_sectors & (sectors_per_frame - 1)) ||
-	    (rq->sector & (sectors_per_frame - 1))) {
-		if (write)
-			return ide_stopped;
-		drive->dma = 0;
-	} else
-		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
+	    (rq->sector & (sectors_per_frame - 1)))
+		return ide_stopped;
+
+	/* use DMA, if possible */
+	drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
 	if (write)
 		cd->devinfo.media_written = 1;
@@ -1050,8 +987,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	if (blk_fs_request(rq)) {
 		if (cdrom_start_rw(drive, rq) == ide_stopped ||
 		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped) {
-			if (rq->current_nr_sectors == 0)
-				uptodate = 1;
 			goto out_end;
 		}
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
@@ -1078,6 +1013,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
+	if (blk_fs_request(rq)) {
+		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+		ide_map_sg(drive, &cmd);
+	}
+
 	return ide_issue_pc(drive, &cmd);
 out_end:
 	nsectors = rq->hard_nr_sectors;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 0e333ecf2ad..a3b7a50562b 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -188,8 +188,8 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 	return stat;
 }
 
-static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
-			  unsigned int write, unsigned int len)
+void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
+		   unsigned int write, unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -243,6 +243,7 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		len -= nr_bytes;
 	}
 }
+EXPORT_SYMBOL_GPL(ide_pio_bytes);
 
 static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 			      unsigned int write)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d5d832271f4..c2841c0c36c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -352,6 +352,8 @@ struct ide_cmd {
 
 	unsigned int		nbytes;
 	unsigned int		nleft;
+	unsigned int		last_xfer_len;
+
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
@@ -1226,6 +1228,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_cmd *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
 
+void ide_pio_bytes(ide_drive_t *, struct ide_cmd *, unsigned int, unsigned int);
+
 void ide_finish_cmd(ide_drive_t *, struct ide_cmd *, u8);
 
 int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16);
-- 
cgit v1.2.3


From 06a449e30135aabb6686c95bf0c42b46d169a3b3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:13 +0200
Subject: ide-cd: fix non-SECTOR_SIZE-multiples PIO transfers for fs requests

We now support arbitrary number of bytes per-IRQ also for fs requests
so remove ide_cd_check_transfer_size() and IDE_AFLAG_LIMIT_NFRAMES.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 36 +-----------------------------------
 include/linux/ide.h  |  5 -----
 2 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5f15859c2c7..c0cefe5becf 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -509,31 +509,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 	return -1;
 }
 
-/*
- * Assume that the drive will always provide data in multiples of at least
- * SECTOR_SIZE, as it gets hairy to keep track of the transfers otherwise.
- */
-static int ide_cd_check_transfer_size(ide_drive_t *drive, int len)
-{
-	ide_debug_log(IDE_DBG_FUNC, "len: %d", len);
-
-	if ((len % SECTOR_SIZE) == 0)
-		return 0;
-
-	printk(KERN_ERR PFX "%s: %s: Bad transfer size %d\n", drive->name,
-			__func__, len);
-
-	if (drive->atapi_flags & IDE_AFLAG_LIMIT_NFRAMES)
-		printk(KERN_ERR PFX "This drive is not supported by this "
-				"version of the driver\n");
-	else {
-		printk(KERN_ERR PFX "Trying to limit transfer sizes\n");
-		drive->atapi_flags |= IDE_AFLAG_LIMIT_NFRAMES;
-	}
-
-	return 1;
-}
-
 static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 						 struct request *rq)
 {
@@ -752,13 +727,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (rc)
 		goto out_end;
 
-	if (blk_fs_request(rq)) {
-		if (write == 0) {
-			if (ide_cd_check_transfer_size(drive, len))
-				goto out_end;
-		}
-		cmd->last_xfer_len = 0;
-	}
+	cmd->last_xfer_len = 0;
 
 	if (ireason == 0) {
 		write = 1;
@@ -1619,9 +1588,6 @@ static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive)
 #endif
 
 static const struct cd_list_entry ide_cd_quirks_list[] = {
-	/* Limit transfer size per interrupt. */
-	{ "SAMSUNG CD-ROM SCR-2430", NULL,   IDE_AFLAG_LIMIT_NFRAMES	     },
-	{ "SAMSUNG CD-ROM SCR-2432", NULL,   IDE_AFLAG_LIMIT_NFRAMES	     },
 	/* SCR-3231 doesn't support the SET_CD_SPEED command. */
 	{ "SAMSUNG CD-ROM SCR-3231", NULL,   IDE_AFLAG_NO_SPEED_SELECT	     },
 	/* Old NEC260 (not R) was released before ATAPI 1.2 spec. */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c2841c0c36c..cb501bf78f7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -458,11 +458,6 @@ enum {
 	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
 	/* TOC track numbers are in BCD. */
 	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
-	/*
-	 * Drive does not provide data in multiples of SECTOR_SIZE
-	 * when more than one interrupt is needed.
-	 */
-	IDE_AFLAG_LIMIT_NFRAMES		= (1 << 5),
 	/* Saved TOC information is current. */
 	IDE_AFLAG_TOC_VALID		= (1 << 6),
 	/* We think that the drive door is locked. */
-- 
cgit v1.2.3


From 8652b31ab211b6fe2a4994cc47b61d7038c3489c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:14 +0200
Subject: ide-cd: merge ide_cd_prepare_rw_request() into cdrom_start_rw()

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index c0cefe5becf..cf0707fe87e 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -509,17 +509,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 	return -1;
 }
 
-static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
-						 struct request *rq)
-{
-	ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags);
-
-	/* set up the command */
-	rq->timeout = ATAPI_WAIT_PC;
-
-	return ide_started;
-}
-
 static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
 {
 	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
@@ -871,9 +860,9 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	unsigned short sectors_per_frame =
 		queue_hardsect_size(q) >> SECTOR_BITS;
 
-	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, "
+	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 				  "secs_per_frame: %u",
-				  rq->cmd[0], write, sectors_per_frame);
+				  rq->cmd[0], rq->cmd_flags, sectors_per_frame);
 
 	if (write) {
 		/* disk has become write protected */
@@ -898,6 +887,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	if (write)
 		cd->devinfo.media_written = 1;
 
+	rq->timeout = ATAPI_WAIT_PC;
+
 	return ide_started;
 }
 
@@ -954,10 +945,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		blk_dump_rq_flags(rq, "ide_cd_do_request");
 
 	if (blk_fs_request(rq)) {
-		if (cdrom_start_rw(drive, rq) == ide_stopped ||
-		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped) {
+		if (cdrom_start_rw(drive, rq) == ide_stopped)
 			goto out_end;
-		}
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 		   rq->cmd_type == REQ_TYPE_ATA_PC) {
 		if (!rq->timeout)
-- 
cgit v1.2.3


From c7ec89994fec4353d5b4251213bdfa7b1a68c26b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:15 +0200
Subject: ide-cd: use scatterlists for PIO transfers (non-fs requests) (v2)

Convert ide-cd to use scatterlists for PIO transfers and get rid of
partial completions (except on error) also for non-fs requests.

v2:
Do not map dataless commands to an sg since it oopses on the virt_to_page()
translation check when DEBUG_VIRTUAL is enabled.  (from Borislav Petkov,
reported/bisected-by Tetsuo Handa).

Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 100 ++++++++++++++++-----------------------------------
 1 file changed, 30 insertions(+), 70 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index cf0707fe87e..7fdfb55011a 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -509,8 +509,10 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 	return -1;
 }
 
-static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
+static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
+	struct request *rq = cmd->rq;
+
 	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
 
 	/*
@@ -518,11 +520,14 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
 	 * and some drives don't send them.  Sigh.
 	 */
 	if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
-	    rq->data_len > 0 && rq->data_len <= 5)
-		while (rq->data_len > 0) {
-			*(u8 *)rq->data++ = 0;
-			--rq->data_len;
+	    cmd->nleft > 0 && cmd->nleft <= 5) {
+		unsigned int ofs = cmd->nbytes - cmd->nleft;
+
+		while (cmd->nleft > 0) {
+			*((u8 *)rq->data + ofs++) = 0;
+			cmd->nleft--;
 		}
+	}
 }
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
@@ -613,22 +618,11 @@ static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 		ide_complete_rq(drive, 0, nr_bytes);
 }
 
-/*
- * Called from blk_end_request_callback() after the data of the request is
- * completed and before the request itself is completed. By returning value '1',
- * blk_end_request_callback() returns immediately without completing it.
- */
-static int cdrom_newpc_intr_dummy_cb(struct request *rq)
-{
-	return 1;
-}
-
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
-	xfer_func_t *xferfunc;
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, stat, thislen, uptodate = 0;
 	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc, nsectors;
@@ -678,7 +672,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	ide_read_bcount_and_ireason(drive, &len, &ireason);
 
-	thislen = blk_fs_request(rq) ? len : rq->data_len;
+	thislen = blk_fs_request(rq) ? len : cmd->nleft;
 	if (thislen > len)
 		thislen = len;
 
@@ -702,9 +696,9 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 				uptodate = 0;
 			}
 		} else if (!blk_pc_request(rq)) {
-			ide_cd_request_sense_fixup(drive, rq);
+			ide_cd_request_sense_fixup(drive, cmd);
 			/* complain if we still have data left to transfer */
-			uptodate = rq->data_len ? 0 : 1;
+			uptodate = cmd->nleft ? 0 : 1;
 			if (uptodate == 0)
 				rq->cmd_flags |= REQ_FAILED;
 		}
@@ -718,35 +712,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	cmd->last_xfer_len = 0;
 
-	if (ireason == 0) {
-		write = 1;
-		xferfunc = hwif->tp_ops->output_data;
-	} else {
-		write = 0;
-		xferfunc = hwif->tp_ops->input_data;
-	}
-
 	ide_debug_log(IDE_DBG_PC, "data transfer, rq->cmd_type: 0x%x, "
 				  "ireason: 0x%x",
 				  rq->cmd_type, ireason);
 
 	/* transfer data */
 	while (thislen > 0) {
-		u8 *ptr = blk_fs_request(rq) ? NULL : rq->data;
-		int blen = rq->data_len;
-
-		/* bio backed? */
-		if (rq->bio) {
-			if (blk_fs_request(rq)) {
-				blen = min_t(int, thislen, cmd->nleft);
-			} else {
-				ptr = bio_data(rq->bio);
-				blen = bio_iovec(rq->bio)->bv_len;
-			}
-		}
+		int blen = min_t(int, thislen, cmd->nleft);
 
-		if ((blk_fs_request(rq) && cmd->nleft == 0) ||
-		    (blk_fs_request(rq) == 0 && ptr == NULL)) {
+		if (cmd->nleft == 0) {
 			if (blk_fs_request(rq) && !write)
 				/*
 				 * If the buffers are full, pipe the rest into
@@ -763,33 +737,12 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			break;
 		}
 
-		if (blen > thislen)
-			blen = thislen;
-
-		if (blk_fs_request(rq)) {
-			ide_pio_bytes(drive, cmd, write, blen);
-			cmd->last_xfer_len += blen;
-		} else
-			xferfunc(drive, NULL, ptr, blen);
+		ide_pio_bytes(drive, cmd, write, blen);
+		cmd->last_xfer_len += blen;
 
 		thislen -= blen;
 		len -= blen;
 
-		if (blk_fs_request(rq) == 0) {
-			rq->data_len -= blen;
-
-			/*
-			 * The request can't be completed until DRQ is cleared.
-			 * So complete the data, but don't complete the request
-			 * using the dummy function for the callback feature
-			 * of blk_end_request_callback().
-			 */
-			if (rq->bio)
-				blk_end_request_callback(rq, 0, blen,
-						 cdrom_newpc_intr_dummy_cb);
-			else
-				rq->data += blen;
-		}
 		if (sense && write == 0)
 			rq->sense_len += blen;
 	}
@@ -814,8 +767,7 @@ out_end:
 	if (blk_pc_request(rq) && rc == 0) {
 		unsigned int dlen = rq->data_len;
 
-		if (dma)
-			rq->data_len = 0;
+		rq->data_len = 0;
 
 		if (blk_end_request(rq, 0, dlen))
 			BUG();
@@ -828,13 +780,14 @@ out_end:
 		if (blk_fs_request(rq)) {
 			if (cmd->nleft == 0)
 				uptodate = 1;
-			if (uptodate == 0)
-				ide_cd_error_cmd(drive, cmd);
 		} else {
 			if (uptodate <= 0 && rq->errors == 0)
 				rq->errors = -EIO;
 		}
 
+		if (uptodate == 0)
+			ide_cd_error_cmd(drive, cmd);
+
 		/* make sure it's fully ended */
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
@@ -844,6 +797,12 @@ out_end:
 		if (nsectors == 0)
 			nsectors = 1;
 
+		if (blk_fs_request(rq) == 0) {
+			rq->data_len -= (cmd->nbytes - cmd->nleft);
+			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
+				rq->data_len += cmd->last_xfer_len;
+		}
+
 		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
 
 		if (sense && rc == 2)
@@ -971,8 +930,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
-	if (blk_fs_request(rq)) {
-		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+	if (blk_fs_request(rq) || rq->data_len) {
+		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? (rq->nr_sectors << 9)
+							 : rq->data_len);
 		ide_map_sg(drive, &cmd);
 	}
 
-- 
cgit v1.2.3


From 4a3d8cf48c7baf3439aed06c847cd4562adfc468 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:15 +0200
Subject: ide-cd: use common completion path for DMA requests in
 cdrom_newpc_intr()

Use the following facts:

- rq->nr_sectors should now be always equal to (non-zero)
  rq->hard_nr_sectors for fs requests

- REQ_TYPE_ATA_PC requests have never bio attached to them

- rq->hard_nr_sectors == 0 for REQ_TYPE_ATA_PC requests

- DMA is used only for fs, pc and REQ_TYPE_ATA_PC requests

- 'uptodate' is ignored for pc requests ('rc == 0' case)

and use the common completion path also for DMA requests.

There should be no functional changes caused by this patch

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 7fdfb55011a..11d8d5b870b 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -657,16 +657,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (dma) {
 		if (dma_error)
 			return ide_error(drive, "dma error", stat);
-		if (blk_fs_request(rq)) {
-			ide_complete_rq(drive, 0, rq->nr_sectors
-				? (rq->nr_sectors << 9) : ide_rq_bytes(rq));
-			return ide_stopped;
-		} else if (rq->cmd_type == REQ_TYPE_ATA_PC && !rq->bio) {
-			ide_complete_rq(drive, 0, 512);
-			return ide_stopped;
-		}
-		if (blk_pc_request(rq) == 0 && uptodate == 0)
-			rq->cmd_flags |= REQ_FAILED;
+		uptodate = 1;
 		goto out_end;
 	}
 
-- 
cgit v1.2.3


From 14fa91ccbafa02a71cfb53f9c830b8c0c65119d0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:16 +0200
Subject: ide-cd: unify transfer padding in cdrom_newpc_intr()

* 'thislen' is always <= cmd->nleft for non-fs requests so the transfer
  padding inside the 'while (thislen > 0)' loop can happen only for fs
  requests -- then move it out of the loop and unify with the transfer
  padding for non-fs requests ('thislen' == 'len' for fs requests).

* blk_dump_rq_flags() dumps all request flags so it is enough to pass
  only the function name to it.

* Update my Copyrights while at it.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 11d8d5b870b..0201201c6e4 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -4,7 +4,7 @@
  * Copyright (C) 1994-1996   Scott Snyder <snyder@fnald0.fnal.gov>
  * Copyright (C) 1996-1998   Erik Andersen <andersee@debian.org>
  * Copyright (C) 1998-2000   Jens Axboe <axboe@suse.de>
- * Copyright (C) 2005, 2007  Bartlomiej Zolnierkiewicz
+ * Copyright (C) 2005, 2007-2009  Bartlomiej Zolnierkiewicz
  *
  * May be copied or modified under the terms of the GNU General Public
  * License.  See linux/COPYING for more information.
@@ -711,22 +711,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	while (thislen > 0) {
 		int blen = min_t(int, thislen, cmd->nleft);
 
-		if (cmd->nleft == 0) {
-			if (blk_fs_request(rq) && !write)
-				/*
-				 * If the buffers are full, pipe the rest into
-				 * oblivion.
-				 */
-				ide_pad_transfer(drive, 0, thislen);
-			else {
-				printk(KERN_ERR PFX "%s: confused, missing data\n",
-						drive->name);
-				blk_dump_rq_flags(rq, rq_data_dir(rq)
-						  ? "cdrom_newpc_intr, write"
-						  : "cdrom_newpc_intr, read");
-			}
+		if (cmd->nleft == 0)
 			break;
-		}
 
 		ide_pio_bytes(drive, cmd, write, blen);
 		cmd->last_xfer_len += blen;
@@ -739,8 +725,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	}
 
 	/* pad, if necessary */
-	if (!blk_fs_request(rq) && len > 0)
-		ide_pad_transfer(drive, write, len);
+	if (len > 0) {
+		if (blk_fs_request(rq) == 0 || write == 0)
+			ide_pad_transfer(drive, write, len);
+		else {
+			printk(KERN_ERR PFX "%s: confused, missing data\n",
+				drive->name);
+			blk_dump_rq_flags(rq, "cdrom_newpc_intr");
+		}
+	}
 
 	if (blk_pc_request(rq)) {
 		timeout = rq->timeout;
-- 
cgit v1.2.3


From e698ea83a8531a6740dc657329dcf0728392d6ac Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:16 +0200
Subject: ide-cd: minor ide_cdrom_setup() cleanup

Cache drive->queue in local variable and use max().

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0201201c6e4..5319e7a7370 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1581,18 +1581,18 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 {
 	struct cdrom_info *cd = drive->driver_data;
 	struct cdrom_device_info *cdi = &cd->devinfo;
+	struct request_queue *q = drive->queue;
 	u16 *id = drive->id;
 	char *fw_rev = (char *)&id[ATA_ID_FW_REV];
 	int nslots;
 
 	ide_debug_log(IDE_DBG_PROBE, "enter");
 
-	blk_queue_prep_rq(drive->queue, ide_cdrom_prep_fn);
-	blk_queue_dma_alignment(drive->queue, 31);
-	blk_queue_update_dma_pad(drive->queue, 15);
-	drive->queue->unplug_delay = (1 * HZ) / 1000;
-	if (!drive->queue->unplug_delay)
-		drive->queue->unplug_delay = 1;
+	blk_queue_prep_rq(q, ide_cdrom_prep_fn);
+	blk_queue_dma_alignment(q, 31);
+	blk_queue_update_dma_pad(q, 15);
+
+	q->unplug_delay = max((1 * HZ) / 1000, 1);
 
 	drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
 	drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id);
@@ -1610,8 +1610,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	nslots = ide_cdrom_probe_capabilities(drive);
 
-	/* set correct block size */
-	blk_queue_hardsect_size(drive->queue, CD_FRAMESIZE);
+	blk_queue_hardsect_size(q, CD_FRAMESIZE);
 
 	if (ide_cdrom_register(drive, nslots)) {
 		printk(KERN_ERR PFX "%s: %s failed to register device with the"
-- 
cgit v1.2.3


From 35c9b4daf4c94b30e5cede597d98016ebf31b5ad Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:19 +0200
Subject: ide: add ->dma_clear method and remove ->dma_timeout one

All custom ->dma_timeout implementations call the generic one thus it is
possible to have only an optional method for resetting DMA engine instead:

* Add ->dma_clear method and convert hpt366, pdc202xx_old and sl82c105
  host drivers to use it.

* Always use ide_dma_timeout() in ide_dma_timeout_retry() and remove
 ->dma_timeout method.

* Make ide_dma_timeout() static.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c     |  1 -
 drivers/ide/au1xxx-ide.c   |  1 -
 drivers/ide/cmd64x.c       |  3 ---
 drivers/ide/cs5536.c       |  1 -
 drivers/ide/hpt366.c       | 10 +---------
 drivers/ide/icside.c       |  1 -
 drivers/ide/ide-dma-sff.c  |  3 +--
 drivers/ide/ide-dma.c      | 10 ++++++----
 drivers/ide/it821x.c       |  3 +--
 drivers/ide/ns87415.c      |  1 -
 drivers/ide/pdc202xx_old.c | 10 ++--------
 drivers/ide/pmac.c         |  1 -
 drivers/ide/sc1200.c       |  1 -
 drivers/ide/scc_pata.c     |  1 -
 drivers/ide/sgiioc4.c      |  1 -
 drivers/ide/siimage.c      |  1 -
 drivers/ide/sl82c105.c     |  7 +++----
 drivers/ide/tc86c001.c     |  1 -
 drivers/ide/trm290.c       |  1 -
 drivers/ide/tx4939ide.c    |  1 -
 include/linux/ide.h        |  4 ++--
 21 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d516168464f..d3faf0b97f4 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -509,7 +509,6 @@ static const struct ide_dma_ops ali_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index d3a9d6c1532..0c08c5e01f2 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -353,7 +353,6 @@ static const struct ide_dma_ops au1xxx_dma_ops = {
 	.dma_end		= auide_dma_end,
 	.dma_test_irq		= auide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static int auide_ddma_init(ide_hwif_t *hwif, const struct ide_port_info *d)
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index bf0e3f47082..f0a49d2ff71 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -384,7 +384,6 @@ static const struct ide_dma_ops cmd64x_dma_ops = {
 	.dma_test_irq		= cmd64x_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -396,7 +395,6 @@ static const struct ide_dma_ops cmd646_rev1_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -408,7 +406,6 @@ static const struct ide_dma_ops cmd648_dma_ops = {
 	.dma_test_irq		= cmd648_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c
index d5dcf489960..353a35bbba6 100644
--- a/drivers/ide/cs5536.c
+++ b/drivers/ide/cs5536.c
@@ -236,7 +236,6 @@ static const struct ide_dma_ops cs5536_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info cs5536_info = {
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index dbaf184ed9c..a0eb87f5913 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -835,12 +835,6 @@ static int hpt370_dma_end(ide_drive_t *drive)
 	return ide_dma_end(drive);
 }
 
-static void hpt370_dma_timeout(ide_drive_t *drive)
-{
-	hpt370_irq_timeout(drive);
-	ide_dma_timeout(drive);
-}
-
 /* returns 1 if DMA IRQ issued, 0 otherwise */
 static int hpt374_dma_test_irq(ide_drive_t *drive)
 {
@@ -1423,7 +1417,6 @@ static const struct ide_dma_ops hpt37x_dma_ops = {
 	.dma_test_irq		= hpt374_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -1435,7 +1428,7 @@ static const struct ide_dma_ops hpt370_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= hpt370_dma_timeout,
+	.dma_clear		= hpt370_irq_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -1447,7 +1440,6 @@ static const struct ide_dma_ops hpt36x_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= hpt366_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 51ce404fe53..f069f122ee6 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -377,7 +377,6 @@ static const struct ide_dma_ops icside_v6_dma_ops = {
 	.dma_start		= icside_dma_start,
 	.dma_end		= icside_dma_end,
 	.dma_test_irq		= icside_dma_test_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 };
 #else
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 75a9ea2e4c8..7836d7e03ff 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -338,9 +338,8 @@ const struct ide_dma_ops sff_dma_ops = {
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 EXPORT_SYMBOL_GPL(sff_dma_ops);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 3dbf80c1549..dc5d9bc4ced 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -460,7 +460,7 @@ void ide_dma_lost_irq(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_dma_lost_irq);
 
-void ide_dma_timeout(ide_drive_t *drive)
+static void ide_dma_timeout(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
@@ -473,7 +473,6 @@ void ide_dma_timeout(ide_drive_t *drive)
 
 	hwif->dma_ops->dma_end(drive);
 }
-EXPORT_SYMBOL_GPL(ide_dma_timeout);
 
 /*
  * un-busy the port etc, and clear any pending DMA status. we want to
@@ -483,6 +482,7 @@ EXPORT_SYMBOL_GPL(ide_dma_timeout);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 	struct request *rq;
 	ide_startstop_t ret = ide_stopped;
 
@@ -492,12 +492,14 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
-		(void)hwif->dma_ops->dma_end(drive);
+		(void)dma_ops->dma_end(drive);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
 		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-		hwif->dma_ops->dma_timeout(drive);
+		if (dma_ops->dma_clear)
+			dma_ops->dma_clear(drive);
+		ide_dma_timeout(drive);
 	}
 
 	/*
diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c
index 0d4ac65cf94..51aa745246d 100644
--- a/drivers/ide/it821x.c
+++ b/drivers/ide/it821x.c
@@ -511,9 +511,8 @@ static struct ide_dma_ops it821x_pass_through_dma_ops = {
 	.dma_start		= it821x_dma_start,
 	.dma_end		= it821x_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 7b65fe5bf44..9039a373020 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -306,7 +306,6 @@ static const struct ide_dma_ops ns87415_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= superio_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index f7536d1943f..248a54bd238 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -258,12 +258,6 @@ static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
 	ide_dma_lost_irq(drive);
 }
 
-static void pdc202xx_dma_timeout(ide_drive_t *drive)
-{
-	pdc202xx_reset(drive);
-	ide_dma_timeout(drive);
-}
-
 static int init_chipset_pdc202xx(struct pci_dev *dev)
 {
 	unsigned long dmabase = pci_resource_start(dev, 4);
@@ -336,7 +330,7 @@ static const struct ide_dma_ops pdc20246_dma_ops = {
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= pdc202xx_dma_timeout,
+	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -348,7 +342,7 @@ static const struct ide_dma_ops pdc2026x_dma_ops = {
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= pdc202xx_dma_timeout,
+	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 2bfcfedaa07..d15cc46a66e 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1650,7 +1650,6 @@ static const struct ide_dma_ops pmac_dma_ops = {
 	.dma_start		= pmac_ide_dma_start,
 	.dma_end		= pmac_ide_dma_end,
 	.dma_test_irq		= pmac_ide_dma_test_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= pmac_ide_dma_lost_irq,
 };
 
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 1c3a8291499..371549d18a0 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -291,7 +291,6 @@ static const struct ide_dma_ops sc1200_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 0cc137cfe76..64534d150b0 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -872,7 +872,6 @@ static const struct ide_dma_ops scc_dma_ops = {
 	.dma_end		= scc_dma_end,
 	.dma_test_irq		= scc_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= scc_dma_sff_read_status,
 };
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index b12de8346c7..44df0c750ba 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -533,7 +533,6 @@ static const struct ide_dma_ops sgiioc4_dma_ops = {
 	.dma_end		= sgiioc4_dma_end,
 	.dma_test_irq		= sgiioc4_dma_test_irq,
 	.dma_lost_irq		= sgiioc4_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info sgiioc4_port_info __devinitconst = {
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index 075cb1243b2..e4973cd1fba 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -715,7 +715,6 @@ static const struct ide_dma_ops sil_dma_ops = {
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= siimage_dma_test_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index d25137b04e7..d6f8977191c 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -189,14 +189,13 @@ static void sl82c105_dma_start(ide_drive_t *drive)
 	ide_dma_start(drive);
 }
 
-static void sl82c105_dma_timeout(ide_drive_t *drive)
+static void sl82c105_dma_clear(ide_drive_t *drive)
 {
 	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
 
-	DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name));
+	DBG(("sl82c105_dma_clear(drive:%s)\n", drive->name));
 
 	sl82c105_reset_host(dev);
-	ide_dma_timeout(drive);
 }
 
 static int sl82c105_dma_end(ide_drive_t *drive)
@@ -298,7 +297,7 @@ static const struct ide_dma_ops sl82c105_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= sl82c105_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= sl82c105_dma_timeout,
+	.dma_clear		= sl82c105_dma_clear,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
index 427d4b3c2c6..b4cf42dc8a6 100644
--- a/drivers/ide/tc86c001.c
+++ b/drivers/ide/tc86c001.c
@@ -187,7 +187,6 @@ static const struct ide_dma_ops tc86c001_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index ed1496845a9..d6a950828e9 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -314,7 +314,6 @@ static struct ide_dma_ops trm290_dma_ops = {
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info trm290_chipset __devinitdata = {
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index e0e0a803dde..53f99853b06 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -632,7 +632,6 @@ static const struct ide_dma_ops tx4939ide_dma_ops = {
 	.dma_test_irq		= tx4939ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= tx4939ide_dma_sff_read_status,
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cb501bf78f7..d3035f2f125 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -716,8 +716,9 @@ struct ide_dma_ops {
 	int	(*dma_end)(struct ide_drive_s *);
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
+	/* below ones are optional */
 	int	(*dma_timer_expiry)(struct ide_drive_s *);
-	void	(*dma_timeout)(struct ide_drive_s *);
+	void	(*dma_clear)(struct ide_drive_s *);
 	/*
 	 * The following method is optional and only required to be
 	 * implemented for the SFF-8038i compatible controllers.
@@ -1461,7 +1462,6 @@ static inline int config_drive_for_dma(ide_drive_t *drive) { return 0; }
 #endif /* CONFIG_BLK_DEV_IDEDMA_SFF */
 
 void ide_dma_lost_irq(ide_drive_t *);
-void ide_dma_timeout(ide_drive_t *);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int);
 
 #else
-- 
cgit v1.2.3


From 1cee52de28aa687760ad262ad0834d1bf6c6d2ac Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:19 +0200
Subject: ide: inline ide_dma_timeout() into ide_dma_timeout_retry()

Since ide_dma_timeout() is only used by ide_dma_timeout_retry()
inline it there.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-dma.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index dc5d9bc4ced..4e200507111 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -460,20 +460,6 @@ void ide_dma_lost_irq(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_dma_lost_irq);
 
-static void ide_dma_timeout(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name);
-
-	if (hwif->dma_ops->dma_test_irq(drive))
-		return;
-
-	ide_dump_status(drive, "DMA timeout", hwif->tp_ops->read_status(hwif));
-
-	hwif->dma_ops->dma_end(drive);
-}
-
 /*
  * un-busy the port etc, and clear any pending DMA status. we want to
  * retry the current request in pio mode instead of risking tossing it
@@ -499,7 +485,12 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
 		if (dma_ops->dma_clear)
 			dma_ops->dma_clear(drive);
-		ide_dma_timeout(drive);
+		printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name);
+		if (dma_ops->dma_test_irq(drive) == 0) {
+			ide_dump_status(drive, "DMA timeout",
+					hwif->tp_ops->read_status(hwif));
+			(void)dma_ops->dma_end(drive);
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 4453011f959a5f5c6c7a33aea54fe17f5e43a867 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ide: destroy DMA mappings after ending DMA (v2)

Move ide_destroy_dmatable() call out from ->dma_end method to
{ide_pc,cdrom_newpc,ide_dma}_intr(), ide_dma_timeout_retry()
and sgiioc4_resetproc().

This causes minor/safe behavior changes w.r.t.:
* cmd64x.c::cmd64{8,x}_dma_end()
* cs5536.c::cs5536_dma_end()
* icside.c::icside_dma_end()
* it821x.c::it821x_dma_end()
* scc_pata.c::__scc_dma_end()
* sl82c105.c::sl82c105_dma_end()
* tx4939ide.c::tx4939ide_dma_end()

v2:
* Fix build for CONFIG_BLK_DEV_IDEDMA=n (reported by Randy Dunlap).

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c  | 2 --
 drivers/ide/cmd64x.c      | 2 --
 drivers/ide/icside.c      | 3 ---
 drivers/ide/ide-atapi.c   | 7 +++++--
 drivers/ide/ide-cd.c      | 1 +
 drivers/ide/ide-dma-sff.c | 2 --
 drivers/ide/ide-dma.c     | 3 +++
 drivers/ide/ns87415.c     | 2 --
 drivers/ide/pmac.c        | 2 --
 drivers/ide/sc1200.c      | 1 -
 drivers/ide/scc_pata.c    | 2 --
 drivers/ide/sgiioc4.c     | 2 +-
 drivers/ide/trm290.c      | 3 +--
 drivers/ide/tx4939ide.c   | 4 +---
 include/linux/ide.h       | 1 +
 15 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 0c08c5e01f2..ba2a211758a 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -280,8 +280,6 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 
 static int auide_dma_end(ide_drive_t *drive)
 {
-	ide_destroy_dmatable(drive);
-
 	return 0;
 }
 
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index f0a49d2ff71..f2edf280ef8 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -327,8 +327,6 @@ static int cmd646_1_dma_end(ide_drive_t *drive)
 	outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
 	/* clear the INTR & ERROR bits */
 	outb(dma_stat | 6, hwif->dma_base + ATA_DMA_STATUS);
-	/* and free any DMA resources */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	return (dma_stat & 7) != 4;
 }
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index f069f122ee6..9bf57d7c8e5 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -291,9 +291,6 @@ static int icside_dma_end(ide_drive_t *drive)
 
 	disable_dma(ec->dma);
 
-	/* Teardown mappings after DMA has completed. */
-	ide_destroy_dmatable(drive);
-
 	return get_dma_residue(ec->dma) != 0;
 }
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f591166d2c9..1481f71f817 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -342,8 +342,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	stat = tp_ops->read_status(hwif);
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		if (hwif->dma_ops->dma_end(drive) ||
-		    (drive->media == ide_tape && (stat & ATA_ERR))) {
+		int rc = hwif->dma_ops->dma_end(drive);
+
+		ide_destroy_dmatable(drive);
+
+		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
 				printk(KERN_ERR "%s: DMA %s error\n",
 					drive->name, rq_data_dir(pc->rq)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5319e7a7370..4a0d66ee954 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -639,6 +639,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (dma) {
 		drive->dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
+		ide_destroy_dmatable(drive);
 		if (dma_error) {
 			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
 					write ? "write" : "read");
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 7836d7e03ff..f8adbb5eb33 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -310,8 +310,6 @@ int ide_dma_end(ide_drive_t *drive)
 	/* clear INTR & ERROR bits */
 	ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR);
 
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
 	wmb();
 
 	/* verify good DMA status */
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 4e200507111..b430898bbcd 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -92,6 +92,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 	u8 stat = 0, dma_stat = 0;
 
 	dma_stat = hwif->dma_ops->dma_end(drive);
+	ide_destroy_dmatable(drive);
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
@@ -479,6 +480,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		(void)dma_ops->dma_end(drive);
+		ide_destroy_dmatable(drive);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
@@ -490,6 +492,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 			ide_dump_status(drive, "DMA timeout",
 					hwif->tp_ops->read_status(hwif));
 			(void)dma_ops->dma_end(drive);
+			ide_destroy_dmatable(drive);
 		}
 	}
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9039a373020..9ad71a74f93 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -210,8 +210,6 @@ static int ns87415_dma_end(ide_drive_t *drive)
 	/* from ERRATA: clear the INTR & ERROR bits */
 	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
 	outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD);
-	/* and free any DMA resources */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	return (dma_stat & 7) != 4;
 }
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index d15cc46a66e..5643a8b957b 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1562,8 +1562,6 @@ pmac_ide_dma_end (ide_drive_t *drive)
 	dstat = readl(&dma->status);
 	writel(((RUN|WAKE|DEAD) << 16), &dma->control);
 
-	ide_destroy_dmatable(drive);
-
 	/* verify good dma status. we don't check for ACTIVE beeing 0. We should...
 	 * in theory, but with ATAPI decices doing buffer underruns, that would
 	 * cause us to disable DMA, which isn't what we want
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 371549d18a0..d9c47034bed 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -184,7 +184,6 @@ static int sc1200_dma_end(ide_drive_t *drive)
 	outb(inb(dma_base)&~1, dma_base);	/* !! DO THIS HERE !! stop DMA */
 
 	drive->waiting_for_dma = 0;
-	ide_destroy_dmatable(drive);		/* purge DMA mappings */
 
 	return (dma_stat & 7) != 4;		/* verify good DMA status */
 }
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 64534d150b0..693536ebe33 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -365,8 +365,6 @@ static int __scc_dma_end(ide_drive_t *drive)
 	dma_stat = scc_dma_sff_read_status(hwif);
 	/* clear the INTR & ERROR bits */
 	scc_ide_outb(dma_stat | 6, hwif->dma_base + 4);
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	wmb();
 	return (dma_stat & 7) != 4 ? (0x10 | dma_stat) : 0;
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 44df0c750ba..457a762a1f2 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -259,7 +259,6 @@ static int sgiioc4_dma_end(ide_drive_t *drive)
 	}
 
 	drive->waiting_for_dma = 0;
-	ide_destroy_dmatable(drive);
 
 	return dma_stat;
 }
@@ -284,6 +283,7 @@ static void
 sgiioc4_resetproc(ide_drive_t * drive)
 {
 	sgiioc4_dma_end(drive);
+	ide_destroy_dmatable(drive);
 	sgiioc4_clearirq(drive);
 }
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index d6a950828e9..8dd3d822687 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -216,8 +216,7 @@ static int trm290_dma_end(ide_drive_t *drive)
 	u16 status;
 
 	drive->waiting_for_dma = 0;
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
+
 	status = inw(drive->hwif->dma_base + 2);
 
 	return status != 0x00ff;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 53f99853b06..f62ced855cf 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -335,11 +335,9 @@ static int tx4939ide_dma_end(ide_drive_t *drive)
 	/* read and clear the INTR & ERROR bits */
 	dma_stat = tx4939ide_clear_dma_status(base);
 
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
-	/* verify good DMA status */
 	wmb();
 
+	/* verify good DMA status */
 	if ((dma_stat & (ATA_DMA_INTR | ATA_DMA_ERR | ATA_DMA_ACTIVE)) == 0 &&
 	    (ctl & (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST)) ==
 	    (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d3035f2f125..b6c4942fde1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1479,6 +1479,7 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_build_sglist(ide_drive_t *drive,
 				   struct ide_cmd *cmd) { return 0; }
+static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3


From 5ae5412d9a23b05ab08461b202bad21ad8f6b66d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ide: add ide_dma_prepare() helper

* Add ide_dma_prepare() helper.

* Convert ide_issue_pc() and do_rw_taskfile() to use it.

* Make ide_build_sglist() static.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 18 ++++--------------
 drivers/ide/ide-dma.c      | 11 ++++++++++-
 drivers/ide/ide-taskfile.c |  4 +---
 include/linux/ide.h        |  7 ++++---
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 1481f71f817..89d2339bdef 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -638,7 +638,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 	ide_expiry_t *expiry = NULL;
 	struct request *rq = hwif->rq;
 	unsigned int timeout;
@@ -652,12 +651,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 		expiry = ide_cd_expiry;
 		timeout = ATAPI_WAIT_PC;
 
-		if (drive->dma) {
-			if (ide_build_sglist(drive, cmd))
-				drive->dma = !dma_ops->dma_setup(drive, cmd);
-			else
-				drive->dma = 0;
-		}
+		if (drive->dma)
+			drive->dma = !ide_dma_prepare(drive, cmd);
 	} else {
 		pc = drive->pc;
 
@@ -675,13 +670,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 			ide_dma_off(drive);
 		}
 
-		if ((pc->flags & PC_FLAG_DMA_OK) &&
-		     (drive->dev_flags & IDE_DFLAG_USING_DMA)) {
-			if (ide_build_sglist(drive, cmd))
-				drive->dma = !dma_ops->dma_setup(drive, cmd);
-			else
-				drive->dma = 0;
-		}
+		if (pc->flags & PC_FLAG_DMA_OK)
+			drive->dma = !ide_dma_prepare(drive, cmd);
 
 		if (!drive->dma)
 			pc->flags &= ~PC_FLAG_DMA_OK;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index b430898bbcd..cf5897f5533 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -128,7 +128,7 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -563,3 +563,12 @@ int ide_allocate_dma_engine(ide_hwif_t *hwif)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
+
+int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+	    ide_build_sglist(drive, cmd) == 0 ||
+	    drive->hwif->dma_ops->dma_setup(drive, cmd))
+		return 1;
+	return 0;
+}
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index a3b7a50562b..dba68db629b 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -100,9 +100,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		ide_execute_command(drive, cmd, handler, WAIT_WORSTCASE);
 		return ide_started;
 	case ATA_PROT_DMA:
-		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-		    ide_build_sglist(drive, cmd) == 0 ||
-		    dma_ops->dma_setup(drive, cmd))
+		if (ide_dma_prepare(drive, cmd))
 			return ide_stopped;
 		hwif->expiry = dma_ops->dma_timer_expiry;
 		ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b6c4942fde1..78892e2a432 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1443,7 +1443,8 @@ ide_startstop_t ide_dma_intr(ide_drive_t *);
 int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
-int ide_build_sglist(ide_drive_t *, struct ide_cmd *);
+int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
+
 void ide_destroy_dmatable(ide_drive_t *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
@@ -1477,8 +1478,8 @@ static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
 static inline ide_startstop_t ide_dma_intr(ide_drive_t *drive) { return ide_stopped; }
 static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
-static inline int ide_build_sglist(ide_drive_t *drive,
-				   struct ide_cmd *cmd) { return 0; }
+static inline int ide_dma_prepare(ide_drive_t *drive,
+				  struct ide_cmd *cmd) { return 1; }
 static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
-- 
cgit v1.2.3


From a6d67ffa7dfe9515d8f2051a76b14c82b748475a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ns87415: use custom ->dma_{start,end} to handle
 ns87415_prepare_drive()

Use custom ->dma_{start,end} methods to handle ns87415_prepare_drive()
there instead of in ->dma_setup method.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ns87415.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9ad71a74f93..6e0f372d2f0 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -196,6 +196,12 @@ static void ns87415_selectproc (ide_drive_t *drive)
 			      !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
 }
 
+static void ns87415_dma_start(ide_drive_t *drive)
+{
+	ns87415_prepare_drive(drive, 1);
+	ide_dma_start(drive);
+}
+
 static int ns87415_dma_end(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -210,19 +216,11 @@ static int ns87415_dma_end(ide_drive_t *drive)
 	/* from ERRATA: clear the INTR & ERROR bits */
 	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
 	outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD);
-	/* verify good DMA status */
-	return (dma_stat & 7) != 4;
-}
 
-static int ns87415_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	/* select DMA xfer */
-	ns87415_prepare_drive(drive, 1);
-	if (ide_dma_setup(drive, cmd) == 0)
-		return 0;
-	/* DMA failed: select PIO xfer */
 	ns87415_prepare_drive(drive, 0);
-	return 1;
+
+	/* verify good DMA status */
+	return (dma_stat & 7) != 4;
 }
 
 static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
@@ -298,8 +296,8 @@ static const struct ide_port_ops ns87415_port_ops = {
 
 static const struct ide_dma_ops ns87415_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ns87415_dma_setup,
-	.dma_start		= ide_dma_start,
+	.dma_setup		= ide_dma_setup,
+	.dma_start		= ns87415_dma_start,
 	.dma_end		= ns87415_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-- 
cgit v1.2.3


From 7526efaafdc835b8d6b22aa1a302e14651373908 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:21 +0200
Subject: trm290: use custom ->dma_{start,end} to handle trm290_prepare_drive()

Use custom ->dma_{start,end} methods to handle trm290_prepare_drive()
there instead of in ->dma_setup method.

There should be no functional changes caused by this patch
(DMA support is disabled currently in trm290.c).

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/trm290.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index 8dd3d822687..b91bb709af4 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -184,7 +184,6 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_WRITE) {
 #ifdef TRM290_NO_DMA_WRITES
 		/* always use PIO for writes */
-		trm290_prepare_drive(drive, 0);	/* select PIO xfer */
 		return 1;
 #endif
 		rw = 1;
@@ -195,11 +194,8 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (count == 0) {
 		ide_map_sg(drive, cmd);
 		/* try PIO instead of DMA */
-		trm290_prepare_drive(drive, 0); /* select PIO xfer */
 		return 1;
 	}
-	/* select DMA xfer */
-	trm290_prepare_drive(drive, 1);
 	outl(hwif->dmatable_dma | rw, hwif->dma_base);
 	drive->waiting_for_dma = 1;
 	/* start DMA */
@@ -209,6 +205,7 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 
 static void trm290_dma_start(ide_drive_t *drive)
 {
+	trm290_prepare_drive(drive, 1);
 }
 
 static int trm290_dma_end(ide_drive_t *drive)
@@ -219,6 +216,8 @@ static int trm290_dma_end(ide_drive_t *drive)
 
 	status = inw(drive->hwif->dma_base + 2);
 
+	trm290_prepare_drive(drive, 0);
+
 	return status != 0x00ff;
 }
 
-- 
cgit v1.2.3


From 8a4a5738ba499083cf4c5668895efe220b1946d3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:21 +0200
Subject: ide: add ->dma_check method

* Add (an optional) ->dma_check method for checking if DMA can be
  used for a given command and fail DMA setup in ide_dma_prepare()
  if necessary.

* Convert alim15x3 and trm290 host drivers to use ->dma_check.

* Rename ali15x3_dma_setup() to ali_dma_check() while at it.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c |  9 +++++----
 drivers/ide/ide-dma.c  |  5 ++++-
 drivers/ide/trm290.c   | 17 ++++++++++-------
 include/linux/ide.h    |  1 +
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d3faf0b97f4..537da1cde16 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -189,20 +189,20 @@ static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed)
 }
 
 /**
- *	ali15x3_dma_setup	-	begin a DMA phase
+ *	ali_dma_check	-	DMA check
  *	@drive:	target device
  *	@cmd: command
  *
  *	Returns 1 if the DMA cannot be performed, zero on success.
  */
 
-static int ali15x3_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ali_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	if (m5229_revision < 0xC2 && drive->media != ide_disk) {
 		if (cmd->tf_flags & IDE_TFLAG_WRITE)
 			return 1;	/* try PIO instead of DMA */
 	}
-	return ide_dma_setup(drive, cmd);
+	return 0;
 }
 
 /**
@@ -503,11 +503,12 @@ static const struct ide_port_ops ali_port_ops = {
 
 static const struct ide_dma_ops ali_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ali15x3_dma_setup,
+	.dma_setup		= ide_dma_setup,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_check		= ali_dma_check,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index cf5897f5533..c0505e2dfc2 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -566,9 +566,12 @@ EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
 
 int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 {
+	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
+
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
 	    ide_build_sglist(drive, cmd) == 0 ||
-	    drive->hwif->dma_ops->dma_setup(drive, cmd))
+	    dma_ops->dma_setup(drive, cmd))
 		return 1;
 	return 0;
 }
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index b91bb709af4..1076efd050d 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -176,19 +176,21 @@ static void trm290_selectproc (ide_drive_t *drive)
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
 }
 
-static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned int count, rw;
-
 	if (cmd->tf_flags & IDE_TFLAG_WRITE) {
 #ifdef TRM290_NO_DMA_WRITES
 		/* always use PIO for writes */
 		return 1;
 #endif
-		rw = 1;
-	} else
-		rw = 2;
+	}
+	return 0;
+}
+
+static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2;
 
 	count = ide_build_dmatable(drive, cmd);
 	if (count == 0) {
@@ -312,6 +314,7 @@ static struct ide_dma_ops trm290_dma_ops = {
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_check		= trm290_dma_check,
 };
 
 static const struct ide_port_info trm290_chipset __devinitdata = {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 78892e2a432..b350667b83a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -717,6 +717,7 @@ struct ide_dma_ops {
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
 	/* below ones are optional */
+	int	(*dma_check)(struct ide_drive_s *, struct ide_cmd *);
 	int	(*dma_timer_expiry)(struct ide_drive_s *);
 	void	(*dma_clear)(struct ide_drive_s *);
 	/*
-- 
cgit v1.2.3


From 11998b316173f814698f9037ce9179d634d1f423 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:21 +0200
Subject: ide: move ide_map_sg() call out of ->dma_setup method (take 2)

Move ide_map_sg() call from ->dma_setup implementations and
ide_destroy_dmatable() one from *_build_dmatable() to ide_dma_prepare().

There should be no functional changes caused by this patch.

Sergei:
Removed 'use_pio_instead' labels and replaced 'goto' with 'return 0' --
that required no changes to the follow-up patches...

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c  |  9 ++-------
 drivers/ide/ide-dma-sff.c |  2 --
 drivers/ide/ide-dma.c     |  8 ++++++--
 drivers/ide/pmac.c        | 11 +++--------
 drivers/ide/scc_pata.c    |  4 +---
 drivers/ide/sgiioc4.c     |  9 ++-------
 drivers/ide/trm290.c      |  5 ++---
 drivers/ide/tx4939ide.c   |  6 +-----
 8 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index ba2a211758a..239643806b9 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -236,7 +236,7 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 			if (++count >= PRD_ENTRIES) {
 				printk(KERN_WARNING "%s: DMA table too small\n",
 				       drive->name);
-				goto use_pio_instead;
+				return 0;
 			}
 
 			/* Lets enable intr for the last descriptor only */
@@ -272,9 +272,6 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (count)
 		return 1;
 
- use_pio_instead:
-	ide_destroy_dmatable(drive);
-
 	return 0; /* revert to PIO for this request */
 }
 
@@ -290,10 +287,8 @@ static void auide_dma_start(ide_drive_t *drive )
 
 static int auide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	if (auide_build_dmatable(drive, cmd) == 0) {
-		ide_map_sg(drive, cmd);
+	if (auide_build_dmatable(drive, cmd) == 0)
 		return 1;
-	}
 
 	drive->waiting_for_dma = 1;
 	return 0;
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index f8adbb5eb33..e10054b827a 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -166,8 +166,6 @@ use_pio_instead:
 	printk(KERN_ERR "%s: %s\n", drive->name,
 		count ? "DMA table too small" : "empty DMA table?");
 
-	ide_destroy_dmatable(drive);
-
 	return 0; /* revert to PIO for this request */
 }
 EXPORT_SYMBOL_GPL(ide_build_dmatable);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index c0505e2dfc2..d61f9a8cc18 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -570,8 +570,12 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
 	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
-	    ide_build_sglist(drive, cmd) == 0 ||
-	    dma_ops->dma_setup(drive, cmd))
+	    ide_build_sglist(drive, cmd) == 0)
 		return 1;
+	if (dma_ops->dma_setup(drive, cmd)) {
+		ide_destroy_dmatable(drive);
+		ide_map_sg(drive, cmd);
+		return 1;
+	}
 	return 0;
 }
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 5643a8b957b..1df7b763645 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1455,7 +1455,7 @@ static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 				       "switching to PIO on Ohare chipset\n", drive->name);
 				pmif->broken_dma_warn = 1;
 			}
-			goto use_pio_instead;
+			return 0;
 		}
 		while (cur_len) {
 			unsigned int tc = (cur_len < 0xfe00)? cur_len: 0xfe00;
@@ -1463,7 +1463,7 @@ static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 			if (count++ >= MAX_DCMDS) {
 				printk(KERN_WARNING "%s: DMA table too small\n",
 				       drive->name);
-				goto use_pio_instead;
+				return 0;
 			}
 			st_le16(&table->command, wr? OUTPUT_MORE: INPUT_MORE);
 			st_le16(&table->req_count, tc);
@@ -1492,9 +1492,6 @@ static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 
 	printk(KERN_DEBUG "%s: empty DMA table?\n", drive->name);
 
-use_pio_instead:
-	ide_destroy_dmatable(drive);
-
 	return 0; /* revert to PIO for this request */
 }
 
@@ -1510,10 +1507,8 @@ static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	u8 unit = drive->dn & 1, ata4 = (pmif->kind == controller_kl_ata4);
 	u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
 
-	if (pmac_ide_build_dmatable(drive, cmd) == 0) {
-		ide_map_sg(drive, cmd);
+	if (pmac_ide_build_dmatable(drive, cmd) == 0)
 		return 1;
-	}
 
 	/* Apple adds 60ns to wrDataSetup on reads */
 	if (ata4 && (pmif->timings[unit] & TR_66_UDMA_EN)) {
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 693536ebe33..2cdf51d8917 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -321,10 +321,8 @@ static int scc_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	u8 dma_stat;
 
 	/* fall back to pio! */
-	if (ide_build_dmatable(drive, cmd) == 0) {
-		ide_map_sg(drive, cmd);
+	if (ide_build_dmatable(drive, cmd) == 0)
 		return 1;
-	}
 
 	/* PRD table */
 	out_be32((void __iomem *)(hwif->dma_base + 8), hwif->dmatable_dma);
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 457a762a1f2..d90e8c5af0a 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -442,7 +442,7 @@ static int sgiioc4_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 				printk(KERN_WARNING
 				       "%s: DMA table too small\n",
 				       drive->name);
-				goto use_pio_instead;
+				return 0;
 			} else {
 				u32 bcount =
 				    0x10000 - (cur_addr & 0xffff);
@@ -477,9 +477,6 @@ static int sgiioc4_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 		return count;
 	}
 
-use_pio_instead:
-	ide_destroy_dmatable(drive);
-
 	return 0;		/* revert to PIO for this request */
 }
 
@@ -488,11 +485,9 @@ static int sgiioc4_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	int ddir;
 	u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
 
-	if (sgiioc4_build_dmatable(drive, cmd) == 0) {
+	if (sgiioc4_build_dmatable(drive, cmd) == 0)
 		/* try PIO instead of DMA */
-		ide_map_sg(drive, cmd);
 		return 1;
-	}
 
 	if (write)
 		/* Writes TO the IOC4 FROM Main Memory */
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index 1076efd050d..0be27cae27d 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -193,11 +193,10 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2;
 
 	count = ide_build_dmatable(drive, cmd);
-	if (count == 0) {
-		ide_map_sg(drive, cmd);
+	if (count == 0)
 		/* try PIO instead of DMA */
 		return 1;
-	}
+
 	outl(hwif->dmatable_dma | rw, hwif->dma_base);
 	drive->waiting_for_dma = 1;
 	/* start DMA */
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index f62ced855cf..7267c46c07c 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -279,8 +279,6 @@ use_pio_instead:
 	printk(KERN_ERR "%s: %s\n", drive->name,
 		count ? "DMA table too small" : "empty DMA table?");
 
-	ide_destroy_dmatable(drive);
-
 	return 0; /* revert to PIO for this request */
 }
 #else
@@ -294,10 +292,8 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	u8 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR;
 
 	/* fall back to PIO! */
-	if (tx4939ide_build_dmatable(drive, cmd) == 0) {
-		ide_map_sg(drive, cmd);
+	if (tx4939ide_build_dmatable(drive, cmd) == 0)
 		return 1;
-	}
 
 	/* PRD table */
 	tx4939ide_writel(hwif->dmatable_dma, base, TX4939IDE_PRD_Ptr);
-- 
cgit v1.2.3


From 88b4132e101e60e8fa67996ae3072ab6b71e8500 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:22 +0200
Subject: ide: set/clear drive->waiting_for_dma flag in the core code

Set/clear drive->waiting_for_dma flag in the core code
instead of in ->dma_setup and ->dma_end methods.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c  | 6 ------
 drivers/ide/cmd64x.c      | 1 -
 drivers/ide/icside.c      | 4 ----
 drivers/ide/ide-atapi.c   | 4 +++-
 drivers/ide/ide-cd.c      | 1 +
 drivers/ide/ide-dma-sff.c | 3 ---
 drivers/ide/ide-dma.c     | 4 ++++
 drivers/ide/ns87415.c     | 1 -
 drivers/ide/pmac.c        | 3 ---
 drivers/ide/sc1200.c      | 2 --
 drivers/ide/scc_pata.c    | 3 +--
 drivers/ide/sgiioc4.c     | 3 ---
 drivers/ide/trm290.c      | 8 ++------
 drivers/ide/tx4939ide.c   | 4 ----
 14 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 239643806b9..549bbb2755a 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -290,7 +290,6 @@ static int auide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (auide_build_dmatable(drive, cmd) == 0)
 		return 1;
 
-	drive->waiting_for_dma = 1;
 	return 0;
 }
 
@@ -315,16 +314,11 @@ static void auide_dma_host_set(ide_drive_t *drive, int on)
 
 static void auide_ddma_tx_callback(int irq, void *param)
 {
-	_auide_hwif *ahwif = (_auide_hwif*)param;
-	ahwif->drive->waiting_for_dma = 0;
 }
 
 static void auide_ddma_rx_callback(int irq, void *param)
 {
-	_auide_hwif *ahwif = (_auide_hwif*)param;
-	ahwif->drive->waiting_for_dma = 0;
 }
-
 #endif /* end CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA */
 
 static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 devwidth, u32 flags)
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index f2edf280ef8..80b777e4247 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -318,7 +318,6 @@ static int cmd646_1_dma_end(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	u8 dma_stat = 0, dma_cmd = 0;
 
-	drive->waiting_for_dma = 0;
 	/* get DMA status */
 	dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS);
 	/* read DMA command state */
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 9bf57d7c8e5..4e16ce68b06 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -287,8 +287,6 @@ static int icside_dma_end(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	struct expansion_card *ec = ECARD_DEV(hwif->dev);
 
-	drive->waiting_for_dma = 0;
-
 	disable_dma(ec->dma);
 
 	return get_dma_residue(ec->dma) != 0;
@@ -343,8 +341,6 @@ static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	set_dma_sg(ec->dma, hwif->sg_table, cmd->sg_nents);
 	set_dma_mode(ec->dma, dma_mode);
 
-	drive->waiting_for_dma = 1;
-
 	return 0;
 }
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 89d2339bdef..db6e617790b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -342,8 +342,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	stat = tp_ops->read_status(hwif);
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		int rc = hwif->dma_ops->dma_end(drive);
+		int rc;
 
+		drive->waiting_for_dma = 0;
+		rc = hwif->dma_ops->dma_end(drive);
 		ide_destroy_dmatable(drive);
 
 		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4a0d66ee954..f5c7bb739f4 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -638,6 +638,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	dma = drive->dma;
 	if (dma) {
 		drive->dma = 0;
+		drive->waiting_for_dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
 		ide_destroy_dmatable(drive);
 		if (dma_error) {
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index e10054b827a..f8c7d0cd2ff 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -216,7 +216,6 @@ int ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* clear INTR & ERROR flags */
 	ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR);
 
-	drive->waiting_for_dma = 1;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_dma_setup);
@@ -290,8 +289,6 @@ int ide_dma_end(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	u8 dma_stat = 0, dma_cmd = 0, mask;
 
-	drive->waiting_for_dma = 0;
-
 	/* stop DMA */
 	if (hwif->host_flags & IDE_HFLAG_MMIO) {
 		dma_cmd = readb((void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index d61f9a8cc18..4d3102887d9 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -91,6 +91,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	u8 stat = 0, dma_stat = 0;
 
+	drive->waiting_for_dma = 0;
 	dma_stat = hwif->dma_ops->dma_end(drive);
 	ide_destroy_dmatable(drive);
 	stat = hwif->tp_ops->read_status(hwif);
@@ -479,6 +480,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
+		drive->waiting_for_dma = 0;
 		(void)dma_ops->dma_end(drive);
 		ide_destroy_dmatable(drive);
 		ret = ide_error(drive, "dma timeout error",
@@ -491,6 +493,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 		if (dma_ops->dma_test_irq(drive) == 0) {
 			ide_dump_status(drive, "DMA timeout",
 					hwif->tp_ops->read_status(hwif));
+			drive->waiting_for_dma = 0;
 			(void)dma_ops->dma_end(drive);
 			ide_destroy_dmatable(drive);
 		}
@@ -577,5 +580,6 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 		ide_map_sg(drive, cmd);
 		return 1;
 	}
+	drive->waiting_for_dma = 1;
 	return 0;
 }
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 6e0f372d2f0..cbc1d111038 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -207,7 +207,6 @@ static int ns87415_dma_end(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	u8 dma_stat = 0, dma_cmd = 0;
 
-	drive->waiting_for_dma = 0;
 	dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
 	/* get DMA command mode */
 	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 1df7b763645..879c3d8d9f3 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1517,8 +1517,6 @@ static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 		(void)readl(PMAC_IDE_REG(IDE_TIMING_CONFIG));
 	}
 
-	drive->waiting_for_dma = 1;
-
 	return 0;
 }
 
@@ -1553,7 +1551,6 @@ pmac_ide_dma_end (ide_drive_t *drive)
 	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
 	u32 dstat;
 
-	drive->waiting_for_dma = 0;
 	dstat = readl(&dma->status);
 	writel(((RUN|WAKE|DEAD) << 16), &dma->control);
 
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index d9c47034bed..13e3988f00f 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -183,8 +183,6 @@ static int sc1200_dma_end(ide_drive_t *drive)
 	outb(dma_stat|0x1b, dma_base+2);	/* clear the INTR & ERROR bits */
 	outb(inb(dma_base)&~1, dma_base);	/* !! DO THIS HERE !! stop DMA */
 
-	drive->waiting_for_dma = 0;
-
 	return (dma_stat & 7) != 4;		/* verify good DMA status */
 }
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 2cdf51d8917..542419ffa9b 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -335,7 +335,7 @@ static int scc_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 
 	/* clear INTR & ERROR flags */
 	out_be32((void __iomem *)(hwif->dma_base + 4), dma_stat | 6);
-	drive->waiting_for_dma = 1;
+
 	return 0;
 }
 
@@ -354,7 +354,6 @@ static int __scc_dma_end(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	u8 dma_stat, dma_cmd;
 
-	drive->waiting_for_dma = 0;
 	/* get DMA command mode */
 	dma_cmd = scc_ide_inb(hwif->dma_base);
 	/* stop DMA */
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index d90e8c5af0a..cb2657c4c97 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -258,8 +258,6 @@ static int sgiioc4_dma_end(ide_drive_t *drive)
 		}
 	}
 
-	drive->waiting_for_dma = 0;
-
 	return dma_stat;
 }
 
@@ -412,7 +410,6 @@ sgiioc4_configure_for_dma(int dma_direction, ide_drive_t * drive)
 	writel(ending_dma_addr, (void __iomem *)(dma_base + IOC4_DMA_END_ADDR * 4));
 
 	writel(dma_direction, (void __iomem *)ioc4_dma_addr);
-	drive->waiting_for_dma = 1;
 }
 
 /* IOC4 Scatter Gather list Format 					 */
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index 0be27cae27d..c0528f27fca 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -198,9 +198,9 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 		return 1;
 
 	outl(hwif->dmatable_dma | rw, hwif->dma_base);
-	drive->waiting_for_dma = 1;
 	/* start DMA */
 	outw(count * 2 - 1, hwif->dma_base + 2);
+
 	return 0;
 }
 
@@ -211,11 +211,7 @@ static void trm290_dma_start(ide_drive_t *drive)
 
 static int trm290_dma_end(ide_drive_t *drive)
 {
-	u16 status;
-
-	drive->waiting_for_dma = 0;
-
-	status = inw(drive->hwif->dma_base + 2);
+	u16 status = inw(drive->hwif->dma_base + 2);
 
 	trm290_prepare_drive(drive, 0);
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 7267c46c07c..faf0d97b9ce 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -304,8 +304,6 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* clear INTR & ERROR flags */
 	tx4939ide_clear_dma_status(base);
 
-	drive->waiting_for_dma = 1;
-
 	tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ?
 			 TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1);
 
@@ -321,8 +319,6 @@ static int tx4939ide_dma_end(ide_drive_t *drive)
 	void __iomem *base = TX4939IDE_BASE(hwif);
 	u16 ctl = tx4939ide_readw(base, TX4939IDE_Int_Ctl);
 
-	drive->waiting_for_dma = 0;
-
 	/* get DMA command mode */
 	dma_cmd = tx4939ide_readb(base, TX4939IDE_DMA_Cmd);
 	/* stop DMA */
-- 
cgit v1.2.3


From f094d4d83bccee9277ddb6aadccf35747889426b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:24 +0200
Subject: ide: sanitize ide_build_sglist() and ide_destroy_dmatable()

* Move ide_map_sg() calls out from ide_build_sglist()
  to ide_dma_prepare().

* Pass command to ide_destroy_dmatable().

* Rename ide_build_sglist() to ide_dma_map_sg()
  and ide_destroy_dmatable() to ide_dma_unmap_sg().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c |  3 ++-
 drivers/ide/ide-cd.c    |  2 +-
 drivers/ide/ide-dma.c   | 50 ++++++++++++++++++++++++-------------------------
 drivers/ide/sgiioc4.c   |  7 ++++---
 include/linux/ide.h     |  6 +++---
 5 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index db6e617790b..3f3fc7c7b2f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -326,6 +326,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 {
 	struct ide_atapi_pc *pc = drive->pc;
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	xfer_func_t *xferfunc;
@@ -346,7 +347,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		drive->waiting_for_dma = 0;
 		rc = hwif->dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 
 		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f5c7bb739f4..35729a47f79 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -640,7 +640,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		drive->dma = 0;
 		drive->waiting_for_dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 		if (dma_error) {
 			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
 					write ? "write" : "read");
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 4d3102887d9..a5612eadc30 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -89,17 +89,16 @@ static const struct drive_list_entry drive_blacklist[] = {
 ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	u8 stat = 0, dma_stat = 0;
 
 	drive->waiting_for_dma = 0;
 	dma_stat = hwif->dma_ops->dma_end(drive);
-	ide_destroy_dmatable(drive);
+	ide_dma_unmap_sg(drive, cmd);
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
 		if (!dma_stat) {
-			struct ide_cmd *cmd = &hwif->cmd;
-
 			if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 				ide_finish_cmd(drive, cmd, stat);
 			else
@@ -119,8 +118,8 @@ int ide_dma_good_drive(ide_drive_t *drive)
 }
 
 /**
- *	ide_build_sglist	-	map IDE scatter gather for DMA I/O
- *	@drive: the drive to build the DMA table for
+ *	ide_dma_map_sg	-	map IDE scatter gather for DMA I/O
+ *	@drive: the drive to map the DMA table for
  *	@cmd: command
  *
  *	Perform the DMA mapping magic necessary to access the source or
@@ -129,23 +128,19 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ide_dma_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
 	int i;
 
-	ide_map_sg(drive, cmd);
-
 	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		cmd->sg_dma_direction = DMA_TO_DEVICE;
 	else
 		cmd->sg_dma_direction = DMA_FROM_DEVICE;
 
 	i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction);
-	if (i == 0)
-		ide_map_sg(drive, cmd);
-	else {
+	if (i) {
 		cmd->orig_sg_nents = cmd->sg_nents;
 		cmd->sg_nents = i;
 	}
@@ -154,7 +149,7 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 }
 
 /**
- *	ide_destroy_dmatable	-	clean up DMA mapping
+ *	ide_dma_unmap_sg	-	clean up DMA mapping
  *	@drive: The drive to unmap
  *
  *	Teardown mappings after DMA has completed. This must be called
@@ -164,15 +159,14 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
  *	time.
  */
 
-void ide_destroy_dmatable(ide_drive_t *drive)
+void ide_dma_unmap_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
 
 	dma_unmap_sg(hwif->dev, hwif->sg_table, cmd->orig_sg_nents,
 		     cmd->sg_dma_direction);
 }
-EXPORT_SYMBOL_GPL(ide_destroy_dmatable);
+EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
 
 /**
  *	ide_dma_off_quietly	-	Generic DMA kill
@@ -471,6 +465,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq;
 	ide_startstop_t ret = ide_stopped;
 
@@ -482,7 +477,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		drive->waiting_for_dma = 0;
 		(void)dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
@@ -495,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 					hwif->tp_ops->read_status(hwif));
 			drive->waiting_for_dma = 0;
 			(void)dma_ops->dma_end(drive);
-			ide_destroy_dmatable(drive);
+			ide_dma_unmap_sg(drive, cmd);
 		}
 	}
 
@@ -572,14 +567,19 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
 
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
-	    ide_build_sglist(drive, cmd) == 0)
-		return 1;
-	if (dma_ops->dma_setup(drive, cmd)) {
-		ide_destroy_dmatable(drive);
-		ide_map_sg(drive, cmd);
-		return 1;
-	}
+	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)))
+		goto out;
+	ide_map_sg(drive, cmd);
+	if (ide_dma_map_sg(drive, cmd) == 0)
+		goto out_map;
+	if (dma_ops->dma_setup(drive, cmd))
+		goto out_dma_unmap;
 	drive->waiting_for_dma = 1;
 	return 0;
+out_dma_unmap:
+	ide_dma_unmap_sg(drive, cmd);
+out_map:
+	ide_map_sg(drive, cmd);
+out:
+	return 1;
 }
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index cb2657c4c97..6ef5a567d37 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -277,11 +277,12 @@ static void sgiioc4_dma_host_set(ide_drive_t *drive, int on)
 		sgiioc4_clearirq(drive);
 }
 
-static void
-sgiioc4_resetproc(ide_drive_t * drive)
+static void sgiioc4_resetproc(ide_drive_t *drive)
 {
+	struct ide_cmd *cmd = &drive->hwif->cmd;
+
 	sgiioc4_dma_end(drive);
-	ide_destroy_dmatable(drive);
+	ide_dma_unmap_sg(drive, cmd);
 	sgiioc4_clearirq(drive);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b350667b83a..03c520917b7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1445,8 +1445,7 @@ int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
 int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
-
-void ide_destroy_dmatable(ide_drive_t *);
+void ide_dma_unmap_sg(ide_drive_t *, struct ide_cmd *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
 int config_drive_for_dma(ide_drive_t *);
@@ -1481,7 +1480,8 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_dma_prepare(ide_drive_t *drive,
 				  struct ide_cmd *cmd) { return 1; }
-static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
+static inline void ide_dma_unmap_sg(ide_drive_t *drive,
+				    struct ide_cmd *cmd) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3


From 52913ab2c6f760c2af9f9396765ce8fa1a2baf17 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:24 +0200
Subject: ide-generic: remove no longer needed sysfs interface

Nowadays we have "ide_generic.probe_mask=" module parameter
and ide_platform host driver so sysfs interface for adding
IDE interfaces is no longer needed.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-generic.c | 68 +----------------------------------------------
 1 file changed, 1 insertion(+), 67 deletions(-)

diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 9d03e821153..3a93e4c41bf 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -1,20 +1,12 @@
 /*
  * generic/default IDE host driver
  *
- * Copyright (C) 2004, 2008 Bartlomiej Zolnierkiewicz
+ * Copyright (C) 2004, 2008-2009 Bartlomiej Zolnierkiewicz
  * This code was split off from ide.c.  See it for original copyrights.
  *
  * May be copied or modified under the terms of the GNU General Public License.
  */
 
-/*
- * For special cases new interfaces may be added using sysfs, i.e.
- *
- *	echo -n "0x168:0x36e:10" > /sys/class/ide_generic/add
- *
- * will add an interface using I/O ports 0x168-0x16f/0x36e and IRQ 10.
- */
-
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -36,60 +28,6 @@ static const struct ide_port_info ide_generic_port_info = {
 	.host_flags		= IDE_HFLAG_NO_DMA,
 };
 
-static ssize_t store_add(struct class *cls, const char *buf, size_t n)
-{
-	unsigned int base, ctl;
-	int irq, rc;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
-
-	if (sscanf(buf, "%x:%x:%d", &base, &ctl, &irq) != 3)
-		return -EINVAL;
-
-	memset(&hw, 0, sizeof(hw));
-	ide_std_init_ports(&hw, base, ctl);
-	hw.irq = irq;
-	hw.chipset = ide_generic;
-
-	rc = ide_host_add(&ide_generic_port_info, hws, NULL);
-	if (rc)
-		return rc;
-
-	return n;
-};
-
-static struct class_attribute ide_generic_class_attrs[] = {
-	__ATTR(add, S_IWUSR, NULL, store_add),
-	__ATTR_NULL
-};
-
-static void ide_generic_class_release(struct class *cls)
-{
-	kfree(cls);
-}
-
-static int __init ide_generic_sysfs_init(void)
-{
-	struct class *cls;
-	int rc;
-
-	cls = kzalloc(sizeof(*cls), GFP_KERNEL);
-	if (!cls)
-		return -ENOMEM;
-
-	cls->name = DRV_NAME;
-	cls->owner = THIS_MODULE;
-	cls->class_release = ide_generic_class_release;
-	cls->class_attrs = ide_generic_class_attrs;
-
-	rc = class_register(cls);
-	if (rc) {
-		kfree(cls);
-		return rc;
-	}
-
-	return 0;
-}
-
 #if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) \
 	|| defined(CONFIG_PLAT_OPSPUT)
 static const u16 legacy_bases[] = { 0x1f0 };
@@ -196,10 +134,6 @@ static int __init ide_generic_init(void)
 		}
 	}
 
-	if (ide_generic_sysfs_init())
-		printk(KERN_ERR DRV_NAME ": failed to create ide_generic "
-					 "class\n");
-
 	return rc;
 }
 
-- 
cgit v1.2.3


From 4465461ece2b9249d6c0cf57bc0002100823e361 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:24 +0200
Subject: ide: merge ide_arm and ide_generic host drivers

There is no need for a separate ide_arm host driver nowadays
so merge it into ide_generic one.

While at it:
- return -EBUSY from ide_generic_init() if I/O resources are busy
- scale down ide_generic_check_pci_legacy_iobases() for CONFIG_PCI=n

Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Alexander Schulz <alex@shark-linux.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig       |  8 ++-----
 drivers/ide/Makefile      |  2 --
 drivers/ide/ide-generic.c | 18 +++++++++++-----
 drivers/ide/ide_arm.c     | 53 -----------------------------------------------
 4 files changed, 15 insertions(+), 66 deletions(-)
 delete mode 100644 drivers/ide/ide_arm.c

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 640c9920724..896424445f6 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -222,7 +222,8 @@ comment "IDE chipset support/bugfixes"
 
 config IDE_GENERIC
 	tristate "generic/default IDE chipset support"
-	depends on ALPHA || X86 || IA64 || M32R || MIPS
+	depends on ALPHA || X86 || IA64 || M32R || MIPS || ARCH_RPC || ARCH_SHARK
+	default ARM && (ARCH_RPC || ARCH_SHARK)
 	help
 	  This is the generic IDE driver.  This driver attaches to the
 	  fixed legacy ports (e.g. on PCs 0x1f0/0x170, 0x1e8/0x168 and
@@ -731,11 +732,6 @@ config BLK_DEV_IDE_AT91
 	depends on ARM && ARCH_AT91 && !ARCH_AT91RM9200 && !ARCH_AT91X40
 	select IDE_TIMINGS
 
-config IDE_ARM
-	tristate "ARM IDE support"
-	depends on ARM && (ARCH_RPC || ARCH_SHARK)
-	default y
-
 config BLK_DEV_IDE_ICSIDE
 	tristate "ICS IDE interface support"
 	depends on ARM && ARCH_ACORN
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 9b4bbe1cdc1..81df925f0e8 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -21,8 +21,6 @@ ide-core-$(CONFIG_IDE_LEGACY)		+= ide-legacy.o
 
 obj-$(CONFIG_IDE)			+= ide-core.o
 
-obj-$(CONFIG_IDE_ARM)			+= ide_arm.o
-
 obj-$(CONFIG_BLK_DEV_ALI14XX)		+= ali14xx.o
 obj-$(CONFIG_BLK_DEV_UMC8672)		+= umc8672.o
 obj-$(CONFIG_BLK_DEV_DTC2278)		+= dtc2278.o
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 3a93e4c41bf..7812ca0be13 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -13,7 +13,10 @@
 #include <linux/ide.h>
 #include <linux/pci_ids.h>
 
-/* FIXME: convert m32r to use ide_platform host driver */
+/* FIXME: convert arm and m32r to use ide_platform host driver */
+#ifdef CONFIG_ARM
+#include <asm/irq.h>
+#endif
 #ifdef CONFIG_M32R
 #include <asm/m32r.h>
 #endif
@@ -28,8 +31,11 @@ static const struct ide_port_info ide_generic_port_info = {
 	.host_flags		= IDE_HFLAG_NO_DMA,
 };
 
-#if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) \
-	|| defined(CONFIG_PLAT_OPSPUT)
+#ifdef CONFIG_ARM
+static const u16 legacy_bases[] = { 0x1f0 };
+static const int legacy_irqs[]  = { IRQ_HARDDISK };
+#elif defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) || \
+      defined(CONFIG_PLAT_OPSPUT)
 static const u16 legacy_bases[] = { 0x1f0 };
 static const int legacy_irqs[]  = { PLD_IRQ_CFIREQ };
 #elif defined(CONFIG_PLAT_MAPPI3)
@@ -45,11 +51,11 @@ static const int legacy_irqs[]  = { 14, 15, 11, 10, 8, 12 };
 
 static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
 {
+#ifdef CONFIG_PCI
 	struct pci_dev *p = NULL;
 	u16 val;
 
 	for_each_pci_dev(p) {
-
 		if (pci_resource_start(p, 0) == 0x1f0)
 			*primary = 1;
 		if (pci_resource_start(p, 2) == 0x170)
@@ -64,7 +70,6 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
 		/* Intel MPIIX - PIO ATA on non PCI side of bridge */
 		if (p->vendor == PCI_VENDOR_ID_INTEL &&
 		    p->device == PCI_DEVICE_ID_INTEL_82371MX) {
-
 			pci_read_config_word(p, 0x6C, &val);
 			if (val & 0x8000) {
 				/* ATA port enabled */
@@ -75,6 +80,7 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
 			}
 		}
 	}
+#endif
 }
 
 static int __init ide_generic_init(void)
@@ -106,6 +112,7 @@ static int __init ide_generic_init(void)
 				printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX "
 						"not free.\n",
 						DRV_NAME, io_addr, io_addr + 7);
+				rc = -EBUSY;
 				continue;
 			}
 
@@ -114,6 +121,7 @@ static int __init ide_generic_init(void)
 						"not free.\n",
 						DRV_NAME, io_addr + 0x206);
 				release_region(io_addr, 8);
+				rc = -EBUSY;
 				continue;
 			}
 
diff --git a/drivers/ide/ide_arm.c b/drivers/ide/ide_arm.c
deleted file mode 100644
index cf6385446ec..00000000000
--- a/drivers/ide/ide_arm.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * ARM default IDE host driver
- *
- * Copyright (C) 2004 Bartlomiej Zolnierkiewicz
- * Based on code by: Russell King, Ian Molton and Alexander Schulz.
- *
- * May be copied or modified under the terms of the GNU General Public License.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-
-#include <asm/irq.h>
-
-#define DRV_NAME "ide_arm"
-
-#define IDE_ARM_IO	0x1f0
-#define IDE_ARM_IRQ	IRQ_HARDDISK
-
-static const struct ide_port_info ide_arm_port_info = {
-	.host_flags		= IDE_HFLAG_NO_DMA,
-};
-
-static int __init ide_arm_init(void)
-{
-	unsigned long base = IDE_ARM_IO, ctl = IDE_ARM_IO + 0x206;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
-
-	if (!request_region(base, 8, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
-				DRV_NAME, base, base + 7);
-		return -EBUSY;
-	}
-
-	if (!request_region(ctl, 1, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n",
-				DRV_NAME, ctl);
-		release_region(base, 8);
-		return -EBUSY;
-	}
-
-	memset(&hw, 0, sizeof(hw));
-	ide_std_init_ports(&hw, base, ctl);
-	hw.irq = IDE_ARM_IRQ;
-	hw.chipset = ide_generic;
-
-	return ide_host_add(&ide_arm_port_info, hws, NULL);
-}
-
-module_init(ide_arm_init);
-
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From b5479167f4206e0d821a51ae149d921cd7a58e54 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:25 +0200
Subject: ide: fix locking in drive_release_dev()

* Request queue cleanup should happen before freeing drive->id
  and marking device as non-present.  Fix it.

* Remove superfluous hwif->lock acquiring/releasing.

Cc: Stanislaw Gruszka <stf_xl@wp.pl>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 548864510ba..7c1f1bf8183 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -942,20 +942,16 @@ EXPORT_SYMBOL_GPL(ide_init_disk);
 static void drive_release_dev (struct device *dev)
 {
 	ide_drive_t *drive = container_of(dev, ide_drive_t, gendev);
-	ide_hwif_t *hwif = drive->hwif;
 
 	ide_proc_unregister_device(drive);
 
-	spin_lock_irq(&hwif->lock);
+	blk_cleanup_queue(drive->queue);
+	drive->queue = NULL;
+
 	kfree(drive->id);
 	drive->id = NULL;
+
 	drive->dev_flags &= ~IDE_DFLAG_PRESENT;
-	/* Messed up locking ... */
-	spin_unlock_irq(&hwif->lock);
-	blk_cleanup_queue(drive->queue);
-	spin_lock_irq(&hwif->lock);
-	drive->queue = NULL;
-	spin_unlock_irq(&hwif->lock);
 
 	complete(&drive->gendev_rel_comp);
 }
-- 
cgit v1.2.3


From 41fa9f863baacd32dd049daf8050d55a0c9e6f1a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:25 +0200
Subject: ide: decrease size of ->pc_buf field in struct ide_atapi_pc

struct ide_atapi_pc is often allocated on the stack and size of ->pc_buf
size is 256 bytes.  However since only ide_floppy_create_read_capacity_cmd()
and idetape_create_inquiry_cmd() require such size allocate buffers for
these pc-s explicitely and decrease ->pc_buf size to 64 bytes.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c       | 5 ++++-
 drivers/ide/ide-floppy_ioctl.c | 5 ++++-
 drivers/ide/ide-tape.c         | 4 ++++
 include/linux/ide.h            | 2 +-
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 7ae66233483..0faae309829 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -385,7 +385,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	struct gendisk *disk = floppy->disk;
 	struct ide_atapi_pc pc;
 	u8 *cap_desc;
-	u8 header_len, desc_cnt;
+	u8 pc_buf[256], header_len, desc_cnt;
 	int i, rc = 1, blocks, length;
 
 	drive->bios_cyl = 0;
@@ -395,6 +395,9 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	drive->capacity64 = 0;
 
 	ide_floppy_create_read_capacity_cmd(&pc);
+	pc.buf = &pc_buf[0];
+	pc.buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, disk, &pc)) {
 		printk(KERN_ERR PFX "Can't get floppy parameters\n");
 		return 1;
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 8f8be854603..cd8a42027ed 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -36,9 +36,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 					    int __user *arg)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
-	u8 header_len, desc_cnt;
 	int i, blocks, length, u_array_size, u_index;
 	int __user *argp;
+	u8 pc_buf[256], header_len, desc_cnt;
 
 	if (get_user(u_array_size, arg))
 		return -EFAULT;
@@ -47,6 +47,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 		return -EINVAL;
 
 	ide_floppy_create_read_capacity_cmd(pc);
+	pc->buf = &pc_buf[0];
+	pc->buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, floppy->disk, pc)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
 		return -EIO;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 64dfa7458f8..cafc67d9e2e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2014,9 +2014,13 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc pc;
+	u8 pc_buf[256];
 	char fw_rev[4], vendor_id[8], product_id[16];
 
 	idetape_create_inquiry_cmd(&pc);
+	pc.buf = &pc_buf[0];
+	pc.buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 03c520917b7..0f48fbd4602 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -377,7 +377,7 @@ enum {
  * With each packet command, we allocate a buffer of IDE_PC_BUFFER_SIZE bytes.
  * This is used for several packet commands (not for READ/WRITE commands).
  */
-#define IDE_PC_BUFFER_SIZE	256
+#define IDE_PC_BUFFER_SIZE	64
 #define ATAPI_WAIT_PC		(60 * HZ)
 
 struct ide_atapi_pc {
-- 
cgit v1.2.3


From 9f5af4d667a6d4ebd66019b4b26b445ddbae6d6c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:26 +0200
Subject: ide: remove CONFIG_BLK_DEV_IDEDOUBLER config option

Nowadays it is not worth having a separate config option for
Amiga IDE Doubler support so always include it (it still needs
to be explicitly enabled by module parameter).

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig | 23 ++++++++---------------
 drivers/ide/gayle.c | 12 +-----------
 2 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 896424445f6..4d2f2a6b5bd 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -770,27 +770,20 @@ config BLK_DEV_GAYLE
 	  This includes on-board IDE interfaces on some Amiga models (A600,
 	  A1200, A4000, and A4000T), and IDE interfaces on the Zorro expansion
 	  bus (M-Tech E-Matrix 530 expansion card).
-	  Say Y if you have an Amiga with a Gayle IDE interface and want to use
-	  IDE devices (hard disks, CD-ROM drives, etc.) that are connected to
-	  it.
-	  Note that you also have to enable Zorro bus support if you want to
-	  use Gayle IDE interfaces on the Zorro expansion bus.
 
-config BLK_DEV_IDEDOUBLER
-	bool "Amiga IDE Doubler support (EXPERIMENTAL)"
-	depends on BLK_DEV_GAYLE && EXPERIMENTAL
-	---help---
-	  This feature provides support for the so-called `IDE doublers' (made
+	  It also provides support for the so-called `IDE doublers' (made
 	  by various manufacturers, e.g. Eyetech) that can be connected to
 	  the on-board IDE interface of some Amiga models. Using such an IDE
 	  doubler, you can connect up to four instead of two IDE devices to
-	  the Amiga's on-board IDE interface.
+	  the Amiga's on-board IDE interface. The feature is enabled at kernel
+	  runtime using the "gayle.doubler" kernel boot parameter.
 
-	  Note that the normal Amiga Gayle IDE driver may not work correctly
-	  if you have an IDE doubler and don't enable this feature!
+	  Say Y if you have an Amiga with a Gayle IDE interface and want to use
+	  IDE devices (hard disks, CD-ROM drives, etc.) that are connected to
+	  it.
 
-	  Say Y if you have an IDE doubler.  The feature is enabled at kernel
-	  runtime using the "gayle.doubler" kernel boot parameter.
+	  Note that you also have to enable Zorro bus support if you want to
+	  use Gayle IDE interfaces on the Zorro expansion bus.
 
 config BLK_DEV_BUDDHA
 	tristate "Buddha/Catweasel/X-Surf IDE interface support (EXPERIMENTAL)"
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index dc778251cb0..c7119516c5a 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -53,11 +53,6 @@
 
 #define GAYLE_NEXT_PORT	0x1000
 
-#ifndef CONFIG_BLK_DEV_IDEDOUBLER
-#define GAYLE_NUM_HWIFS		1
-#define GAYLE_NUM_PROBE_HWIFS	GAYLE_NUM_HWIFS
-#define GAYLE_HAS_CONTROL_REG	1
-#else /* CONFIG_BLK_DEV_IDEDOUBLER */
 #define GAYLE_NUM_HWIFS		2
 #define GAYLE_NUM_PROBE_HWIFS	(ide_doubler ? GAYLE_NUM_HWIFS : \
 					       GAYLE_NUM_HWIFS-1)
@@ -66,8 +61,6 @@
 static int ide_doubler;
 module_param_named(doubler, ide_doubler, bool, 0);
 MODULE_PARM_DESC(doubler, "enable support for IDE doublers");
-#endif /* CONFIG_BLK_DEV_IDEDOUBLER */
-
 
     /*
      *  Check and acknowledge the interrupt status
@@ -151,10 +144,7 @@ static int __init gayle_init(void)
 found:
 	printk(KERN_INFO "ide: Gayle IDE controller (A%d style%s)\n",
 			 a4000 ? 4000 : 1200,
-#ifdef CONFIG_BLK_DEV_IDEDOUBLER
-			 ide_doubler ? ", IDE doubler" :
-#endif
-			 "");
+			 ide_doubler ? ", IDE doubler" : "");
 
 	if (a4000) {
 	    phys_base = GAYLE_BASE_4000;
-- 
cgit v1.2.3


From d93bc4521c80e9d87767779814e88f6d725453d7 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:26 +0200
Subject: ide-{floppy,tape}: fix padding for PIO transfers

* Return number of bytes left to transfer from idetape_{in,out}put_buffers()
  and number of bytes done from ide_tape_io_buffers().

* Fix padding for PIO transfers in ide_pc_intr() so read/write buffers are
  always completely processed and then the transfer is padded if necessary.

* Remove invalid error messages.

* Remove now superfluous padding from ide{_io_buffers,tape_input_buffers}().

While at it:

* Set pc->bh to NULL in idetape_input_buffers() after all bh-s are done.

* Cache !!(pc->flags & PC_FLAG_WRITING) in local variable in ide_pc_intr().

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 57 +++++++++++++++++--------------------------------
 drivers/ide/ide-tape.c  | 32 +++++++++++++--------------
 2 files changed, 35 insertions(+), 54 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3f3fc7c7b2f..40f413b20bd 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -110,13 +110,6 @@ int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 		}
 	}
 
-	if (bcount) {
-		printk(KERN_ERR "%s: %d leftover bytes, %s\n", drive->name,
-			bcount, write ? "padding with zeros"
-				      : "discarding data");
-		ide_pad_transfer(drive, write, bcount);
-	}
-
 	return done;
 }
 EXPORT_SYMBOL_GPL(ide_io_buffers);
@@ -330,9 +323,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	xfer_func_t *xferfunc;
-	unsigned int timeout, temp;
+	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
+	u8 write = !!(pc->flags & PC_FLAG_WRITING);
 
 	debug_log("Enter %s - interrupt handler\n", __func__);
 
@@ -441,8 +435,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (((ireason & ATAPI_IO) == ATAPI_IO) ==
-		!!(pc->flags & PC_FLAG_WRITING)) {
+	if (((ireason & ATAPI_IO) == ATAPI_IO) == write) {
 		/* Hopefully, we will never get here */
 		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
 				"to %s!\n", drive->name,
@@ -451,45 +444,33 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (!(pc->flags & PC_FLAG_WRITING)) {
-		/* Reading - Check that we have enough space */
-		temp = pc->xferred + bcount;
-		if (temp > pc->req_xfer) {
-			if (temp > pc->buf_size) {
-				printk(KERN_ERR "%s: The device wants to send "
-						"us more data than expected - "
-						"discarding data\n",
-						drive->name);
-
-				ide_pad_transfer(drive, 0, bcount);
-				goto next_irq;
-			}
-			debug_log("The device wants to send us more data than "
-				  "expected - allowing transfer\n");
-		}
-		xferfunc = tp_ops->input_data;
-	} else
-		xferfunc = tp_ops->output_data;
+	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
 
 	if ((drive->media == ide_floppy && !pc->buf) ||
 	    (drive->media == ide_tape && pc->bh)) {
-		int done = drive->pc_io_buffers(drive, pc, bcount,
-				  !!(pc->flags & PC_FLAG_WRITING));
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
 
 		/* FIXME: don't do partial completions */
 		if (drive->media == ide_floppy)
 			ide_complete_rq(drive, 0,
 					done ? done : ide_rq_bytes(rq));
-	} else
-		xferfunc(drive, NULL, pc->cur_pos, bcount);
+	} else {
+		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
+		xferfunc(drive, NULL, pc->cur_pos, done);
+	}
 
 	/* Update the current position */
-	pc->xferred += bcount;
-	pc->cur_pos += bcount;
+	pc->xferred += done;
+	pc->cur_pos += done;
+
+	bcount -= done;
+
+	if (bcount)
+		ide_pad_transfer(drive, write, bcount);
+
+	debug_log("[cmd %x] transferred %d bytes, padded %d bytes\n",
+		  rq->cmd[0], done, bcount);
 
-	debug_log("[cmd %x] transferred %d bytes on that intr.\n",
-		  rq->cmd[0], bcount);
-next_irq:
 	/* And set the interrupt handler again */
 	ide_set_handler(drive, ide_pc_intr, timeout);
 	return ide_started;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cafc67d9e2e..cb942a9b580 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -297,19 +297,15 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-static void idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
+static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 				  unsigned int bcount)
 {
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
 	while (bcount) {
-		if (bh == NULL) {
-			printk(KERN_ERR "ide-tape: bh == NULL in "
-				"idetape_input_buffers\n");
-			ide_pad_transfer(drive, 0, bcount);
-			return;
-		}
+		if (bh == NULL)
+			break;
 		count = min(
 			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
 			bcount);
@@ -323,21 +319,21 @@ static void idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 				atomic_set(&bh->b_count, 0);
 		}
 	}
+
 	pc->bh = bh;
+
+	return bcount;
 }
 
-static void idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
+static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 				   unsigned int bcount)
 {
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
 	while (bcount) {
-		if (bh == NULL) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return;
-		}
+		if (bh == NULL)
+			break;
 		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
 		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
 		bcount -= count;
@@ -352,6 +348,8 @@ static void idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 			}
 		}
 	}
+
+	return bcount;
 }
 
 static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
@@ -563,12 +561,14 @@ static void ide_tape_handle_dsc(ide_drive_t *drive)
 static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 				unsigned int bcount, int write)
 {
+	unsigned int bleft;
+
 	if (write)
-		idetape_output_buffers(drive, pc, bcount);
+		bleft = idetape_output_buffers(drive, pc, bcount);
 	else
-		idetape_input_buffers(drive, pc, bcount);
+		bleft = idetape_input_buffers(drive, pc, bcount);
 
-	return bcount;
+	return bcount - bleft;
 }
 
 /*
-- 
cgit v1.2.3


From 349d12a1fe57d49287a539909cf14f362634342d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:26 +0200
Subject: ide-floppy: use ide_pio_bytes()

* Fix ide_init_sg_cmd() setup for non-fs requests.

* Convert ide_pc_intr() to use ide_pio_bytes() for floppy media.

* Remove no longer needed ide_io_buffers() and sg/sg_cnt fields
  from struct ide_atapi_pc.

* Remove partial completions; kill idefloppy_update_buffers(), as a
  result.

* Add some more debugging statements.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 68 ++++++++++++------------------------------------
 drivers/ide/ide-floppy.c | 24 ++++-------------
 include/linux/ide.h      |  5 ----
 3 files changed, 21 insertions(+), 76 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 40f413b20bd..100e6f94b4f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -71,49 +71,6 @@ int ide_check_atapi_device(ide_drive_t *drive, const char *s)
 }
 EXPORT_SYMBOL_GPL(ide_check_atapi_device);
 
-/* PIO data transfer routine using the scatter gather table. */
-int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-		    unsigned int bcount, int write)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xf = write ? tp_ops->output_data : tp_ops->input_data;
-	struct scatterlist *sg = pc->sg;
-	char *buf;
-	int count, done = 0;
-
-	while (bcount) {
-		count = min(sg->length - pc->b_count, bcount);
-
-		if (PageHighMem(sg_page(sg))) {
-			unsigned long flags;
-
-			local_irq_save(flags);
-			buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
-			xf(drive, NULL, buf + pc->b_count, count);
-			kunmap_atomic(buf - sg->offset, KM_IRQ0);
-			local_irq_restore(flags);
-		} else {
-			buf = sg_virt(sg);
-			xf(drive, NULL, buf + pc->b_count, count);
-		}
-
-		bcount -= count;
-		pc->b_count += count;
-		done += count;
-
-		if (pc->b_count == sg->length) {
-			if (!--pc->sg_cnt)
-				break;
-			pc->sg = sg = sg_next(sg);
-			pc->b_count = 0;
-		}
-	}
-
-	return done;
-}
-EXPORT_SYMBOL_GPL(ide_io_buffers);
-
 void ide_init_pc(struct ide_atapi_pc *pc)
 {
 	memset(pc, 0, sizeof(*pc));
@@ -353,6 +310,9 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			pc->xferred = pc->req_xfer;
 			if (drive->pc_update_buffers)
 				drive->pc_update_buffers(drive, pc);
+
+			if (drive->media == ide_floppy)
+				ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
@@ -408,12 +368,19 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		} else {
+			unsigned int done;
+
 			if (blk_fs_request(rq) == 0 && uptodate <= 0) {
 				if (rq->errors == 0)
 					rq->errors = -EIO;
 			}
-			ide_complete_rq(drive, uptodate ? 0 : -EIO,
-					ide_rq_bytes(rq));
+
+			if (drive->media == ide_tape)
+				done = ide_rq_bytes(rq); /* FIXME */
+			else
+				done = blk_rq_bytes(rq);
+
+			ide_complete_rq(drive, uptodate ? 0 : -EIO, done);
 		}
 
 		return ide_stopped;
@@ -446,14 +413,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
 
-	if ((drive->media == ide_floppy && !pc->buf) ||
-	    (drive->media == ide_tape && pc->bh)) {
+	if (drive->media == ide_floppy && pc->buf == NULL) {
+		done = min_t(unsigned int, bcount, cmd->nleft);
+		ide_pio_bytes(drive, cmd, write, done);
+	} else if (drive->media == ide_tape && pc->bh) {
 		done = drive->pc_io_buffers(drive, pc, bcount, write);
-
-		/* FIXME: don't do partial completions */
-		if (drive->media == ide_floppy)
-			ide_complete_rq(drive, 0,
-					done ? done : ide_rq_bytes(rq));
 	} else {
 		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
 		xferfunc(drive, NULL, pc->cur_pos, done);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 0faae309829..2b4868d95f8 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -61,16 +61,6 @@
  */
 #define IDEFLOPPY_PC_DELAY	(HZ/20)	/* default delay for ZIP 100 (50ms) */
 
-static void idefloppy_update_buffers(ide_drive_t *drive,
-				struct ide_atapi_pc *pc)
-{
-	struct request *rq = pc->rq;
-	struct bio *bio = rq->bio;
-
-	while ((bio = rq->bio) != NULL)
-		ide_complete_rq(drive, 0, ide_rq_bytes(rq));
-}
-
 static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
@@ -213,7 +203,6 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	memcpy(rq->cmd, pc->c, 12);
 
 	pc->rq = rq;
-	pc->b_count = 0;
 	if (rq->cmd_flags & REQ_RW)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = NULL;
@@ -227,7 +216,6 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	pc->b_count = 0;
 	if (rq->data_len && rq_data_dir(rq) == WRITE)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = rq->data;
@@ -244,10 +232,11 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 					     struct request *rq, sector_t block)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
-	ide_hwif_t *hwif = drive->hwif;
 	struct ide_cmd cmd;
 	struct ide_atapi_pc *pc;
 
+	ide_debug_log(IDE_DBG_FUNC, "enter, cmd: 0x%x\n", rq->cmd[0]);
+
 	if (drive->debug_mask & IDE_DBG_RQ)
 		blk_dump_rq_flags(rq, (rq->rq_disk
 					? rq->rq_disk->disk_name
@@ -294,13 +283,10 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || pc->req_xfer) {
-		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+		ide_init_sg_cmd(&cmd, pc->req_xfer);
 		ide_map_sg(drive, &cmd);
 	}
 
-	pc->sg = hwif->sg_table;
-	pc->sg_cnt = cmd.sg_nents;
-
 	pc->rq = rq;
 
 	return ide_floppy_issue_pc(drive, &cmd, pc);
@@ -388,6 +374,8 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	u8 pc_buf[256], header_len, desc_cnt;
 	int i, rc = 1, blocks, length;
 
+	ide_debug_log(IDE_DBG_FUNC, "enter");
+
 	drive->bios_cyl = 0;
 	drive->bios_head = drive->bios_sect = 0;
 	floppy->blocks = 0;
@@ -488,8 +476,6 @@ static void ide_floppy_setup(ide_drive_t *drive)
 	u16 *id = drive->id;
 
 	drive->pc_callback	 = ide_floppy_callback;
-	drive->pc_update_buffers = idefloppy_update_buffers;
-	drive->pc_io_buffers	 = ide_io_buffers;
 
 	/*
 	 * We used to check revisions here. At this point however I'm giving up.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0f48fbd4602..836c4c6cb7e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -415,9 +415,6 @@ struct ide_atapi_pc {
 	struct idetape_bh *bh;
 	char *b_data;
 
-	struct scatterlist *sg;
-	unsigned int sg_cnt;
-
 	unsigned long timeout;
 };
 
@@ -1177,8 +1174,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
-int ide_io_buffers(ide_drive_t *, struct ide_atapi_pc *, unsigned int, int);
-
 extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
-- 
cgit v1.2.3


From 985232e388714d4a9e94b4d96ee69b6ff8c9dc31 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:27 +0200
Subject: au1xxx-ide: auide_{in|out}sw() should be static

Make auide_{insw|outsw}() 'static' and mark them 'inline' as there's only one
call site for each: in the driver's {in|out}put_data() methods respectively...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 549bbb2755a..1bfb43d0d3a 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -50,7 +50,7 @@ static _auide_hwif auide_hwif;
 
 #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA)
 
-void auide_insw(unsigned long port, void *addr, u32 count)
+static inline void auide_insw(unsigned long port, void *addr, u32 count)
 {
 	_auide_hwif *ahwif = &auide_hwif;
 	chan_tab_t *ctp;
@@ -68,7 +68,7 @@ void auide_insw(unsigned long port, void *addr, u32 count)
 	ctp->cur_ptr = au1xxx_ddma_get_nextptr_virt(dp);
 }
 
-void auide_outsw(unsigned long port, void *addr, u32 count)
+static inline void auide_outsw(unsigned long port, void *addr, u32 count)
 {
 	_auide_hwif *ahwif = &auide_hwif;
 	chan_tab_t *ctp;
-- 
cgit v1.2.3


From 8d64fcd9357798ad0d61f8877de13d5e1b1ab510 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:27 +0200
Subject: ide: identify data word 53 bit 1 doesn't cover words 62 and 63 (take
 3)

The IDE code assumed for years that the bit 1 of the identify data word 53 also
covers the validity of the SW/MW DMA information in words 62 and 63, but it has
always covered only words 64 thru 70, with words 62 and 63 being defined in the
original ATA spec, not in ATA-2...

This fix however should only concern *very* old hard disks and rather old CF
cards...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cs5530.c      |  3 +--
 drivers/ide/ide-dma-sff.c |  7 +++----
 drivers/ide/ide-dma.c     | 32 ++++++++++++++------------------
 drivers/ide/sc1200.c      |  3 +--
 4 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c
index 8e8b35a8990..40bf05eddf6 100644
--- a/drivers/ide/cs5530.c
+++ b/drivers/ide/cs5530.c
@@ -92,8 +92,7 @@ static u8 cs5530_udma_filter(ide_drive_t *drive)
 		if ((mateid[ATA_ID_FIELD_VALID] & 4) &&
 		    (mateid[ATA_ID_UDMA_MODES] & 7))
 			goto out;
-		if ((mateid[ATA_ID_FIELD_VALID] & 2) &&
-		    (mateid[ATA_ID_MWDMA_MODES] & 7))
+		if (mateid[ATA_ID_MWDMA_MODES] & 7)
 			mask = 0;
 	}
 out:
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index f8c7d0cd2ff..16fc46edc32 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -38,10 +38,9 @@ int config_drive_for_dma(ide_drive_t *drive)
 	 * Enable DMA on any drive that has mode2 DMA
 	 * (multi or single) enabled
 	 */
-	if (id[ATA_ID_FIELD_VALID] & 2)	/* regular DMA */
-		if ((id[ATA_ID_MWDMA_MODES] & 0x404) == 0x404 ||
-		    (id[ATA_ID_SWDMA_MODES] & 0x404) == 0x404)
-			return 1;
+	if ((id[ATA_ID_MWDMA_MODES] & 0x404) == 0x404 ||
+	    (id[ATA_ID_SWDMA_MODES] & 0x404) == 0x404)
+		return 1;
 
 	/* Consult the list of known "good" drives */
 	if (ide_dma_good_drive(drive))
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index a5612eadc30..f9c91752f27 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -245,12 +245,11 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode)
 	case XFER_UDMA_0:
 		if ((id[ATA_ID_FIELD_VALID] & 4) == 0)
 			break;
-
+		mask = id[ATA_ID_UDMA_MODES];
 		if (port_ops && port_ops->udma_filter)
-			mask = port_ops->udma_filter(drive);
+			mask &= port_ops->udma_filter(drive);
 		else
-			mask = hwif->ultra_mask;
-		mask &= id[ATA_ID_UDMA_MODES];
+			mask &= hwif->ultra_mask;
 
 		/*
 		 * avoid false cable warning from eighty_ninty_three()
@@ -261,18 +260,15 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode)
 		}
 		break;
 	case XFER_MW_DMA_0:
-		if ((id[ATA_ID_FIELD_VALID] & 2) == 0)
-			break;
+		mask = id[ATA_ID_MWDMA_MODES];
 		if (port_ops && port_ops->mdma_filter)
-			mask = port_ops->mdma_filter(drive);
+			mask &= port_ops->mdma_filter(drive);
 		else
-			mask = hwif->mwdma_mask;
-		mask &= id[ATA_ID_MWDMA_MODES];
+			mask &= hwif->mwdma_mask;
 		break;
 	case XFER_SW_DMA_0:
-		if (id[ATA_ID_FIELD_VALID] & 2) {
-			mask = id[ATA_ID_SWDMA_MODES] & hwif->swdma_mask;
-		} else if (id[ATA_ID_OLD_DMA_MODES] >> 8) {
+		mask = id[ATA_ID_SWDMA_MODES];
+		if (!(mask & ATA_SWDMA2) && (id[ATA_ID_OLD_DMA_MODES] >> 8)) {
 			u8 mode = id[ATA_ID_OLD_DMA_MODES] >> 8;
 
 			/*
@@ -280,8 +276,9 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode)
 			 * (the maximum allowed mode is XFER_SW_DMA_2)
 			 */
 			if (mode <= 2)
-				mask = ((2 << mode) - 1) & hwif->swdma_mask;
+				mask = (2 << mode) - 1;
 		}
+		mask &= hwif->swdma_mask;
 		break;
 	default:
 		BUG();
@@ -398,11 +395,10 @@ int ide_id_dma_bug(ide_drive_t *drive)
 		if ((id[ATA_ID_UDMA_MODES] >> 8) &&
 		    (id[ATA_ID_MWDMA_MODES] >> 8))
 			goto err_out;
-	} else if (id[ATA_ID_FIELD_VALID] & 2) {
-		if ((id[ATA_ID_MWDMA_MODES] >> 8) &&
-		    (id[ATA_ID_SWDMA_MODES] >> 8))
-			goto err_out;
-	}
+	} else if ((id[ATA_ID_MWDMA_MODES] >> 8) &&
+		   (id[ATA_ID_SWDMA_MODES] >> 8))
+		goto err_out;
+
 	return 0;
 err_out:
 	printk(KERN_ERR "%s: bad DMA info in identify block\n", drive->name);
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 13e3988f00f..d467478d68d 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -115,8 +115,7 @@ static u8 sc1200_udma_filter(ide_drive_t *drive)
 		if ((mateid[ATA_ID_FIELD_VALID] & 4) &&
 		    (mateid[ATA_ID_UDMA_MODES] & 7))
 			goto out;
-		if ((mateid[ATA_ID_FIELD_VALID] & 2) &&
-		    (mateid[ATA_ID_MWDMA_MODES] & 7))
+		if (mateid[ATA_ID_MWDMA_MODES] & 7)
 			mask = 0;
 	}
 out:
-- 
cgit v1.2.3


From c4199930b119eb9c1ffb102ed57eaac4d4424d08 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:27 +0200
Subject: ide-iops: only clear DMA words on setting DMA mode

The bytes indicating current DMA mode in the identify data words 62, 63, and 88
should only change on setting a DMA mode, so stop clearing them  on setting PIO
mode in ide_config_drive_speed().  While at it, correct SW/MW DMA mode masks...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-iops.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 5403e4a44be..0e62698146b 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -386,9 +386,11 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 		return error;
 	}
 
-	id[ATA_ID_UDMA_MODES]  &= ~0xFF00;
-	id[ATA_ID_MWDMA_MODES] &= ~0x0F00;
-	id[ATA_ID_SWDMA_MODES] &= ~0x0F00;
+	if (speed >= XFER_SW_DMA_0) {
+		id[ATA_ID_UDMA_MODES]  &= ~0xFF00;
+		id[ATA_ID_MWDMA_MODES] &= ~0x0700;
+		id[ATA_ID_SWDMA_MODES] &= ~0x0700;
+	}
 
  skip:
 #ifdef CONFIG_BLK_DEV_IDEDMA
-- 
cgit v1.2.3


From 74638c84821c066d02c158bc843c84499ddc9764 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:28 +0200
Subject: ide: add support for CFA specified transfer modes (take 3)

Add support for the CompactFlash specific PIO modes 5/6 and MWDMA modes 3/4.

Since there were no PIO5 capable hard drives produced and one would also need
66 MHz IDE clock to actually get the difference WRT the address setup timings
programmed, I decided to simply replace the old non-standard PIO mode 5 timings
with the CFA specified ones.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: stf_xl@wp.pl
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-dma.c       |  8 ++++++++
 drivers/ide/ide-iops.c      | 12 +++++++++++-
 drivers/ide/ide-timings.c   | 12 ++++++++++--
 drivers/ide/ide-xfer-mode.c | 15 +++++++++------
 drivers/ide/sl82c105.c      |  3 ++-
 5 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index f9c91752f27..a0b8cab1d9a 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -261,6 +261,14 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode)
 		break;
 	case XFER_MW_DMA_0:
 		mask = id[ATA_ID_MWDMA_MODES];
+
+		/* Also look for the CF specific MWDMA modes... */
+		if (ata_id_is_cfa(id) && (id[ATA_ID_CFA_MODES] & 0x38)) {
+			u8 mode = ((id[ATA_ID_CFA_MODES] & 0x38) >> 3) - 1;
+
+			mask |= ((2 << mode) - 1) << 3;
+		}
+
 		if (port_ops && port_ops->mdma_filter)
 			mask &= port_ops->mdma_filter(drive);
 		else
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 0e62698146b..0caca342802 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -306,6 +306,7 @@ int ide_driveid_update(ide_drive_t *drive)
 	drive->id[ATA_ID_UDMA_MODES]  = id[ATA_ID_UDMA_MODES];
 	drive->id[ATA_ID_MWDMA_MODES] = id[ATA_ID_MWDMA_MODES];
 	drive->id[ATA_ID_SWDMA_MODES] = id[ATA_ID_SWDMA_MODES];
+	drive->id[ATA_ID_CFA_MODES]   = id[ATA_ID_CFA_MODES];
 	/* anything more ? */
 
 	kfree(id);
@@ -390,7 +391,10 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 		id[ATA_ID_UDMA_MODES]  &= ~0xFF00;
 		id[ATA_ID_MWDMA_MODES] &= ~0x0700;
 		id[ATA_ID_SWDMA_MODES] &= ~0x0700;
-	}
+		if (ata_id_is_cfa(id))
+			id[ATA_ID_CFA_MODES] &= ~0x0E00;
+	} else	if (ata_id_is_cfa(id))
+		id[ATA_ID_CFA_MODES] &= ~0x01C0;
 
  skip:
 #ifdef CONFIG_BLK_DEV_IDEDMA
@@ -403,12 +407,18 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	if (speed >= XFER_UDMA_0) {
 		i = 1 << (speed - XFER_UDMA_0);
 		id[ATA_ID_UDMA_MODES] |= (i << 8 | i);
+	} else if (ata_id_is_cfa(id) && speed >= XFER_MW_DMA_3) {
+		i = speed - XFER_MW_DMA_2;
+		id[ATA_ID_CFA_MODES] |= i << 9;
 	} else if (speed >= XFER_MW_DMA_0) {
 		i = 1 << (speed - XFER_MW_DMA_0);
 		id[ATA_ID_MWDMA_MODES] |= (i << 8 | i);
 	} else if (speed >= XFER_SW_DMA_0) {
 		i = 1 << (speed - XFER_SW_DMA_0);
 		id[ATA_ID_SWDMA_MODES] |= (i << 8 | i);
+	} else if (ata_id_is_cfa(id) && speed >= XFER_PIO_5) {
+		i = speed - XFER_PIO_4;
+		id[ATA_ID_CFA_MODES] |= i << 6;
 	}
 
 	if (!drive->init_speed)
diff --git a/drivers/ide/ide-timings.c b/drivers/ide/ide-timings.c
index 81f527af8fa..001a56365be 100644
--- a/drivers/ide/ide-timings.c
+++ b/drivers/ide/ide-timings.c
@@ -43,6 +43,8 @@ static struct ide_timing ide_timing[] = {
 	{ XFER_UDMA_1,     0,   0,   0,   0,   0,   0,   0,  80 },
 	{ XFER_UDMA_0,     0,   0,   0,   0,   0,   0,   0, 120 },
 
+	{ XFER_MW_DMA_4,  25,   0,   0,   0,  55,  20,  80,   0 },
+	{ XFER_MW_DMA_3,  25,   0,   0,   0,  65,  25, 100,   0 },
 	{ XFER_MW_DMA_2,  25,   0,   0,   0,  70,  25, 120,   0 },
 	{ XFER_MW_DMA_1,  45,   0,   0,   0,  80,  50, 150,   0 },
 	{ XFER_MW_DMA_0,  60,   0,   0,   0, 215, 215, 480,   0 },
@@ -51,7 +53,8 @@ static struct ide_timing ide_timing[] = {
 	{ XFER_SW_DMA_1,  90,   0,   0,   0, 240, 240, 480,   0 },
 	{ XFER_SW_DMA_0, 120,   0,   0,   0, 480, 480, 960,   0 },
 
-	{ XFER_PIO_5,     20,  50,  30, 100,  50,  30, 100,   0 },
+	{ XFER_PIO_6,     10,  55,  20,  80,  55,  20,  80,   0 },
+	{ XFER_PIO_5,     15,  65,  25, 100,  65,  25, 100,   0 },
 	{ XFER_PIO_4,     25,  70,  25, 120,  70,  25, 120,   0 },
 	{ XFER_PIO_3,     30,  80,  70, 180,  80,  70, 180,   0 },
 
@@ -90,6 +93,10 @@ u16 ide_pio_cycle_time(ide_drive_t *drive, u8 pio)
 		/* conservative "downgrade" for all pre-ATA2 drives */
 		if (pio < 3 && cycle < t->cycle)
 			cycle = 0; /* use standard timing */
+
+		/* Use the standard timing for the CF specific modes too */
+		if (pio > 4 && ata_id_is_cfa(id))
+			cycle = 0;
 	}
 
 	return cycle ? cycle : t->cycle;
@@ -161,7 +168,8 @@ int ide_timing_compute(ide_drive_t *drive, u8 speed,
 
 		if (speed <= XFER_PIO_2)
 			p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO];
-		else if (speed <= XFER_PIO_5)
+		else if ((speed <= XFER_PIO_4) ||
+			 (speed == XFER_PIO_5 && !ata_id_is_cfa(id)))
 			p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO_IORDY];
 		else if (speed >= XFER_MW_DMA_0 && speed <= XFER_MW_DMA_2)
 			p.cycle = id[ATA_ID_EIDE_DMA_MIN];
diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c
index 6910f6a257e..af44be9d546 100644
--- a/drivers/ide/ide-xfer-mode.c
+++ b/drivers/ide/ide-xfer-mode.c
@@ -9,11 +9,11 @@ static const char *udma_str[] =
 	 { "UDMA/16", "UDMA/25",  "UDMA/33",  "UDMA/44",
 	   "UDMA/66", "UDMA/100", "UDMA/133", "UDMA7" };
 static const char *mwdma_str[] =
-	{ "MWDMA0", "MWDMA1", "MWDMA2" };
+	{ "MWDMA0", "MWDMA1", "MWDMA2", "MWDMA3", "MWDMA4" };
 static const char *swdma_str[] =
 	{ "SWDMA0", "SWDMA1", "SWDMA2" };
 static const char *pio_str[] =
-	{ "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5" };
+	{ "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5", "PIO6" };
 
 /**
  *	ide_xfer_verbose	-	return IDE mode names
@@ -30,11 +30,11 @@ const char *ide_xfer_verbose(u8 mode)
 
 	if (mode >= XFER_UDMA_0 && mode <= XFER_UDMA_7)
 		s = udma_str[i];
-	else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_2)
+	else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_4)
 		s = mwdma_str[i];
 	else if (mode >= XFER_SW_DMA_0 && mode <= XFER_SW_DMA_2)
 		s = swdma_str[i];
-	else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_5)
+	else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_6)
 		s = pio_str[i & 0x7];
 	else if (mode == XFER_PIO_SLOW)
 		s = "PIO SLOW";
@@ -79,7 +79,10 @@ u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
 		}
 
 		if (id[ATA_ID_FIELD_VALID] & 2) {	      /* ATA2? */
-			if (ata_id_has_iordy(id)) {
+			if (ata_id_is_cfa(id) && (id[ATA_ID_CFA_MODES] & 7))
+				pio_mode = 4 + min_t(int, 2,
+						     id[ATA_ID_CFA_MODES] & 7);
+			else if (ata_id_has_iordy(id)) {
 				if (id[ATA_ID_PIO_MODES] & 7) {
 					overridden = 0;
 					if (id[ATA_ID_PIO_MODES] & 4)
@@ -239,7 +242,7 @@ int ide_set_xfer_rate(ide_drive_t *drive, u8 rate)
 
 	BUG_ON(rate < XFER_PIO_0);
 
-	if (rate >= XFER_PIO_0 && rate <= XFER_PIO_5)
+	if (rate >= XFER_PIO_0 && rate <= XFER_PIO_6)
 		return ide_set_pio_mode(drive, rate);
 
 	return ide_set_dma_mode(drive, rate);
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index d6f8977191c..b0a46062533 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -61,7 +61,8 @@ static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio)
 	if (cmd_off == 0)
 		cmd_off = 1;
 
-	if (pio > 2 || ata_id_has_iordy(drive->id))
+	if ((pio > 2 || ata_id_has_iordy(drive->id)) &&
+	    !(pio > 4 && ata_id_is_cfa(drive->id)))
 		iordy = 0x40;
 
 	return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy;
-- 
cgit v1.2.3


From 47ab834854d4639fedf2ed2f21b41297f2abe1a7 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:29 +0200
Subject: ide-disk: use ATA_ERR

Make use of ATA_ERR instead of hard-coded value in idedisk_set_max_address()
and idedisk_read_native_max_address().

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-disk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index ca934c8a128..c998cf8e971 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -227,7 +227,7 @@ static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
 	ide_no_data_taskfile(drive, &cmd);
 
 	/* if OK, compute maximum address value */
-	if ((tf->status & 0x01) == 0)
+	if (!(tf->status & ATA_ERR))
 		addr = ide_get_lba_addr(tf, lba48) + 1;
 
 	return addr;
@@ -267,7 +267,7 @@ static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 	ide_no_data_taskfile(drive, &cmd);
 
 	/* if OK, compute maximum address value */
-	if ((tf->status & 0x01) == 0)
+	if (!(tf->status & ATA_ERR))
 		addr_set = ide_get_lba_addr(tf, lba48) + 1;
 
 	return addr_set;
-- 
cgit v1.2.3


From 4d74c3fcf2b90487eacec511bc8c07177711c81c Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:29 +0200
Subject: ide: use ATA_HOB

Make use of ATA_HOB instead of hard-coded value in the tf_read() method.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   | 4 ++--
 drivers/ide/ide-h8300.c  | 4 ++--
 drivers/ide/ide-io-std.c | 4 ++--
 drivers/ide/ns87415.c    | 4 ++--
 drivers/ide/scc_pata.c   | 4 ++--
 drivers/ide/tx4938ide.c  | 4 ++--
 drivers/ide/tx4939ide.c  | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 6bb45a2b24c..8fc6ae958b0 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -242,7 +242,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	ide_mm_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = ide_mm_inb(io_ports->feature_addr);
@@ -258,7 +258,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = ide_mm_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		ide_mm_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		ide_mm_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = ide_mm_inb(io_ports->feature_addr);
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index ff8339ed59a..7492f28d129 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -98,7 +98,7 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = inb(io_ports->feature_addr);
@@ -114,7 +114,7 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = inb(io_ports->feature_addr);
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 2d9c6dc3f95..3a867e49a0a 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -166,7 +166,7 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	tf_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = tf_inb(io_ports->feature_addr);
@@ -182,7 +182,7 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = tf_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		tf_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = tf_inb(io_ports->feature_addr);
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index cbc1d111038..13a9e00efa1 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -74,7 +74,7 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = inb(io_ports->feature_addr);
@@ -90,7 +90,7 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = superio_ide_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = inb(io_ports->feature_addr);
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 542419ffa9b..6e47eac1cd7 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -709,7 +709,7 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	scc_ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = scc_ide_inb(io_ports->feature_addr);
@@ -725,7 +725,7 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = scc_ide_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		scc_ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = scc_ide_inb(io_ports->feature_addr);
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 657a61890b1..1c4a78ac1a2 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -142,7 +142,7 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	tx4938ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = tx4938ide_inb(io_ports->feature_addr);
@@ -158,7 +158,7 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = tx4938ide_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		tx4938ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		tx4938ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature =
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index faf0d97b9ce..77aee5b2ce9 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -509,7 +509,7 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	/* be sure we're looking at the low order bits */
-	tx4939ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
+	tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = tx4939ide_inb(io_ports->feature_addr);
@@ -525,7 +525,7 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf->device = tx4939ide_inb(io_ports->device_addr);
 
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		tx4939ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
+		tx4939ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature =
-- 
cgit v1.2.3


From ecf3a31d2a08a419bdf919456f1724f5b72bde2c Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:30 +0200
Subject: ide: turn set_irq() method into write_devctl() method

Turn set_irq() method with its software reset hack into write_devctl() method
(for just writing a value into the device control register) at last...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     |  2 +-
 drivers/ide/au1xxx-ide.c   |  3 +--
 drivers/ide/falconide.c    |  3 +--
 drivers/ide/ide-eh.c       |  7 +++----
 drivers/ide/ide-h8300.c    |  3 +--
 drivers/ide/ide-io-std.c   | 16 +++-------------
 drivers/ide/ide-io.c       |  4 +++-
 drivers/ide/ide-iops.c     |  4 ++--
 drivers/ide/ide-pm.c       |  2 +-
 drivers/ide/ide-probe.c    |  6 +++---
 drivers/ide/ide-taskfile.c |  2 +-
 drivers/ide/ns87415.c      |  3 +--
 drivers/ide/pmac.c         | 14 ++------------
 drivers/ide/q40ide.c       |  3 +--
 drivers/ide/scc_pata.c     | 14 ++------------
 drivers/ide/sgiioc4.c      |  3 +--
 drivers/ide/tx4938ide.c    |  3 +--
 drivers/ide/tx4939ide.c    |  6 ++----
 include/linux/ide.h        |  6 ++----
 19 files changed, 32 insertions(+), 72 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 8fc6ae958b0..e6e96743aa7 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -295,7 +295,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = {
 	.exec_command	= ide_exec_command,
 	.read_status	= ide_read_status,
 	.read_altstatus	= ide_read_altstatus,
-	.set_irq	= ide_set_irq,
+	.write_devctl	= ide_write_devctl,
 
 	.tf_load	= at91_ide_tf_load,
 	.tf_read	= at91_ide_tf_read,
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 1bfb43d0d3a..2ca10d533da 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -467,8 +467,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index b368a5effc3..5063be85dc3 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -89,8 +89,7 @@ static const struct ide_tp_ops falconide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 11664976eea..de4b7f1c9c9 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -401,15 +401,14 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	 * immediate interrupt due to the edge transition it produces.
 	 * This single interrupt gives us a "fast poll" for drives that
 	 * recover from reset very quickly, saving us the first 50ms wait time.
-	 *
-	 * TODO: add ->softreset method and stop abusing ->set_irq
 	 */
 	/* set SRST and nIEN */
-	tp_ops->set_irq(hwif, 4);
+	tp_ops->write_devctl(hwif, ATA_SRST | ATA_NIEN | ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	/* clear SRST, leave nIEN (unless device is on the quirk list) */
-	tp_ops->set_irq(hwif, drive->quirk_list == 2);
+	tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) |
+			     ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 7492f28d129..a57ccad61ac 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -159,8 +159,7 @@ static const struct ide_tp_ops h8300_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= h8300_tf_load,
 	.tf_read		= h8300_tf_read,
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 3a867e49a0a..bbeedce6b17 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -64,23 +64,14 @@ u8 ide_read_altstatus(ide_hwif_t *hwif)
 }
 EXPORT_SYMBOL_GPL(ide_read_altstatus);
 
-void ide_set_irq(ide_hwif_t *hwif, int on)
+void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
 	else
 		outb(ctl, hwif->io_ports.ctl_addr);
 }
-EXPORT_SYMBOL_GPL(ide_set_irq);
+EXPORT_SYMBOL_GPL(ide_write_devctl);
 
 void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
@@ -312,8 +303,7 @@ const struct ide_tp_ops default_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3c52317d852..5b57905a7d7 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -494,7 +494,9 @@ repeat:
 			 * quirk_list may not like intr setups/cleanups
 			 */
 			if (prev_port && prev_port->cur_dev->quirk_list == 0)
-				prev_port->tp_ops->set_irq(prev_port, 0);
+				prev_port->tp_ops->write_devctl(prev_port,
+								ATA_NIEN |
+								ATA_DEVCTL_OBS);
 
 			hwif->host->cur_port = hwif;
 		}
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 0caca342802..ae227dd8466 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -360,7 +360,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	SELECT_DRIVE(drive);
 	SELECT_MASK(drive, 1);
 	udelay(1);
-	tp_ops->set_irq(hwif, 0);
+	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT;
@@ -372,7 +372,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
 	if (drive->quirk_list == 2)
-		tp_ops->set_irq(hwif, 1);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	error = __ide_wait_stat(drive, drive->ready_stat,
 				ATA_BUSY | ATA_DRQ | ATA_ERR,
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ebf2d21ebdc..20553d4c42a 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -233,7 +233,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		if (rc)
 			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
 		SELECT_DRIVE(drive);
-		hwif->tp_ops->set_irq(hwif, 1);
+		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		rc = ide_wait_not_busy(hwif, 100000);
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7c1f1bf8183..d240f76b0da 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -260,7 +260,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	 * during the identify phase that the IRQ handler isn't expecting.
 	 */
 	if (io_ports->ctl_addr)
-		tp_ops->set_irq(hwif, 0);
+		tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	/* take a deep breath */
 	msleep(50);
@@ -628,7 +628,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
 		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
 			SELECT_DRIVE(drive);
-			hwif->tp_ops->set_irq(hwif, 1);
+			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 			mdelay(2);
 			rc = ide_wait_not_busy(hwif, 35000);
 			if (rc)
@@ -845,7 +845,7 @@ static int init_irq (ide_hwif_t *hwif)
 		irq_handler = ide_intr;
 
 	if (io_ports->ctl_addr)
-		hwif->tp_ops->set_irq(hwif, 1);
+		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif))
 		goto out_up;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dba68db629b..47f13cd1103 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -80,7 +80,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 
 	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
 		ide_tf_dump(drive->name, tf);
-		tp_ops->set_irq(hwif, 1);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		SELECT_MASK(drive, 0);
 		tp_ops->tf_load(drive, cmd);
 	}
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 13a9e00efa1..00ab0be7335 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -109,8 +109,7 @@ static const struct ide_tp_ops superio_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= superio_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= superio_tf_read,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 879c3d8d9f3..7aa45ea37ee 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -476,17 +476,8 @@ static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
 				     + IDE_TIMING_CONFIG));
 }
 
-static void pmac_set_irq(ide_hwif_t *hwif, int on)
+static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
 	(void)readl((void __iomem *)(hwif->io_ports.data_addr
 				     + IDE_TIMING_CONFIG));
@@ -954,8 +945,7 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.exec_command		= pmac_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= pmac_set_irq,
+	.write_devctl		= pmac_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 2a43a2f4963..7fddfd34fcc 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -99,8 +99,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 6e47eac1cd7..6ba4983d831 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -148,17 +148,8 @@ static u8 scc_dma_sff_read_status(ide_hwif_t *hwif)
 	return (u8)in_be32((void *)(hwif->dma_base + 4));
 }
 
-static void scc_set_irq(ide_hwif_t *hwif, int on)
+static void scc_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	out_be32((void *)hwif->io_ports.ctl_addr, ctl);
 	eieio();
 	in_be32((void *)(hwif->dma_base + 0x01c));
@@ -843,8 +834,7 @@ static const struct ide_tp_ops scc_tp_ops = {
 	.exec_command		= scc_exec_command,
 	.read_status		= scc_read_status,
 	.read_altstatus		= scc_read_altstatus,
-
-	.set_irq		= scc_set_irq,
+	.write_devctl		= scc_write_devctl,
 
 	.tf_load		= scc_tf_load,
 	.tf_read		= scc_tf_read,
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 6ef5a567d37..58980fcafc3 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -503,8 +503,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= sgiioc4_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 1c4a78ac1a2..ec3aa32fbbe 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -204,8 +204,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4938ide_tf_load,
 	.tf_read		= tx4938ide_tf_read,
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 77aee5b2ce9..43bc0372413 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -571,8 +571,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= tx4939ide_tf_read,
@@ -595,8 +594,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 836c4c6cb7e..ccb70abe991 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -655,8 +655,7 @@ struct ide_tp_ops {
 	void	(*exec_command)(struct hwif_s *, u8);
 	u8	(*read_status)(struct hwif_s *);
 	u8	(*read_altstatus)(struct hwif_s *);
-
-	void	(*set_irq)(struct hwif_s *, int);
+	void	(*write_devctl)(struct hwif_s *, u8);
 
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
@@ -1165,8 +1164,7 @@ void ide_tf_dump(const char *, struct ide_taskfile *);
 void ide_exec_command(ide_hwif_t *, u8);
 u8 ide_read_status(ide_hwif_t *);
 u8 ide_read_altstatus(ide_hwif_t *);
-
-void ide_set_irq(ide_hwif_t *, int);
+void ide_write_devctl(ide_hwif_t *, u8);
 
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
-- 
cgit v1.2.3


From 6762511934e6e7287ce3c8baac0d52ef64e3787b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:30 +0200
Subject: ide: rename IDE_TFLAG_IN_[HOB_]FEATURE

The feature register has never been readable -- when its location is read, one
gets the error register value; hence rename IDE_TFLAG_IN_[HOB_]FEATURE into
IDE_TFLAG_IN_[HOB_]ERROR and introduce the 'hob_error' field into the 'struct
ide_taskfile' (despite the error register not really depending on the HOB bit).

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   | 16 ++++++++--------
 drivers/ide/ide-h8300.c  | 16 ++++++++--------
 drivers/ide/ide-io-std.c | 16 ++++++++--------
 drivers/ide/ide-iops.c   |  2 +-
 drivers/ide/ns87415.c    | 16 ++++++++--------
 drivers/ide/scc_pata.c   | 16 ++++++++--------
 drivers/ide/tx4938ide.c  | 17 ++++++++---------
 drivers/ide/tx4939ide.c  | 17 ++++++++---------
 include/linux/ide.h      | 12 ++++++++----
 9 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index e6e96743aa7..9dce793d93b 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -244,8 +244,8 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = ide_mm_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = ide_mm_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = ide_mm_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -260,16 +260,16 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		ide_mm_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = ide_mm_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = ide_mm_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = ide_mm_inb(io_ports->nsect_addr);
+			tf->hob_nsect = ide_mm_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = ide_mm_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = ide_mm_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = ide_mm_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = ide_mm_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = ide_mm_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = ide_mm_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index a57ccad61ac..1d45cd5b6a1 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -100,8 +100,8 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -116,16 +116,16 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = inb(io_ports->nsect_addr);
+			tf->hob_nsect = inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = inb(io_ports->lbal_addr);
+			tf->hob_lbal  = inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = inb(io_ports->lbam_addr);
+			tf->hob_lbam  = inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = inb(io_ports->lbah_addr);
+			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index bbeedce6b17..31f5c5f4c09 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -159,8 +159,8 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tf_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tf_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tf_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -175,16 +175,16 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = tf_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = tf_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tf_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tf_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tf_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tf_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tf_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tf_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tf_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tf_inb(io_ports->lbah_addr);
 	}
 }
 EXPORT_SYMBOL_GPL(ide_tf_read);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index ae227dd8466..6f363a26700 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -55,7 +55,7 @@ u8 ide_read_error(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_FEATURE;
+	cmd.tf_flags = IDE_TFLAG_IN_ERROR;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 00ab0be7335..0a6cf74c326 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -76,8 +76,8 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -92,16 +92,16 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = inb(io_ports->nsect_addr);
+			tf->hob_nsect = inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = inb(io_ports->lbal_addr);
+			tf->hob_lbal  = inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = inb(io_ports->lbam_addr);
+			tf->hob_lbam  = inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = inb(io_ports->lbah_addr);
+			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 6ba4983d831..ea0a9752c6f 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -702,8 +702,8 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = scc_ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = scc_ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = scc_ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -718,16 +718,16 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = scc_ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = scc_ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = scc_ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = scc_ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = scc_ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = scc_ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = scc_ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = scc_ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = scc_ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index ec3aa32fbbe..606c37f5267 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -144,8 +144,8 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tx4938ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tx4938ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4938ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -160,17 +160,16 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4938ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature =
-				tx4938ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = tx4938ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tx4938ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tx4938ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tx4938ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tx4938ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tx4938ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tx4938ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tx4938ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tx4938ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 43bc0372413..f1e9da71110 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -511,8 +511,8 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tx4939ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tx4939ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4939ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -527,17 +527,16 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4939ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature =
-				tx4939ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error =	tx4939ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tx4939ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tx4939ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tx4939ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tx4939ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tx4939ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tx4939ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tx4939ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tx4939ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ccb70abe991..e919c865f0c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -265,7 +265,7 @@ enum {
 	IDE_TFLAG_WRITE			= (1 << 12),
 	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 13),
 	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 14),
-	IDE_TFLAG_IN_HOB_FEATURE	= (1 << 15),
+	IDE_TFLAG_IN_HOB_ERROR		= (1 << 15),
 	IDE_TFLAG_IN_HOB_NSECT		= (1 << 16),
 	IDE_TFLAG_IN_HOB_LBAL		= (1 << 17),
 	IDE_TFLAG_IN_HOB_LBAM		= (1 << 18),
@@ -273,10 +273,10 @@ enum {
 	IDE_TFLAG_IN_HOB_LBA		= IDE_TFLAG_IN_HOB_LBAL |
 					  IDE_TFLAG_IN_HOB_LBAM |
 					  IDE_TFLAG_IN_HOB_LBAH,
-	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_FEATURE |
+	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_ERROR |
 					  IDE_TFLAG_IN_HOB_NSECT |
 					  IDE_TFLAG_IN_HOB_LBA,
-	IDE_TFLAG_IN_FEATURE		= (1 << 20),
+	IDE_TFLAG_IN_ERROR		= (1 << 20),
 	IDE_TFLAG_IN_NSECT		= (1 << 21),
 	IDE_TFLAG_IN_LBAL		= (1 << 22),
 	IDE_TFLAG_IN_LBAM		= (1 << 23),
@@ -310,8 +310,12 @@ enum {
 
 struct ide_taskfile {
 	u8	hob_data;	/*  0: high data byte (for TASKFILE IOCTL) */
+				/*  1-5: additional data to support LBA48 */
+	union {
+		u8 hob_error;	/*   read: error */
+		u8 hob_feature;	/*  write: feature */
+	};
 
-	u8	hob_feature;	/*  1-5: additional data to support LBA48 */
 	u8	hob_nsect;
 	u8	hob_lbal;
 	u8	hob_lbam;
-- 
cgit v1.2.3


From deae17fd5d147ae65e277905343b7ea578574d12 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:31 +0200
Subject: ide-io-std: shorten ide_{in|out}put_data()

ide_{in|out|put_data() can be somewhat shortened by merging the paths doing
16-bit I/O...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io-std.c | 60 ++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 31f5c5f4c09..96a537da892 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -216,11 +216,10 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	unsigned long data_addr = io_ports->data_addr;
+	unsigned int words = (len + 1) >> 1;
 	u8 io_32bit = drive->io_32bit;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
-	len++;
-
 	if (io_32bit) {
 		unsigned long uninitialized_var(flags);
 
@@ -229,27 +228,26 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
+		words >>= 1;
 		if (mmio)
-			__ide_mm_insl((void __iomem *)data_addr, buf, len / 4);
+			__ide_mm_insl((void __iomem *)data_addr, buf, words);
 		else
-			insl(data_addr, buf, len / 4);
+			insl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
 			local_irq_restore(flags);
 
-		if ((len & 3) >= 2) {
-			if (mmio)
-				__ide_mm_insw((void __iomem *)data_addr,
-						(u8 *)buf + (len & ~3), 1);
-			else
-				insw(data_addr, (u8 *)buf + (len & ~3), 1);
-		}
-	} else {
-		if (mmio)
-			__ide_mm_insw((void __iomem *)data_addr, buf, len / 2);
-		else
-			insw(data_addr, buf, len / 2);
+		if (((len + 1) & 3) < 2)
+			return;
+
+		buf += len & ~3;
+		words = 1;
 	}
+
+	if (mmio)
+		__ide_mm_insw((void __iomem *)data_addr, buf, words);
+	else
+		insw(data_addr, buf, words);
 }
 EXPORT_SYMBOL_GPL(ide_input_data);
 
@@ -262,11 +260,10 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	unsigned long data_addr = io_ports->data_addr;
+	unsigned int words = (len + 1) >> 1;
 	u8 io_32bit = drive->io_32bit;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
-	len++;
-
 	if (io_32bit) {
 		unsigned long uninitialized_var(flags);
 
@@ -275,27 +272,26 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
+		words >>= 1;
 		if (mmio)
-			__ide_mm_outsl((void __iomem *)data_addr, buf, len / 4);
+			__ide_mm_outsl((void __iomem *)data_addr, buf, words);
 		else
-			outsl(data_addr, buf, len / 4);
+			outsl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
 			local_irq_restore(flags);
 
-		if ((len & 3) >= 2) {
-			if (mmio)
-				__ide_mm_outsw((void __iomem *)data_addr,
-						 (u8 *)buf + (len & ~3), 1);
-			else
-				outsw(data_addr, (u8 *)buf + (len & ~3), 1);
-		}
-	} else {
-		if (mmio)
-			__ide_mm_outsw((void __iomem *)data_addr, buf, len / 2);
-		else
-			outsw(data_addr, buf, len / 2);
+		if (((len + 1) & 3) < 2)
+			return;
+
+		buf += len & ~3;
+		words = 1;
 	}
+
+	if (mmio)
+		__ide_mm_outsw((void __iomem *)data_addr, buf, words);
+	else
+		outsw(data_addr, buf, words);
 }
 EXPORT_SYMBOL_GPL(ide_output_data);
 
-- 
cgit v1.2.3


From bac08cee93f9cb37b40ecfa8eaf1f6d8daf3909b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:31 +0200
Subject: ide: call {in|out}put_data() methods from tf_{read|load}() methods
 (take 2)

Handle IDE_FTFLAG_{IN|OUT}_DATA flags in tf_{read|load}() methods by calling
{in|out}put_data() methods to transfer 2 bytes -- this will allow us to move
that handling out of those methods altogether...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   | 13 +++++++------
 drivers/ide/ide-h8300.c  | 15 ++++++++++-----
 drivers/ide/ide-io-std.c | 18 ++++++------------
 drivers/ide/ns87415.c    |  8 +++++---
 drivers/ide/scc_pata.c   | 16 ++++++++++------
 drivers/ide/tx4938ide.c  | 15 +++++++--------
 drivers/ide/tx4939ide.c  | 15 +++++++--------
 7 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 9dce793d93b..b7be66d600f 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -196,9 +196,9 @@ static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 		HIHI = 0xFF;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u16 data = (tf->hob_data << 8) | tf->data;
+		u8 data[2] = { tf->data, tf->hob_data };
 
-		at91_ide_output_data(drive, NULL, &data, 2);
+		at91_ide_output_data(drive, cmd, data, 2);
 	}
 
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
@@ -234,11 +234,12 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data;
+		u8 data[2];
 
-		at91_ide_input_data(drive, NULL, &data, 2);
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		at91_ide_input_data(drive, cmd, data, 2);
+
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 1d45cd5b6a1..6cb1785d32e 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -54,8 +54,11 @@ static void h8300_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA)
-		mm_outw((tf->hob_data << 8) | tf->data, io_ports->data_addr);
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
+		u8 data[2] = { tf->data, tf->hob_data };
+
+		h8300_output_data(drive, cmd, data, 2);
+	}
 
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		outb(tf->hob_feature, io_ports->feature_addr);
@@ -91,10 +94,12 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data = mm_inw(io_ports->data_addr);
+		u8 data[2];
+
+		h8300_input_data(drive, cmd, data, 2);
 
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 96a537da892..f06940df255 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -91,12 +91,9 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 		HIHI = 0xFF;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u16 data = (tf->hob_data << 8) | tf->data;
+		u8 data[2] = { tf->data, tf->hob_data };
 
-		if (mmio)
-			writew(data, (void __iomem *)io_ports->data_addr);
-		else
-			outw(data, io_ports->data_addr);
+		ide_output_data(drive, cmd, data, 2);
 	}
 
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
@@ -145,15 +142,12 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data;
+		u8 data[2];
 
-		if (mmio)
-			data = readw((void __iomem *)io_ports->data_addr);
-		else
-			data = inw(io_ports->data_addr);
+		ide_input_data(drive, cmd, data, 2);
 
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 0a6cf74c326..7d64bc079fa 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -67,10 +67,12 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data = inw(io_ports->data_addr);
+		u8 data[2];
 
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		ide_input_data(drive, cmd, data, 2);
+
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index ea0a9752c6f..9c47cbf8c50 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -656,9 +656,11 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA)
-		out_be32((void *)io_ports->data_addr,
-			 (tf->hob_data << 8) | tf->data);
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
+		u8 data[2] = { tf->data, tf->hob_data };
+
+		scc_output_data(drive, NULL, data, 2);
+	}
 
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		scc_ide_outb(tf->hob_feature, io_ports->feature_addr);
@@ -693,10 +695,12 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data = (u16)in_be32((void *)io_ports->data_addr);
+		u8 data[2];
+
+		scc_input_data(drive, cmd, data, 2);
 
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 606c37f5267..c075464308a 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -93,10 +93,9 @@ static void tx4938ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 		HIHI = 0xFF;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u16 data = (tf->hob_data << 8) | tf->data;
+		u8 data[2] = { tf->data, tf->hob_data };
 
-		/* no endian swap */
-		__raw_writew(data, (void __iomem *)io_ports->data_addr);
+		hwif->tp_ops->output_data(drive, cmd, data, 2);
 	}
 
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
@@ -133,12 +132,12 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data;
+		u8 data[2];
 
-		/* no endian swap */
-		data = __raw_readw((void __iomem *)io_ports->data_addr);
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		hwif->tp_ops->input_data(drive, cmd, data, 2);
+
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index f1e9da71110..c350d0c7ba4 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -458,10 +458,9 @@ static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 		HIHI = 0xFF;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u16 data = (tf->hob_data << 8) | tf->data;
+		u8 data[2] = { tf->data, tf->hob_data };
 
-		/* no endian swap */
-		__raw_writew(data, (void __iomem *)io_ports->data_addr);
+		hwif->tp_ops->output_data(drive, cmd, data, 2);
 	}
 
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
@@ -500,12 +499,12 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_taskfile *tf = &cmd->tf;
 
 	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u16 data;
+		u8 data[2];
 
-		/* no endian swap */
-		data = __raw_readw((void __iomem *)io_ports->data_addr);
-		tf->data = data & 0xff;
-		tf->hob_data = (data >> 8) & 0xff;
+		hwif->tp_ops->input_data(drive, cmd, data, 2);
+
+		tf->data = data[0];
+		tf->hob_data = data[1];
 	}
 
 	/* be sure we're looking at the low order bits */
-- 
cgit v1.2.3


From 35218d1ca808ed19b8c6f079ce91872b3deb2219 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:31 +0200
Subject: ide: move data register access out of tf_{read|load}() methods (take
 2)

Move IDE_FTFLAG_{IN|OUT}_DATA flag handling out of tf_{read|load}() methods
into the only two functions where these flags actually need to be handled:
do_rw_taskfile() and ide_complete_cmd()...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     | 15 ---------------
 drivers/ide/ide-h8300.c    | 15 ---------------
 drivers/ide/ide-io-std.c   | 15 ---------------
 drivers/ide/ide-io.c       | 12 +++++++++++-
 drivers/ide/ide-taskfile.c |  6 ++++++
 drivers/ide/ns87415.c      |  9 ---------
 drivers/ide/scc_pata.c     | 15 ---------------
 drivers/ide/tx4938ide.c    | 15 ---------------
 drivers/ide/tx4939ide.c    | 15 ---------------
 9 files changed, 17 insertions(+), 100 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index b7be66d600f..04b39ff02d7 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -195,12 +195,6 @@ static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u8 data[2] = { tf->data, tf->hob_data };
-
-		at91_ide_output_data(drive, cmd, data, 2);
-	}
-
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		ide_mm_outb(tf->hob_feature, io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
@@ -233,15 +227,6 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		at91_ide_input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 6cb1785d32e..8541a9abd7a 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -54,12 +54,6 @@ static void h8300_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u8 data[2] = { tf->data, tf->hob_data };
-
-		h8300_output_data(drive, cmd, data, 2);
-	}
-
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		outb(tf->hob_feature, io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
@@ -93,15 +87,6 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		h8300_input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index f06940df255..7f77bb7db48 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -90,12 +90,6 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u8 data[2] = { tf->data, tf->hob_data };
-
-		ide_output_data(drive, cmd, data, 2);
-	}
-
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		tf_outb(tf->hob_feature, io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
@@ -141,15 +135,6 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 		tf_inb  = ide_inb;
 	}
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		ide_input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 5b57905a7d7..5589dce8867 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(ide_end_rq);
 
 void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 {
+	const struct ide_tp_ops *tp_ops = drive->hwif->tp_ops;
 	struct ide_taskfile *tf = &cmd->tf;
 	struct request *rq = cmd->rq;
 	u8 tf_cmd = tf->command;
@@ -80,7 +81,16 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 	tf->error = err;
 	tf->status = stat;
 
-	drive->hwif->tp_ops->tf_read(drive, cmd);
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
+		u8 data[2];
+
+		tp_ops->input_data(drive, cmd, data, 2);
+
+		tf->data = data[0];
+		tf->hob_data = data[1];
+	}
+
+	tp_ops->tf_read(drive, cmd);
 
 	if ((cmd->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) &&
 	    tf_cmd == ATA_CMD_IDLEIMMEDIATE) {
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 47f13cd1103..243421ce40d 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -82,6 +82,12 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		ide_tf_dump(drive->name, tf);
 		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		SELECT_MASK(drive, 0);
+
+		if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
+			u8 data[2] = { tf->data, tf->hob_data };
+
+			tp_ops->output_data(drive, cmd, data, 2);
+		}
 		tp_ops->tf_load(drive, cmd);
 	}
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 7d64bc079fa..9f6dff83b14 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -66,15 +66,6 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		ide_input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 9c47cbf8c50..97f8e0ef21b 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -656,12 +656,6 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u8 data[2] = { tf->data, tf->hob_data };
-
-		scc_output_data(drive, NULL, data, 2);
-	}
-
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		scc_ide_outb(tf->hob_feature, io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
@@ -694,15 +688,6 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		scc_input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index c075464308a..be391b61596 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -92,12 +92,6 @@ static void tx4938ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u8 data[2] = { tf->data, tf->hob_data };
-
-		hwif->tp_ops->output_data(drive, cmd, data, 2);
-	}
-
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		tx4938ide_outb(tf->hob_feature, io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
@@ -131,15 +125,6 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		hwif->tp_ops->input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index c350d0c7ba4..5a614d1c94f 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -457,12 +457,6 @@ static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-		u8 data[2] = { tf->data, tf->hob_data };
-
-		hwif->tp_ops->output_data(drive, cmd, data, 2);
-	}
-
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		tx4939ide_outb(tf->hob_feature, io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
@@ -498,15 +492,6 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &cmd->tf;
 
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		hwif->tp_ops->input_data(drive, cmd, data, 2);
-
-		tf->data = data[0];
-		tf->hob_data = data[1];
-	}
-
 	/* be sure we're looking at the low order bits */
 	tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-- 
cgit v1.2.3


From 0f861e8c47ede537a8ad280c61d5d00d541f04db Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:31 +0200
Subject: MAINTAINERS: move old ide-{floppy,tape} entries to CREDITS (take 2)

Ben Hutchings noticed that MAINTAINERS somehow still contain old
ide-{floppy,tape} entries.  Fix it by moving them to CREDITS (kudos
to Gadi and Paul for all the early hard work on ide-{floppy,tape}).

v2:
Rename IDE/ATAPI CDROM DRIVER entry to IDE/ATAPI DRIVERS one.

Cc: Gadi Oxman <gadio@netvision.net.il>
Cc: Paul Bristow <paul@paulbristow.net>
Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 CREDITS     |  9 +++++++++
 MAINTAINERS | 15 +--------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/CREDITS b/CREDITS
index e8b7d36611e..9a93e3e26d7 100644
--- a/CREDITS
+++ b/CREDITS
@@ -495,6 +495,11 @@ S: Kopmansg 2
 S: 411 13  Goteborg
 S: Sweden
 
+N: Paul Bristow
+E: paul@paulbristow.net
+W: http://paulbristow.net/linux/idefloppy.html
+D: Maintainer of IDE/ATAPI floppy driver
+
 N: Dominik Brodowski
 E: linux@brodo.de
 W: http://www.brodo.de/
@@ -2642,6 +2647,10 @@ S: C/ Mieses 20, 9-B
 S: Valladolid 47009
 S: Spain
 
+N: Gadi Oxman
+E: gadio@netvision.net.il
+D: Original author and maintainer of IDE/ATAPI floppy/tape drivers
+
 N: Greg Page
 E: gpage@sovereign.org
 D: IPX development and support
diff --git a/MAINTAINERS b/MAINTAINERS
index c5f4e9d27b6..d20c786baf2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2201,25 +2201,12 @@ L:	linux-ide@vger.kernel.org
 T:	quilt kernel.org/pub/linux/kernel/people/bart/pata-2.6/
 S:	Maintained
 
-IDE/ATAPI CDROM DRIVER
+IDE/ATAPI DRIVERS
 P:	Borislav Petkov
 M:	petkovbb@gmail.com
 L:	linux-ide@vger.kernel.org
 S:	Maintained
 
-IDE/ATAPI FLOPPY DRIVERS
-P:	Paul Bristow
-M:	Paul Bristow <paul@paulbristow.net>
-W:	http://paulbristow.net/linux/idefloppy.html
-L:	linux-kernel@vger.kernel.org
-S:	Maintained
-
-IDE/ATAPI TAPE DRIVERS
-P:	Gadi Oxman
-M:	Gadi Oxman <gadio@netvision.net.il>
-L:	linux-kernel@vger.kernel.org
-S:	Maintained
-
 IDLE-I7300
 P:	Andy Henroid
 M:	andrew.d.henroid@intel.com
-- 
cgit v1.2.3


From abb596b25edac1ec1acc4ef53df190771661c3d2 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:32 +0200
Subject: ide: turn selectproc() method into dev_select() method (take 5)

Turn selectproc() method into dev_select() method by teaching it to write to the
device register and moving it from 'struct ide_port_ops' to 'struct ide_tp_ops'.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: benh@kernel.crashing.org
Cc: petkovbb@gmail.com
[bart: add ->dev_select to at91_ide.c and tx4939.c (__BIG_ENDIAN case)]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   |  1 +
 drivers/ide/au1xxx-ide.c |  1 +
 drivers/ide/falconide.c  |  1 +
 drivers/ide/ht6560b.c    | 20 +++++++++++++++--
 drivers/ide/ide-h8300.c  |  1 +
 drivers/ide/ide-io-std.c | 13 +++++++++++
 drivers/ide/ide-iops.c   | 12 +---------
 drivers/ide/ns87415.c    | 25 ++++++++++++++++-----
 drivers/ide/pmac.c       | 58 ++++++++++++++++++++++++++++++++----------------
 drivers/ide/q40ide.c     |  1 +
 drivers/ide/qd65xx.c     | 21 +++++++++++++++---
 drivers/ide/scc_pata.c   |  1 +
 drivers/ide/sgiioc4.c    |  1 +
 drivers/ide/trm290.c     | 20 +++++++++++++----
 drivers/ide/tx4938ide.c  |  1 +
 drivers/ide/tx4939ide.c  |  4 +++-
 include/linux/ide.h      |  6 ++---
 17 files changed, 139 insertions(+), 48 deletions(-)

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 04b39ff02d7..8eda552326e 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -283,6 +283,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = {
 	.read_altstatus	= ide_read_altstatus,
 	.write_devctl	= ide_write_devctl,
 
+	.dev_select	= ide_dev_select,
 	.tf_load	= at91_ide_tf_load,
 	.tf_read	= at91_ide_tf_read,
 
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 2ca10d533da..46013644c96 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -469,6 +469,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 5063be85dc3..afa2af9a362 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -91,6 +91,7 @@ static const struct ide_tp_ops falconide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
index c7e5c2246b7..2fb0f296500 100644
--- a/drivers/ide/ht6560b.c
+++ b/drivers/ide/ht6560b.c
@@ -103,7 +103,7 @@
 /*
  * This routine is invoked from ide.c to prepare for access to a given drive.
  */
-static void ht6560b_selectproc (ide_drive_t *drive)
+static void ht6560b_dev_select(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
@@ -143,6 +143,8 @@ static void ht6560b_selectproc (ide_drive_t *drive)
 #endif
 	}
 	local_irq_restore(flags);
+
+	outb(drive->select | ATA_DEVICE_OBS, hwif->io_ports.device_addr);
 }
 
 /*
@@ -305,15 +307,29 @@ static int probe_ht6560b;
 module_param_named(probe, probe_ht6560b, bool, 0);
 MODULE_PARM_DESC(probe, "probe for HT6560B chipset");
 
+static const struct ide_tp_ops ht6560b_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= ht6560b_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
+};
+
 static const struct ide_port_ops ht6560b_port_ops = {
 	.init_dev		= ht6560b_init_dev,
 	.set_pio_mode		= ht6560b_set_pio_mode,
-	.selectproc		= ht6560b_selectproc,
 };
 
 static const struct ide_port_info ht6560b_port_info __initdata = {
 	.name			= DRV_NAME,
 	.chipset		= ide_ht6560b,
+	.tp_ops 		= &ht6560b_tp_ops,
 	.port_ops		= &ht6560b_port_ops,
 	.host_flags		= IDE_HFLAG_SERIALIZE | /* is this needed? */
 				  IDE_HFLAG_NO_DMA |
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 8541a9abd7a..dac9a6d4496 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -151,6 +151,7 @@ static const struct ide_tp_ops h8300_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= h8300_tf_load,
 	.tf_read		= h8300_tf_read,
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 7f77bb7db48..9cac281d82c 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -73,6 +73,18 @@ void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
 }
 EXPORT_SYMBOL_GPL(ide_write_devctl);
 
+void ide_dev_select(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u8 select = drive->select | ATA_DEVICE_OBS;
+
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
+		writeb(select, (void __iomem *)hwif->io_ports.device_addr);
+	else
+		outb(select, hwif->io_ports.device_addr);
+}
+EXPORT_SYMBOL_GPL(ide_dev_select);
+
 void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -280,6 +292,7 @@ const struct ide_tp_ops default_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 6f363a26700..dfb0ec317fa 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -29,17 +29,7 @@
 
 void SELECT_DRIVE(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	struct ide_cmd cmd;
-
-	if (port_ops && port_ops->selectproc)
-		port_ops->selectproc(drive);
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_OUT_DEVICE;
-
-	drive->hwif->tp_ops->tf_load(drive, &cmd);
+	drive->hwif->tp_ops->dev_select(drive);
 }
 
 void SELECT_MASK(ide_drive_t *drive, int mask)
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9f6dff83b14..af1b421eb45 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -98,12 +98,15 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 }
 
+static void ns87415_dev_select(ide_drive_t *drive);
+
 static const struct ide_tp_ops superio_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= superio_read_status,
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ns87415_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= superio_tf_read,
 
@@ -182,10 +185,12 @@ static void ns87415_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
 	local_irq_restore(flags);
 }
 
-static void ns87415_selectproc (ide_drive_t *drive)
+static void ns87415_dev_select(ide_drive_t *drive)
 {
 	ns87415_prepare_drive(drive,
 			      !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 static void ns87415_dma_start(ide_drive_t *drive)
@@ -229,7 +234,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	 * Also, leave IRQ masked during drive probing, to prevent infinite
 	 * interrupts from a potentially floating INTA..
 	 *
-	 * IRQs get unmasked in selectproc when drive is first used.
+	 * IRQs get unmasked in dev_select() when drive is first used.
 	 */
 	(void) pci_read_config_dword(dev, 0x40, &ctrl);
 	(void) pci_read_config_byte(dev, 0x09, &progif);
@@ -281,8 +286,18 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	outb(0x60, hwif->dma_base + ATA_DMA_STATUS);
 }
 
-static const struct ide_port_ops ns87415_port_ops = {
-	.selectproc		= ns87415_selectproc,
+static const struct ide_tp_ops ns87415_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= ns87415_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static const struct ide_dma_ops ns87415_dma_ops = {
@@ -299,7 +314,7 @@ static const struct ide_dma_ops ns87415_dma_ops = {
 static const struct ide_port_info ns87415_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_ns87415,
-	.port_ops	= &ns87415_port_ops,
+	.tp_ops 	= &ns87415_tp_ops,
 	.dma_ops	= &ns87415_dma_ops,
 	.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA |
 			  IDE_HFLAG_NO_ATAPI_DMA,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 7aa45ea37ee..24ce1f805cd 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -404,8 +404,6 @@ kauai_lookup_timing(struct kauai_timing* table, int cycle_time)
 #define IDE_WAKEUP_DELAY	(1*HZ)
 
 static int pmac_ide_init_dma(ide_hwif_t *, const struct ide_port_info *);
-static void pmac_ide_selectproc(ide_drive_t *drive);
-static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
 
 #define PMAC_IDE_REG(x) \
 	((void __iomem *)((drive)->hwif->io_ports.data_addr + (x)))
@@ -415,8 +413,7 @@ static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
  * timing register when selecting that unit. This version is for
  * ASICs with a single timing register
  */
-static void
-pmac_ide_selectproc(ide_drive_t *drive)
+static void pmac_ide_apply_timings(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -434,8 +431,7 @@ pmac_ide_selectproc(ide_drive_t *drive)
  * timing register when selecting that unit. This version is for
  * ASICs with a dual timing register (Kauai)
  */
-static void
-pmac_ide_kauai_selectproc(ide_drive_t *drive)
+static void pmac_ide_kauai_apply_timings(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -464,9 +460,25 @@ pmac_ide_do_update_timings(ide_drive_t *drive)
 	if (pmif->kind == controller_sh_ata6 ||
 	    pmif->kind == controller_un_ata6 ||
 	    pmif->kind == controller_k2_ata6)
-		pmac_ide_kauai_selectproc(drive);
+		pmac_ide_kauai_apply_timings(drive);
 	else
-		pmac_ide_selectproc(drive);
+		pmac_ide_apply_timings(drive);
+}
+
+static void pmac_dev_select(ide_drive_t *drive)
+{
+	pmac_ide_apply_timings(drive);
+
+	writeb(drive->select | ATA_DEVICE_OBS,
+	       (void __iomem *)drive->hwif->io_ports.device_addr);
+}
+
+static void pmac_kauai_dev_select(ide_drive_t *drive)
+{
+	pmac_ide_kauai_apply_timings(drive);
+
+	writeb(drive->select | ATA_DEVICE_OBS,
+	       (void __iomem *)drive->hwif->io_ports.device_addr);
 }
 
 static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
@@ -947,6 +959,7 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= pmac_write_devctl,
 
+	.dev_select		= pmac_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
@@ -954,19 +967,24 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.output_data		= ide_output_data,
 };
 
-static const struct ide_port_ops pmac_ide_ata6_port_ops = {
-	.init_dev		= pmac_ide_init_dev,
-	.set_pio_mode		= pmac_ide_set_pio_mode,
-	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_kauai_selectproc,
-	.cable_detect		= pmac_ide_cable_detect,
+static const struct ide_tp_ops pmac_ata6_tp_ops = {
+	.exec_command		= pmac_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= pmac_write_devctl,
+
+	.dev_select		= pmac_kauai_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static const struct ide_port_ops pmac_ide_ata4_port_ops = {
 	.init_dev		= pmac_ide_init_dev,
 	.set_pio_mode		= pmac_ide_set_pio_mode,
 	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_selectproc,
 	.cable_detect		= pmac_ide_cable_detect,
 };
 
@@ -974,7 +992,6 @@ static const struct ide_port_ops pmac_ide_port_ops = {
 	.init_dev		= pmac_ide_init_dev,
 	.set_pio_mode		= pmac_ide_set_pio_mode,
 	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_selectproc,
 };
 
 static const struct ide_dma_ops pmac_dma_ops;
@@ -1011,15 +1028,18 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
 	pmif->broken_dma = pmif->broken_dma_warn = 0;
 	if (of_device_is_compatible(np, "shasta-ata")) {
 		pmif->kind = controller_sh_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA6;
 	} else if (of_device_is_compatible(np, "kauai-ata")) {
 		pmif->kind = controller_un_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "K2-UATA")) {
 		pmif->kind = controller_k2_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "keylargo-ata")) {
 		if (strcmp(np->name, "ata-4") == 0) {
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 7fddfd34fcc..d007e7f6659 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -101,6 +101,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c
index 08c4fa35e9b..c9a13498689 100644
--- a/drivers/ide/qd65xx.c
+++ b/drivers/ide/qd65xx.c
@@ -90,13 +90,15 @@ static int timings[4]={-1,-1,-1,-1}; /* stores current timing for each timer */
  * This routine is invoked to prepare for access to a given drive.
  */
 
-static void qd65xx_select(ide_drive_t *drive)
+static void qd65xx_dev_select(ide_drive_t *drive)
 {
 	u8 index = ((	(QD_TIMREG(drive)) & 0x80 ) >> 7) |
 			(QD_TIMREG(drive) & 0x02);
 
 	if (timings[index] != QD_TIMING(drive))
 		outb(timings[index] = QD_TIMING(drive), QD_TIMREG(drive));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 /*
@@ -309,20 +311,33 @@ static void __init qd6580_init_dev(ide_drive_t *drive)
 	drive->drive_data = (drive->dn & 1) ? t2 : t1;
 }
 
+static const struct ide_tp_ops qd65xx_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= qd65xx_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
+};
+
 static const struct ide_port_ops qd6500_port_ops = {
 	.init_dev		= qd6500_init_dev,
 	.set_pio_mode		= qd6500_set_pio_mode,
-	.selectproc		= qd65xx_select,
 };
 
 static const struct ide_port_ops qd6580_port_ops = {
 	.init_dev		= qd6580_init_dev,
 	.set_pio_mode		= qd6580_set_pio_mode,
-	.selectproc		= qd65xx_select,
 };
 
 static const struct ide_port_info qd65xx_port_info __initdata = {
 	.name			= DRV_NAME,
+	.tp_ops 		= &qd65xx_tp_ops,
 	.chipset		= ide_qd65xx,
 	.host_flags		= IDE_HFLAG_IO_32BIT |
 				  IDE_HFLAG_NO_DMA,
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 97f8e0ef21b..6d8dbd9c10b 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -825,6 +825,7 @@ static const struct ide_tp_ops scc_tp_ops = {
 	.read_altstatus		= scc_read_altstatus,
 	.write_devctl		= scc_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= scc_tf_load,
 	.tf_read		= scc_tf_read,
 
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 58980fcafc3..e5d2a48a84d 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -505,6 +505,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index c0528f27fca..4b42ca09153 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -171,9 +171,11 @@ static void trm290_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
 	local_irq_restore(flags);
 }
 
-static void trm290_selectproc (ide_drive_t *drive)
+static void trm290_dev_select(ide_drive_t *drive)
 {
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
@@ -298,8 +300,18 @@ static void __devinit init_hwif_trm290(ide_hwif_t *hwif)
 #endif
 }
 
-static const struct ide_port_ops trm290_port_ops = {
-	.selectproc		= trm290_selectproc,
+static const struct ide_tp_ops trm290_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= trm290_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static struct ide_dma_ops trm290_dma_ops = {
@@ -315,7 +327,7 @@ static struct ide_dma_ops trm290_dma_ops = {
 static const struct ide_port_info trm290_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_trm290,
-	.port_ops	= &trm290_port_ops,
+	.tp_ops 	= &trm290_tp_ops,
 	.dma_ops	= &trm290_dma_ops,
 	.host_flags	= IDE_HFLAG_TRM290 |
 			  IDE_HFLAG_NO_ATAPI_DMA |
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index be391b61596..4cb79c4c260 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -189,6 +189,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4938ide_tf_load,
 	.tf_read		= tx4938ide_tf_read,
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 5a614d1c94f..0040a9a3e26 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -429,7 +429,7 @@ static void tx4939ide_tf_load_fixup(ide_drive_t *drive)
 	 * Fix ATA100 CORE System Control Register. (The write to the
 	 * Device/Head register may write wrong data to the System
 	 * Control Register)
-	 * While Sys_Ctl is written here, selectproc is not needed.
+	 * While Sys_Ctl is written here, dev_select() is not needed.
 	 */
 	tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl);
 }
@@ -556,6 +556,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= tx4939ide_tf_read,
 
@@ -579,6 +580,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e919c865f0c..c69181c61fd 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -603,7 +603,7 @@ struct ide_drive_s {
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
-	unsigned int	drive_data;	/* used by set_pio_mode/selectproc */
+	unsigned int	drive_data;	/* used by set_pio_mode/dev_select() */
 	unsigned int	failures;	/* current failure count */
 	unsigned int	max_failures;	/* maximum allowed failure count */
 	u64		probed_capacity;/* initial reported media capacity (ide-cd only currently) */
@@ -661,6 +661,7 @@ struct ide_tp_ops {
 	u8	(*read_altstatus)(struct hwif_s *);
 	void	(*write_devctl)(struct hwif_s *, u8);
 
+	void	(*dev_select)(ide_drive_t *);
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
 
@@ -678,7 +679,6 @@ extern const struct ide_tp_ops default_tp_ops;
  * @init_dev:		host specific initialization of a device
  * @set_pio_mode:	routine to program host for PIO mode
  * @set_dma_mode:	routine to program host for DMA mode
- * @selectproc:		tweaks hardware to select drive
  * @reset_poll:		chipset polling based on hba specifics
  * @pre_reset:		chipset specific changes to default for device-hba resets
  * @resetproc:		routine to reset controller after a disk reset
@@ -695,7 +695,6 @@ struct ide_port_ops {
 	void	(*init_dev)(ide_drive_t *);
 	void	(*set_pio_mode)(ide_drive_t *, const u8);
 	void	(*set_dma_mode)(ide_drive_t *, const u8);
-	void	(*selectproc)(ide_drive_t *);
 	int	(*reset_poll)(ide_drive_t *);
 	void	(*pre_reset)(ide_drive_t *);
 	void	(*resetproc)(ide_drive_t *);
@@ -1170,6 +1169,7 @@ u8 ide_read_status(ide_hwif_t *);
 u8 ide_read_altstatus(ide_hwif_t *);
 void ide_write_devctl(ide_hwif_t *, u8);
 
+void ide_dev_select(ide_drive_t *);
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 
-- 
cgit v1.2.3


From fdd88f0af616db59a6a36bdf0185181d2b779f53 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:33 +0200
Subject: ide: inline SELECT_DRIVE()

Since SELECT_DRIVE() has boiled down to a mere dev_select() method call, it now
makes sense to just inline it...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-eh.c    |  7 ++++---
 drivers/ide/ide-io.c    |  2 +-
 drivers/ide/ide-iops.c  |  7 +------
 drivers/ide/ide-pm.c    |  5 +++--
 drivers/ide/ide-probe.c | 15 ++++++++-------
 drivers/ide/ns87415.c   |  2 +-
 include/linux/ide.h     |  1 -
 7 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index de4b7f1c9c9..5d5fb961b5c 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -165,11 +165,12 @@ static ide_startstop_t do_reset1(ide_drive_t *, int);
 static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	u8 stat;
 
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	udelay(10);
-	stat = hwif->tp_ops->read_status(hwif);
+	stat = tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, 0, ATA_BUSY))
 		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
@@ -348,7 +349,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	/* For an ATAPI device, first try an ATAPI SRST. */
 	if (drive->media != ide_disk && !do_not_try_atapi) {
 		pre_reset(drive);
-		SELECT_DRIVE(drive);
+		tp_ops->dev_select(drive);
 		udelay(20);
 		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 		ndelay(400);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 5589dce8867..1deb6d29b18 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -348,7 +348,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 	if (blk_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
-	SELECT_DRIVE(drive);
+	drive->hwif->tp_ops->dev_select(drive);
 	if (ide_wait_stat(&startstop, drive, drive->ready_stat,
 			  ATA_BUSY | ATA_DRQ, WAIT_READY)) {
 		printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index dfb0ec317fa..27bb70ddd45 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -27,11 +27,6 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-void SELECT_DRIVE(ide_drive_t *drive)
-{
-	drive->hwif->tp_ops->dev_select(drive);
-}
-
 void SELECT_MASK(ide_drive_t *drive, int mask)
 {
 	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
@@ -347,7 +342,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	disable_irq_nosync(hwif->irq);
 
 	udelay(1);
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	SELECT_MASK(drive, 1);
 	udelay(1);
 	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 20553d4c42a..bb7858ebb7d 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -223,6 +223,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		 * point.
 		 */
 		ide_hwif_t *hwif = drive->hwif;
+		const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 		struct request_queue *q = drive->queue;
 		unsigned long flags;
 		int rc;
@@ -232,8 +233,8 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		rc = ide_wait_not_busy(hwif, 35000);
 		if (rc)
 			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		SELECT_DRIVE(drive);
-		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+		tp_ops->dev_select(drive);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		rc = ide_wait_not_busy(hwif, 100000);
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index d240f76b0da..d8c1c3e735b 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -390,13 +390,13 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	 * (e.g. crw9624 as drive0 with disk as slave)
 	 */
 	msleep(50);
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	msleep(50);
 
 	if (ide_read_device(drive) != drive->select && present == 0) {
 		if (drive->dn & 1) {
 			/* exit with drive0 selected */
-			SELECT_DRIVE(hwif->devices[0]);
+			tp_ops->dev_select(hwif->devices[0]);
 			/* allow ATA_BUSY to assert & clear */
 			msleep(50);
 		}
@@ -422,7 +422,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			printk(KERN_ERR "%s: no response (status = 0x%02x), "
 					"resetting drive\n", drive->name, stat);
 			msleep(50);
-			SELECT_DRIVE(drive);
+			tp_ops->dev_select(drive);
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
@@ -441,7 +441,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	}
 	if (drive->dn & 1) {
 		/* exit with drive0 selected */
-		SELECT_DRIVE(hwif->devices[0]);
+		tp_ops->dev_select(hwif->devices[0]);
 		msleep(50);
 		/* ensure drive irq is clear */
 		(void)tp_ops->read_status(hwif);
@@ -605,6 +605,7 @@ out:
 
 static int ide_port_wait_ready(ide_hwif_t *hwif)
 {
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	ide_drive_t *drive;
 	int i, rc;
 
@@ -627,8 +628,8 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 		/* Ignore disks that we will not probe for later. */
 		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
 		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
-			SELECT_DRIVE(drive);
-			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+			tp_ops->dev_select(drive);
+			tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 			mdelay(2);
 			rc = ide_wait_not_busy(hwif, 35000);
 			if (rc)
@@ -640,7 +641,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 out:
 	/* Exit function with master reselected (let's be sane) */
 	if (i)
-		SELECT_DRIVE(hwif->devices[0]);
+		tp_ops->dev_select(hwif->devices[0]);
 
 	return rc;
 }
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index af1b421eb45..71a39fb3856 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -262,7 +262,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 #ifdef __sparc_v9__
 		/*
 		 * XXX: Reset the device, if we don't it will not respond to
-		 *      SELECT_DRIVE() properly during first ide_probe_port().
+		 *      dev_select() properly during first ide_probe_port().
 		 */
 		timeout = 10000;
 		outb(12, hwif->io_ports.ctl_addr);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c69181c61fd..a5d26f66ef7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1176,7 +1176,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
-extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
 u8 ide_read_error(ide_drive_t *);
-- 
cgit v1.2.3


From a9d5a97fa3828e7cbc577805eba3d0a0d35dd5a0 Mon Sep 17 00:00:00 2001
From: TOMARI Hisanobu <posco.grubb@gmail.com>
Date: Tue, 31 Mar 2009 20:15:34 +0200
Subject: ide-pmac: IDE cable detection on Apple PowerBook

As IDE cable used on Apple PowerBook/iBook laptops are always of "Short 40"
type when the firmware says it's 80 conductor one, the cable detection should
return ATA_CBL_PATA40_SHORT on those machines.  This enables to automatically
use UDMA5 even with drives that doesn't correctly detect those cables on Apple
laptops.

Signed-off-by: TOMARI Hisanobu <posco.grubb@gmail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: benh@kernel.crashing.org
[bart: beautify patch description]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/pmac.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 24ce1f805cd..052b9bf1f8f 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -919,10 +919,18 @@ static u8 pmac_ide_cable_detect(ide_hwif_t *hwif)
 		(pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent);
 	struct device_node *np = pmif->node;
 	const char *cable = of_get_property(np, "cable-type", NULL);
+	struct device_node *root = of_find_node_by_path("/");
+	const char *model = of_get_property(root, "model", NULL);
 
 	/* Get cable type from device-tree. */
-	if (cable && !strncmp(cable, "80-", 3))
-		return ATA_CBL_PATA80;
+	if (cable && !strncmp(cable, "80-", 3)) {
+		/* Some drives fail to detect 80c cable in PowerBook */
+		/* These machine use proprietary short IDE cable anyway */
+		if (!strncmp(model, "PowerBook", 9))
+			return ATA_CBL_PATA40_SHORT;
+		else
+			return ATA_CBL_PATA80;
+	}
 
 	/*
 	 * G5's seem to have incorrect cable type in device-tree.
-- 
cgit v1.2.3


From d80c592c38378c88c568b96963f7a98d927d05fa Mon Sep 17 00:00:00 2001
From: Gilles Espinasse <g.esp@free.fr>
Date: Tue, 31 Mar 2009 20:15:34 +0200
Subject: ide: be able to build pmac driver without IDE built-in

No reason to need IDE built-in to be able to compile pmac driver.

Tested to work on 2.6.29-rc8 and 2.6.28.8 with ide and pmac as modules
inside an initramfs.

Signed-off-by: Gilles Espinasse <g.esp@free.fr>
Cc: sam@ravnborg.org
Cc: benh@kernel.crashing.org
[bart: remove now superfluous IDE check]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 4d2f2a6b5bd..cf06494bb74 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -681,7 +681,7 @@ endif
 # TODO: BLK_DEV_IDEDMA_PCI -> BLK_DEV_IDEDMA_SFF
 config BLK_DEV_IDE_PMAC
 	tristate "PowerMac on-board IDE support"
-	depends on PPC_PMAC && IDE=y
+	depends on PPC_PMAC
 	select IDE_TIMINGS
 	select BLK_DEV_IDEDMA_PCI
 	help
-- 
cgit v1.2.3


From 5b6c942dd1f13835eff8105ec2aa859544a1498d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Tue, 31 Mar 2009 20:15:34 +0200
Subject: ide-floppy: do not complete rq's prematurely

... and access them afterwards. Simplify rq completing code while at it.

Spotted-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 100e6f94b4f..3e43b889dd6 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -310,16 +310,14 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			pc->xferred = pc->req_xfer;
 			if (drive->pc_update_buffers)
 				drive->pc_update_buffers(drive, pc);
-
-			if (drive->media == ide_floppy)
-				ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
 	/* No more interrupts */
 	if ((stat & ATA_DRQ) == 0) {
-		int uptodate;
+		int uptodate, error;
+		unsigned int done;
 
 		debug_log("Packet command completed, %d bytes transferred\n",
 			  pc->xferred);
@@ -366,9 +364,9 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
+			done = blk_rq_bytes(rq);
+			error = 0;
 		} else {
-			unsigned int done;
 
 			if (blk_fs_request(rq) == 0 && uptodate <= 0) {
 				if (rq->errors == 0)
@@ -380,9 +378,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			else
 				done = blk_rq_bytes(rq);
 
-			ide_complete_rq(drive, uptodate ? 0 : -EIO, done);
+			error = uptodate ? 0 : -EIO;
 		}
 
+		ide_complete_rq(drive, error, done);
 		return ide_stopped;
 	}
 
-- 
cgit v1.2.3


From 5a3f23d515a2ebf0c750db80579ca57b28cbce6d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 31 Mar 2009 13:27:11 -0400
Subject: Btrfs: add extra flushing for renames and truncates

Renames and truncates are both common ways to replace old data with new
data.  The filesystem can make an effort to make sure the new data is
on disk before actually replacing the old data.

This is especially important for rename, which many application use as
though it were atomic for both the data and the metadata involved.  The
current btrfs code will happily replace a file that is fully on disk
with one that was just created and still has pending IO.

If we crash after transaction commit but before the IO is done, we'll end
up replacing a good file with a zero length file.  The solution used
here is to create a list of inodes that need special ordering and force
them to disk before the commit is done.  This is similar to the
ext3 style data=ordering, except it is only done on selected files.

Btrfs is able to get away with this because it does not wait on commits
very often, even for fsync (which use a sub-commit).

For renames, we order the file when it wasn't already
on disk and when it is replacing an existing file.  Larger files
are sent to filemap_flush right away (before the transaction handle is
opened).

For truncates, we order if the file goes from non-zero size down to
zero size.  This is a little different, because at the time of the
truncate the file has no dirty bytes to order.  But, we flag the inode
so that it is added to the ordered list on close (via release method).  We
also immediately add it to the ordered list of the current transaction
so that we can try to flush down any writes the application sneaks in
before commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |  18 ++++++++
 fs/btrfs/ctree.h        |  35 ++++++++++++++
 fs/btrfs/disk-io.c      |   2 +
 fs/btrfs/file.c         |  26 +++++++++++
 fs/btrfs/inode.c        |  81 ++++++++++++++++++++++++++++++---
 fs/btrfs/ordered-data.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.h |   4 ++
 fs/btrfs/transaction.c  |  11 +++++
 8 files changed, 288 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3af4cfb5654..b30986f00b9 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
+	/*
+	 * list for tracking inodes that must be sent to disk before a
+	 * rename or truncate commit
+	 */
+	struct list_head ordered_operations;
+
 	/* the space_info for where this inode's data allocations are done */
 	struct btrfs_space_info *space_info;
 
@@ -122,6 +128,18 @@ struct btrfs_inode {
 	 */
 	u64 last_unlink_trans;
 
+	/*
+	 * ordered_data_close is set by truncate when a file that used
+	 * to have good data has been truncated to zero.  When it is set
+	 * the btrfs file release call will add this inode to the
+	 * ordered operations list so that we make sure to flush out any
+	 * new data the application may have written before commit.
+	 *
+	 * yes, its silly to have a single bitflag, but we might grow more
+	 * of these.
+	 */
+	unsigned ordered_data_close:1;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2737facbd34..f48905ee524 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
 
+	/*
+	 * this protects the ordered operations list only while we are
+	 * processing all of the entries on it.  This way we make
+	 * sure the commit code doesn't find the list temporarily empty
+	 * because another function happens to be doing non-waiting preflush
+	 * before jumping into the main commit.
+	 */
+	struct mutex ordered_operations_mutex;
+
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -741,9 +757,28 @@ struct btrfs_fs_info {
 	 * ordered extents
 	 */
 	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
 	struct list_head ordered_extents;
+
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
 	struct list_head delalloc_inodes;
 
+	/*
+	 * special rename and truncate targets that must be on disk before
+	 * we're allowed to commit.  This is basically the ext3 style
+	 * data=ordered list.
+	 */
+	struct list_head ordered_operations;
+
 	/*
 	 * there is a pool of worker threads for checksumming during writes
 	 * and a pool for checksumming after reads.  This is because readers
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9244cd7313d..1747dfd1865 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
@@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	insert_inode_hash(fs_info->btree_inode);
 
 	mutex_init(&fs_info->trans_mutex);
+	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->pinned_mutex);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32d10a61761..9c9fb46ccd0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1161,6 +1161,20 @@ out_nolock:
 		page_cache_release(pinned[1]);
 	*ppos = pos;
 
+	/*
+	 * we want to make sure fsync finds this change
+	 * but we haven't joined a transaction running right now.
+	 *
+	 * Later on, someone is sure to update the inode and get the
+	 * real transid recorded.
+	 *
+	 * We set last_trans now to the fs_info generation + 1,
+	 * this will either be one more than the running transaction
+	 * or the generation used for the next transaction if there isn't
+	 * one running right now.
+	 */
+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
 	if (num_written > 0 && will_write) {
 		struct btrfs_trans_handle *trans;
 
@@ -1194,6 +1208,18 @@ out_nolock:
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+	/*
+	 * ordered_data_close is set by settattr when we are about to truncate
+	 * a file from a non-zero size to a zero size.  This tries to
+	 * flush down new bytes that may have been written if the
+	 * application were using truncate to replace a file in place.
+	 */
+	if (BTRFS_I(inode)->ordered_data_close) {
+		BTRFS_I(inode)->ordered_data_close = 0;
+		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+			filemap_flush(inode->i_mapping);
+	}
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bffd79faffb..1cff528d5b5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
-		err = btrfs_cont_expand(inode, attr->ia_size);
-		if (err)
-			return err;
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+		if (attr->ia_size > inode->i_size) {
+			err = btrfs_cont_expand(inode, attr->ia_size);
+			if (err)
+				return err;
+		} else if (inode->i_size > 0 &&
+			   attr->ia_size == 0) {
+
+			/* we're truncating a file that used to have good
+			 * data down to zero.  Make sure it gets into
+			 * the ordered flush list so that any new writes
+			 * get down to disk quickly.
+			 */
+			BTRFS_I(inode)->ordered_data_close = 1;
+		}
 	}
 
 	err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
+
+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 
 	trans = btrfs_start_transaction(root, 1);
+
+	/*
+	 * setattr is responsible for setting the ordered_data_close flag,
+	 * but that is only tested during the last file release.  That
+	 * could happen well after the next commit, leaving a great big
+	 * window where new writes may get lost if someone chooses to write
+	 * to this file after truncating to zero
+	 *
+	 * The inode doesn't have any dirty data here, and so if we commit
+	 * this is a noop.  If someone immediately starts writing to the inode
+	 * it is very likely we'll catch some of their writes in this
+	 * transaction, and the commit will find this file on the ordered
+	 * data list with good things to send down.
+	 *
+	 * This is a best effort solution, there is still a window where
+	 * using truncate to replace the contents of the file will
+	 * end up with a zero length file after a crash.
+	 */
+	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+		btrfs_add_ordered_operation(trans, root, inode);
+
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_i_size_write(inode, inode->i_size);
 
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
 	INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->ordered_operations);
 	return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
 	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-	spin_lock(&BTRFS_I(inode)->root->list_lock);
+	/*
+	 * Make sure we're properly removed from the ordered operation
+	 * lists.
+	 */
+	smp_mb();
+	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+		spin_lock(&root->fs_info->ordered_extent_lock);
+		list_del_init(&BTRFS_I(inode)->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+	}
+
+	spin_lock(&root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
 		       " list\n", inode->i_ino);
 		dump_stack();
 	}
-	spin_unlock(&BTRFS_I(inode)->root->list_lock);
+	spin_unlock(&root->list_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,8 +4715,27 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * we're using rename to replace one file with another.
+	 * and the replacement file is large.  Start IO on it now so
+	 * we don't add too much work to the end of the transaction
+	 */
+	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+	    new_inode->i_size &&
+	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+		filemap_flush(old_inode->i_mapping);
+
 	trans = btrfs_start_transaction(root, 1);
 
+	/*
+	 * make sure the inode gets flushed if it is replacing
+	 * something.
+	 */
+	if (new_inode && new_inode->i_size &&
+	    old_inode && S_ISREG(old_inode->i_mode)) {
+		btrfs_add_ordered_operation(trans, root, old_inode);
+	}
+
 	/*
 	 * this is an ugly little race, but the rename is required to make
 	 * sure that if we crash, the inode is either at the old name
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0..53c87b197d7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
+
+	/*
+	 * we have no more ordered extents for this inode and
+	 * no dirty pages.  We can safely remove it from the
+	 * list of ordered extents
+	 */
+	if (RB_EMPTY_ROOT(&tree->tree) &&
+	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		list_del_init(&BTRFS_I(inode)->ordered_operations);
+	}
 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
 	mutex_unlock(&tree->mutex);
@@ -369,6 +379,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 	return 0;
 }
 
+/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct inode *inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+	list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+				   ordered_operations);
+
+		inode = &btrfs_inode->vfs_inode;
+
+		list_del_init(&btrfs_inode->ordered_operations);
+
+		/*
+		 * the inode may be getting freed (in sys_unlink path).
+		 */
+		inode = igrab(inode);
+
+		if (!wait && inode) {
+			list_add_tail(&BTRFS_I(inode)->ordered_operations,
+			      &root->fs_info->ordered_operations);
+		}
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		if (inode) {
+			if (wait)
+				btrfs_wait_ordered_range(inode, 0, (u64)-1);
+			else
+				filemap_flush(inode->i_mapping);
+			iput(inode);
+		}
+
+		cond_resched();
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+	if (wait && !list_empty(&root->fs_info->ordered_operations))
+		goto again;
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return 0;
+}
+
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 
 	return ret;
 }
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode)
+{
+	u64 last_mod;
+
+	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+	/*
+	 * if this file hasn't been changed since the last transaction
+	 * commit, we can safely return without doing anything
+	 */
+	if (last_mod < root->fs_info->last_trans_committed)
+		return 0;
+
+	/*
+	 * the transaction is already committing.  Just start the IO and
+	 * don't bother with all of this list nonsense
+	 */
+	if (trans && root->fs_info->running_transaction->blocked) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		return 0;
+	}
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+		list_add_tail(&BTRFS_I(inode)->ordered_operations,
+			      &root->fs_info->ordered_operations);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d..3d31c8827b0 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9c8f158dd2d..664782c6a2d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	int should_grow = 0;
 	unsigned long now = get_seconds();
 
+	btrfs_run_ordered_operations(root, 0);
+
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
@@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 
+		/*
+		 * rename don't use btrfs_join_transaction, so, once we
+		 * set the transaction to blocked above, we aren't going
+		 * to get any new ordered operations.  We can safely run
+		 * it here and no for sure that nothing new will be added
+		 * to the list
+		 */
+		btrfs_run_ordered_operations(root, 1);
+
 		smp_mb();
 		if (cur_trans->num_writers > 1 || should_grow)
 			schedule_timeout(timeout);
-- 
cgit v1.2.3


From d57e62b89796f751c9422801cbcd407a9f8dcdc4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 31 Mar 2009 13:47:50 -0400
Subject: Btrfs: try to free metadata pages when we free btree blocks

COW means we cycle though blocks fairly quickly, and once we
free an extent on disk, it doesn't make much sense to keep the pages around.

This commit tries to immediately free the page when we free the extent,
which lowers our memory footprint significantly.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c482e0d7c4..f5e7cae63d8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2384,6 +2384,10 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
+		} else {
+			invalidate_mapping_pages(info->btree_inode->i_mapping,
+			     bytenr >> PAGE_CACHE_SHIFT,
+			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
 		}
 
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-- 
cgit v1.2.3


From ae149b6bec64a09373ba20fce75f8aa6b14b78fd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:15 -0700
Subject: proc tty: add struct tty_operations::proc_fops

Used for gradual switch of TTY drivers from using ->read_proc which helps
with gradual switch from ->read_proc for the whole tree.

As side effect, fix possible race condition when ->data initialized after
PDE is hooked into proc tree.

->proc_fops takes precedence over ->read_proc.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_tty.c         | 20 +++++++++++++-------
 include/linux/tty_driver.h |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae6..854827b1d46 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,22 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->ops->read_proc || !driver->driver_name ||
-	    driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry)
 		return;
 
-	ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-	if (!ent)
+	if (driver->ops->proc_fops) {
+		ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+				       driver->ops->proc_fops, driver);
+		if (!ent)
+			return;
+	} else if (driver->ops->read_proc) {
+		ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
+		if (!ent)
+			return;
+		ent->read_proc = driver->ops->read_proc;
+		ent->data = driver;
+	} else
 		return;
-	ent->read_proc = driver->ops->read_proc;
-	ent->data = driver;
-
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 08e088334db..c9a69575ded 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -264,6 +264,7 @@ struct tty_operations {
 	int (*poll_get_char)(struct tty_driver *driver, int line);
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
+	const struct file_operations *proc_fops;
 };
 
 struct tty_driver {
-- 
cgit v1.2.3


From 444697d61b6d5ae43b317d259db7c362c9d3756a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:15 -0700
Subject: proc tty: switch cyclades to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 54 ++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 6a59f72a9c2..272db0e2b49 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -657,6 +657,7 @@
 
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 
 static void cy_throttle(struct tty_struct *tty);
 static void cy_send_xchar(struct tty_struct *tty, char ch);
@@ -868,8 +869,6 @@ static int cyz_issue_cmd(struct cyclades_card *, __u32, __u8, __u32);
 static unsigned detect_isa_irq(void __iomem *);
 #endif				/* CONFIG_ISA */
 
-static int cyclades_get_proc_info(char *, char **, off_t, int, int *, void *);
-
 #ifndef CONFIG_CYZ_INTR
 static void cyz_poll(unsigned long);
 
@@ -5216,31 +5215,22 @@ static struct pci_driver cy_pci_driver = {
 };
 #endif
 
-static int
-cyclades_get_proc_info(char *buf, char **start, off_t offset, int length,
-		int *eof, void *data)
+static int cyclades_proc_show(struct seq_file *m, void *v)
 {
 	struct cyclades_port *info;
 	unsigned int i, j;
-	int len = 0;
-	off_t begin = 0;
-	off_t pos = 0;
-	int size;
 	__u32 cur_jifs = jiffies;
 
-	size = sprintf(buf, "Dev TimeOpen   BytesOut  IdleOut    BytesIn   "
+	seq_puts(m, "Dev TimeOpen   BytesOut  IdleOut    BytesIn   "
 			"IdleIn  Overruns  Ldisc\n");
 
-	pos += size;
-	len += size;
-
 	/* Output one line for each known port */
 	for (i = 0; i < NR_CARDS; i++)
 		for (j = 0; j < cy_card[i].nports; j++) {
 			info = &cy_card[i].ports[j];
 
 			if (info->port.count)
-				size = sprintf(buf + len, "%3d %8lu %10lu %8lu "
+				seq_printf(m, "%3d %8lu %10lu %8lu "
 					"%10lu %8lu %9lu %6ld\n", info->line,
 					(cur_jifs - info->idle_stats.in_use) /
 					HZ, info->idle_stats.xmit_bytes,
@@ -5251,30 +5241,26 @@ cyclades_get_proc_info(char *buf, char **start, off_t offset, int length,
 					/* FIXME: double check locking */
 					(long)info->port.tty->ldisc.ops->num);
 			else
-				size = sprintf(buf + len, "%3d %8lu %10lu %8lu "
+				seq_printf(m, "%3d %8lu %10lu %8lu "
 					"%10lu %8lu %9lu %6ld\n",
 					info->line, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
-			len += size;
-			pos = begin + len;
-
-			if (pos < offset) {
-				len = 0;
-				begin = pos;
-			}
-			if (pos > offset + length)
-				goto done;
 		}
-	*eof = 1;
-done:
-	*start = buf + (offset - begin);	/* Start of wanted data */
-	len -= (offset - begin);	/* Start slop */
-	if (len > length)
-		len = length;	/* Ending slop */
-	if (len < 0)
-		len = 0;
-	return len;
+	return 0;
+}
+
+static int cyclades_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cyclades_proc_show, NULL);
 }
 
+static const struct file_operations cyclades_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cyclades_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /* The serial driver boot-time initialization code!
     Hardware I/O ports are mapped to character special devices on a
     first found, first allocated manner.  That is, this code searches
@@ -5311,9 +5297,9 @@ static const struct tty_operations cy_ops = {
 	.hangup = cy_hangup,
 	.break_ctl = cy_break,
 	.wait_until_sent = cy_wait_until_sent,
-	.read_proc = cyclades_get_proc_info,
 	.tiocmget = cy_tiocmget,
 	.tiocmset = cy_tiocmset,
+	.proc_fops = &cyclades_proc_fops,
 };
 
 static int __init cy_init(void)
-- 
cgit v1.2.3


From cdda7cd92b9c0b8b25c906a1f39c61954432357a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:16 -0700
Subject: proc tty: switch ip2 to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/ip2/ip2main.c | 74 +++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index 70e0ebc30bd..afd9247cf08 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -139,7 +139,7 @@
 #include <linux/seq_file.h>
 
 static const struct file_operations ip2mem_proc_fops;
-static int ip2_read_proc(char *, char **, off_t, int, int *, void * );
+static const struct file_operations ip2_proc_fops;
 
 /********************/
 /* Type Definitions */
@@ -446,9 +446,9 @@ static const struct tty_operations ip2_ops = {
 	.stop            = ip2_stop,
 	.start           = ip2_start,
 	.hangup          = ip2_hangup,
-	.read_proc       = ip2_read_proc,
 	.tiocmget	 = ip2_tiocmget,
 	.tiocmset	 = ip2_tiocmset,
+	.proc_fops	 = &ip2_proc_fops,
 };
 
 /******************************************************************************/
@@ -3029,19 +3029,17 @@ static const struct file_operations ip2mem_proc_fops = {
  * different sources including ip2mkdev.c and a couple of other drivers.
  * The bugs are all mine.  :-)	=mhw=
  */
-static int ip2_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
+static int ip2_proc_show(struct seq_file *m, void *v)
 {
 	int	i, j, box;
-	int	len = 0;
 	int	boxes = 0;
 	int	ports = 0;
 	int	tports = 0;
-	off_t	begin = 0;
 	i2eBordStrPtr  pB;
+	char *sep;
 
-	len += sprintf(page, "ip2info: 1.0 driver: %s\n", pcVersion );
-	len += sprintf(page+len, "Driver: SMajor=%d CMajor=%d IMajor=%d MaxBoards=%d MaxBoxes=%d MaxPorts=%d\n",
+	seq_printf(m, "ip2info: 1.0 driver: %s\n", pcVersion);
+	seq_printf(m, "Driver: SMajor=%d CMajor=%d IMajor=%d MaxBoards=%d MaxBoxes=%d MaxPorts=%d\n",
 			IP2_TTY_MAJOR, IP2_CALLOUT_MAJOR, IP2_IPL_MAJOR,
 			IP2_MAX_BOARDS, ABS_MAX_BOXES, ABS_BIGGEST_BOX);
 
@@ -3053,7 +3051,8 @@ static int ip2_read_proc(char *page, char **start, off_t off,
 			switch( pB->i2ePom.e.porID & ~POR_ID_RESERVED ) 
 			{
 			case POR_ID_FIIEX:
-				len += sprintf( page+len, "Board %d: EX ports=", i );
+				seq_printf(m, "Board %d: EX ports=", i);
+				sep = "";
 				for( box = 0; box < ABS_MAX_BOXES; ++box )
 				{
 					ports = 0;
@@ -3065,79 +3064,74 @@ static int ip2_read_proc(char *page, char **start, off_t off,
 							++ports;
 						}
 					}
-					len += sprintf( page+len, "%d,", ports );
+					seq_printf(m, "%s%d", sep, ports);
+					sep = ",";
 					tports += ports;
 				}
-
-				--len;	/* Backup over that last comma */
-
-				len += sprintf( page+len, " boxes=%d width=%d", boxes, pB->i2eDataWidth16 ? 16 : 8 );
+				seq_printf(m, " boxes=%d width=%d", boxes, pB->i2eDataWidth16 ? 16 : 8);
 				break;
 
 			case POR_ID_II_4:
-				len += sprintf(page+len, "Board %d: ISA-4 ports=4 boxes=1", i );
+				seq_printf(m, "Board %d: ISA-4 ports=4 boxes=1", i);
 				tports = ports = 4;
 				break;
 
 			case POR_ID_II_8:
-				len += sprintf(page+len, "Board %d: ISA-8-std ports=8 boxes=1", i );
+				seq_printf(m, "Board %d: ISA-8-std ports=8 boxes=1", i);
 				tports = ports = 8;
 				break;
 
 			case POR_ID_II_8R:
-				len += sprintf(page+len, "Board %d: ISA-8-RJ11 ports=8 boxes=1", i );
+				seq_printf(m, "Board %d: ISA-8-RJ11 ports=8 boxes=1", i);
 				tports = ports = 8;
 				break;
 
 			default:
-				len += sprintf(page+len, "Board %d: unknown", i );
+				seq_printf(m, "Board %d: unknown", i);
 				/* Don't try and probe for minor numbers */
 				tports = ports = 0;
 			}
 
 		} else {
 			/* Don't try and probe for minor numbers */
-			len += sprintf(page+len, "Board %d: vacant", i );
+			seq_printf(m, "Board %d: vacant", i);
 			tports = ports = 0;
 		}
 
 		if( tports ) {
-			len += sprintf(page+len, " minors=" );
-
+			seq_puts(m, " minors=");
+			sep = "";
 			for ( box = 0; box < ABS_MAX_BOXES; ++box )
 			{
 				for ( j = 0; j < ABS_BIGGEST_BOX; ++j )
 				{
 					if ( pB->i2eChannelMap[box] & (1 << j) )
 					{
-						len += sprintf (page+len,"%d,",
+						seq_printf(m, "%s%d", sep,
 							j + ABS_BIGGEST_BOX *
 							(box+i*ABS_MAX_BOXES));
+						sep = ",";
 					}
 				}
 			}
-
-			page[ len - 1 ] = '\n';	/* Overwrite that last comma */
-		} else {
-			len += sprintf (page+len,"\n" );
-		}
-
-		if (len+begin > off+count)
-			break;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
 		}
+		seq_putc(m, '\n');
 	}
+	return 0;
+ }
 
-	if (i >= IP2_MAX_BOARDS)
-		*eof = 1;
-	if (off >= len+begin)
-		return 0;
+static int ip2_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ip2_proc_show, NULL);
+}
 
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
- }
+static const struct file_operations ip2_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ip2_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
  
 /******************************************************************************/
 /* Function:   ip2trace()                                                     */
-- 
cgit v1.2.3


From 5bd6de7dadb8054a558ae4ac29121d8e93493065 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:16 -0700
Subject: proc tty: switch istallion to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/istallion.c | 121 ++++++++++++++++++++---------------------------
 1 file changed, 52 insertions(+), 69 deletions(-)

diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index 5c3dc6b8411..fff19f7e29d 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/serial.h>
+#include <linux/seq_file.h>
 #include <linux/cdk.h>
 #include <linux/comstats.h>
 #include <linux/istallion.h>
@@ -613,7 +614,6 @@ static int	stli_breakctl(struct tty_struct *tty, int state);
 static void	stli_waituntilsent(struct tty_struct *tty, int timeout);
 static void	stli_sendxchar(struct tty_struct *tty, char ch);
 static void	stli_hangup(struct tty_struct *tty);
-static int	stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portnr, char *pos);
 
 static int	stli_brdinit(struct stlibrd *brdp);
 static int	stli_startbrd(struct stlibrd *brdp);
@@ -1893,20 +1893,10 @@ static void stli_sendxchar(struct tty_struct *tty, char ch)
 	stli_cmdwait(brdp, portp, A_PORTCTRL, &actrl, sizeof(asyctrl_t), 0);
 }
 
-/*****************************************************************************/
-
-#define	MAXLINE		80
-
-/*
- *	Format info for a specified port. The line is deliberately limited
- *	to 80 characters. (If it is too long it will be truncated, if too
- *	short then padded with spaces).
- */
-
-static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portnr, char *pos)
+static void stli_portinfo(struct seq_file *m, struct stlibrd *brdp, struct stliport *portp, int portnr)
 {
-	char *sp, *uart;
-	int rc, cnt;
+	char *uart;
+	int rc;
 
 	rc = stli_portcmdstats(NULL, portp);
 
@@ -1918,44 +1908,50 @@ static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portn
 		default:uart = "CD1400"; break;
 		}
 	}
-
-	sp = pos;
-	sp += sprintf(sp, "%d: uart:%s ", portnr, uart);
+	seq_printf(m, "%d: uart:%s ", portnr, uart);
 
 	if ((brdp->state & BST_STARTED) && (rc >= 0)) {
-		sp += sprintf(sp, "tx:%d rx:%d", (int) stli_comstats.txtotal,
+		char sep;
+
+		seq_printf(m, "tx:%d rx:%d", (int) stli_comstats.txtotal,
 			(int) stli_comstats.rxtotal);
 
 		if (stli_comstats.rxframing)
-			sp += sprintf(sp, " fe:%d",
+			seq_printf(m, " fe:%d",
 				(int) stli_comstats.rxframing);
 		if (stli_comstats.rxparity)
-			sp += sprintf(sp, " pe:%d",
+			seq_printf(m, " pe:%d",
 				(int) stli_comstats.rxparity);
 		if (stli_comstats.rxbreaks)
-			sp += sprintf(sp, " brk:%d",
+			seq_printf(m, " brk:%d",
 				(int) stli_comstats.rxbreaks);
 		if (stli_comstats.rxoverrun)
-			sp += sprintf(sp, " oe:%d",
+			seq_printf(m, " oe:%d",
 				(int) stli_comstats.rxoverrun);
 
-		cnt = sprintf(sp, "%s%s%s%s%s ",
-			(stli_comstats.signals & TIOCM_RTS) ? "|RTS" : "",
-			(stli_comstats.signals & TIOCM_CTS) ? "|CTS" : "",
-			(stli_comstats.signals & TIOCM_DTR) ? "|DTR" : "",
-			(stli_comstats.signals & TIOCM_CD) ? "|DCD" : "",
-			(stli_comstats.signals & TIOCM_DSR) ? "|DSR" : "");
-		*sp = ' ';
-		sp += cnt;
+		sep = ' ';
+		if (stli_comstats.signals & TIOCM_RTS) {
+			seq_printf(m, "%c%s", sep, "RTS");
+			sep = '|';
+		}
+		if (stli_comstats.signals & TIOCM_CTS) {
+			seq_printf(m, "%c%s", sep, "CTS");
+			sep = '|';
+		}
+		if (stli_comstats.signals & TIOCM_DTR) {
+			seq_printf(m, "%c%s", sep, "DTR");
+			sep = '|';
+		}
+		if (stli_comstats.signals & TIOCM_CD) {
+			seq_printf(m, "%c%s", sep, "DCD");
+			sep = '|';
+		}
+		if (stli_comstats.signals & TIOCM_DSR) {
+			seq_printf(m, "%c%s", sep, "DSR");
+			sep = '|';
+		}
 	}
-
-	for (cnt = (sp - pos); (cnt < (MAXLINE - 1)); cnt++)
-		*sp++ = ' ';
-	if (cnt >= MAXLINE)
-		pos[(MAXLINE - 2)] = '+';
-	pos[(MAXLINE - 1)] = '\n';
-
-	return(MAXLINE);
+	seq_putc(m, '\n');
 }
 
 /*****************************************************************************/
@@ -1964,26 +1960,15 @@ static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portn
  *	Port info, read from the /proc file system.
  */
 
-static int stli_readproc(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int stli_proc_show(struct seq_file *m, void *v)
 {
 	struct stlibrd *brdp;
 	struct stliport *portp;
 	unsigned int brdnr, portnr, totalport;
-	int curoff, maxoff;
-	char *pos;
 
-	pos = page;
 	totalport = 0;
-	curoff = 0;
-
-	if (off == 0) {
-		pos += sprintf(pos, "%s: version %s", stli_drvtitle,
-			stli_drvversion);
-		while (pos < (page + MAXLINE - 1))
-			*pos++ = ' ';
-		*pos++ = '\n';
-	}
-	curoff =  MAXLINE;
+
+	seq_printf(m, "%s: version %s\n", stli_drvtitle, stli_drvversion);
 
 /*
  *	We scan through for each board, panel and port. The offset is
@@ -1996,33 +1981,31 @@ static int stli_readproc(char *page, char **start, off_t off, int count, int *eo
 		if (brdp->state == 0)
 			continue;
 
-		maxoff = curoff + (brdp->nrports * MAXLINE);
-		if (off >= maxoff) {
-			curoff = maxoff;
-			continue;
-		}
-
 		totalport = brdnr * STL_MAXPORTS;
 		for (portnr = 0; (portnr < brdp->nrports); portnr++,
 		    totalport++) {
 			portp = brdp->ports[portnr];
 			if (portp == NULL)
 				continue;
-			if (off >= (curoff += MAXLINE))
-				continue;
-			if ((pos - page + MAXLINE) > count)
-				goto stli_readdone;
-			pos += stli_portinfo(brdp, portp, totalport, pos);
+			stli_portinfo(m, brdp, portp, totalport);
 		}
 	}
+	return 0;
+}
 
-	*eof = 1;
-
-stli_readdone:
-	*start = page;
-	return(pos - page);
+static int stli_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stli_proc_show, NULL);
 }
 
+static const struct file_operations stli_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= stli_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*****************************************************************************/
 
 /*
@@ -4427,9 +4410,9 @@ static const struct tty_operations stli_ops = {
 	.break_ctl = stli_breakctl,
 	.wait_until_sent = stli_waituntilsent,
 	.send_xchar = stli_sendxchar,
-	.read_proc = stli_readproc,
 	.tiocmget = stli_tiocmget,
 	.tiocmset = stli_tiocmset,
+	.proc_fops = &stli_proc_fops,
 };
 
 static const struct tty_port_operations stli_port_ops = {
-- 
cgit v1.2.3


From 87687144b4fce2ad083e689eec8b219054c292ae Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:17 -0700
Subject: proc tty: switch synclink_cs to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/pcmcia/synclink_cs.c | 73 ++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 5608a1e5a3b..19d79fc5446 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -51,6 +51,7 @@
 #include <linux/ptrace.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
@@ -2619,13 +2620,12 @@ cleanup:
  * /proc fs routines....
  */
 
-static inline int line_info(char *buf, MGSLPC_INFO *info)
+static inline void line_info(struct seq_file *m, MGSLPC_INFO *info)
 {
 	char	stat_buf[30];
-	int	ret;
 	unsigned long flags;
 
-	ret = sprintf(buf, "%s:io:%04X irq:%d",
+	seq_printf(m, "%s:io:%04X irq:%d",
 		      info->device_name, info->io_base, info->irq_level);
 
 	/* output current serial signal states */
@@ -2649,75 +2649,70 @@ static inline int line_info(char *buf, MGSLPC_INFO *info)
 		strcat(stat_buf, "|RI");
 
 	if (info->params.mode == MGSL_MODE_HDLC) {
-		ret += sprintf(buf+ret, " HDLC txok:%d rxok:%d",
+		seq_printf(m, " HDLC txok:%d rxok:%d",
 			      info->icount.txok, info->icount.rxok);
 		if (info->icount.txunder)
-			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+			seq_printf(m, " txunder:%d", info->icount.txunder);
 		if (info->icount.txabort)
-			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+			seq_printf(m, " txabort:%d", info->icount.txabort);
 		if (info->icount.rxshort)
-			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);
+			seq_printf(m, " rxshort:%d", info->icount.rxshort);
 		if (info->icount.rxlong)
-			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+			seq_printf(m, " rxlong:%d", info->icount.rxlong);
 		if (info->icount.rxover)
-			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+			seq_printf(m, " rxover:%d", info->icount.rxover);
 		if (info->icount.rxcrc)
-			ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc);
+			seq_printf(m, " rxcrc:%d", info->icount.rxcrc);
 	} else {
-		ret += sprintf(buf+ret, " ASYNC tx:%d rx:%d",
+		seq_printf(m, " ASYNC tx:%d rx:%d",
 			      info->icount.tx, info->icount.rx);
 		if (info->icount.frame)
-			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+			seq_printf(m, " fe:%d", info->icount.frame);
 		if (info->icount.parity)
-			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+			seq_printf(m, " pe:%d", info->icount.parity);
 		if (info->icount.brk)
-			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);
+			seq_printf(m, " brk:%d", info->icount.brk);
 		if (info->icount.overrun)
-			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+			seq_printf(m, " oe:%d", info->icount.overrun);
 	}
 
 	/* Append serial signal status to end */
-	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+	seq_printf(m, " %s\n", stat_buf+1);
 
-	ret += sprintf(buf+ret, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+	seq_printf(m, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
 		       info->tx_active,info->bh_requested,info->bh_running,
 		       info->pending_bh);
-
-	return ret;
 }
 
 /* Called to print information about devices
  */
-static int mgslpc_read_proc(char *page, char **start, off_t off, int count,
-		 int *eof, void *data)
+static int mgslpc_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0, l;
-	off_t	begin = 0;
 	MGSLPC_INFO *info;
 
-	len += sprintf(page, "synclink driver:%s\n", driver_version);
+	seq_printf(m, "synclink driver:%s\n", driver_version);
 
 	info = mgslpc_device_list;
 	while( info ) {
-		l = line_info(page + len, info);
-		len += l;
-		if (len+begin > off+count)
-			goto done;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
+		line_info(m, info);
 		info = info->next_device;
 	}
+	return 0;
+}
 
-	*eof = 1;
-done:
-	if (off >= len+begin)
-		return 0;
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
+static int mgslpc_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mgslpc_proc_show, NULL);
 }
 
+static const struct file_operations mgslpc_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= mgslpc_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int rx_alloc_buffers(MGSLPC_INFO *info)
 {
 	/* each buffer has header and data */
@@ -2861,13 +2856,13 @@ static const struct tty_operations mgslpc_ops = {
 	.send_xchar = mgslpc_send_xchar,
 	.break_ctl = mgslpc_break,
 	.wait_until_sent = mgslpc_wait_until_sent,
-	.read_proc = mgslpc_read_proc,
 	.set_termios = mgslpc_set_termios,
 	.stop = tx_pause,
 	.start = tx_release,
 	.hangup = mgslpc_hangup,
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
+	.proc_fops = &mgslpc_proc_fops,
 };
 
 static void synclink_cs_cleanup(void)
-- 
cgit v1.2.3


From 8561c44c9e8baf02a9e3018f76c53aa99038a499 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:18 -0700
Subject: proc tty: switch stallion to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/stallion.c | 126 +++++++++++++++++++-----------------------------
 1 file changed, 50 insertions(+), 76 deletions(-)

diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index e1e0dd89ac9..2ad813a801d 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -32,6 +32,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/serial.h>
+#include <linux/seq_file.h>
 #include <linux/cd1400.h>
 #include <linux/sc26198.h>
 #include <linux/comstats.h>
@@ -1379,52 +1380,47 @@ static void stl_sendxchar(struct tty_struct *tty, char ch)
 		stl_putchar(tty, ch);
 }
 
-/*****************************************************************************/
-
-#define	MAXLINE		80
-
-/*
- *	Format info for a specified port. The line is deliberately limited
- *	to 80 characters. (If it is too long it will be truncated, if too
- *	short then padded with spaces).
- */
-
-static int stl_portinfo(struct stlport *portp, int portnr, char *pos)
+static void stl_portinfo(struct seq_file *m, struct stlport *portp, int portnr)
 {
-	char	*sp;
-	int	sigs, cnt;
+	int	sigs;
+	char sep;
 
-	sp = pos;
-	sp += sprintf(sp, "%d: uart:%s tx:%d rx:%d",
+	seq_printf(m, "%d: uart:%s tx:%d rx:%d",
 		portnr, (portp->hwid == 1) ? "SC26198" : "CD1400",
 		(int) portp->stats.txtotal, (int) portp->stats.rxtotal);
 
 	if (portp->stats.rxframing)
-		sp += sprintf(sp, " fe:%d", (int) portp->stats.rxframing);
+		seq_printf(m, " fe:%d", (int) portp->stats.rxframing);
 	if (portp->stats.rxparity)
-		sp += sprintf(sp, " pe:%d", (int) portp->stats.rxparity);
+		seq_printf(m, " pe:%d", (int) portp->stats.rxparity);
 	if (portp->stats.rxbreaks)
-		sp += sprintf(sp, " brk:%d", (int) portp->stats.rxbreaks);
+		seq_printf(m, " brk:%d", (int) portp->stats.rxbreaks);
 	if (portp->stats.rxoverrun)
-		sp += sprintf(sp, " oe:%d", (int) portp->stats.rxoverrun);
+		seq_printf(m, " oe:%d", (int) portp->stats.rxoverrun);
 
 	sigs = stl_getsignals(portp);
-	cnt = sprintf(sp, "%s%s%s%s%s ",
-		(sigs & TIOCM_RTS) ? "|RTS" : "",
-		(sigs & TIOCM_CTS) ? "|CTS" : "",
-		(sigs & TIOCM_DTR) ? "|DTR" : "",
-		(sigs & TIOCM_CD) ? "|DCD" : "",
-		(sigs & TIOCM_DSR) ? "|DSR" : "");
-	*sp = ' ';
-	sp += cnt;
-
-	for (cnt = sp - pos; cnt < (MAXLINE - 1); cnt++)
-		*sp++ = ' ';
-	if (cnt >= MAXLINE)
-		pos[(MAXLINE - 2)] = '+';
-	pos[(MAXLINE - 1)] = '\n';
-
-	return MAXLINE;
+	sep = ' ';
+	if (sigs & TIOCM_RTS) {
+		seq_printf(m, "%c%s", sep, "RTS");
+		sep = '|';
+	}
+	if (sigs & TIOCM_CTS) {
+		seq_printf(m, "%c%s", sep, "CTS");
+		sep = '|';
+	}
+	if (sigs & TIOCM_DTR) {
+		seq_printf(m, "%c%s", sep, "DTR");
+		sep = '|';
+	}
+	if (sigs & TIOCM_CD) {
+		seq_printf(m, "%c%s", sep, "DCD");
+		sep = '|';
+	}
+	if (sigs & TIOCM_DSR) {
+		seq_printf(m, "%c%s", sep, "DSR");
+		sep = '|';
+	}
+	seq_putc(m, '\n');
 }
 
 /*****************************************************************************/
@@ -1433,30 +1429,17 @@ static int stl_portinfo(struct stlport *portp, int portnr, char *pos)
  *	Port info, read from the /proc file system.
  */
 
-static int stl_readproc(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int stl_proc_show(struct seq_file *m, void *v)
 {
 	struct stlbrd	*brdp;
 	struct stlpanel	*panelp;
 	struct stlport	*portp;
 	unsigned int	brdnr, panelnr, portnr;
-	int		totalport, curoff, maxoff;
-	char		*pos;
+	int		totalport;
 
-	pr_debug("stl_readproc(page=%p,start=%p,off=%lx,count=%d,eof=%p,"
-		"data=%p\n", page, start, off, count, eof, data);
-
-	pos = page;
 	totalport = 0;
-	curoff = 0;
-
-	if (off == 0) {
-		pos += sprintf(pos, "%s: version %s", stl_drvtitle,
-			stl_drvversion);
-		while (pos < (page + MAXLINE - 1))
-			*pos++ = ' ';
-		*pos++ = '\n';
-	}
-	curoff =  MAXLINE;
+
+	seq_printf(m, "%s: version %s\n", stl_drvtitle, stl_drvversion);
 
 /*
  *	We scan through for each board, panel and port. The offset is
@@ -1469,46 +1452,37 @@ static int stl_readproc(char *page, char **start, off_t off, int count, int *eof
 		if (brdp->state == 0)
 			continue;
 
-		maxoff = curoff + (brdp->nrports * MAXLINE);
-		if (off >= maxoff) {
-			curoff = maxoff;
-			continue;
-		}
-
 		totalport = brdnr * STL_MAXPORTS;
 		for (panelnr = 0; panelnr < brdp->nrpanels; panelnr++) {
 			panelp = brdp->panels[panelnr];
 			if (panelp == NULL)
 				continue;
 
-			maxoff = curoff + (panelp->nrports * MAXLINE);
-			if (off >= maxoff) {
-				curoff = maxoff;
-				totalport += panelp->nrports;
-				continue;
-			}
-
 			for (portnr = 0; portnr < panelp->nrports; portnr++,
 			    totalport++) {
 				portp = panelp->ports[portnr];
 				if (portp == NULL)
 					continue;
-				if (off >= (curoff += MAXLINE))
-					continue;
-				if ((pos - page + MAXLINE) > count)
-					goto stl_readdone;
-				pos += stl_portinfo(portp, totalport, pos);
+				stl_portinfo(m, portp, totalport);
 			}
 		}
 	}
+	return 0;
+}
 
-	*eof = 1;
-
-stl_readdone:
-	*start = page;
-	return pos - page;
+static int stl_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stl_proc_show, NULL);
 }
 
+static const struct file_operations stl_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= stl_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*****************************************************************************/
 
 /*
@@ -2566,9 +2540,9 @@ static const struct tty_operations stl_ops = {
 	.break_ctl = stl_breakctl,
 	.wait_until_sent = stl_waituntilsent,
 	.send_xchar = stl_sendxchar,
-	.read_proc = stl_readproc,
 	.tiocmget = stl_tiocmget,
 	.tiocmset = stl_tiocmset,
+	.proc_fops = &stl_proc_fops,
 };
 
 static const struct tty_port_operations stl_port_ops = {
-- 
cgit v1.2.3


From d337829bd841974045846ec8b428f4199453159e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:18 -0700
Subject: proc tty: switch synclink to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/synclink.c | 98 ++++++++++++++++++++-----------------------------
 1 file changed, 39 insertions(+), 59 deletions(-)

diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index 0057a8f58cb..afd0b26ca05 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -79,6 +79,7 @@
 #include <linux/ptrace.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/netdevice.h>
@@ -3459,18 +3460,17 @@ cleanup:
  * /proc fs routines....
  */
 
-static inline int line_info(char *buf, struct mgsl_struct *info)
+static inline void line_info(struct seq_file *m, struct mgsl_struct *info)
 {
 	char	stat_buf[30];
-	int	ret;
 	unsigned long flags;
 
 	if (info->bus_type == MGSL_BUS_TYPE_PCI) {
-		ret = sprintf(buf, "%s:PCI io:%04X irq:%d mem:%08X lcr:%08X",
+		seq_printf(m, "%s:PCI io:%04X irq:%d mem:%08X lcr:%08X",
 			info->device_name, info->io_base, info->irq_level,
 			info->phys_memory_base, info->phys_lcr_base);
 	} else {
-		ret = sprintf(buf, "%s:(E)ISA io:%04X irq:%d dma:%d",
+		seq_printf(m, "%s:(E)ISA io:%04X irq:%d dma:%d",
 			info->device_name, info->io_base, 
 			info->irq_level, info->dma_level);
 	}
@@ -3497,37 +3497,37 @@ static inline int line_info(char *buf, struct mgsl_struct *info)
 
 	if (info->params.mode == MGSL_MODE_HDLC ||
 	    info->params.mode == MGSL_MODE_RAW ) {
-		ret += sprintf(buf+ret, " HDLC txok:%d rxok:%d",
+		seq_printf(m, " HDLC txok:%d rxok:%d",
 			      info->icount.txok, info->icount.rxok);
 		if (info->icount.txunder)
-			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+			seq_printf(m, " txunder:%d", info->icount.txunder);
 		if (info->icount.txabort)
-			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+			seq_printf(m, " txabort:%d", info->icount.txabort);
 		if (info->icount.rxshort)
-			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);	
+			seq_printf(m, " rxshort:%d", info->icount.rxshort);
 		if (info->icount.rxlong)
-			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+			seq_printf(m, " rxlong:%d", info->icount.rxlong);
 		if (info->icount.rxover)
-			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+			seq_printf(m, " rxover:%d", info->icount.rxover);
 		if (info->icount.rxcrc)
-			ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc);
+			seq_printf(m, " rxcrc:%d", info->icount.rxcrc);
 	} else {
-		ret += sprintf(buf+ret, " ASYNC tx:%d rx:%d",
+		seq_printf(m, " ASYNC tx:%d rx:%d",
 			      info->icount.tx, info->icount.rx);
 		if (info->icount.frame)
-			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+			seq_printf(m, " fe:%d", info->icount.frame);
 		if (info->icount.parity)
-			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+			seq_printf(m, " pe:%d", info->icount.parity);
 		if (info->icount.brk)
-			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);	
+			seq_printf(m, " brk:%d", info->icount.brk);
 		if (info->icount.overrun)
-			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+			seq_printf(m, " oe:%d", info->icount.overrun);
 	}
 	
 	/* Append serial signal status to end */
-	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+	seq_printf(m, " %s\n", stat_buf+1);
 	
-	ret += sprintf(buf+ret, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+	seq_printf(m, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
 	 info->tx_active,info->bh_requested,info->bh_running,
 	 info->pending_bh);
 	 
@@ -3544,60 +3544,40 @@ static inline int line_info(char *buf, struct mgsl_struct *info)
 	u16 Tmr = usc_InReg( info, TMR );
 	u16 Tccr = usc_InReg( info, TCCR );
 	u16 Ccar = inw( info->io_base + CCAR );
-	ret += sprintf(buf+ret, "tcsr=%04X tdmr=%04X ticr=%04X rcsr=%04X rdmr=%04X\n"
+	seq_printf(m, "tcsr=%04X tdmr=%04X ticr=%04X rcsr=%04X rdmr=%04X\n"
                         "ricr=%04X icr =%04X dccr=%04X tmr=%04X tccr=%04X ccar=%04X\n",
 	 		Tcsr,Tdmr,Ticr,Rscr,Rdmr,Ricr,Icr,Dccr,Tmr,Tccr,Ccar );
 	}
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
-	
-	return ret;
-	
-}	/* end of line_info() */
+}
 
-/* mgsl_read_proc()
- * 
- * Called to print information about devices
- * 
- * Arguments:
- * 	page	page of memory to hold returned info
- * 	start	
- * 	off
- * 	count
- * 	eof
- * 	data
- * 	
- * Return Value:
- */
-static int mgsl_read_proc(char *page, char **start, off_t off, int count,
-		 int *eof, void *data)
+/* Called to print information about devices */
+static int mgsl_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0, l;
-	off_t	begin = 0;
 	struct mgsl_struct *info;
 	
-	len += sprintf(page, "synclink driver:%s\n", driver_version);
+	seq_printf(m, "synclink driver:%s\n", driver_version);
 	
 	info = mgsl_device_list;
 	while( info ) {
-		l = line_info(page + len, info);
-		len += l;
-		if (len+begin > off+count)
-			goto done;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
+		line_info(m, info);
 		info = info->next_device;
 	}
+	return 0;
+}
 
-	*eof = 1;
-done:
-	if (off >= len+begin)
-		return 0;
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
-	
-}	/* end of mgsl_read_proc() */
+static int mgsl_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mgsl_proc_show, NULL);
+}
+
+static const struct file_operations mgsl_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= mgsl_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
 /* mgsl_allocate_dma_buffers()
  * 
@@ -4335,13 +4315,13 @@ static const struct tty_operations mgsl_ops = {
 	.send_xchar = mgsl_send_xchar,
 	.break_ctl = mgsl_break,
 	.wait_until_sent = mgsl_wait_until_sent,
- 	.read_proc = mgsl_read_proc,
 	.set_termios = mgsl_set_termios,
 	.stop = mgsl_stop,
 	.start = mgsl_start,
 	.hangup = mgsl_hangup,
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
+	.proc_fops = &mgsl_proc_fops,
 };
 
 /*
-- 
cgit v1.2.3


From a18c56e5af41a6391a6bee2c26e806e7997f6698 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:19 -0700
Subject: proc tty: switch synclink_gt to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/synclink_gt.c | 74 +++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index efb3dc928a4..6ec6e13d47d 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -60,6 +60,7 @@
 #include <linux/ptrace.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
@@ -154,7 +155,6 @@ static void tx_hold(struct tty_struct *tty);
 static void tx_release(struct tty_struct *tty);
 
 static int  ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg);
-static int  read_proc(char *page, char **start, off_t off, int count,int *eof, void *data);
 static int  chars_in_buffer(struct tty_struct *tty);
 static void throttle(struct tty_struct * tty);
 static void unthrottle(struct tty_struct * tty);
@@ -1229,13 +1229,12 @@ static long slgt_compat_ioctl(struct tty_struct *tty, struct file *file,
 /*
  * proc fs support
  */
-static inline int line_info(char *buf, struct slgt_info *info)
+static inline void line_info(struct seq_file *m, struct slgt_info *info)
 {
 	char stat_buf[30];
-	int ret;
 	unsigned long flags;
 
-	ret = sprintf(buf, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n",
+	seq_printf(m, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n",
 		      info->device_name, info->phys_reg_addr,
 		      info->irq_level, info->max_frame_size);
 
@@ -1260,75 +1259,70 @@ static inline int line_info(char *buf, struct slgt_info *info)
 		strcat(stat_buf, "|RI");
 
 	if (info->params.mode != MGSL_MODE_ASYNC) {
-		ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d",
+		seq_printf(m, "\tHDLC txok:%d rxok:%d",
 			       info->icount.txok, info->icount.rxok);
 		if (info->icount.txunder)
-			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+			seq_printf(m, " txunder:%d", info->icount.txunder);
 		if (info->icount.txabort)
-			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+			seq_printf(m, " txabort:%d", info->icount.txabort);
 		if (info->icount.rxshort)
-			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);
+			seq_printf(m, " rxshort:%d", info->icount.rxshort);
 		if (info->icount.rxlong)
-			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+			seq_printf(m, " rxlong:%d", info->icount.rxlong);
 		if (info->icount.rxover)
-			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+			seq_printf(m, " rxover:%d", info->icount.rxover);
 		if (info->icount.rxcrc)
-			ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc);
+			seq_printf(m, " rxcrc:%d", info->icount.rxcrc);
 	} else {
-		ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d",
+		seq_printf(m, "\tASYNC tx:%d rx:%d",
 			       info->icount.tx, info->icount.rx);
 		if (info->icount.frame)
-			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+			seq_printf(m, " fe:%d", info->icount.frame);
 		if (info->icount.parity)
-			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+			seq_printf(m, " pe:%d", info->icount.parity);
 		if (info->icount.brk)
-			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);
+			seq_printf(m, " brk:%d", info->icount.brk);
 		if (info->icount.overrun)
-			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+			seq_printf(m, " oe:%d", info->icount.overrun);
 	}
 
 	/* Append serial signal status to end */
-	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+	seq_printf(m, " %s\n", stat_buf+1);
 
-	ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+	seq_printf(m, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
 		       info->tx_active,info->bh_requested,info->bh_running,
 		       info->pending_bh);
-
-	return ret;
 }
 
 /* Called to print information about devices
  */
-static int read_proc(char *page, char **start, off_t off, int count,
-		     int *eof, void *data)
+static int synclink_gt_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0, l;
-	off_t	begin = 0;
 	struct slgt_info *info;
 
-	len += sprintf(page, "synclink_gt driver\n");
+	seq_puts(m, "synclink_gt driver\n");
 
 	info = slgt_device_list;
 	while( info ) {
-		l = line_info(page + len, info);
-		len += l;
-		if (len+begin > off+count)
-			goto done;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
+		line_info(m, info);
 		info = info->next_device;
 	}
+	return 0;
+}
 
-	*eof = 1;
-done:
-	if (off >= len+begin)
-		return 0;
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
+static int synclink_gt_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, synclink_gt_proc_show, NULL);
 }
 
+static const struct file_operations synclink_gt_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= synclink_gt_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*
  * return count of bytes in transmit buffer
  */
@@ -3562,13 +3556,13 @@ static const struct tty_operations ops = {
 	.send_xchar = send_xchar,
 	.break_ctl = set_break,
 	.wait_until_sent = wait_until_sent,
- 	.read_proc = read_proc,
 	.set_termios = set_termios,
 	.stop = tx_hold,
 	.start = tx_release,
 	.hangup = hangup,
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
+	.proc_fops = &synclink_gt_proc_fops,
 };
 
 static void slgt_cleanup(void)
-- 
cgit v1.2.3


From e6c8dd8a5c887caaf6ee29f04c7260617cb28295 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:20 -0700
Subject: proc tty: switch synclinkmp to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/synclinkmp.c | 74 ++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 8eb6c89a980..26de60efe4b 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
@@ -520,7 +521,6 @@ static void tx_hold(struct tty_struct *tty);
 static void tx_release(struct tty_struct *tty);
 
 static int  ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg);
-static int  read_proc(char *page, char **start, off_t off, int count,int *eof, void *data);
 static int  chars_in_buffer(struct tty_struct *tty);
 static void throttle(struct tty_struct * tty);
 static void unthrottle(struct tty_struct * tty);
@@ -1354,13 +1354,12 @@ static int ioctl(struct tty_struct *tty, struct file *file,
  * /proc fs routines....
  */
 
-static inline int line_info(char *buf, SLMP_INFO *info)
+static inline void line_info(struct seq_file *m, SLMP_INFO *info)
 {
 	char	stat_buf[30];
-	int	ret;
 	unsigned long flags;
 
-	ret = sprintf(buf, "%s: SCABase=%08x Mem=%08X StatusControl=%08x LCR=%08X\n"
+	seq_printf(m, "%s: SCABase=%08x Mem=%08X StatusControl=%08x LCR=%08X\n"
 		       "\tIRQ=%d MaxFrameSize=%u\n",
 		info->device_name,
 		info->phys_sca_base,
@@ -1391,75 +1390,70 @@ static inline int line_info(char *buf, SLMP_INFO *info)
 		strcat(stat_buf, "|RI");
 
 	if (info->params.mode == MGSL_MODE_HDLC) {
-		ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d",
+		seq_printf(m, "\tHDLC txok:%d rxok:%d",
 			      info->icount.txok, info->icount.rxok);
 		if (info->icount.txunder)
-			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+			seq_printf(m, " txunder:%d", info->icount.txunder);
 		if (info->icount.txabort)
-			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+			seq_printf(m, " txabort:%d", info->icount.txabort);
 		if (info->icount.rxshort)
-			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);
+			seq_printf(m, " rxshort:%d", info->icount.rxshort);
 		if (info->icount.rxlong)
-			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+			seq_printf(m, " rxlong:%d", info->icount.rxlong);
 		if (info->icount.rxover)
-			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+			seq_printf(m, " rxover:%d", info->icount.rxover);
 		if (info->icount.rxcrc)
-			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxcrc);
+			seq_printf(m, " rxlong:%d", info->icount.rxcrc);
 	} else {
-		ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d",
+		seq_printf(m, "\tASYNC tx:%d rx:%d",
 			      info->icount.tx, info->icount.rx);
 		if (info->icount.frame)
-			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+			seq_printf(m, " fe:%d", info->icount.frame);
 		if (info->icount.parity)
-			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+			seq_printf(m, " pe:%d", info->icount.parity);
 		if (info->icount.brk)
-			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);
+			seq_printf(m, " brk:%d", info->icount.brk);
 		if (info->icount.overrun)
-			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+			seq_printf(m, " oe:%d", info->icount.overrun);
 	}
 
 	/* Append serial signal status to end */
-	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+	seq_printf(m, " %s\n", stat_buf+1);
 
-	ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+	seq_printf(m, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
 	 info->tx_active,info->bh_requested,info->bh_running,
 	 info->pending_bh);
-
-	return ret;
 }
 
 /* Called to print information about devices
  */
-static int read_proc(char *page, char **start, off_t off, int count,
-	      int *eof, void *data)
+static int synclinkmp_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0, l;
-	off_t	begin = 0;
 	SLMP_INFO *info;
 
-	len += sprintf(page, "synclinkmp driver:%s\n", driver_version);
+	seq_printf(m, "synclinkmp driver:%s\n", driver_version);
 
 	info = synclinkmp_device_list;
 	while( info ) {
-		l = line_info(page + len, info);
-		len += l;
-		if (len+begin > off+count)
-			goto done;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
+		line_info(m, info);
 		info = info->next_device;
 	}
+	return 0;
+}
 
-	*eof = 1;
-done:
-	if (off >= len+begin)
-		return 0;
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
+static int synclinkmp_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, synclinkmp_proc_show, NULL);
 }
 
+static const struct file_operations synclinkmp_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= synclinkmp_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /* Return the count of bytes in transmit buffer
  */
 static int chars_in_buffer(struct tty_struct *tty)
@@ -3905,13 +3899,13 @@ static const struct tty_operations ops = {
 	.send_xchar = send_xchar,
 	.break_ctl = set_break,
 	.wait_until_sent = wait_until_sent,
- 	.read_proc = read_proc,
 	.set_termios = set_termios,
 	.stop = tx_hold,
 	.start = tx_release,
 	.hangup = hangup,
 	.tiocmget = tiocmget,
 	.tiocmset = tiocmset,
+	.proc_fops = &synclinkmp_proc_fops,
 };
 
 
-- 
cgit v1.2.3


From 201a50ba6627dd00aa7b7673a5c454ca387095fb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:20 -0700
Subject: proc tty: switch sdio_uart to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/card/sdio_uart.c | 62 +++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c
index 78ad48718ab..36a8d53ad2a 100644
--- a/drivers/mmc/card/sdio_uart.c
+++ b/drivers/mmc/card/sdio_uart.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
+#include <linux/seq_file.h>
 #include <linux/serial_reg.h>
 #include <linux/circ_buf.h>
 #include <linux/gfp.h>
@@ -933,67 +934,64 @@ static int sdio_uart_tiocmset(struct tty_struct *tty, struct file *file,
 	return result;
 }
 
-static int sdio_uart_read_proc(char *page, char **start, off_t off,
-			       int count, int *eof, void *data)
+static int sdio_uart_proc_show(struct seq_file *m, void *v)
 {
-	int i, len = 0;
-	off_t begin = 0;
+	int i;
 
-	len += sprintf(page, "serinfo:1.0 driver%s%s revision:%s\n",
+	seq_printf(m, "serinfo:1.0 driver%s%s revision:%s\n",
 		       "", "", "");
-	for (i = 0; i < UART_NR && len < PAGE_SIZE - 96; i++) {
+	for (i = 0; i < UART_NR; i++) {
 		struct sdio_uart_port *port = sdio_uart_port_get(i);
 		if (port) {
-			len += sprintf(page+len, "%d: uart:SDIO", i);
+			seq_printf(m, "%d: uart:SDIO", i);
 			if(capable(CAP_SYS_ADMIN)) {
-				len += sprintf(page + len, " tx:%d rx:%d",
+				seq_printf(m, " tx:%d rx:%d",
 					       port->icount.tx, port->icount.rx);
 				if (port->icount.frame)
-					len += sprintf(page + len, " fe:%d",
+					seq_printf(m, " fe:%d",
 						       port->icount.frame);
 				if (port->icount.parity)
-					len += sprintf(page + len, " pe:%d",
+					seq_printf(m, " pe:%d",
 						       port->icount.parity);
 				if (port->icount.brk)
-					len += sprintf(page + len, " brk:%d",
+					seq_printf(m, " brk:%d",
 						       port->icount.brk);
 				if (port->icount.overrun)
-					len += sprintf(page + len, " oe:%d",
+					seq_printf(m, " oe:%d",
 						       port->icount.overrun);
 				if (port->icount.cts)
-					len += sprintf(page + len, " cts:%d",
+					seq_printf(m, " cts:%d",
 						       port->icount.cts);
 				if (port->icount.dsr)
-					len += sprintf(page + len, " dsr:%d",
+					seq_printf(m, " dsr:%d",
 						       port->icount.dsr);
 				if (port->icount.rng)
-					len += sprintf(page + len, " rng:%d",
+					seq_printf(m, " rng:%d",
 						       port->icount.rng);
 				if (port->icount.dcd)
-					len += sprintf(page + len, " dcd:%d",
+					seq_printf(m, " dcd:%d",
 						       port->icount.dcd);
 			}
-			strcat(page, "\n");
-			len++;
 			sdio_uart_port_put(port);
-		}
-
-		if (len + begin > off + count)
-			goto done;
-		if (len + begin < off) {
-			begin += len;
-			len = 0;
+			seq_putc(m, '\n');
 		}
 	}
-	*eof = 1;
+	return 0;
+}
 
-done:
-	if (off >= len + begin)
-		return 0;
-	*start = page + (off - begin);
-	return (count < begin + len - off) ? count : (begin + len - off);
+static int sdio_uart_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sdio_uart_proc_show, NULL);
 }
 
+static const struct file_operations sdio_uart_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= sdio_uart_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static const struct tty_operations sdio_uart_ops = {
 	.open			= sdio_uart_open,
 	.close			= sdio_uart_close,
@@ -1007,7 +1005,7 @@ static const struct tty_operations sdio_uart_ops = {
 	.break_ctl		= sdio_uart_break_ctl,
 	.tiocmget		= sdio_uart_tiocmget,
 	.tiocmset		= sdio_uart_tiocmset,
-	.read_proc		= sdio_uart_read_proc,
+	.proc_fops		= &sdio_uart_proc_fops,
 };
 
 static struct tty_driver *sdio_uart_tty_driver;
-- 
cgit v1.2.3


From d196a949ba0fb85121c0dc0720b13380d802dbd6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:21 -0700
Subject: proc tty: switch serial_core to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 76 +++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 40 deletions(-)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 42f4e66fcca..bf3c0e32a33 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -27,6 +27,8 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/console.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/serial_core.h>
 #include <linux/smp_lock.h>
 #include <linux/device.h>
@@ -1682,20 +1684,20 @@ static const char *uart_type(struct uart_port *port)
 
 #ifdef CONFIG_PROC_FS
 
-static int uart_line_info(char *buf, struct uart_driver *drv, int i)
+static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
 {
 	struct uart_state *state = drv->state + i;
 	int pm_state;
 	struct uart_port *port = state->port;
 	char stat_buf[32];
 	unsigned int status;
-	int mmio, ret;
+	int mmio;
 
 	if (!port)
-		return 0;
+		return;
 
 	mmio = port->iotype >= UPIO_MEM;
-	ret = sprintf(buf, "%d: uart:%s %s%08llX irq:%d",
+	seq_printf(m, "%d: uart:%s %s%08llX irq:%d",
 			port->line, uart_type(port),
 			mmio ? "mmio:0x" : "port:",
 			mmio ? (unsigned long long)port->mapbase
@@ -1703,8 +1705,8 @@ static int uart_line_info(char *buf, struct uart_driver *drv, int i)
 			port->irq);
 
 	if (port->type == PORT_UNKNOWN) {
-		strcat(buf, "\n");
-		return ret + 1;
+		seq_putc(m, '\n');
+		return;
 	}
 
 	if (capable(CAP_SYS_ADMIN)) {
@@ -1719,19 +1721,19 @@ static int uart_line_info(char *buf, struct uart_driver *drv, int i)
 			uart_change_pm(state, pm_state);
 		mutex_unlock(&state->mutex);
 
-		ret += sprintf(buf + ret, " tx:%d rx:%d",
+		seq_printf(m, " tx:%d rx:%d",
 				port->icount.tx, port->icount.rx);
 		if (port->icount.frame)
-			ret += sprintf(buf + ret, " fe:%d",
+			seq_printf(m, " fe:%d",
 				port->icount.frame);
 		if (port->icount.parity)
-			ret += sprintf(buf + ret, " pe:%d",
+			seq_printf(m, " pe:%d",
 				port->icount.parity);
 		if (port->icount.brk)
-			ret += sprintf(buf + ret, " brk:%d",
+			seq_printf(m, " brk:%d",
 				port->icount.brk);
 		if (port->icount.overrun)
-			ret += sprintf(buf + ret, " oe:%d",
+			seq_printf(m, " oe:%d",
 				port->icount.overrun);
 
 #define INFOBIT(bit, str) \
@@ -1753,45 +1755,39 @@ static int uart_line_info(char *buf, struct uart_driver *drv, int i)
 		STATBIT(TIOCM_RNG, "|RI");
 		if (stat_buf[0])
 			stat_buf[0] = ' ';
-		strcat(stat_buf, "\n");
 
-		ret += sprintf(buf + ret, stat_buf);
-	} else {
-		strcat(buf, "\n");
-		ret++;
+		seq_puts(m, stat_buf);
 	}
+	seq_putc(m, '\n');
 #undef STATBIT
 #undef INFOBIT
-	return ret;
 }
 
-static int uart_read_proc(char *page, char **start, off_t off,
-			  int count, int *eof, void *data)
+static int uart_proc_show(struct seq_file *m, void *v)
 {
-	struct tty_driver *ttydrv = data;
+	struct tty_driver *ttydrv = v;
 	struct uart_driver *drv = ttydrv->driver_state;
-	int i, len = 0, l;
-	off_t begin = 0;
+	int i;
 
-	len += sprintf(page, "serinfo:1.0 driver%s%s revision:%s\n",
+	seq_printf(m, "serinfo:1.0 driver%s%s revision:%s\n",
 			"", "", "");
-	for (i = 0; i < drv->nr && len < PAGE_SIZE - 96; i++) {
-		l = uart_line_info(page + len, drv, i);
-		len += l;
-		if (len + begin > off + count)
-			goto done;
-		if (len + begin < off) {
-			begin += len;
-			len = 0;
-		}
-	}
-	*eof = 1;
- done:
-	if (off >= len + begin)
-		return 0;
-	*start = page + (off - begin);
-	return (count < begin + len - off) ? count : (begin + len - off);
+	for (i = 0; i < drv->nr; i++)
+		uart_line_info(m, drv, i);
+	return 0;
 }
+
+static int uart_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uart_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations uart_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= uart_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
 
 #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL)
@@ -2299,7 +2295,7 @@ static const struct tty_operations uart_ops = {
 	.break_ctl	= uart_break_ctl,
 	.wait_until_sent= uart_wait_until_sent,
 #ifdef CONFIG_PROC_FS
-	.read_proc	= uart_read_proc,
+	.proc_fops	= &uart_proc_fops,
 #endif
 	.tiocmget	= uart_tiocmget,
 	.tiocmset	= uart_tiocmset,
-- 
cgit v1.2.3


From 6fd69d3cf1496c8e6751ecb3eae254e1a839bd5d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:21 -0700
Subject: proc tty: switch usb-serial to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/usb-serial.c | 58 +++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 742a5bc44be..2a70563bbee 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -26,6 +26,7 @@
 #include <linux/tty_flip.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
@@ -421,57 +422,52 @@ static int serial_break(struct tty_struct *tty, int break_state)
 	return 0;
 }
 
-static int serial_read_proc(char *page, char **start, off_t off, int count,
-							int *eof, void *data)
+static int serial_proc_show(struct seq_file *m, void *v)
 {
 	struct usb_serial *serial;
-	int length = 0;
 	int i;
-	off_t begin = 0;
 	char tmp[40];
 
 	dbg("%s", __func__);
-	length += sprintf(page, "usbserinfo:1.0 driver:2.0\n");
-	for (i = 0; i < SERIAL_TTY_MINORS && length < PAGE_SIZE; ++i) {
+	seq_puts(m, "usbserinfo:1.0 driver:2.0\n");
+	for (i = 0; i < SERIAL_TTY_MINORS; ++i) {
 		serial = usb_serial_get_by_index(i);
 		if (serial == NULL)
 			continue;
 
-		length += sprintf(page+length, "%d:", i);
+		seq_printf(m, "%d:", i);
 		if (serial->type->driver.owner)
-			length += sprintf(page+length, " module:%s",
+			seq_printf(m, " module:%s",
 				module_name(serial->type->driver.owner));
-		length += sprintf(page+length, " name:\"%s\"",
+		seq_printf(m, " name:\"%s\"",
 				serial->type->description);
-		length += sprintf(page+length, " vendor:%04x product:%04x",
+		seq_printf(m, " vendor:%04x product:%04x",
 			le16_to_cpu(serial->dev->descriptor.idVendor),
 			le16_to_cpu(serial->dev->descriptor.idProduct));
-		length += sprintf(page+length, " num_ports:%d",
-							serial->num_ports);
-		length += sprintf(page+length, " port:%d",
-							i - serial->minor + 1);
+		seq_printf(m, " num_ports:%d", serial->num_ports);
+		seq_printf(m, " port:%d", i - serial->minor + 1);
 		usb_make_path(serial->dev, tmp, sizeof(tmp));
-		length += sprintf(page+length, " path:%s", tmp);
+		seq_printf(m, " path:%s", tmp);
 
-		length += sprintf(page+length, "\n");
-		if ((length + begin) > (off + count)) {
-			usb_serial_put(serial);
-			goto done;
-		}
-		if ((length + begin) < off) {
-			begin += length;
-			length = 0;
-		}
+		seq_putc(m, '\n');
 		usb_serial_put(serial);
 	}
-	*eof = 1;
-done:
-	if (off >= (length + begin))
-		return 0;
-	*start = page + (off-begin);
-	return (count < begin+length-off) ? count : begin+length-off;
+	return 0;
 }
 
+static int serial_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, serial_proc_show, NULL);
+}
+
+static const struct file_operations serial_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= serial_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int serial_tiocmget(struct tty_struct *tty, struct file *file)
 {
 	struct usb_serial_port *port = tty->driver_data;
@@ -1113,9 +1109,9 @@ static const struct tty_operations serial_ops = {
 	.unthrottle =		serial_unthrottle,
 	.break_ctl =		serial_break,
 	.chars_in_buffer =	serial_chars_in_buffer,
-	.read_proc =		serial_read_proc,
 	.tiocmget =		serial_tiocmget,
 	.tiocmset =		serial_tiocmset,
+	.proc_fops =		&serial_proc_fops,
 };
 
 struct tty_driver *usb_serial_tty_driver;
-- 
cgit v1.2.3


From 3d3041768296c4993c0aba686bf0775faab5236a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:22 -0700
Subject: proc tty: switch ircomm to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/irda/ircomm/ircomm_tty.c | 256 +++++++++++++++++++++++--------------------
 1 file changed, 138 insertions(+), 118 deletions(-)

diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 086d5ef098f..811984d9324 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 #include <linux/termios.h>
 #include <linux/tty.h>
 #include <linux/interrupt.h>
@@ -72,8 +73,7 @@ static int ircomm_tty_control_indication(void *instance, void *sap,
 static void ircomm_tty_flow_indication(void *instance, void *sap,
 				       LOCAL_FLOW cmd);
 #ifdef CONFIG_PROC_FS
-static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len,
-				int *eof, void *unused);
+static const struct file_operations ircomm_tty_proc_fops;
 #endif /* CONFIG_PROC_FS */
 static struct tty_driver *driver;
 
@@ -98,7 +98,7 @@ static const struct tty_operations ops = {
 	.hangup          = ircomm_tty_hangup,
 	.wait_until_sent = ircomm_tty_wait_until_sent,
 #ifdef CONFIG_PROC_FS
-	.read_proc       = ircomm_tty_read_proc,
+	.proc_fops       = &ircomm_tty_proc_fops,
 #endif /* CONFIG_PROC_FS */
 };
 
@@ -1245,150 +1245,170 @@ static void ircomm_tty_flow_indication(void *instance, void *sap,
 }
 
 #ifdef CONFIG_PROC_FS
-static int ircomm_tty_line_info(struct ircomm_tty_cb *self, char *buf)
+static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 {
-	int  ret=0;
+	char sep;
 
-	ret += sprintf(buf+ret, "State: %s\n", ircomm_tty_state[self->state]);
+	seq_printf(m, "State: %s\n", ircomm_tty_state[self->state]);
 
-	ret += sprintf(buf+ret, "Service type: ");
+	seq_puts(m, "Service type: ");
 	if (self->service_type & IRCOMM_9_WIRE)
-		ret += sprintf(buf+ret, "9_WIRE");
+		seq_puts(m, "9_WIRE");
 	else if (self->service_type & IRCOMM_3_WIRE)
-		ret += sprintf(buf+ret, "3_WIRE");
+		seq_puts(m, "3_WIRE");
 	else if (self->service_type & IRCOMM_3_WIRE_RAW)
-		ret += sprintf(buf+ret, "3_WIRE_RAW");
+		seq_puts(m, "3_WIRE_RAW");
 	else
-		ret += sprintf(buf+ret, "No common service type!\n");
-	ret += sprintf(buf+ret, "\n");
-
-	ret += sprintf(buf+ret, "Port name: %s\n", self->settings.port_name);
-
-	ret += sprintf(buf+ret, "DTE status: ");
-	if (self->settings.dte & IRCOMM_RTS)
-		ret += sprintf(buf+ret, "RTS|");
-	if (self->settings.dte & IRCOMM_DTR)
-		ret += sprintf(buf+ret, "DTR|");
-	if (self->settings.dte)
-		ret--; /* remove the last | */
-	ret += sprintf(buf+ret, "\n");
-
-	ret += sprintf(buf+ret, "DCE status: ");
-	if (self->settings.dce & IRCOMM_CTS)
-		ret += sprintf(buf+ret, "CTS|");
-	if (self->settings.dce & IRCOMM_DSR)
-		ret += sprintf(buf+ret, "DSR|");
-	if (self->settings.dce & IRCOMM_CD)
-		ret += sprintf(buf+ret, "CD|");
-	if (self->settings.dce & IRCOMM_RI)
-		ret += sprintf(buf+ret, "RI|");
-	if (self->settings.dce)
-		ret--; /* remove the last | */
-	ret += sprintf(buf+ret, "\n");
-
-	ret += sprintf(buf+ret, "Configuration: ");
+		seq_puts(m, "No common service type!\n");
+	seq_putc(m, '\n');
+
+	seq_printf(m, "Port name: %s\n", self->settings.port_name);
+
+	seq_printf(m, "DTE status:");
+	sep = ' ';
+	if (self->settings.dte & IRCOMM_RTS) {
+		seq_printf(m, "%cRTS", sep);
+		sep = '|';
+	}
+	if (self->settings.dte & IRCOMM_DTR) {
+		seq_printf(m, "%cDTR", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_puts(m, "DCE status:");
+	sep = ' ';
+	if (self->settings.dce & IRCOMM_CTS) {
+		seq_printf(m, "%cCTS", sep);
+		sep = '|';
+	}
+	if (self->settings.dce & IRCOMM_DSR) {
+		seq_printf(m, "%cDSR", sep);
+		sep = '|';
+	}
+	if (self->settings.dce & IRCOMM_CD) {
+		seq_printf(m, "%cCD", sep);
+		sep = '|';
+	}
+	if (self->settings.dce & IRCOMM_RI) {
+		seq_printf(m, "%cRI", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_puts(m, "Configuration: ");
 	if (!self->settings.null_modem)
-		ret += sprintf(buf+ret, "DTE <-> DCE\n");
+		seq_puts(m, "DTE <-> DCE\n");
 	else
-		ret += sprintf(buf+ret,
-			       "DTE <-> DTE (null modem emulation)\n");
-
-	ret += sprintf(buf+ret, "Data rate: %d\n", self->settings.data_rate);
-
-	ret += sprintf(buf+ret, "Flow control: ");
-	if (self->settings.flow_control & IRCOMM_XON_XOFF_IN)
-		ret += sprintf(buf+ret, "XON_XOFF_IN|");
-	if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT)
-		ret += sprintf(buf+ret, "XON_XOFF_OUT|");
-	if (self->settings.flow_control & IRCOMM_RTS_CTS_IN)
-		ret += sprintf(buf+ret, "RTS_CTS_IN|");
-	if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT)
-		ret += sprintf(buf+ret, "RTS_CTS_OUT|");
-	if (self->settings.flow_control & IRCOMM_DSR_DTR_IN)
-		ret += sprintf(buf+ret, "DSR_DTR_IN|");
-	if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT)
-		ret += sprintf(buf+ret, "DSR_DTR_OUT|");
-	if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN)
-		ret += sprintf(buf+ret, "ENQ_ACK_IN|");
-	if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT)
-		ret += sprintf(buf+ret, "ENQ_ACK_OUT|");
-	if (self->settings.flow_control)
-		ret--; /* remove the last | */
-	ret += sprintf(buf+ret, "\n");
-
-	ret += sprintf(buf+ret, "Flags: ");
-	if (self->flags & ASYNC_CTS_FLOW)
-		ret += sprintf(buf+ret, "ASYNC_CTS_FLOW|");
-	if (self->flags & ASYNC_CHECK_CD)
-		ret += sprintf(buf+ret, "ASYNC_CHECK_CD|");
-	if (self->flags & ASYNC_INITIALIZED)
-		ret += sprintf(buf+ret, "ASYNC_INITIALIZED|");
-	if (self->flags & ASYNC_LOW_LATENCY)
-		ret += sprintf(buf+ret, "ASYNC_LOW_LATENCY|");
-	if (self->flags & ASYNC_CLOSING)
-		ret += sprintf(buf+ret, "ASYNC_CLOSING|");
-	if (self->flags & ASYNC_NORMAL_ACTIVE)
-		ret += sprintf(buf+ret, "ASYNC_NORMAL_ACTIVE|");
-	if (self->flags)
-		ret--; /* remove the last | */
-	ret += sprintf(buf+ret, "\n");
-
-	ret += sprintf(buf+ret, "Role: %s\n", self->client ?
-		       "client" : "server");
-	ret += sprintf(buf+ret, "Open count: %d\n", self->open_count);
-	ret += sprintf(buf+ret, "Max data size: %d\n", self->max_data_size);
-	ret += sprintf(buf+ret, "Max header size: %d\n", self->max_header_size);
+		seq_puts(m, "DTE <-> DTE (null modem emulation)\n");
+
+	seq_printf(m, "Data rate: %d\n", self->settings.data_rate);
+
+	seq_puts(m, "Flow control:");
+	sep = ' ';
+	if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) {
+		seq_printf(m, "%cXON_XOFF_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) {
+		seq_printf(m, "%cXON_XOFF_OUT", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) {
+		seq_printf(m, "%cRTS_CTS_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) {
+		seq_printf(m, "%cRTS_CTS_OUT", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) {
+		seq_printf(m, "%cDSR_DTR_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) {
+		seq_printf(m, "%cDSR_DTR_OUT", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) {
+		seq_printf(m, "%cENQ_ACK_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) {
+		seq_printf(m, "%cENQ_ACK_OUT", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_puts(m, "Flags:");
+	sep = ' ';
+	if (self->flags & ASYNC_CTS_FLOW) {
+		seq_printf(m, "%cASYNC_CTS_FLOW", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_CHECK_CD) {
+		seq_printf(m, "%cASYNC_CHECK_CD", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_INITIALIZED) {
+		seq_printf(m, "%cASYNC_INITIALIZED", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_LOW_LATENCY) {
+		seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_CLOSING) {
+		seq_printf(m, "%cASYNC_CLOSING", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_NORMAL_ACTIVE) {
+		seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_printf(m, "Role: %s\n", self->client ? "client" : "server");
+	seq_printf(m, "Open count: %d\n", self->open_count);
+	seq_printf(m, "Max data size: %d\n", self->max_data_size);
+	seq_printf(m, "Max header size: %d\n", self->max_header_size);
 
 	if (self->tty)
-		ret += sprintf(buf+ret, "Hardware: %s\n",
+		seq_printf(m, "Hardware: %s\n",
 			       self->tty->hw_stopped ? "Stopped" : "Running");
-
-	ret += sprintf(buf+ret, "\n");
-	return ret;
 }
 
-
-/*
- * Function ircomm_tty_read_proc (buf, start, offset, len, eof, unused)
- *
- *
- *
- */
-static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len,
-				int *eof, void *unused)
+static int ircomm_tty_proc_show(struct seq_file *m, void *v)
 {
 	struct ircomm_tty_cb *self;
-	int count = 0, l;
-	off_t begin = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ircomm_tty->hb_spinlock, flags);
 
 	self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty);
-	while ((self != NULL) && (count < 4000)) {
+	while (self != NULL) {
 		if (self->magic != IRCOMM_TTY_MAGIC)
 			break;
 
-		l = ircomm_tty_line_info(self, buf + count);
-		count += l;
-		if (count+begin > offset+len)
-			goto done;
-		if (count+begin < offset) {
-			begin += count;
-			count = 0;
-		}
-
+		ircomm_tty_line_info(self, m);
 		self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty);
 	}
-	*eof = 1;
-done:
 	spin_unlock_irqrestore(&ircomm_tty->hb_spinlock, flags);
+	return 0;
+}
 
-	if (offset >= count+begin)
-		return 0;
-	*start = buf + (offset-begin);
-	return ((len < begin+count-offset) ? len : begin+count-offset);
+static int ircomm_tty_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ircomm_tty_proc_show, NULL);
 }
+
+static const struct file_operations ircomm_tty_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ircomm_tty_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif /* CONFIG_PROC_FS */
 
 MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
-- 
cgit v1.2.3


From d594027d62808f6649b273544ab8c19ed5b98fd1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:23 -0700
Subject: proc tty: switch amiserial to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/amiserial.c | 62 ++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c
index a58869ea851..fd3ebd1be57 100644
--- a/drivers/char/amiserial.c
+++ b/drivers/char/amiserial.c
@@ -79,6 +79,7 @@ static char *serial_version = "4.30";
 #include <linux/ptrace.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
@@ -1825,14 +1826,13 @@ static int rs_open(struct tty_struct *tty, struct file * filp)
  * /proc fs routines....
  */
 
-static inline int line_info(char *buf, struct serial_state *state)
+static inline void line_info(struct seq_file *m, struct serial_state *state)
 {
 	struct async_struct *info = state->info, scr_info;
 	char	stat_buf[30], control, status;
-	int	ret;
 	unsigned long flags;
 
-	ret = sprintf(buf, "%d: uart:amiga_builtin",state->line);
+	seq_printf(m, "%d: uart:amiga_builtin",state->line);
 
 	/*
 	 * Figure out the current RS-232 lines
@@ -1864,55 +1864,49 @@ static inline int line_info(char *buf, struct serial_state *state)
 		strcat(stat_buf, "|CD");
 
 	if (info->quot) {
-		ret += sprintf(buf+ret, " baud:%d",
-			       state->baud_base / info->quot);
+		seq_printf(m, " baud:%d", state->baud_base / info->quot);
 	}
 
-	ret += sprintf(buf+ret, " tx:%d rx:%d",
-		      state->icount.tx, state->icount.rx);
+	seq_printf(m, " tx:%d rx:%d", state->icount.tx, state->icount.rx);
 
 	if (state->icount.frame)
-		ret += sprintf(buf+ret, " fe:%d", state->icount.frame);
+		seq_printf(m, " fe:%d", state->icount.frame);
 
 	if (state->icount.parity)
-		ret += sprintf(buf+ret, " pe:%d", state->icount.parity);
+		seq_printf(m, " pe:%d", state->icount.parity);
 
 	if (state->icount.brk)
-		ret += sprintf(buf+ret, " brk:%d", state->icount.brk);
+		seq_printf(m, " brk:%d", state->icount.brk);
 
 	if (state->icount.overrun)
-		ret += sprintf(buf+ret, " oe:%d", state->icount.overrun);
+		seq_printf(m, " oe:%d", state->icount.overrun);
 
 	/*
 	 * Last thing is the RS-232 status lines
 	 */
-	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
-	return ret;
+	seq_printf(m, " %s\n", stat_buf+1);
 }
 
-static int rs_read_proc(char *page, char **start, off_t off, int count,
-			int *eof, void *data)
+static int rs_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0, l;
-	off_t	begin = 0;
-
-	len += sprintf(page, "serinfo:1.0 driver:%s\n", serial_version);
-	l = line_info(page + len, &rs_table[0]);
-	len += l;
-	if (len+begin > off+count)
-	  goto done;
-	if (len+begin < off) {
-	  begin += len;
-	  len = 0;
-	}
-	*eof = 1;
-done:
-	if (off >= len+begin)
-		return 0;
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
+	seq_printf(m, "serinfo:1.0 driver:%s\n", serial_version);
+	line_info(m, &rs_table[0]);
+	return 0;
 }
 
+static int rs_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rs_proc_show, NULL);
+}
+
+static const struct file_operations rs_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rs_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*
  * ---------------------------------------------------------------------
  * rs_init() and friends
@@ -1951,9 +1945,9 @@ static const struct tty_operations serial_ops = {
 	.break_ctl = rs_break,
 	.send_xchar = rs_send_xchar,
 	.wait_until_sent = rs_wait_until_sent,
-	.read_proc = rs_read_proc,
 	.tiocmget = rs_tiocmget,
 	.tiocmset = rs_tiocmset,
+	.proc_fops = &rs_proc_fops,
 };
 
 /*
-- 
cgit v1.2.3


From bf54215ef86a1bd83affd8ecdf833c053aefb49d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:23 -0700
Subject: proc tty: switch ia64 simserial to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/hp/sim/simserial.c | 49 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index 24b1ad5334c..2bef5261d96 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -24,6 +24,7 @@
 #include <linux/major.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/capability.h>
 #include <linux/console.h>
@@ -848,38 +849,36 @@ static int rs_open(struct tty_struct *tty, struct file * filp)
  * /proc fs routines....
  */
 
-static inline int line_info(char *buf, struct serial_state *state)
+static inline void line_info(struct seq_file *m, struct serial_state *state)
 {
-	return sprintf(buf, "%d: uart:%s port:%lX irq:%d\n",
+	seq_printf(m, "%d: uart:%s port:%lX irq:%d\n",
 		       state->line, uart_config[state->type].name,
 		       state->port, state->irq);
 }
 
-static int rs_read_proc(char *page, char **start, off_t off, int count,
-		 int *eof, void *data)
+static int rs_proc_show(struct seq_file *m, void *v)
 {
-	int i, len = 0, l;
-	off_t	begin = 0;
-
-	len += sprintf(page, "simserinfo:1.0 driver:%s\n", serial_version);
-	for (i = 0; i < NR_PORTS && len < 4000; i++) {
-		l = line_info(page + len, &rs_table[i]);
-		len += l;
-		if (len+begin > off+count)
-			goto done;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
-	}
-	*eof = 1;
-done:
-	if (off >= len+begin)
-		return 0;
-	*start = page + (begin-off);
-	return ((count < begin+len-off) ? count : begin+len-off);
+	int i;
+
+	seq_printf(m, "simserinfo:1.0 driver:%s\n", serial_version);
+	for (i = 0; i < NR_PORTS; i++)
+		line_info(m, &rs_table[i]);
+	return 0;
 }
 
+static int rs_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rs_proc_show, NULL);
+}
+
+static const struct file_operations rs_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rs_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*
  * ---------------------------------------------------------------------
  * rs_init() and friends
@@ -917,7 +916,7 @@ static const struct tty_operations hp_ops = {
 	.start = rs_start,
 	.hangup = rs_hangup,
 	.wait_until_sent = rs_wait_until_sent,
-	.read_proc = rs_read_proc,
+	.proc_fops = &rs_proc_fops,
 };
 
 /*
-- 
cgit v1.2.3


From 140716934f67a9b28c3f7032c07c20c746d97a31 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:24 -0700
Subject: proc tty: switch xtensa iss console to ->proc_fops

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/xtensa/platforms/iss/console.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c
index 25d46c84eb0..4c559cf7da2 100644
--- a/arch/xtensa/platforms/iss/console.c
+++ b/arch/xtensa/platforms/iss/console.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/major.h>
 #include <linux/param.h>
+#include <linux/seq_file.h>
 #include <linux/serial.h>
 #include <linux/serialP.h>
 
@@ -176,22 +177,24 @@ static void rs_wait_until_sent(struct tty_struct *tty, int timeout)
 	/* Stub, once again.. */
 }
 
-static int rs_read_proc(char *page, char **start, off_t off, int count,
-			int *eof, void *data)
+static int rs_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-	off_t begin = 0;
-
-	len += sprintf(page, "serinfo:1.0 driver:%s\n", serial_version);
-	*eof = 1;
-
-	if (off >= len + begin)
-		return 0;
+	seq_printf(m, "serinfo:1.0 driver:%s\n", serial_version);
+	return 0;
+}
 
-	*start = page + (off - begin);
-	return ((count < begin + len - off) ? count : begin + len - off);
+static int rs_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rs_proc_show, NULL);
 }
 
+static const struct file_operations rs_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rs_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
 static struct tty_operations serial_ops = {
 	.open = rs_open,
@@ -203,7 +206,7 @@ static struct tty_operations serial_ops = {
 	.chars_in_buffer = rs_chars_in_buffer,
 	.hangup = rs_hangup,
 	.wait_until_sent = rs_wait_until_sent,
-	.read_proc = rs_read_proc
+	.proc_fops = &rs_proc_fops,
 };
 
 int __init rs_init(void)
-- 
cgit v1.2.3


From 0f043a81ebe84be3576667f04fdda481609e3816 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:25 -0700
Subject: proc tty: remove struct tty_operations::read_proc

struct tty_operations::proc_fops took it's place and there is one less
create_proc_read_entry() user now!

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/isdn/capi/capi.c   |  7 -------
 fs/proc/proc_tty.c         | 18 ++++--------------
 include/linux/tty_driver.h |  2 --
 net/bluetooth/rfcomm/tty.c |  6 ------
 4 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 3e468d2cf73..2d8352419c0 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1331,12 +1331,6 @@ static void capinc_tty_send_xchar(struct tty_struct *tty, char ch)
 #endif
 }
 
-static int capinc_tty_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	return 0;
-}
-
 static struct tty_driver *capinc_tty_driver;
 
 static const struct tty_operations capinc_ops = {
@@ -1358,7 +1352,6 @@ static const struct tty_operations capinc_ops = {
 	.flush_buffer = capinc_tty_flush_buffer,
 	.set_ldisc = capinc_tty_set_ldisc,
 	.send_xchar = capinc_tty_send_xchar,
-	.read_proc = capinc_tty_read_proc,
 };
 
 static int capinc_tty_init(void)
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 854827b1d46..83adcc86943 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,22 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->driver_name || driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry ||
+	    !driver->ops->proc_fops)
 		return;
 
-	if (driver->ops->proc_fops) {
-		ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-				       driver->ops->proc_fops, driver);
-		if (!ent)
-			return;
-	} else if (driver->ops->read_proc) {
-		ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-		if (!ent)
-			return;
-		ent->read_proc = driver->ops->read_proc;
-		ent->data = driver;
-	} else
-		return;
+	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_fops, driver);
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index c9a69575ded..8615d661ab6 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -252,8 +252,6 @@ struct tty_operations {
 	void (*set_ldisc)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
 	void (*send_xchar)(struct tty_struct *tty, char ch);
-	int (*read_proc)(char *page, char **start, off_t off,
-			  int count, int *eof, void *data);
 	int (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int (*tiocmset)(struct tty_struct *tty, struct file *file,
 			unsigned int set, unsigned int clear);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index abdc703a11d..cab71ea2796 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -1093,11 +1093,6 @@ static void rfcomm_tty_hangup(struct tty_struct *tty)
 	}
 }
 
-static int rfcomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused)
-{
-	return 0;
-}
-
 static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
@@ -1156,7 +1151,6 @@ static const struct tty_operations rfcomm_ops = {
 	.send_xchar		= rfcomm_tty_send_xchar,
 	.hangup			= rfcomm_tty_hangup,
 	.wait_until_sent	= rfcomm_tty_wait_until_sent,
-	.read_proc		= rfcomm_tty_read_proc,
 	.tiocmget		= rfcomm_tty_tiocmget,
 	.tiocmset		= rfcomm_tty_tiocmset,
 };
-- 
cgit v1.2.3


From ef161a9863b045909142daea9490b067997f3dc5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 31 Mar 2009 15:19:25 -0700
Subject: mm: mminit_validate_memmodel_limits(): remove redundant test

In case if start_pfn overlap the upper bound no need to test end_pfn again
since we have it already trimmed.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index 083f5b63e7a..da432d9f0ae 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 		WARN_ON_ONCE(1);
 		*start_pfn = max_sparsemem_pfn;
 		*end_pfn = max_sparsemem_pfn;
-	}
-
-	if (*end_pfn > max_sparsemem_pfn) {
+	} else if (*end_pfn > max_sparsemem_pfn) {
 		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
 			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
 			*start_pfn, *end_pfn, max_sparsemem_pfn);
-- 
cgit v1.2.3


From d086817dc0d42f1be8db4138233d33e1dd16a956 Mon Sep 17 00:00:00 2001
From: MinChan Kim <minchan.kim@gmail.com>
Date: Tue, 31 Mar 2009 15:19:26 -0700
Subject: vmap: remove needless lock and list in vmap

vmap's dirty_list is unused.  It's for optimizing flushing.  but Nick
didn't write the code yet.  so, we don't need it until time as it is
needed.

This patch removes vmap_block's dirty_list and codes related to it.

Signed-off-by: MinChan Kim <minchan.kim@gmail.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af58324c361..fab19876b4d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -671,10 +671,7 @@ struct vmap_block {
 	DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
 	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
 	union {
-		struct {
-			struct list_head free_list;
-			struct list_head dirty_list;
-		};
+		struct list_head free_list;
 		struct rcu_head rcu_head;
 	};
 };
@@ -741,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
 	bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
 	INIT_LIST_HEAD(&vb->free_list);
-	INIT_LIST_HEAD(&vb->dirty_list);
 
 	vb_idx = addr_to_vb_idx(va->va_start);
 	spin_lock(&vmap_block_tree_lock);
@@ -772,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb)
 	struct vmap_block *tmp;
 	unsigned long vb_idx;
 
-	spin_lock(&vb->vbq->lock);
-	if (!list_empty(&vb->free_list))
-		list_del(&vb->free_list);
-	if (!list_empty(&vb->dirty_list))
-		list_del(&vb->dirty_list);
-	spin_unlock(&vb->vbq->lock);
+	BUG_ON(!list_empty(&vb->free_list));
 
 	vb_idx = addr_to_vb_idx(vb->va->va_start);
 	spin_lock(&vmap_block_tree_lock);
@@ -862,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size)
 
 	spin_lock(&vb->lock);
 	bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
-	if (!vb->dirty) {
-		spin_lock(&vb->vbq->lock);
-		list_add(&vb->dirty_list, &vb->vbq->dirty);
-		spin_unlock(&vb->vbq->lock);
-	}
+
 	vb->dirty += 1UL << order;
 	if (vb->dirty == VMAP_BBMAP_BITS) {
 		BUG_ON(vb->free || !list_empty(&vb->free_list));
-- 
cgit v1.2.3


From a12888f772dab4bf5e6f73668dc4f5f6026a7014 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 31 Mar 2009 15:19:27 -0700
Subject: oom_kill: don't call for int_sqrt(0)

There is no need to call for int_sqrt if argument is 0.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba05061a4..d3b9bac085b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock);
 
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
-	unsigned long points, cpu_time, run_time, s;
+	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
 	struct task_struct *child;
 
@@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	else
 		run_time = 0;
 
-	s = int_sqrt(cpu_time);
-	if (s)
-		points /= s;
-	s = int_sqrt(int_sqrt(run_time));
-	if (s)
-		points /= s;
+	if (cpu_time)
+		points /= int_sqrt(cpu_time);
+	if (run_time)
+		points /= int_sqrt(int_sqrt(run_time));
 
 	/*
 	 * Niced processes are most likely less important, so double
-- 
cgit v1.2.3


From 9de1581e75ba9d7979766d4ab6d56f0f2d87f7c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Mar 2009 15:19:29 -0700
Subject: get_mm_hiwater_xxx: trivial, s/define/inline/

Andrew pointed out get_mm_hiwater_xxx() evaluate "mm" argument thrice/twice,
make them inline.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2d..481fad3a9b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -391,8 +391,15 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-#define get_mm_hiwater_rss(mm)	max((mm)->hiwater_rss, get_mm_rss(mm))
-#define get_mm_hiwater_vm(mm)	max((mm)->hiwater_vm, (mm)->total_vm)
+static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+{
+	return max(mm->hiwater_rss, get_mm_rss(mm));
+}
+
+static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+{
+	return max(mm->hiwater_vm, mm->total_vm);
+}
 
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
-- 
cgit v1.2.3


From a6dc60f8975ad96d162915e07703a4439c80dcf0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:19:30 -0700
Subject: vmscan: rename sc.may_swap to may_unmap

sc.may_swap does not only influence reclaiming of anon pages but pages
mapped into pagetables in general, which also includes mapped file pages.

In shrink_page_list():

		if (!sc->may_swap && page_mapped(page))
			goto keep_locked;

For anon pages, this makes sense as they are always mapped and reclaiming
them always requires swapping.

But mapped file pages are skipped here as well and it has nothing to do
with swapping.

The real effect of the knob is whether mapped pages are unmapped and
reclaimed or not.  Rename it to `may_unmap' to have its name match its
actual meaning more precisely.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: MinChan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 479e4671939..1bca60f0c52 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -60,8 +60,8 @@ struct scan_control {
 
 	int may_writepage;
 
-	/* Can pages be swapped as part of reclaim? */
-	int may_swap;
+	/* Can mapped pages be reclaimed? */
+	int may_unmap;
 
 	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
 	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
@@ -606,7 +606,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (unlikely(!page_evictable(page, NULL)))
 			goto cull_mlocked;
 
-		if (!sc->may_swap && page_mapped(page))
+		if (!sc->may_unmap && page_mapped(page))
 			goto keep_locked;
 
 		/* Double the slab pressure for mapped and swapcache pages */
@@ -1694,7 +1694,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
-		.may_swap = 1,
+		.may_unmap = 1,
 		.swappiness = vm_swappiness,
 		.order = order,
 		.mem_cgroup = NULL,
@@ -1713,7 +1713,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
-		.may_swap = 1,
+		.may_unmap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = swappiness,
 		.order = 0,
@@ -1723,7 +1723,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 	struct zonelist *zonelist;
 
 	if (noswap)
-		sc.may_swap = 0;
+		sc.may_unmap = 0;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1762,7 +1762,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
-		.may_swap = 1,
+		.may_unmap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 		.order = order,
@@ -2110,7 +2110,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
-		.may_swap = 0,
+		.may_unmap = 0,
 		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.isolate_pages = isolate_pages_global,
@@ -2147,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 		/* Force reclaiming mapped pages in the passes #3 and #4 */
 		if (pass > 2)
-			sc.may_swap = 1;
+			sc.may_unmap = 1;
 
 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
 			unsigned long nr_to_scan = nr_pages - ret;
@@ -2290,7 +2290,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int priority;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.swap_cluster_max = max_t(unsigned long, nr_pages,
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
-- 
cgit v1.2.3


From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:31 -0700
Subject: mm: introduce for_each_populated_zone() macro

Impact: cleanup

In almost cases, for_each_zone() is used with populated_zone().  It's
because almost function doesn't need memoryless node information.
Therefore, for_each_populated_zone() can help to make code simplify.

This patch has no functional change.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h  |  8 ++++++++
 kernel/power/snapshot.c |  9 +++------
 kernel/power/swsusp.c   | 17 ++++++++---------
 mm/page_alloc.c         | 26 +++++---------------------
 mm/vmscan.c             |  4 +---
 mm/vmstat.c             | 11 ++---------
 6 files changed, 27 insertions(+), 48 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6cebbb7..26ef24076b7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -806,6 +806,14 @@ extern struct zone *next_zone(struct zone *zone);
 	     zone;					\
 	     zone = next_zone(zone))
 
+#define for_each_populated_zone(zone)		        \
+	for (zone = (first_online_pgdat())->node_zones; \
+	     zone;					\
+	     zone = next_zone(zone))			\
+		if (!populated_zone(zone))		\
+			; /* do nothing */		\
+		else
+
 static inline struct zone *zonelist_zone(struct zoneref *zoneref)
 {
 	return zoneref->zone;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f..33e2e4a819f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 
 	INIT_LIST_HEAD(list);
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long zone_start, zone_end;
 		struct mem_extent *ext, *cur, *aux;
 
-		if (!populated_zone(zone))
-			continue;
-
 		zone_start = zone->zone_start_pfn;
 		zone_end = zone->zone_start_pfn + zone->spanned_pages;
 
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
 	struct zone *zone;
 	unsigned int cnt = 0;
 
-	for_each_zone(zone)
-		if (populated_zone(zone) && is_highmem(zone))
+	for_each_populated_zone(zone)
+		if (is_highmem(zone))
 			cnt += zone_page_state(zone, NR_FREE_PAGES);
 
 	return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c9145155..1ee6636414b 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -229,17 +229,16 @@ int swsusp_shrink_memory(void)
 		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
 		tmp = size;
 		size += highmem_size;
-		for_each_zone (zone)
-			if (populated_zone(zone)) {
-				tmp += snapshot_additional_pages(zone);
-				if (is_highmem(zone)) {
-					highmem_size -=
+		for_each_populated_zone(zone) {
+			tmp += snapshot_additional_pages(zone);
+			if (is_highmem(zone)) {
+				highmem_size -=
 					zone_page_state(zone, NR_FREE_PAGES);
-				} else {
-					tmp -= zone_page_state(zone, NR_FREE_PAGES);
-					tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				}
+			} else {
+				tmp -= zone_page_state(zone, NR_FREE_PAGES);
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
 			}
+		}
 
 		if (highmem_size < 0)
 			highmem_size = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3803ea8c27..cbd532161f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
 	unsigned long flags;
 	struct zone *zone;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		if (!populated_zone(zone))
-			continue;
-
 		pset = zone_pcp(zone, cpu);
 
 		pcp = &pset->pcp;
@@ -1879,10 +1876,7 @@ void show_free_areas(void)
 	int cpu;
 	struct zone *zone;
 
-	for_each_zone(zone) {
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 
@@ -1922,12 +1916,9 @@ void show_free_areas(void)
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		int i;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1967,12 +1958,9 @@ void show_free_areas(void)
 		printk("\n");
 	}
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s: ", zone->name);
 
@@ -2784,11 +2772,7 @@ static int __cpuinit process_zones(int cpu)
 
 	node_set_state(node, N_CPU);	/* this node has a cpu */
 
-	for_each_zone(zone) {
-
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, node);
 		if (!zone_pcp(zone, cpu))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1bca60f0c52..301f057fd11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2061,11 +2061,9 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 	struct zone *zone;
 	unsigned long ret = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		enum lru_list l;
 
-		if (!populated_zone(zone))
-			continue;
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8cd81ea1ddc..9826766f127 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
 	int cpu;
 	int threshold;
 
-	for_each_zone(zone) {
-
-		if (!zone->present_pages)
-			continue;
-
+	for_each_populated_zone(zone) {
 		threshold = calculate_threshold(zone);
 
 		for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
 	int i;
 	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *p;
 
-		if (!populated_zone(zone))
-			continue;
-
 		p = zone_pcp(zone, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-- 
cgit v1.2.3


From 0a0dd05dd7e1a800241888cbf515bf8d3dc2e59c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:33 -0700
Subject: mm: don't call mark_page_accessed() in do_swap_page()

commit bf3f3bc5e734706730c12a323f9b2068052aa1f0 (mm: don't
mark_page_accessed in fault path) only remove the mark_page_accessed() in
filemap_fault().

Therefore, swap-backed pages and file-backed pages have inconsistent
behavior.  mark_page_accessed() should be removed from do_swap_page().

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 2032ad2fc34..0017111214c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2435,8 +2435,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(PGMAJFAULT);
 	}
 
-	mark_page_accessed(page);
-
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
-- 
cgit v1.2.3


From d979677c4c02f0a72db5a03ecd8184bd9d6695c8 Mon Sep 17 00:00:00 2001
From: MinChan Kim <minchan.kim@gmail.com>
Date: Tue, 31 Mar 2009 15:19:34 -0700
Subject: mm: shrink_all_memory(): use sc.nr_reclaimed

Commit a79311c14eae4bb946a97af25f3e1b17d625985d "vmscan: bail out of
direct reclaim after swap_cluster_max pages" moved the nr_reclaimed
counter into the scan control to accumulate the number of all reclaimed
pages in a reclaim invocation.

shrink_all_memory() can use the same mechanism. it increase code
consistency and redability.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: MinChan Kim <minchan.kim@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 301f057fd11..b15dcbb9e17 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2050,16 +2050,15 @@ unsigned long global_lru_pages(void)
 #ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * from LRU lists system-wide, for given pass and priority, and returns the
- * number of reclaimed pages
+ * from LRU lists system-wide, for given pass and priority.
  *
  * For pass > 3 we also try to shrink the LRU lists that contain a few pages
  */
-static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+static void shrink_all_zones(unsigned long nr_pages, int prio,
 				      int pass, struct scan_control *sc)
 {
 	struct zone *zone;
-	unsigned long ret = 0;
+	unsigned long nr_reclaimed = 0;
 
 	for_each_populated_zone(zone) {
 		enum lru_list l;
@@ -2082,14 +2081,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 
 				zone->lru[l].nr_scan = 0;
 				nr_to_scan = min(nr_pages, lru_pages);
-				ret += shrink_list(l, nr_to_scan, zone,
+				nr_reclaimed += shrink_list(l, nr_to_scan, zone,
 								sc, prio);
-				if (ret >= nr_pages)
-					return ret;
+				if (nr_reclaimed >= nr_pages) {
+					sc->nr_reclaimed = nr_reclaimed;
+					return;
+				}
 			}
 		}
 	}
-	return ret;
+	sc->nr_reclaimed = nr_reclaimed;
 }
 
 /*
@@ -2103,7 +2104,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	unsigned long lru_pages, nr_slab;
-	unsigned long ret = 0;
 	int pass;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
@@ -2125,8 +2125,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 		if (!reclaim_state.reclaimed_slab)
 			break;
 
-		ret += reclaim_state.reclaimed_slab;
-		if (ret >= nr_pages)
+		sc.nr_reclaimed += reclaim_state.reclaimed_slab;
+		if (sc.nr_reclaimed >= nr_pages)
 			goto out;
 
 		nr_slab -= reclaim_state.reclaimed_slab;
@@ -2148,18 +2148,18 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 			sc.may_unmap = 1;
 
 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
-			unsigned long nr_to_scan = nr_pages - ret;
+			unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
 
 			sc.nr_scanned = 0;
-			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
-			if (ret >= nr_pages)
+			shrink_all_zones(nr_to_scan, prio, pass, &sc);
+			if (sc.nr_reclaimed >= nr_pages)
 				goto out;
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
 					global_lru_pages());
-			ret += reclaim_state.reclaimed_slab;
-			if (ret >= nr_pages)
+			sc.nr_reclaimed += reclaim_state.reclaimed_slab;
+			if (sc.nr_reclaimed >= nr_pages)
 				goto out;
 
 			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
@@ -2168,21 +2168,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	}
 
 	/*
-	 * If ret = 0, we could not shrink LRUs, but there may be something
-	 * in slab caches
+	 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
+	 * something in slab caches
 	 */
-	if (!ret) {
+	if (!sc.nr_reclaimed) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
-			ret += reclaim_state.reclaimed_slab;
-		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+			sc.nr_reclaimed += reclaim_state.reclaimed_slab;
+		} while (sc.nr_reclaimed < nr_pages &&
+				reclaim_state.reclaimed_slab > 0);
 	}
 
+
 out:
 	current->reclaim_state = NULL;
 
-	return ret;
+	return sc.nr_reclaimed;
 }
 #endif
 
-- 
cgit v1.2.3


From 9786bf841da57fac3457a1dac41acb4c1f2eced6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:19:35 -0700
Subject: vmscan: clip swap_cluster_max in shrink_all_memory()

shrink_inactive_list() scans in sc->swap_cluster_max chunks until it hits
the scan limit it was passed.

shrink_inactive_list()
{
	do {
		isolate_pages(swap_cluster_max)
		shrink_page_list()
	} while (nr_scanned < max_scan);
}

This assumes that swap_cluster_max is not bigger than the scan limit
because the latter is checked only after at least one iteration.

In shrink_all_memory() sc->swap_cluster_max is initialized to the overall
reclaim goal in the beginning but not decreased while reclaim is making
progress which leads to subsequent calls to shrink_inactive_list()
reclaiming way too much in the one iteration that is done unconditionally.

Set sc->swap_cluster_max always to the proper goal before doing
  shrink_all_zones()
    shrink_list()
      shrink_inactive_list().

While the current shrink_all_memory() happily reclaims more than actually
requested, this patch fixes it to never exceed the goal:

unpatched
   wanted=10000 reclaimed=13356
   wanted=10000 reclaimed=19711
   wanted=10000 reclaimed=10289
   wanted=10000 reclaimed=17306
   wanted=10000 reclaimed=10700
   wanted=10000 reclaimed=10004
   wanted=10000 reclaimed=13301
   wanted=10000 reclaimed=10976
   wanted=10000 reclaimed=10605
   wanted=10000 reclaimed=10088
   wanted=10000 reclaimed=15000

patched
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=9599
   wanted=10000 reclaimed=8476
   wanted=10000 reclaimed=8326
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=9919
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=9624
   wanted=10000 reclaimed=10000
   wanted=10000 reclaimed=10000
   wanted=8500 reclaimed=8092
   wanted=316 reclaimed=316

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: MinChan Kim <minchan.kim@gmail.com>
Acked-by: Nigel Cunningham <ncunningham@crca.org.au>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b15dcbb9e17..9578063cd94 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2109,7 +2109,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_unmap = 0,
-		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.isolate_pages = isolate_pages_global,
 	};
@@ -2151,6 +2150,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 			unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
 
 			sc.nr_scanned = 0;
+			sc.swap_cluster_max = nr_to_scan;
 			shrink_all_zones(nr_to_scan, prio, pass, &sc);
 			if (sc.nr_reclaimed >= nr_pages)
 				goto out;
-- 
cgit v1.2.3


From bd775c42ea5f7c766d03a287083837cf05e7e738 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:37 -0700
Subject: mm: add comment why mark_page_accessed() would be better than
 pte_mkyoung() in follow_page()

At first look, mark_page_accessed() in follow_page() seems a bit strange.
It seems pte_mkyoung() would be better consistent with other kernel code.

However, it is intentional. The commit log said:

    ------------------------------------------------
    commit 9e45f61d69be9024a2e6bef3831fb04d90fac7a8
    Author: akpm <akpm>
    Date:   Fri Aug 15 07:24:59 2003 +0000

    [PATCH] Use mark_page_accessed() in follow_page()

    Touching a page via follow_page() counts as a reference so we should be
    either setting the referenced bit in the pte or running mark_page_accessed().

    Altering the pte is tricky because we haven't implemented an atomic
    pte_mkyoung().  And mark_page_accessed() is better anyway because it has more
    aging state: it can move the page onto the active list.

    BKrev: 3f3c8acbplT8FbwBVGtth7QmnqWkIw
    ------------------------------------------------

The atomic issue is still true nowadays. adding comment help to understand
code intention and it would be better.

[akpm@linux-foundation.org: clarify text]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 0017111214c..5b4ad5e4f98 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
 			set_page_dirty(page);
+		/*
+		 * pte_mkyoung() would be more correct here, but atomic care
+		 * is needed to avoid losing the dirty bit: it is easier to use
+		 * mark_page_accessed().
+		 */
 		mark_page_accessed(page);
 	}
 unlock:
-- 
cgit v1.2.3


From bd2f6199cf9af472aeefa1b642c9f504f19e6008 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:19:38 -0700
Subject: vmscan: respect higher order in zone_reclaim()

During page allocation, there are two stages of direct reclaim that are
applied to each zone in the preferred list.  The first stage using
zone_reclaim() reclaims unmapped file backed pages and slab pages if over
defined limits as these are cheaper to reclaim.  The caller specifies the
order of the target allocation but the scan control is not being correctly
initialised.

The impact is that the correct number of pages are being reclaimed but
that lumpy reclaim is not being applied.  This increases the chances of a
full direct reclaim via try_to_free_pages() is required.

This patch initialises the order field of the scan control as requested by
the caller.

[mel@csn.ul.ie: rewrote changelog]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9578063cd94..51f2df04d7c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2295,6 +2295,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
+		.order = order,
 		.isolate_pages = isolate_pages_global,
 	};
 	unsigned long slab_reclaimable;
-- 
cgit v1.2.3


From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Tue, 31 Mar 2009 15:19:39 -0700
Subject: vfs: add/use account_page_dirtied()

Add a helper function account_page_dirtied().  Use that from two
callsites.  reiser4 adds a function which adds a third callsite.

Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c         |  9 +--------
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 22 +++++++++++++++-------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97c..73abe6d8218 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page,
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_dirty_inc(current);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1ea37fc7a2..2223f8dfa56 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -834,6 +834,7 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
+void account_page_dirtied(struct page *page, struct address_space *mapping);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 40ca7cdb653..6aa92b03c74 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 	return 0;
 }
 
+/*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+	if (mapping_cap_account_dirty(mapping)) {
+		__inc_zone_page_state(page, NR_FILE_DIRTY);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		task_dirty_inc(current);
+		task_io_account_write(PAGE_CACHE_SIZE);
+	}
+}
+
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
@@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		if (mapping2) { /* Race with truncate? */
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-			if (mapping_cap_account_dirty(mapping)) {
-				__inc_zone_page_state(page, NR_FILE_DIRTY);
-				__inc_bdi_stat(mapping->backing_dev_info,
-						BDI_RECLAIMABLE);
-				task_dirty_inc(current);
-				task_io_account_write(PAGE_CACHE_SIZE);
-			}
+			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
-- 
cgit v1.2.3


From 8a0bdec194c21c8fdef840989d0d7b742bb5d4bc Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 31 Mar 2009 15:19:40 -0700
Subject: mm: fix SHM_HUGETLB to work with users in hugetlb_shm_group

Fix hugetlb subsystem so that non root users belonging to
hugetlb_shm_group can actually allocate hugetlb backed shm.

Currently non root users cannot even map one large page using SHM_HUGETLB
when they belong to the gid in /proc/sys/vm/hugetlb_shm_group.  This is
because allocation size is verified against RLIMIT_MEMLOCK resource limit
even if the user belongs to hugetlb_shm_group.

This patch
1. Fixes hugetlb subsystem so that users with CAP_IPC_LOCK and users
   belonging to hugetlb_shm_group don't need to be restricted with
   RLIMIT_MEMLOCK resource limits
2. This patch also disables mlock based rlimit checking (which will
   be reinstated and marked deprecated in a subsequent patch).

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a68..bc56df8ce00 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -943,9 +943,7 @@ static struct vfsmount *hugetlbfs_vfsmount;
 
 static int can_do_hugetlb_shm(void)
 {
-	return likely(capable(CAP_IPC_LOCK) ||
-			in_group_p(sysctl_hugetlb_shm_group) ||
-			can_do_mlock());
+	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
@@ -963,9 +961,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!user_shm_lock(size, user))
-		return ERR_PTR(-ENOMEM);
-
 	root = hugetlbfs_vfsmount->mnt_root;
 	quick_string.name = name;
 	quick_string.len = strlen(quick_string.name);
@@ -1004,7 +999,6 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	user_shm_unlock(size, user);
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.3


From 2584e517320bd48dc8d20e38a2621a2dbe58fade Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 31 Mar 2009 15:21:26 -0700
Subject: mm: reintroduce and deprecate rlimit based access for SHM_HUGETLB

Allow non root users with sufficient mlock rlimits to be able to allocate
hugetlb backed shm for now.  Deprecate this though.  This is being
deprecated because the mlock based rlimit checks for SHM_HUGETLB is not
consistent with mmap based huge page allocations.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 12 ++++++++++++
 fs/hugetlbfs/inode.c                       | 13 +++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 5e02b83ac12..ea7d1bdad34 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -311,6 +311,18 @@ Who:	Vlad Yasevich <vladislav.yasevich@hp.com>
 
 ---------------------------
 
+What:	Ability for non root users to shm_get hugetlb pages based on mlock
+	resource limits
+When:	2.6.31
+Why:	Non root users need to be part of /proc/sys/vm/hugetlb_shm_group or
+	have CAP_IPC_LOCK to be able to allocate shm segments backed by
+	huge pages.  The mlock based rlimit check to allow shm hugetlb is
+	inconsistent with mmap based allocations.  Hence it is being
+	deprecated.
+Who:	Ravikiran Thirumalai <kiran@scalex86.org>
+
+---------------------------
+
 What:	CONFIG_THERMAL_HWMON
 When:	January 2009
 Why:	This option was introduced just to allow older lm-sensors userspace
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index bc56df8ce00..23a3c76711e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -949,6 +949,7 @@ static int can_do_hugetlb_shm(void)
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
+	int unlock_shm = 0;
 	struct file *file;
 	struct inode *inode;
 	struct dentry *dentry, *root;
@@ -958,8 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
 
-	if (!can_do_hugetlb_shm())
-		return ERR_PTR(-EPERM);
+	if (!can_do_hugetlb_shm()) {
+		if (user_shm_lock(size, user)) {
+			unlock_shm = 1;
+			WARN_ONCE(1,
+			  "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+		} else
+			return ERR_PTR(-EPERM);
+	}
 
 	root = hugetlbfs_vfsmount->mnt_root;
 	quick_string.name = name;
@@ -999,6 +1006,8 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
+	if (unlock_shm)
+		user_shm_unlock(size, user);
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.3


From 2443462b0a04ef0f82ad48f4fd0ef4ac5b24c4b7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:23:12 -0700
Subject: mm: move pagevec stripping to save unlock-relock

In shrink_active_list() after the deactivation loop, we strip buffer heads
from the potentially remaining pages in the pagevec.

Currently, this drops the zone's lru lock for stripping, only to reacquire
it again afterwards to update statistics.

It is not necessary to strip the pages before updating the stats, so move
the whole thing out of the protected region and save the extra locking.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: MinChan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 51f2df04d7c..988aef93301 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1298,14 +1298,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	}
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	pgdeactivate += pgmoved;
-	if (buffer_heads_over_limit) {
-		spin_unlock_irq(&zone->lru_lock);
-		pagevec_strip(&pvec);
-		spin_lock_irq(&zone->lru_lock);
-	}
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgdeactivate);
 	spin_unlock_irq(&zone->lru_lock);
+	if (buffer_heads_over_limit)
+		pagevec_strip(&pvec);
 	if (vm_swap_full())
 		pagevec_swap_free(&pvec);
 
-- 
cgit v1.2.3


From ad1c3544d0a85da7738ce8cff6f8a148da57935c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:23:13 -0700
Subject: mm: don't free swap slots on page deactivation

The pagevec_swap_free() at the end of shrink_active_list() was introduced
in 68a22394 "vmscan: free swap space on swap-in/activation" when
shrink_active_list() was still rotating referenced active pages.

In 7e9cd48 "vmscan: fix pagecache reclaim referenced bit check" this was
changed, the rotating removed but the pagevec_swap_free() after the
rotation loop was forgotten, applying now to the pagevec of the
deactivation loop instead.

Now swap space is freed for deactivated pages.  And only for those that
happen to be on the pagevec after the deactivation loop.

Complete 7e9cd48 and remove the rest of the swap freeing.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 988aef93301..e70fae31e96 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1303,9 +1303,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	spin_unlock_irq(&zone->lru_lock);
 	if (buffer_heads_over_limit)
 		pagevec_strip(&pvec);
-	if (vm_swap_full())
-		pagevec_swap_free(&pvec);
-
 	pagevec_release(&pvec);
 }
 
-- 
cgit v1.2.3


From d1d7487173eab8352125cf6cc271940f24254bd4 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:14 -0700
Subject: mm: remove pagevec_swap_free()

pagevec_swap_free() is now unused.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  1 -
 mm/swap.c               | 23 -----------------------
 2 files changed, 24 deletions(-)

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7b2886fa7fd..bab82f4c571 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -24,7 +24,6 @@ void __pagevec_release(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
-void pagevec_swap_free(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e..6e83084c1f6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -456,29 +456,6 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }
 
-/**
- * pagevec_swap_free - try to free swap space from the pages in a pagevec
- * @pvec: pagevec with swapcache pages to free the swap space of
- *
- * The caller needs to hold an extra reference to each page and
- * not hold the page lock on the pages.  This function uses a
- * trylock on the page lock so it may not always free the swap
- * space associated with a page.
- */
-void pagevec_swap_free(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-
-		if (PageSwapCache(page) && trylock_page(page)) {
-			try_to_free_swap(page);
-			unlock_page(page);
-		}
-	}
-}
-
 /**
  * pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
-- 
cgit v1.2.3


From e2f17d9459aeccf4e013e31cbd741d6b1858eec4 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:23:15 -0700
Subject: hugetlb: chg cannot become less than 0

chg is unsigned, so it cannot be less than 0.

Also, since region_chg returns long, let vma_needs_reservation() forward
this to alloc_huge_page().  Store it as long as well.  all callers cast it
to long anyway.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Johannes Weiner <hannes@saeurebad.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d809a..28c655ba935 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h,
  * an instantiated the change should be committed via vma_commit_reservation.
  * No action is required on failure.
  */
-static int vma_needs_reservation(struct hstate *h,
+static long vma_needs_reservation(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h,
 		return 1;
 
 	} else  {
-		int err;
+		long err;
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
@@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	struct page *page;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned int chg;
+	long chg;
 
 	/*
 	 * Processes that did not create the mapping will have no reserves and
-- 
cgit v1.2.3


From 610a77e04a8d9fe8764dc484e2182fa251ce1cc2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:16 -0700
Subject: memdup_user(): introduce

I notice there are many places doing copy_from_user() which follows
kmalloc():

        dst = kmalloc(len, GFP_KERNEL);
        if (!dst)
                return -ENOMEM;
        if (copy_from_user(dst, src, len)) {
		kfree(dst);
		return -EFAULT
	}

memdup_user() is a wrapper of the above code.  With this new function, we
don't have to write 'len' twice, which can lead to typos/mistakes.  It
also produces smaller code and kernel text.

A quick grep shows 250+ places where memdup_user() *may* be used.  I'll
prepare a patchset to do this conversion.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h |  1 +
 mm/util.c              | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2..8852739f36d 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -12,6 +12,7 @@
 #include <linux/stddef.h>	/* for NULL */
 
 extern char *strndup_user(const char __user *, long);
+extern void *memdup_user(const void __user *, size_t);
 
 /*
  * Include machine specific inline routines
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf305..7c122e49f76 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,6 +69,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 }
 EXPORT_SYMBOL(kmemdup);
 
+/**
+ * memdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.
+ */
+void *memdup_user(const void __user *src, size_t len)
+{
+	void *p;
+
+	/*
+	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
+	 * cause pagefault, which makes it pointless to use GFP_NOFS
+	 * or GFP_ATOMIC.
+	 */
+	p = kmalloc_track_caller(len, GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, src, len)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return p;
+}
+EXPORT_SYMBOL(memdup_user);
+
 /**
  * __krealloc - like krealloc() but don't free @p.
  * @p: object to reallocate memory for.
-- 
cgit v1.2.3


From 6a11f75b6a17b5d9ac5025f8d048382fd1f47377 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:17 -0700
Subject: generic debug pagealloc

CONFIG_DEBUG_PAGEALLOC is now supported by x86, powerpc, sparc64, and
s390.  This patch implements it for the rest of the architectures by
filling the pages with poison byte patterns after free_pages() and
verifying the poison patterns before alloc_pages().

This generic one cannot detect invalid page accesses immediately but
invalid read access may cause invalid dereference by poisoned memory and
invalid write access can be detected after a long delay.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/mm/fault.c            |  18 ------
 arch/powerpc/Kconfig             |   3 +
 arch/powerpc/Kconfig.debug       |   1 +
 arch/s390/Kconfig                |   3 +
 arch/s390/Kconfig.debug          |   1 +
 arch/sparc/Kconfig               |   3 +
 arch/sparc/Kconfig.debug         |   3 +-
 arch/x86/Kconfig                 |   3 +
 arch/x86/Kconfig.debug           |   1 +
 include/linux/mm_types.h         |   4 ++
 include/linux/page-debug-flags.h |  30 +++++++++
 include/linux/poison.h           |   3 +
 lib/Kconfig.debug                |   1 +
 mm/Kconfig.debug                 |  17 ++++++
 mm/Makefile                      |   1 +
 mm/debug-pagealloc.c             | 129 +++++++++++++++++++++++++++++++++++++++
 16 files changed, 202 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/page-debug-flags.h
 create mode 100644 mm/Kconfig.debug
 create mode 100644 mm/debug-pagealloc.c

diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index ce4e4296b95..62d4abbaa65 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -250,21 +250,3 @@ asmlinkage void do_bus_error(unsigned long addr, int write_access,
 	dump_dtlb();
 	die("Bus Error", regs, SIGKILL);
 }
-
-/*
- * This functionality is currently not possible to implement because
- * we're using segmentation to ensure a fixed mapping of the kernel
- * virtual address space.
- *
- * It would be possible to implement this, but it would require us to
- * disable segmentation at startup and load the kernel mappings into
- * the TLB like any other pages. There will be lots of trickery to
- * avoid recursive invocation of the TLB miss handler, though...
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
-
-}
-EXPORT_SYMBOL(kernel_map_pages);
-#endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ad6b1c084fe..45192dce65c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -228,6 +228,9 @@ config PPC_OF_PLATFORM_PCI
 	depends on PPC64 # not supported on 32 bits yet
 	default n
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 22091bbfdc9..6aa0b5e087c 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -30,6 +30,7 @@ config DEBUG_STACK_USAGE
 config DEBUG_PAGEALLOC
         bool "Debug page memory allocations"
         depends on DEBUG_KERNEL && !HIBERNATION
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
         help
           Unmap pages from the kernel linear mapping after free_pages().
           This results in a large slowdown, but helps to find certain types
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2a8af5e1634..dcb667c4375 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -72,6 +72,9 @@ config PGSTE
 config VIRT_CPU_ACCOUNTING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 4599fa06bd8..7e297a3cde3 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -9,6 +9,7 @@ source "lib/Kconfig.debug"
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a slowdown, but helps to find certain types of
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c3ea215334f..cc12cd48bbc 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -124,6 +124,9 @@ config ARCH_NO_VIRT_TO_BUS
 config OF
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y if SPARC64
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug
index b8a15e271bf..d001b42041a 100644
--- a/arch/sparc/Kconfig.debug
+++ b/arch/sparc/Kconfig.debug
@@ -24,7 +24,8 @@ config STACK_DEBUG
 
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
-	depends on SPARC64 && DEBUG_KERNEL && !HIBERNATION
+	depends on DEBUG_KERNEL && !HIBERNATION
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 45161b81631..748e50a1a15 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,9 @@ config AUDIT_ARCH
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df608b..a345cb5447a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -75,6 +75,7 @@ config DEBUG_STACK_USAGE
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d84feb7bdbf..ddadb4defe0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
+#include <linux/page-debug-flags.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -174,6 +175,9 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+	unsigned long debug_flags;	/* Use atomic bitops on this */
+#endif
 };
 
 struct core_thread {
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h
new file mode 100644
index 00000000000..b0638fd91e9
--- /dev/null
+++ b/include/linux/page-debug-flags.h
@@ -0,0 +1,30 @@
+#ifndef LINUX_PAGE_DEBUG_FLAGS_H
+#define  LINUX_PAGE_DEBUG_FLAGS_H
+
+/*
+ * page->debug_flags bits:
+ *
+ * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to
+ * implement generic debug pagealloc feature. The pages are filled with
+ * poison patterns and set this flag after free_pages(). The poisoned
+ * pages are verified whether the patterns are not corrupted and clear
+ * the flag before alloc_pages().
+ */
+
+enum page_debug_flags {
+	PAGE_DEBUG_FLAG_POISON,		/* Page is poisoned */
+};
+
+/*
+ * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably
+ * gets turned off when no debug features are enabling it!
+ */
+
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+#if !defined(CONFIG_PAGE_POISONING) \
+/* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */
+#error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features!
+#endif
+#endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */
+
+#endif /* LINUX_PAGE_DEBUG_FLAGS_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 9f31683728f..6729f7dcd60 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -17,6 +17,9 @@
  */
 #define TIMER_ENTRY_STATIC	((void *) 0x74737461)
 
+/********** mm/debug-pagealloc.c **********/
+#define PAGE_POISON 0xaa
+
 /********** mm/slab.c **********/
 /*
  * Magic nums for obj red zoning.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 58bfe7e8fab..9638d99644a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -796,6 +796,7 @@ config SYSCTL_SYSCALL_CHECK
 	  to properly maintain and use. This enables checks that help
 	  you to keep things correct.
 
+source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 00000000000..c8d62d49a44
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,17 @@
+config WANT_PAGE_DEBUG_FLAGS
+	bool
+
+config PAGE_POISONING
+	bool "Debug page memory allocations"
+	depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	depends on !HIBERNATION
+	select DEBUG_PAGEALLOC
+	select WANT_PAGE_DEBUG_FLAGS
+	help
+	   Fill the pages with poison patterns after free_pages() and verify
+	   the patterns before alloc_pages(). This results in a large slowdown,
+	   but helps to find certain types of memory corruptions.
+
+	   This option cannot enalbe with hibernation. Otherwise, it will get
+	   wrong messages for memory corruption because the free pages are not
+	   saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f4..ec73c68b601 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 00000000000..a1e3324de2b
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+
+static inline void set_page_poison(struct page *page)
+{
+	__set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+	__clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+	return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static void poison_highpage(struct page *page)
+{
+	/*
+	 * Page poisoning for highmem pages is not implemented.
+	 *
+	 * This can be called from interrupt contexts.
+	 * So we need to create a new kmap_atomic slot for this
+	 * application and it will need interrupt protection.
+	 */
+}
+
+static void poison_page(struct page *page)
+{
+	void *addr;
+
+	if (PageHighMem(page)) {
+		poison_highpage(page);
+		return;
+	}
+	set_page_poison(page);
+	addr = page_address(page);
+	memset(addr, PAGE_POISON, PAGE_SIZE);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+	unsigned char error = a ^ b;
+
+	return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+	unsigned char *start;
+	unsigned char *end;
+
+	for (start = mem; start < mem + bytes; start++) {
+		if (*start != PAGE_POISON)
+			break;
+	}
+	if (start == mem + bytes)
+		return;
+
+	for (end = mem + bytes - 1; end > start; end--) {
+		if (*end != PAGE_POISON)
+			break;
+	}
+
+	if (!printk_ratelimit())
+		return;
+	else if (start == end && single_bit_flip(*start, PAGE_POISON))
+		printk(KERN_ERR "pagealloc: single bit error\n");
+	else
+		printk(KERN_ERR "pagealloc: memory corruption\n");
+
+	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+			end - start + 1, 1);
+	dump_stack();
+}
+
+static void unpoison_highpage(struct page *page)
+{
+	/*
+	 * See comment in poison_highpage().
+	 * Highmem pages should not be poisoned for now
+	 */
+	BUG_ON(page_poison(page));
+}
+
+static void unpoison_page(struct page *page)
+{
+	if (PageHighMem(page)) {
+		unpoison_highpage(page);
+		return;
+	}
+	if (page_poison(page)) {
+		void *addr = page_address(page);
+
+		check_poison_mem(addr, PAGE_SIZE);
+		clear_page_poison(page);
+	}
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		unpoison_page(page + i);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (!debug_pagealloc_enabled)
+		return;
+
+	if (enable)
+		unpoison_pages(page, numpages);
+	else
+		poison_pages(page, numpages);
+}
-- 
cgit v1.2.3


From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:23:18 -0700
Subject: mm: fix proc_dointvec_userhz_jiffies "breakage"

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838

On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange
way from the user's point of view:

	# echo 500 >/proc/sys/vm/dirty_writeback_centisecs
	# cat /proc/sys/vm/dirty_writeback_centisecs
	499

So, we have 5000 jiffies converted to only 499 clock ticks and reported
back.

TICK_NSEC = 999848
ACTHZ = 256039

Keeping in-kernel variable in units passed from userspace will fix issue
of course, but this probably won't be right for every sysctl.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           |  2 +-
 mm/page-writeback.c       | 20 +++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7300ecdc480..93445477f86 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -109,8 +109,8 @@ extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
-extern int dirty_writeback_interval;
-extern int dirty_expire_interval;
+extern unsigned int dirty_writeback_interval;
+extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850..2e490a389dd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1010,7 +1010,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_userhz_jiffies,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6aa92b03c74..30351f0063a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 20;
 unsigned long vm_dirty_bytes;
 
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
  */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
 
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
  */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
 
 	sync_supers();
 
-	oldest_jif = jiffies - dirty_expire_interval;
+	oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
 	start_jif = jiffies;
-	next_jif = start_jif + dirty_writeback_interval;
+	next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (dirty_writeback_interval)
-		mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+		mod_timer(&wb_timer, jiffies +
+			msecs_to_jiffies(dirty_writeback_interval * 10));
 	else
 		del_timer(&wb_timer);
 	return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+	mod_timer(&wb_timer,
+		  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-- 
cgit v1.2.3


From c2fdf3a9b2d52842808a8e551b53b55dd9b45030 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 31 Mar 2009 15:23:19 -0700
Subject: mm: enable hashdist by default on 64bit NUMA

On PowerPC we allocate large boot time hashes on node 0.  This leads to an
imbalance in the free memory, for example on a 64GB box (4 x 16GB nodes):

Free memory:
Node 0: 97.03%
Node 1: 98.54%
Node 2: 98.42%
Node 3: 98.53%

If we switch to using vmalloc (like ia64 and x86-64) things are more
balanced:

Free memory:
Node 0: 97.53%
Node 1: 98.35%
Node 2: 98.33%
Node 3: 98.33%

For many HPC applications we are limited by the free available memory on
the smallest node, so even though the same amount of memory is used the
better balancing helps.

Since all 64bit NUMA capable architectures should have sufficient vmalloc
space, it makes sense to enable it via CONFIG_64BIT.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 455d83219fa..bc3ab707369 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -146,10 +146,10 @@ extern void *alloc_large_system_hash(const char *tablename,
 
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 
-/* Only NUMA needs hash distribution.
- * IA64 and x86_64 have sufficient vmalloc space.
+/* Only NUMA needs hash distribution. 64bit NUMA architectures have
+ * sufficient vmalloc space.
  */
-#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64))
+#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
 #define HASHDIST_DEFAULT 1
 #else
 #define HASHDIST_DEFAULT 0
-- 
cgit v1.2.3


From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 31 Mar 2009 15:23:21 -0700
Subject: mm: page_mkwrite change prototype to match fault

Change the page_mkwrite prototype to take a struct vm_fault, and return
VM_FAULT_xxx flags.  There should be no functional change.

This makes it possible to return much more detailed error information to
the VM (and also can provide more information eg.  virtual_address to the
driver, which might be important in some special cases).

This is required for a subsequent fix.  And will also make it easier to
merge page_mkwrite() with fault() in future.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Artem Bityutskiy <dedekind@infradead.org>
Cc: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  2 +-
 drivers/video/fb_defio.c          |  3 ++-
 fs/btrfs/ctree.h                  |  2 +-
 fs/btrfs/inode.c                  |  5 ++++-
 fs/buffer.c                       |  6 +++++-
 fs/ext4/ext4.h                    |  2 +-
 fs/ext4/inode.c                   |  5 ++++-
 fs/fuse/file.c                    |  3 ++-
 fs/gfs2/ops_file.c                |  5 ++++-
 fs/nfs/file.c                     |  5 ++++-
 fs/ocfs2/mmap.c                   |  6 ++++--
 fs/ubifs/file.c                   |  9 ++++++---
 fs/xfs/linux-2.6/xfs_file.c       |  4 ++--
 include/linux/buffer_head.h       |  2 +-
 include/linux/mm.h                |  3 ++-
 mm/memory.c                       | 26 ++++++++++++++++++++++----
 16 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4e78ce67784..76efe5b71d7 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -505,7 +505,7 @@ prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
 	int (*fault)(struct vm_area_struct*, struct vm_fault *);
-	int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+	int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
 	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 082026546ae..0a7a6679ee6 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
 static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct page *page)
+				  struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct fb_info *info = vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d..7dd1b6d0bf3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22..ec5423790bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4362,6 +4363,8 @@ again:
 out_unlock:
 	unlock_page(page);
 out:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 73abe6d8218..6d51a3da362 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
  * unlock the page.
  */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
@@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 		ret = block_commit_write(page, 0, end);
 
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+
 	unlock_page(page);
 	return ret;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057..990c9400092 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
 
 /* ioctl.c */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db7..dd82ff39006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
@@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		goto out_unlock;
 	ret = 0;
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	up_read(&inode->i_alloc_sem);
 	return ret;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719b..4e340fedf76 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	/*
 	 * Don't use page->mapping as it may become NULL from a
 	 * concurrent truncate.
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500..70b9b854894 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,8 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d..cec79392e4b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 };
 
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
@@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		ret = pagelen;
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713e..b606496b72e 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct buffer_head *di_bh = NULL;
 	sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
 	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret2 < 0)
 		mlog_errno(ret2);
-
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f26..0ff89fe71e5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made
  * writable. UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
 
 	if (unlikely(c->ro_media))
-		return -EROFS;
+		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
 	 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		if (err == -ENOSPC)
 			ubifs_warn("out of space for mmapped file "
 				   "(inode number %lu)", inode->i_ino);
-		return err;
+		return VM_FAULT_SIGBUS;
 	}
 
 	lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
 	unlock_page(page);
 	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
 	return err;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0..f4e25544157 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
 STATIC int
 xfs_vm_page_mkwrite(
 	struct vm_area_struct	*vma,
-	struct page		*page)
+	struct vm_fault		*vmf)
 {
-	return block_page_mkwrite(vma, page, xfs_get_blocks);
+	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
 const struct file_operations xfs_file_operations = {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f19fd9045ea..3d7bcde2e33 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
-int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2223f8dfa56..aeabe953ba4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -135,6 +135,7 @@ extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -187,7 +188,7 @@ struct vm_operations_struct {
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
diff --git a/mm/memory.c b/mm/memory.c
index 5b4ad5e4f98..cf6873e91c6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * get_user_pages(.write=1, .force=1).
 		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			struct vm_fault vmf;
+			int tmp;
+
+			vmf.virtual_address = (void __user *)(address &
+								PAGE_MASK);
+			vmf.pgoff = old_page->index;
+			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+			vmf.page = old_page;
+
 			/*
 			 * Notify the address space that the page is about to
 			 * become writable so that it can prohibit this or wait
@@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 
-			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+			if (unlikely(tmp &
+					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+				ret = tmp;
 				goto unwritable_page;
+			}
 
 			/*
 			 * Since we dropped the lock we need to revalidate
@@ -2106,7 +2119,7 @@ oom:
 
 unwritable_page:
 	page_cache_release(old_page);
-	return VM_FAULT_SIGBUS;
+	return ret;
 }
 
 /*
@@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
+				int tmp;
+
 				unlock_page(page);
-				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-					ret = VM_FAULT_SIGBUS;
+				vmf.flags |= FAULT_FLAG_MKWRITE;
+				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+				if (unlikely(tmp &
+					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+					ret = tmp;
 					anon = 1; /* no anon but release vmf.page */
 					goto out_unlocked;
 				}
-- 
cgit v1.2.3


From 56a76f8275c379ed73c8a43cfa1dfa2f5e9cfa19 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 31 Mar 2009 15:23:23 -0700
Subject: fs: fix page_mkwrite error cases in core code and btrfs

page_mkwrite is called with neither the page lock nor the ptl held.  This
means a page can be concurrently truncated or invalidated out from
underneath it.  Callers are supposed to prevent truncate races themselves,
however previously the only thing they can do in case they hit one is to
raise a SIGBUS.  A sigbus is wrong for the case that the page has been
invalidated or truncated within i_size (eg.  hole punched).  Callers may
also have to perform memory allocations in this path, where again, SIGBUS
would be wrong.

The previous patch ("mm: page_mkwrite change prototype to match fault")
made it possible to properly specify errors.  Convert the generic buffer.c
code and btrfs to return sane error values (in the case of page removed
from pagecache, VM_FAULT_NOPAGE will cause the fault handler to exit
without doing anything, and the fault will be retried properly).

This fixes core code, and converts btrfs as a template/example.  All other
filesystems defining their own page_mkwrite should be fixed in a similar
manner.

Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/inode.c | 11 +++++++----
 fs/buffer.c      | 12 ++++++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec5423790bb..17e608c4dc7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4307,10 +4307,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_end;
 
 	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
-	if (ret)
+	if (ret) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
 		goto out;
+	}
 
-	ret = -EINVAL;
+	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
 	lock_page(page);
 	size = i_size_read(inode);
@@ -4363,8 +4368,6 @@ again:
 out_unlock:
 	unlock_page(page);
 out:
-	if (ret)
-		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 6d51a3da362..0c14f8d52ee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2320,7 +2320,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
-	int ret = -EINVAL;
+	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 
 	lock_page(page);
 	size = i_size_read(inode);
@@ -2340,10 +2340,14 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-out_unlock:
-	if (ret)
-		ret = VM_FAULT_SIGBUS;
+	if (unlikely(ret)) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+	}
 
+out_unlock:
 	unlock_page(page);
 	return ret;
 }
-- 
cgit v1.2.3


From 851a039cc547b33b8139fe6d7c2bbfb158e2724e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 31 Mar 2009 15:23:24 -0700
Subject: mm: page_mkwrite change prototype to match fault: fix sysfs

Fix warnings and return values in sysfs bin_page_mkwrite(), fixing
fs/sysfs/bin.c: In function `bin_page_mkwrite':
fs/sysfs/bin.c:250: warning: passing argument 2 of `bb->vm_ops->page_mkwrite' from incompatible pointer type
fs/sysfs/bin.c: At top level:
fs/sysfs/bin.c:280: warning: initialization from incompatible pointer type

Expects to have my [PATCH next] sysfs: fix some bin_vm_ops errors

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/bin.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 07703d3ff4a..93e0c0281d4 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -234,7 +234,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct bin_buffer *bb = file->private_data;
@@ -242,15 +242,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	int ret;
 
 	if (!bb->vm_ops)
-		return -EINVAL;
+		return VM_FAULT_SIGBUS;
 
 	if (!bb->vm_ops->page_mkwrite)
 		return 0;
 
 	if (!sysfs_get_active_two(attr_sd))
-		return -EINVAL;
+		return VM_FAULT_SIGBUS;
 
-	ret = bb->vm_ops->page_mkwrite(vma, page);
+	ret = bb->vm_ops->page_mkwrite(vma, vmf);
 
 	sysfs_put_active_two(attr_sd);
 	return ret;
-- 
cgit v1.2.3


From f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:25 -0700
Subject: mm: introduce debug_kmap_atomic

x86 has debug_kmap_atomic_prot() which is error checking function for
kmap_atomic.  It is usefull for the other architectures, although it needs
CONFIG_TRACE_IRQFLAGS_SUPPORT.

This patch exposes it to the other architectures.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/highmem_32.c | 45 +--------------------------------------------
 include/linux/highmem.h  | 12 ++++++++++++
 mm/highmem.c             | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 522db5e3d0b..8126e8d1a2a 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -19,49 +19,6 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -81,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic_prot(type);
+	debug_kmap_atomic(type);
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 13875ce9112..7ff5c55f9b5 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -187,4 +187,16 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_atomic(vto, KM_USER1);
 }
 
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type);
+
+#else
+
+static inline void debug_kmap_atomic(enum km_type type)
+{
+}
+
+#endif
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/mm/highmem.c b/mm/highmem.c
index 910198037bf..68eb1d9b63f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,3 +422,48 @@ void __init page_address_init(void)
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type)
+{
+	static unsigned warn_count = 10;
+
+	if (unlikely(warn_count == 0))
+		return;
+
+	if (unlikely(in_interrupt())) {
+		if (in_irq()) {
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		} else if (!irqs_disabled()) {	/* softirq */
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+			    type != KM_SKB_SUNRPC_DATA &&
+			    type != KM_SKB_DATA_SOFTIRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		}
+	}
+
+	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+		if (!irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+		if (irq_count() == 0 && !irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	}
+}
+
+#endif
-- 
cgit v1.2.3


From 7ca43e7564679604d86e9ed834e7bbcffd8a4a3f Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:25 -0700
Subject: mm: use debug_kmap_atomic

Use debug_kmap_atomic in kmap_atomic, kmap_atomic_pfn, and
iomap_atomic_prot_pfn.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/highmem.c             | 2 ++
 arch/powerpc/include/asm/highmem.h | 2 ++
 arch/sparc/mm/highmem.c            | 1 +
 arch/x86/mm/highmem_32.c           | 1 +
 arch/x86/mm/iomap_32.c             | 2 ++
 include/asm-frv/highmem.h          | 2 ++
 include/asm-mn10300/highmem.h      | 2 ++
 7 files changed, 12 insertions(+)

diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index 060d28dca8a..4481656d106 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -42,6 +42,7 @@ void *__kmap_atomic(struct page *page, enum km_type type)
 	if (!PageHighMem(page))
 		return page_address(page);
 
+	debug_kmap_atomic(type);
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -88,6 +89,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 
 	pagefault_disable();
 
+	debug_kmap_atomic(type);
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index 545028f8648..684a73f4324 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -24,6 +24,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/highmem.h>
 #include <asm/kmap_types.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
@@ -94,6 +95,7 @@ static inline void *kmap_atomic_prot(struct page *page, enum km_type type, pgpro
 	if (!PageHighMem(page))
 		return page_address(page);
 
+	debug_kmap_atomic(type);
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 752d0c9fb54..7916feba6e4 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -39,6 +39,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	if (!PageHighMem(page))
 		return page_address(page);
 
+	debug_kmap_atomic(type);
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a..5bc5d1688c1 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -40,6 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 
 	debug_kmap_atomic(type);
 
+	debug_kmap_atomic(type);
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 699c9b2895a..bff0c9032f8 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -19,6 +19,7 @@
 #include <asm/iomap.h>
 #include <asm/pat.h>
 #include <linux/module.h>
+#include <linux/highmem.h>
 
 int is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
@@ -71,6 +72,7 @@ iounmap_atomic(void *kvaddr, enum km_type type)
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
+	debug_kmap_atomic(type);
 	/*
 	 * Force other mappings to Oops if they'll try to access this pte
 	 * without first remap it.  Keeping stale mappings around is a bad idea
diff --git a/include/asm-frv/highmem.h b/include/asm-frv/highmem.h
index 26cefcde5ce..68e4677fb9e 100644
--- a/include/asm-frv/highmem.h
+++ b/include/asm-frv/highmem.h
@@ -18,6 +18,7 @@
 #ifdef __KERNEL__
 
 #include <linux/init.h>
+#include <linux/highmem.h>
 #include <asm/mem-layout.h>
 #include <asm/spr-regs.h>
 #include <asm/mb-regs.h>
@@ -116,6 +117,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type type)
 	unsigned long paddr;
 
 	pagefault_disable();
+	debug_kmap_atomic(type);
 	paddr = page_to_phys(page);
 
 	switch (type) {
diff --git a/include/asm-mn10300/highmem.h b/include/asm-mn10300/highmem.h
index 5256854c045..90f2abb04bf 100644
--- a/include/asm-mn10300/highmem.h
+++ b/include/asm-mn10300/highmem.h
@@ -16,6 +16,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/highmem.h>
 #include <asm/kmap_types.h>
 #include <asm/pgtable.h>
 
@@ -77,6 +78,7 @@ static inline unsigned long kmap_atomic(struct page *page, enum km_type type)
 	if (page < highmem_start_page)
 		return page_address(page);
 
+	debug_kmap_atomic(type);
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #if HIGHMEM_DEBUG
-- 
cgit v1.2.3


From 33925b25d2c00a29664f1994ab350a9bff70f7a2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 31 Mar 2009 15:23:26 -0700
Subject: nommu: there is no mlock() for NOMMU, so don't provide the bits

The mlock() facility does not exist for NOMMU since all mappings are
effectively locked anyway, so we don't make the bits available when
they're not useful.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Enrik Berkhan <Enrik.Berkhan@ge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 20 +++++++++++++-------
 mm/Kconfig                 |  8 ++++++++
 mm/internal.h              |  8 +++++---
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 219a523ecdb..61df1779b2a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,8 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
+#endif
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
@@ -234,20 +236,20 @@ PAGEFLAG_FALSE(SwapCache)
 #ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
+#else
+PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
+	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
+	__CLEARPAGEFLAG_NOOP(Unevictable)
+#endif
 
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define MLOCK_PAGES 1
 PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
 	TESTSCFLAG(Mlocked, mlocked)
-
 #else
-
 #define MLOCK_PAGES 0
 PAGEFLAG_FALSE(Mlocked)
 	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
-
-PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
-	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
-	__CLEARPAGEFLAG_NOOP(Unevictable)
 #endif
 
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
@@ -367,9 +369,13 @@ static inline void __ClearPageTail(struct page *page)
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 #define __PG_UNEVICTABLE	(1 << PG_unevictable)
-#define __PG_MLOCKED		(1 << PG_mlocked)
 #else
 #define __PG_UNEVICTABLE	0
+#endif
+
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+#define __PG_MLOCKED		(1 << PG_mlocked)
+#else
 #define __PG_MLOCKED		0
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf..8c895973dfb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,5 +214,13 @@ config UNEVICTABLE_LRU
 	  will use one page flag and increase the code size a little,
 	  say Y unless you know what you are doing.
 
+config HAVE_MLOCK
+	bool
+	default y if MMU=y
+
+config HAVE_MLOCKED_PAGE_BIT
+	bool
+	default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+
 config MMU_NOTIFIER
 	bool
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2..987bb03fbdd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+#ifdef CONFIG_HAVE_MLOCK
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
 }
+#endif
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 /*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
  * Called only in fault path via page_evictable() for a new page
  * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
 	}
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
+#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
 	return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 static inline void free_page_mlock(struct page *page) { }
 
-#endif /* CONFIG_UNEVICTABLE_LRU */
+#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 
 /*
  * Return the mem_map entry representing the 'offset' subpage within
-- 
cgit v1.2.3


From 71aa653c6bfa6743d838342105ebc067145394e4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 31 Mar 2009 15:23:28 -0700
Subject: nommu: make CONFIG_UNEVICTABLE_LRU available when CONFIG_MMU=n

Make CONFIG_UNEVICTABLE_LRU available when CONFIG_MMU=n.  There's no logical
reason it shouldn't be available, and it can be used for ramfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Enrik Berkhan <Enrik.Berkhan@ge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 8c895973dfb..b53427ad30a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -206,7 +206,6 @@ config VIRT_TO_BUS
 config UNEVICTABLE_LRU
 	bool "Add LRU list to track non-evictable pages"
 	default y
-	depends on MMU
 	help
 	  Keeps unevictable pages off of the active and inactive pageout
 	  lists, so kswapd will not waste CPU time or have its balancing
-- 
cgit v1.2.3


From 88c3bd707c2552bcef93cc3724647903aece159d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 31 Mar 2009 15:23:29 -0700
Subject: vmscan: print shrink_slab symbol name on negative shrinker objects

When a shrinker has a negative number of objects to delete, the symbol
name of the shrinker should be printed, not shrink_slab.  This also makes
the error message slightly more informative.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e70fae31e96..f4619c6cd59 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -214,8 +214,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 		do_div(delta, lru_pages + 1);
 		shrinker->nr += delta;
 		if (shrinker->nr < 0) {
-			printk(KERN_ERR "%s: nr=%ld\n",
-					__func__, shrinker->nr);
+			printk(KERN_ERR "shrink_slab: %pF negative objects to "
+			       "delete nr=%ld\n",
+			       shrinker->shrink, shrinker->nr);
 			shrinker->nr = max_pass;
 		}
 
-- 
cgit v1.2.3


From 2678958e1225f350806d90f211a3b475f64aee80 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:23:29 -0700
Subject: ramfs-nommu: use generic lru cache

Instead of open-coding the lru-list-add pagevec batching when expanding a
file mapping from zero, defer to the appropriate page cache function that
also takes care of adding the page to the lru list.

This is cleaner, saves code and reduces the stack footprint by 16 words
worth of pagevec.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.com>
Cc: MinChan Kim <minchan.kim@gmail.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/file-nommu.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 995ef1d6686..ebb2c417912 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
  */
 int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
-	struct pagevec lru_pvec;
 	unsigned long npages, xpages, loop, limit;
 	struct page *pages;
 	unsigned order;
@@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	memset(data, 0, newsize);
 
 	/* attach all the pages to the inode's address space */
-	pagevec_init(&lru_pvec, 0);
 	for (loop = 0; loop < npages; loop++) {
 		struct page *page = pages + loop;
 
-		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+					GFP_KERNEL);
 		if (ret < 0)
 			goto add_error;
 
-		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add_file(&lru_pvec);
-
 		/* prevent the page from being discarded on memory pressure */
 		SetPageDirty(page);
 
 		unlock_page(page);
 	}
 
-	pagevec_lru_add_file(&lru_pvec);
 	return 0;
 
  fsize_exceeded:
@@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	return -EFBIG;
 
  add_error:
-	pagevec_lru_add_file(&lru_pvec);
-	page_cache_release(pages + loop);
-	for (loop++; loop < npages; loop++)
-		__free_page(pages + loop);
+	while (loop < npages)
+		__free_page(pages + loop++);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:31 -0700
Subject: vmscan: fix it to take care of nodemask

try_to_free_pages() is used for the direct reclaim of up to
SWAP_CLUSTER_MAX pages when watermarks are low.  The caller to
alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to
be used but this is not passed to try_to_free_pages().  This can lead to
unnecessary reclaim of pages that are unusable by the caller and int the
worst case lead to allocation failure as progress was not been make where
it is needed.

This patch passes the nodemask used for alloc_pages_nodemask() to
try_to_free_pages().

Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c          |  2 +-
 include/linux/swap.h |  2 +-
 mm/page_alloc.c      |  3 ++-
 mm/vmscan.c          | 13 +++++++++++--
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 0c14f8d52ee..c77b848c3d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -290,7 +290,7 @@ static void free_more_memory(void)
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS);
+						GFP_NOFS, NULL);
 	}
 }
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d3021557887..b8b0c4ce83e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ static inline void lru_cache_add_active_file(struct page *page)
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-					gfp_t gfp_mask);
+					gfp_t gfp_mask, nodemask_t *mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbd532161f6..0284e528748 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1582,7 +1582,8 @@ nofail_alloc:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+	did_some_progress = try_to_free_pages(zonelist, order,
+						gfp_mask, nodemask);
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f4619c6cd59..06e72693b45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,6 +78,12 @@ struct scan_control {
 	/* Which cgroup do we reclaim from */
 	struct mem_cgroup *mem_cgroup;
 
+	/*
+	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
+	 * are scanned.
+	 */
+	nodemask_t	*nodemask;
+
 	/* Pluggable isolate pages callback */
 	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
 			unsigned long *scanned, int order, int mode,
@@ -1538,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 	struct zone *zone;
 
 	sc->all_unreclaimable = 1;
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+					sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		/*
@@ -1683,7 +1690,7 @@ out:
 }
 
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-								gfp_t gfp_mask)
+				gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
@@ -1694,6 +1701,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.mem_cgroup = NULL,
 		.isolate_pages = isolate_pages_global,
+		.nodemask = nodemask,
 	};
 
 	return do_try_to_free_pages(zonelist, &sc);
@@ -1714,6 +1722,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.order = 0,
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
+		.nodemask = NULL, /* we don't care the placement */
 	};
 	struct zonelist *zonelist;
 
-- 
cgit v1.2.3


From 9fab5619bdd7f84cdd22cc760778f759f9819a33 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 31 Mar 2009 15:23:33 -0700
Subject: shmem: writepage directly to swap

Synopsis: if shmem_writepage calls swap_writepage directly, most shmem
swap loads benefit, and a catastrophic interaction between SLUB and some
flash storage is avoided.

shmem_writepage() has always been peculiar in making no attempt to write:
it has just transferred a shmem page from file cache to swap cache, then
let that page make its way around the LRU again before being written and
freed.

The idea was that people use tmpfs because they want those pages to stay
in RAM; so although we give it an overflow to swap, we should resist
writing too soon, giving those pages a second chance before they can be
reclaimed.

That was always questionable, and I've toyed with this patch for years;
but never had a clear justification to depart from the original design.

It became more questionable in 2.6.28, when the split LRU patches classed
shmem and tmpfs pages as SwapBacked rather than as file_cache: that in
itself gives them more resistance to reclaim than normal file pages.  I
prepared this patch for 2.6.29, but the merge window arrived before I'd
completed gathering statistics to justify sending it in.

Then while comparing SLQB against SLUB, running SLUB on a laptop I'd
habitually used with SLAB, I found SLUB to run my tmpfs kbuild swapping
tests five times slower than SLAB or SLQB - other machines slower too, but
nowhere near so bad.  Simpler "cp -a" swapping tests showed the same.

slub_max_order=0 brings sanity to all, but heavy swapping is too far from
normal to justify such a tuning.  The crucial factor on that laptop turns
out to be that I'm using an SD card for swap.  What happens is this:

By default, SLUB uses order-2 pages for shmem_inode_cache (and many other
fs inodes), so creating tmpfs files under memory pressure brings lumpy
reclaim into play.  One subpage of the order is chosen from the bottom of
the LRU as usual, then the other three picked out from their random
positions on the LRUs.

In a tmpfs load, many of these pages will be ones which already passed
through shmem_writepage, so already have swap allocated.  And though their
offsets on swap were probably allocated sequentially, now that the pages
are picked off at random, their swap offsets are scattered.

But the flash storage on the SD card is very sensitive to having its
writes merged: once swap is written at scattered offsets, performance
falls apart.  Rotating disk seeks increase too, but less disastrously.

So: stop giving shmem/tmpfs pages a second pass around the LRU, write them
out to swap as soon as their swap has been allocated.

It's surely possible to devise an artificial load which runs faster the
old way, one whose sizing is such that the tmpfs pages on their second
pass are the ones that are wanted again, and other pages not.

But I've not yet found such a load: on all machines, under the loads I've
tried, immediate swap_writepage speeds up shmem swapping: especially when
using the SLUB allocator (and more effectively than slub_max_order=0), but
also with the others; and it also reduces the variance between runs.  How
much faster varies widely: a factor of five is rare, 5% is common.

One load which might have suffered: imagine a swapping shmem load in a
limited mem_cgroup on a machine with plenty of memory.  Before 2.6.29 the
swapcache was not charged, and such a load would have run quickest with
the shmem swapcache never written to swap.  But now swapcache is charged,
so even this load benefits from shmem_writepage directly to swap.

Apologies for the #ifndef CONFIG_SWAP swap_writepage() stub in swap.h:
it's silly because that will never get called; but refactoring shmem.c
sensibly according to CONFIG_SWAP will be a separate task.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/shmem.c           | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b8b0c4ce83e..62d81435347 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -382,6 +382,11 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+{
+	return 0;
+}
+
 static inline struct page *lookup_swap_cache(swp_entry_t swp)
 {
 	return NULL;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7ec78e24a30..d94d2e9146b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		swap_duplicate(swap);
 		BUG_ON(page_mapped(page));
 		page_cache_release(page);	/* pagecache ref */
-		set_page_dirty(page);
-		unlock_page(page);
+		swap_writepage(page, wbc);
 		if (inode) {
 			mutex_lock(&shmem_swaplist_mutex);
 			/* move instead of add in case we're racing */
-- 
cgit v1.2.3


From 0b42afd0a3051952924ed373dcda61beb23fcb58 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:23:34 -0700
Subject: alpha: fix macros

When this macros isn't called with 'fixup', e.g.  with foo this will
incorectly expand to foo->foo.bits.errreg

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/uaccess.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 22de3b434a2..163f3053001 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -498,13 +498,13 @@ struct exception_table_entry
 };
 
 /* Returns the new pc */
-#define fixup_exception(map_reg, fixup, pc)			\
+#define fixup_exception(map_reg, _fixup, pc)			\
 ({								\
-	if ((fixup)->fixup.bits.valreg != 31)			\
-		map_reg((fixup)->fixup.bits.valreg) = 0;	\
-	if ((fixup)->fixup.bits.errreg != 31)			\
-		map_reg((fixup)->fixup.bits.errreg) = -EFAULT;	\
-	(pc) + (fixup)->fixup.bits.nextinsn;			\
+	if ((_fixup)->fixup.bits.valreg != 31)			\
+		map_reg((_fixup)->fixup.bits.valreg) = 0;	\
+	if ((_fixup)->fixup.bits.errreg != 31)			\
+		map_reg((_fixup)->fixup.bits.errreg) = -EFAULT;	\
+	(pc) + (_fixup)->fixup.bits.nextinsn;			\
 })
 
 
-- 
cgit v1.2.3


From a94066992b3050a7bd9a82bf73bf19f6052d2f82 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Tue, 31 Mar 2009 15:23:35 -0700
Subject: MAINTAINERS: add the missing linux alpha port mailling list

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c5f4e9d27b6..a64f2920a8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -357,6 +357,7 @@ S:	Odd Fixes for 2.4; Maintained for 2.6.
 P:	Ivan Kokshaysky
 M:	ink@jurassic.park.msu.ru
 S:	Maintained for 2.4; PCI support for 2.6.
+L:	linux-alpha@vger.kernel.org
 
 AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER
 P:	Thomas Dahlmann
-- 
cgit v1.2.3


From a6209d6d71f2ab8c63cc1587ef65490d83022baf Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Tue, 31 Mar 2009 15:23:35 -0700
Subject: alpha: xchg/cmpxchg cleanup and fixes

- "_local" versions of xchg/cmpxchg functions duplicate code
  of non-local ones (quite a few pages of assembler), except
  memory barriers. We can generate these two variants from a
  single header file using simple macros;

- convert xchg macro back to inline function using always_inline
  attribute;

- use proper argument types for cmpxchg_u8/u16 functions
  to fix a problem with negative arguments.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/system.h | 547 ++++------------------------------------
 arch/alpha/include/asm/xchg.h   | 258 +++++++++++++++++++
 2 files changed, 308 insertions(+), 497 deletions(-)
 create mode 100644 arch/alpha/include/asm/xchg.h

diff --git a/arch/alpha/include/asm/system.h b/arch/alpha/include/asm/system.h
index afe20fa58c9..5aa40cca4f2 100644
--- a/arch/alpha/include/asm/system.h
+++ b/arch/alpha/include/asm/system.h
@@ -309,518 +309,71 @@ extern int __min_ipl;
 #define tbia()		__tbi(-2, /* no second argument */)
 
 /*
- * Atomic exchange.
- * Since it can be used to implement critical sections
- * it must clobber "memory" (also for interrupts in UP).
+ * Atomic exchange routines.
  */
 
-static inline unsigned long
-__xchg_u8(volatile char *m, unsigned long val)
-{
-	unsigned long ret, tmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%4,7,%3\n"
-	"	insbl	%1,%4,%1\n"
-	"1:	ldq_l	%2,0(%3)\n"
-	"	extbl	%2,%4,%0\n"
-	"	mskbl	%2,%4,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%3)\n"
-	"	beq	%2,2f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
-	: "r" ((long)m), "1" (val) : "memory");
-
-	return ret;
-}
-
-static inline unsigned long
-__xchg_u16(volatile short *m, unsigned long val)
-{
-	unsigned long ret, tmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%4,7,%3\n"
-	"	inswl	%1,%4,%1\n"
-	"1:	ldq_l	%2,0(%3)\n"
-	"	extwl	%2,%4,%0\n"
-	"	mskwl	%2,%4,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%3)\n"
-	"	beq	%2,2f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
-	: "r" ((long)m), "1" (val) : "memory");
-
-	return ret;
-}
-
-static inline unsigned long
-__xchg_u32(volatile int *m, unsigned long val)
-{
-	unsigned long dummy;
-
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%4\n"
-	"	bis $31,%3,%1\n"
-	"	stl_c %1,%2\n"
-	"	beq %1,2f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m) : "memory");
-
-	return val;
-}
-
-static inline unsigned long
-__xchg_u64(volatile long *m, unsigned long val)
-{
-	unsigned long dummy;
-
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%4\n"
-	"	bis $31,%3,%1\n"
-	"	stq_c %1,%2\n"
-	"	beq %1,2f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m) : "memory");
+#define __ASM__MB
+#define ____xchg(type, args...)		__xchg ## type ## _local(args)
+#define ____cmpxchg(type, args...)	__cmpxchg ## type ## _local(args)
+#include <asm/xchg.h>
 
-	return val;
-}
-
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid xchg().  */
-extern void __xchg_called_with_bad_pointer(void);
-
-#define __xchg(ptr, x, size) \
-({ \
-	unsigned long __xchg__res; \
-	volatile void *__xchg__ptr = (ptr); \
-	switch (size) { \
-		case 1: __xchg__res = __xchg_u8(__xchg__ptr, x); break; \
-		case 2: __xchg__res = __xchg_u16(__xchg__ptr, x); break; \
-		case 4: __xchg__res = __xchg_u32(__xchg__ptr, x); break; \
-		case 8: __xchg__res = __xchg_u64(__xchg__ptr, x); break; \
-		default: __xchg_called_with_bad_pointer(); __xchg__res = x; \
-	} \
-	__xchg__res; \
-})
-
-#define xchg(ptr,x)							     \
-  ({									     \
-     __typeof__(*(ptr)) _x_ = (x);					     \
-     (__typeof__(*(ptr))) __xchg((ptr), (unsigned long)_x_, sizeof(*(ptr))); \
+#define xchg_local(ptr,x)						\
+  ({									\
+     __typeof__(*(ptr)) _x_ = (x);					\
+     (__typeof__(*(ptr))) __xchg_local((ptr), (unsigned long)_x_,	\
+				       sizeof(*(ptr)));			\
   })
 
-static inline unsigned long
-__xchg_u8_local(volatile char *m, unsigned long val)
-{
-	unsigned long ret, tmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%4,7,%3\n"
-	"	insbl	%1,%4,%1\n"
-	"1:	ldq_l	%2,0(%3)\n"
-	"	extbl	%2,%4,%0\n"
-	"	mskbl	%2,%4,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%3)\n"
-	"	beq	%2,2f\n"
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
-	: "r" ((long)m), "1" (val) : "memory");
-
-	return ret;
-}
-
-static inline unsigned long
-__xchg_u16_local(volatile short *m, unsigned long val)
-{
-	unsigned long ret, tmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%4,7,%3\n"
-	"	inswl	%1,%4,%1\n"
-	"1:	ldq_l	%2,0(%3)\n"
-	"	extwl	%2,%4,%0\n"
-	"	mskwl	%2,%4,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%3)\n"
-	"	beq	%2,2f\n"
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
-	: "r" ((long)m), "1" (val) : "memory");
-
-	return ret;
-}
-
-static inline unsigned long
-__xchg_u32_local(volatile int *m, unsigned long val)
-{
-	unsigned long dummy;
-
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%4\n"
-	"	bis $31,%3,%1\n"
-	"	stl_c %1,%2\n"
-	"	beq %1,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m) : "memory");
-
-	return val;
-}
-
-static inline unsigned long
-__xchg_u64_local(volatile long *m, unsigned long val)
-{
-	unsigned long dummy;
-
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%4\n"
-	"	bis $31,%3,%1\n"
-	"	stq_c %1,%2\n"
-	"	beq %1,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m) : "memory");
-
-	return val;
-}
-
-#define __xchg_local(ptr, x, size) \
-({ \
-	unsigned long __xchg__res; \
-	volatile void *__xchg__ptr = (ptr); \
-	switch (size) { \
-		case 1: __xchg__res = __xchg_u8_local(__xchg__ptr, x); break; \
-		case 2: __xchg__res = __xchg_u16_local(__xchg__ptr, x); break; \
-		case 4: __xchg__res = __xchg_u32_local(__xchg__ptr, x); break; \
-		case 8: __xchg__res = __xchg_u64_local(__xchg__ptr, x); break; \
-		default: __xchg_called_with_bad_pointer(); __xchg__res = x; \
-	} \
-	__xchg__res; \
-})
-
-#define xchg_local(ptr,x)						     \
-  ({									     \
-     __typeof__(*(ptr)) _x_ = (x);					     \
-     (__typeof__(*(ptr))) __xchg_local((ptr), (unsigned long)_x_,	     \
-     		sizeof(*(ptr))); \
+#define cmpxchg_local(ptr, o, n)					\
+  ({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg_local((ptr), (unsigned long)_o_,	\
+					  (unsigned long)_n_,		\
+					  sizeof(*(ptr)));		\
   })
 
-/* 
- * Atomic compare and exchange.  Compare OLD with MEM, if identical,
- * store NEW in MEM.  Return the initial value in MEM.  Success is
- * indicated by comparing RETURN with OLD.
- *
- * The memory barrier should be placed in SMP only when we actually
- * make the change. If we don't change anything (so if the returned
- * prev is equal to old) then we aren't acquiring anything new and
- * we don't need any memory barrier as far I can tell.
- */
-
-#define __HAVE_ARCH_CMPXCHG 1
-
-static inline unsigned long
-__cmpxchg_u8(volatile char *m, long old, long new)
-{
-	unsigned long prev, tmp, cmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%5,7,%4\n"
-	"	insbl	%1,%5,%1\n"
-	"1:	ldq_l	%2,0(%4)\n"
-	"	extbl	%2,%5,%0\n"
-	"	cmpeq	%0,%6,%3\n"
-	"	beq	%3,2f\n"
-	"	mskbl	%2,%5,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%4)\n"
-	"	beq	%2,3f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	"2:\n"
-	".subsection 2\n"
-	"3:	br	1b\n"
-	".previous"
-	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
-	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-__cmpxchg_u16(volatile short *m, long old, long new)
-{
-	unsigned long prev, tmp, cmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%5,7,%4\n"
-	"	inswl	%1,%5,%1\n"
-	"1:	ldq_l	%2,0(%4)\n"
-	"	extwl	%2,%5,%0\n"
-	"	cmpeq	%0,%6,%3\n"
-	"	beq	%3,2f\n"
-	"	mskwl	%2,%5,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%4)\n"
-	"	beq	%2,3f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	"2:\n"
-	".subsection 2\n"
-	"3:	br	1b\n"
-	".previous"
-	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
-	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-__cmpxchg_u32(volatile int *m, int old, int new)
-{
-	unsigned long prev, cmp;
-
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%5\n"
-	"	cmpeq %0,%3,%1\n"
-	"	beq %1,2f\n"
-	"	mov %4,%1\n"
-	"	stl_c %1,%2\n"
-	"	beq %1,3f\n"
-#ifdef CONFIG_SMP
-	"	mb\n"
-#endif
-	"2:\n"
-	".subsection 2\n"
-	"3:	br 1b\n"
-	".previous"
-	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m) : "memory");
-
-	return prev;
-}
+#define cmpxchg64_local(ptr, o, n)					\
+  ({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_local((ptr), (o), (n));					\
+  })
 
-static inline unsigned long
-__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
-{
-	unsigned long prev, cmp;
-
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%5\n"
-	"	cmpeq %0,%3,%1\n"
-	"	beq %1,2f\n"
-	"	mov %4,%1\n"
-	"	stq_c %1,%2\n"
-	"	beq %1,3f\n"
 #ifdef CONFIG_SMP
-	"	mb\n"
+#undef __ASM__MB
+#define __ASM__MB	"\tmb\n"
 #endif
-	"2:\n"
-	".subsection 2\n"
-	"3:	br 1b\n"
-	".previous"
-	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m) : "memory");
-
-	return prev;
-}
-
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
-
-static __always_inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
-{
-	switch (size) {
-		case 1:
-			return __cmpxchg_u8(ptr, old, new);
-		case 2:
-			return __cmpxchg_u16(ptr, old, new);
-		case 4:
-			return __cmpxchg_u32(ptr, old, new);
-		case 8:
-			return __cmpxchg_u64(ptr, old, new);
-	}
-	__cmpxchg_called_with_bad_pointer();
-	return old;
-}
-
-#define cmpxchg(ptr, o, n)						 \
-  ({									 \
-     __typeof__(*(ptr)) _o_ = (o);					 \
-     __typeof__(*(ptr)) _n_ = (n);					 \
-     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		 \
-				    (unsigned long)_n_, sizeof(*(ptr))); \
+#undef ____xchg
+#undef ____cmpxchg
+#define ____xchg(type, args...)		__xchg ##type(args)
+#define ____cmpxchg(type, args...)	__cmpxchg ##type(args)
+#include <asm/xchg.h>
+
+#define xchg(ptr,x)							\
+  ({									\
+     __typeof__(*(ptr)) _x_ = (x);					\
+     (__typeof__(*(ptr))) __xchg((ptr), (unsigned long)_x_,		\
+				 sizeof(*(ptr)));			\
   })
-#define cmpxchg64(ptr, o, n)						 \
-  ({									 \
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				 \
-	cmpxchg((ptr), (o), (n));					 \
-  })
-
-static inline unsigned long
-__cmpxchg_u8_local(volatile char *m, long old, long new)
-{
-	unsigned long prev, tmp, cmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%5,7,%4\n"
-	"	insbl	%1,%5,%1\n"
-	"1:	ldq_l	%2,0(%4)\n"
-	"	extbl	%2,%5,%0\n"
-	"	cmpeq	%0,%6,%3\n"
-	"	beq	%3,2f\n"
-	"	mskbl	%2,%5,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%4)\n"
-	"	beq	%2,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br	1b\n"
-	".previous"
-	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
-	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-__cmpxchg_u16_local(volatile short *m, long old, long new)
-{
-	unsigned long prev, tmp, cmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%5,7,%4\n"
-	"	inswl	%1,%5,%1\n"
-	"1:	ldq_l	%2,0(%4)\n"
-	"	extwl	%2,%5,%0\n"
-	"	cmpeq	%0,%6,%3\n"
-	"	beq	%3,2f\n"
-	"	mskwl	%2,%5,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%4)\n"
-	"	beq	%2,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br	1b\n"
-	".previous"
-	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
-	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-__cmpxchg_u32_local(volatile int *m, int old, int new)
-{
-	unsigned long prev, cmp;
-
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%5\n"
-	"	cmpeq %0,%3,%1\n"
-	"	beq %1,2f\n"
-	"	mov %4,%1\n"
-	"	stl_c %1,%2\n"
-	"	beq %1,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br 1b\n"
-	".previous"
-	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-__cmpxchg_u64_local(volatile long *m, unsigned long old, unsigned long new)
-{
-	unsigned long prev, cmp;
-
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%5\n"
-	"	cmpeq %0,%3,%1\n"
-	"	beq %1,2f\n"
-	"	mov %4,%1\n"
-	"	stq_c %1,%2\n"
-	"	beq %1,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br 1b\n"
-	".previous"
-	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m) : "memory");
-
-	return prev;
-}
-
-static __always_inline unsigned long
-__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
-		int size)
-{
-	switch (size) {
-		case 1:
-			return __cmpxchg_u8_local(ptr, old, new);
-		case 2:
-			return __cmpxchg_u16_local(ptr, old, new);
-		case 4:
-			return __cmpxchg_u32_local(ptr, old, new);
-		case 8:
-			return __cmpxchg_u64_local(ptr, old, new);
-	}
-	__cmpxchg_called_with_bad_pointer();
-	return old;
-}
 
-#define cmpxchg_local(ptr, o, n)					 \
-  ({									 \
-     __typeof__(*(ptr)) _o_ = (o);					 \
-     __typeof__(*(ptr)) _n_ = (n);					 \
-     (__typeof__(*(ptr))) __cmpxchg_local((ptr), (unsigned long)_o_,	 \
-				    (unsigned long)_n_, sizeof(*(ptr))); \
+#define cmpxchg(ptr, o, n)						\
+  ({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		\
+				    (unsigned long)_n_,	sizeof(*(ptr)));\
   })
-#define cmpxchg64_local(ptr, o, n)					 \
-  ({									 \
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				 \
-	cmpxchg_local((ptr), (o), (n));					 \
+
+#define cmpxchg64(ptr, o, n)						\
+  ({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg((ptr), (o), (n));					\
   })
 
+#undef __ASM__MB
+#undef ____cmpxchg
+
+#define __HAVE_ARCH_CMPXCHG 1
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/alpha/include/asm/xchg.h b/arch/alpha/include/asm/xchg.h
new file mode 100644
index 00000000000..beba1b803e0
--- /dev/null
+++ b/arch/alpha/include/asm/xchg.h
@@ -0,0 +1,258 @@
+#ifndef __ALPHA_SYSTEM_H
+#error Do not include xchg.h directly!
+#else
+/*
+ * xchg/xchg_local and cmpxchg/cmpxchg_local share the same code
+ * except that local version do not have the expensive memory barrier.
+ * So this file is included twice from asm/system.h.
+ */
+
+/*
+ * Atomic exchange.
+ * Since it can be used to implement critical sections
+ * it must clobber "memory" (also for interrupts in UP).
+ */
+
+static inline unsigned long
+____xchg(_u8, volatile char *m, unsigned long val)
+{
+	unsigned long ret, tmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%4,7,%3\n"
+	"	insbl	%1,%4,%1\n"
+	"1:	ldq_l	%2,0(%3)\n"
+	"	extbl	%2,%4,%0\n"
+	"	mskbl	%2,%4,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%3)\n"
+	"	beq	%2,2f\n"
+		__ASM__MB
+	".subsection 2\n"
+	"2:	br	1b\n"
+	".previous"
+	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
+	: "r" ((long)m), "1" (val) : "memory");
+
+	return ret;
+}
+
+static inline unsigned long
+____xchg(_u16, volatile short *m, unsigned long val)
+{
+	unsigned long ret, tmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%4,7,%3\n"
+	"	inswl	%1,%4,%1\n"
+	"1:	ldq_l	%2,0(%3)\n"
+	"	extwl	%2,%4,%0\n"
+	"	mskwl	%2,%4,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%3)\n"
+	"	beq	%2,2f\n"
+		__ASM__MB
+	".subsection 2\n"
+	"2:	br	1b\n"
+	".previous"
+	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
+	: "r" ((long)m), "1" (val) : "memory");
+
+	return ret;
+}
+
+static inline unsigned long
+____xchg(_u32, volatile int *m, unsigned long val)
+{
+	unsigned long dummy;
+
+	__asm__ __volatile__(
+	"1:	ldl_l %0,%4\n"
+	"	bis $31,%3,%1\n"
+	"	stl_c %1,%2\n"
+	"	beq %1,2f\n"
+		__ASM__MB
+	".subsection 2\n"
+	"2:	br 1b\n"
+	".previous"
+	: "=&r" (val), "=&r" (dummy), "=m" (*m)
+	: "rI" (val), "m" (*m) : "memory");
+
+	return val;
+}
+
+static inline unsigned long
+____xchg(_u64, volatile long *m, unsigned long val)
+{
+	unsigned long dummy;
+
+	__asm__ __volatile__(
+	"1:	ldq_l %0,%4\n"
+	"	bis $31,%3,%1\n"
+	"	stq_c %1,%2\n"
+	"	beq %1,2f\n"
+		__ASM__MB
+	".subsection 2\n"
+	"2:	br 1b\n"
+	".previous"
+	: "=&r" (val), "=&r" (dummy), "=m" (*m)
+	: "rI" (val), "m" (*m) : "memory");
+
+	return val;
+}
+
+/* This function doesn't exist, so you'll get a linker error
+   if something tries to do an invalid xchg().  */
+extern void __xchg_called_with_bad_pointer(void);
+
+static __always_inline unsigned long
+____xchg(, volatile void *ptr, unsigned long x, int size)
+{
+	switch (size) {
+		case 1:
+			return ____xchg(_u8, ptr, x);
+		case 2:
+			return ____xchg(_u16, ptr, x);
+		case 4:
+			return ____xchg(_u32, ptr, x);
+		case 8:
+			return ____xchg(_u64, ptr, x);
+	}
+	__xchg_called_with_bad_pointer();
+	return x;
+}
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ *
+ * The memory barrier should be placed in SMP only when we actually
+ * make the change. If we don't change anything (so if the returned
+ * prev is equal to old) then we aren't acquiring anything new and
+ * we don't need any memory barrier as far I can tell.
+ */
+
+static inline unsigned long
+____cmpxchg(_u8, volatile char *m, unsigned char old, unsigned char new)
+{
+	unsigned long prev, tmp, cmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%5,7,%4\n"
+	"	insbl	%1,%5,%1\n"
+	"1:	ldq_l	%2,0(%4)\n"
+	"	extbl	%2,%5,%0\n"
+	"	cmpeq	%0,%6,%3\n"
+	"	beq	%3,2f\n"
+	"	mskbl	%2,%5,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%4)\n"
+	"	beq	%2,3f\n"
+		__ASM__MB
+	"2:\n"
+	".subsection 2\n"
+	"3:	br	1b\n"
+	".previous"
+	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
+	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
+
+	return prev;
+}
+
+static inline unsigned long
+____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new)
+{
+	unsigned long prev, tmp, cmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%5,7,%4\n"
+	"	inswl	%1,%5,%1\n"
+	"1:	ldq_l	%2,0(%4)\n"
+	"	extwl	%2,%5,%0\n"
+	"	cmpeq	%0,%6,%3\n"
+	"	beq	%3,2f\n"
+	"	mskwl	%2,%5,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%4)\n"
+	"	beq	%2,3f\n"
+		__ASM__MB
+	"2:\n"
+	".subsection 2\n"
+	"3:	br	1b\n"
+	".previous"
+	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
+	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
+
+	return prev;
+}
+
+static inline unsigned long
+____cmpxchg(_u32, volatile int *m, int old, int new)
+{
+	unsigned long prev, cmp;
+
+	__asm__ __volatile__(
+	"1:	ldl_l %0,%5\n"
+	"	cmpeq %0,%3,%1\n"
+	"	beq %1,2f\n"
+	"	mov %4,%1\n"
+	"	stl_c %1,%2\n"
+	"	beq %1,3f\n"
+		__ASM__MB
+	"2:\n"
+	".subsection 2\n"
+	"3:	br 1b\n"
+	".previous"
+	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
+	: "r"((long) old), "r"(new), "m"(*m) : "memory");
+
+	return prev;
+}
+
+static inline unsigned long
+____cmpxchg(_u64, volatile long *m, unsigned long old, unsigned long new)
+{
+	unsigned long prev, cmp;
+
+	__asm__ __volatile__(
+	"1:	ldq_l %0,%5\n"
+	"	cmpeq %0,%3,%1\n"
+	"	beq %1,2f\n"
+	"	mov %4,%1\n"
+	"	stq_c %1,%2\n"
+	"	beq %1,3f\n"
+		__ASM__MB
+	"2:\n"
+	".subsection 2\n"
+	"3:	br 1b\n"
+	".previous"
+	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
+	: "r"((long) old), "r"(new), "m"(*m) : "memory");
+
+	return prev;
+}
+
+/* This function doesn't exist, so you'll get a linker error
+   if something tries to do an invalid cmpxchg().  */
+extern void __cmpxchg_called_with_bad_pointer(void);
+
+static __always_inline unsigned long
+____cmpxchg(, volatile void *ptr, unsigned long old, unsigned long new,
+	      int size)
+{
+	switch (size) {
+		case 1:
+			return ____cmpxchg(_u8, ptr, old, new);
+		case 2:
+			return ____cmpxchg(_u16, ptr, old, new);
+		case 4:
+			return ____cmpxchg(_u32, ptr, old, new);
+		case 8:
+			return ____cmpxchg(_u64, ptr, old, new);
+	}
+	__cmpxchg_called_with_bad_pointer();
+	return old;
+}
+
+#endif
-- 
cgit v1.2.3


From 5f0e3da6e186598bbd2569410ab60fa645ba00c9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 31 Mar 2009 15:23:36 -0700
Subject: alpha: convert u64 to unsigned long long

Convert alpha architecture to use u64 as unsigned long long.  This is
being done so that (a) all arches use u64 as unsigned long long and (b)
printk of a u64 as %ll[ux] will not generate format warnings by gcc.

The only gcc cross-compiler that I have is 4.0.2, which generates errors
about miscompiling __weak references, so I have commented out that line in
compiler-gcc4.h so that most of these compile, but more builds and real
machine testing would be Real Good.

[akpm@linux-foundation.org: fix warning]
[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
From: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/machvec.h |  2 +-
 arch/alpha/include/asm/types.h   |  5 +++++
 arch/alpha/kernel/err_ev6.c      |  4 ++--
 arch/alpha/kernel/err_ev7.c      |  6 +++---
 arch/alpha/kernel/err_marvel.c   | 40 ++++++++++++++++++++--------------------
 arch/alpha/kernel/err_titan.c    | 28 ++++++++++++++--------------
 arch/alpha/kernel/pci.c          |  2 +-
 arch/alpha/kernel/pci_iommu.c    | 34 +++++++++++++++++-----------------
 arch/alpha/kernel/proto.h        | 16 ++++++++--------
 arch/alpha/kernel/setup.c        |  2 +-
 arch/alpha/kernel/smc37c669.c    |  4 ++--
 arch/alpha/kernel/sys_jensen.c   |  3 +--
 arch/alpha/kernel/sys_sable.c    |  4 ++--
 arch/alpha/kernel/traps.c        |  2 +-
 14 files changed, 78 insertions(+), 74 deletions(-)

diff --git a/arch/alpha/include/asm/machvec.h b/arch/alpha/include/asm/machvec.h
index fea4ea75b79..13cd4274381 100644
--- a/arch/alpha/include/asm/machvec.h
+++ b/arch/alpha/include/asm/machvec.h
@@ -80,7 +80,7 @@ struct alpha_machine_vector
 	void (*update_irq_hw)(unsigned long, unsigned long, int);
 	void (*ack_irq)(unsigned long);
 	void (*device_interrupt)(unsigned long vector);
-	void (*machine_check)(u64 vector, u64 la);
+	void (*machine_check)(unsigned long vector, unsigned long la);
 
 	void (*smp_callin)(void);
 	void (*init_arch)(void);
diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h
index c1541353cce..f072f344497 100644
--- a/arch/alpha/include/asm/types.h
+++ b/arch/alpha/include/asm/types.h
@@ -8,7 +8,12 @@
  * not a major issue.  However, for interoperability, libraries still
  * need to be careful to avoid a name clashes.
  */
+
+#ifdef __KERNEL__
+#include <asm-generic/int-ll64.h>
+#else
 #include <asm-generic/int-l64.h>
+#endif
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/alpha/kernel/err_ev6.c b/arch/alpha/kernel/err_ev6.c
index 11aee012a8a..985e5c1681a 100644
--- a/arch/alpha/kernel/err_ev6.c
+++ b/arch/alpha/kernel/err_ev6.c
@@ -157,8 +157,8 @@ ev6_parse_cbox(u64 c_addr, u64 c1_syn, u64 c2_syn,
 		       err_print_prefix,
 		       streamname[stream], bitsname[bits], sourcename[source]);
 
-	printk("%s    Address: 0x%016lx\n"
-	         "    Syndrome[upper.lower]: %02lx.%02lx\n", 
+	printk("%s    Address: 0x%016llx\n"
+	         "    Syndrome[upper.lower]: %02llx.%02llx\n",
 	       err_print_prefix,
 	       c_addr,
 	       c2_syn, c1_syn);
diff --git a/arch/alpha/kernel/err_ev7.c b/arch/alpha/kernel/err_ev7.c
index 68cd493f54c..73770c6ca01 100644
--- a/arch/alpha/kernel/err_ev7.c
+++ b/arch/alpha/kernel/err_ev7.c
@@ -246,13 +246,13 @@ ev7_process_pal_subpacket(struct el_subpacket *header)
 
 	switch(header->type) {
 	case EL_TYPE__PAL__LOGOUT_FRAME:
-		printk("%s*** MCHK occurred on LPID %ld (RBOX %lx)\n",
+		printk("%s*** MCHK occurred on LPID %ld (RBOX %llx)\n",
 		       err_print_prefix,
 		       packet->by_type.logout.whami, 
 		       packet->by_type.logout.rbox_whami);
 		el_print_timestamp(&packet->by_type.logout.timestamp);
-		printk("%s  EXC_ADDR: %016lx\n"
-		         "  HALT_CODE: %lx\n", 
+		printk("%s  EXC_ADDR: %016llx\n"
+		         "  HALT_CODE: %llx\n",
 		       err_print_prefix,
 		       packet->by_type.logout.exc_addr,
 		       packet->by_type.logout.halt_code);
diff --git a/arch/alpha/kernel/err_marvel.c b/arch/alpha/kernel/err_marvel.c
index 413bf37eb09..6bfd243efba 100644
--- a/arch/alpha/kernel/err_marvel.c
+++ b/arch/alpha/kernel/err_marvel.c
@@ -129,7 +129,7 @@ marvel_print_po7_crrct_sym(u64 crrct_sym)
 
 
 	printk("%s      Correctable Error Symptoms:\n"
-	       "%s        Syndrome: 0x%lx\n",
+	       "%s        Syndrome: 0x%llx\n",
 	       err_print_prefix,
 	       err_print_prefix, EXTRACT(crrct_sym, IO7__PO7_CRRCT_SYM__SYN));
 	marvel_print_err_cyc(EXTRACT(crrct_sym, IO7__PO7_CRRCT_SYM__ERR_CYC));
@@ -186,7 +186,7 @@ marvel_print_po7_uncrr_sym(u64 uncrr_sym, u64 valid_mask)
 	uncrr_sym &= valid_mask;
 
 	if (EXTRACT(valid_mask, IO7__PO7_UNCRR_SYM__SYN))
-		printk("%s        Syndrome: 0x%lx\n",
+		printk("%s        Syndrome: 0x%llx\n",
 		       err_print_prefix, 
 		       EXTRACT(uncrr_sym, IO7__PO7_UNCRR_SYM__SYN));
 
@@ -307,7 +307,7 @@ marvel_print_po7_ugbge_sym(u64 ugbge_sym)
 		sprintf(opcode_str, "BlkIO");
 		break;
 	default:
-		sprintf(opcode_str, "0x%lx\n", 
+		sprintf(opcode_str, "0x%llx\n",
 			EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_OPCODE));
 		break;
 	}
@@ -321,7 +321,7 @@ marvel_print_po7_ugbge_sym(u64 ugbge_sym)
 	       opcode_str);
 
 	if (0xC5 != EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_OPCODE))
-		printk("%s        Packet Offset 0x%08lx\n",
+		printk("%s        Packet Offset 0x%08llx\n",
 		       err_print_prefix,
 		       EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_PKT_OFF));
 }
@@ -480,8 +480,8 @@ marvel_print_po7_err_sum(struct ev7_pal_io_subpacket *io)
 		printk("%s    Lost Error\n", err_print_prefix);
 
 	printk("%s    Failing Packet:\n"
-	       "%s      Cycle 1: %016lx\n"
-	       "%s      Cycle 2: %016lx\n",
+	       "%s      Cycle 1: %016llx\n"
+	       "%s      Cycle 2: %016llx\n",
 	       err_print_prefix,
 	       err_print_prefix, io->po7_err_pkt0,
 	       err_print_prefix, io->po7_err_pkt1);
@@ -515,9 +515,9 @@ marvel_print_pox_tlb_err(u64 tlb_err)
 	if (!(tlb_err & IO7__POX_TLBERR__ERR_VALID))
 		return;
 
-	printk("%s      TLB Error on index 0x%lx:\n"
+	printk("%s      TLB Error on index 0x%llx:\n"
 	       "%s        - %s\n"
-	       "%s        - Addr: 0x%016lx\n",
+	       "%s        - Addr: 0x%016llx\n",
 	       err_print_prefix,
 	       EXTRACT(tlb_err, IO7__POX_TLBERR__ERR_TLB_PTR),
 	       err_print_prefix,
@@ -579,7 +579,7 @@ marvel_print_pox_spl_cmplt(u64 spl_cmplt)
 		sprintf(message, "Uncorrectable Split Write Data Error");
 		break;
 	default:
-		sprintf(message, "%08lx\n", 
+		sprintf(message, "%08llx\n",
 			EXTRACT(spl_cmplt, IO7__POX_SPLCMPLT__MESSAGE));
 		break;
 	}
@@ -620,9 +620,9 @@ marvel_print_pox_trans_sum(u64 trans_sum)
 		return;
 
 	printk("%s      Transaction Summary:\n"
-	       "%s        Command: 0x%lx - %s\n"
-	       "%s        Address: 0x%016lx%s\n"
-	       "%s        PCI-X Master Slot: 0x%lx\n",
+	       "%s        Command: 0x%llx - %s\n"
+	       "%s        Address: 0x%016llx%s\n"
+	       "%s        PCI-X Master Slot: 0x%llx\n",
 	       err_print_prefix, 
 	       err_print_prefix, 
 	       EXTRACT(trans_sum, IO7__POX_TRANSUM__PCIX_CMD),
@@ -964,12 +964,12 @@ marvel_process_io_error(struct ev7_lf_subpackets *lf_subpackets, int print)
 
 #if 0
 		printk("%s  PORT 7 ERROR:\n"
-		       "%s    PO7_ERROR_SUM: %016lx\n"
-		       "%s    PO7_UNCRR_SYM: %016lx\n"
-		       "%s    PO7_CRRCT_SYM: %016lx\n"
-		       "%s    PO7_UGBGE_SYM: %016lx\n"
-		       "%s    PO7_ERR_PKT0:  %016lx\n"
-		       "%s    PO7_ERR_PKT1:  %016lx\n",
+		       "%s    PO7_ERROR_SUM: %016llx\n"
+		       "%s    PO7_UNCRR_SYM: %016llx\n"
+		       "%s    PO7_CRRCT_SYM: %016llx\n"
+		       "%s    PO7_UGBGE_SYM: %016llx\n"
+		       "%s    PO7_ERR_PKT0:  %016llx\n"
+		       "%s    PO7_ERR_PKT1:  %016llx\n",
 		       err_print_prefix,
 		       err_print_prefix, io->po7_error_sum,
 		       err_print_prefix, io->po7_uncrr_sym,
@@ -987,12 +987,12 @@ marvel_process_io_error(struct ev7_lf_subpackets *lf_subpackets, int print)
 		if (!MARVEL_IO_ERR_VALID(io->ports[i].pox_err_sum))
 			continue;
 
-		printk("%s  PID %u PORT %d POx_ERR_SUM: %016lx\n", 
+		printk("%s  PID %u PORT %d POx_ERR_SUM: %016llx\n",
 		       err_print_prefix, 
 		       lf_subpackets->io_pid, i, io->ports[i].pox_err_sum);
 		marvel_print_pox_err(io->ports[i].pox_err_sum, &io->ports[i]);
 
-		printk("%s  [ POx_FIRST_ERR: %016lx ]\n", 
+		printk("%s  [ POx_FIRST_ERR: %016llx ]\n",
 		       err_print_prefix, io->ports[i].pox_first_err);
 		marvel_print_pox_err(io->ports[i].pox_first_err, 
 				     &io->ports[i]);
diff --git a/arch/alpha/kernel/err_titan.c b/arch/alpha/kernel/err_titan.c
index 257449ed15e..c7e28a88d6e 100644
--- a/arch/alpha/kernel/err_titan.c
+++ b/arch/alpha/kernel/err_titan.c
@@ -107,12 +107,12 @@ titan_parse_p_serror(int which, u64 serror, int print)
 	if (!print)
 		return status;
 
-	printk("%s  PChip %d SERROR: %016lx\n", 
+	printk("%s  PChip %d SERROR: %016llx\n",
 	       err_print_prefix, which, serror);
 	if (serror & TITAN__PCHIP_SERROR__ECCMASK) {
 		printk("%s    %sorrectable ECC Error:\n"
 		       "      Source: %-6s  Command: %-8s  Syndrome: 0x%08x\n"
-		       "      Address: 0x%lx\n", 
+		       "      Address: 0x%llx\n",
 		       err_print_prefix,
 		       (serror & TITAN__PCHIP_SERROR__UECC) ? "Unc" : "C",
 		       serror_src[EXTRACT(serror, TITAN__PCHIP_SERROR__SRC)],
@@ -223,7 +223,7 @@ titan_parse_p_perror(int which, int port, u64 perror, int print)
 	if (!print) 
 		return status;
 
-	printk("%s  PChip %d %cPERROR: %016lx\n", 
+	printk("%s  PChip %d %cPERROR: %016llx\n",
 	       err_print_prefix, which, 
 	       port ? 'A' : 'G', perror);
 	if (perror & TITAN__PCHIP_PERROR__IPTPW)
@@ -316,7 +316,7 @@ titan_parse_p_agperror(int which, u64 agperror, int print)
 	addr = EXTRACT(agperror, TITAN__PCHIP_AGPERROR__ADDR) << 3;
 	len = EXTRACT(agperror, TITAN__PCHIP_AGPERROR__LEN);
 
-	printk("%s  PChip %d AGPERROR: %016lx\n", err_print_prefix,
+	printk("%s  PChip %d AGPERROR: %016llx\n", err_print_prefix,
 	       which, agperror);
 	if (agperror & TITAN__PCHIP_AGPERROR__NOWINDOW)
 		printk("%s    No Window\n", err_print_prefix);
@@ -597,16 +597,16 @@ privateer_process_680_frame(struct el_common *mchk_header, int print)
 		return status;
 
 	/* TODO - decode instead of just dumping... */
-	printk("%s  Summary Flags:         %016lx\n"
- 	         "  CChip DIRx:            %016lx\n"
-		 "  System Management IR:  %016lx\n"
-		 "  CPU IR:                %016lx\n"
-		 "  Power Supply IR:       %016lx\n"
-		 "  LM78 Fault Status:     %016lx\n"
-		 "  System Doors:          %016lx\n"
-		 "  Temperature Warning:   %016lx\n"
-		 "  Fan Control:           %016lx\n"
-		 "  Fatal Power Down Code: %016lx\n",
+	printk("%s  Summary Flags:         %016llx\n"
+ 	         "  CChip DIRx:            %016llx\n"
+		 "  System Management IR:  %016llx\n"
+		 "  CPU IR:                %016llx\n"
+		 "  Power Supply IR:       %016llx\n"
+		 "  LM78 Fault Status:     %016llx\n"
+		 "  System Doors:          %016llx\n"
+		 "  Temperature Warning:   %016llx\n"
+		 "  Fan Control:           %016llx\n"
+		 "  Fatal Power Down Code: %016llx\n",
 	       err_print_prefix,
 	       emchk->summary,
 	       emchk->c_dirx,
diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index a3b93881140..a91ba28999b 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -168,7 +168,7 @@ pcibios_align_resource(void *data, struct resource *res,
 		 */
 
 		/* Align to multiple of size of minimum base.  */
-		alignto = max(0x1000UL, align);
+		alignto = max_t(resource_size_t, 0x1000, align);
 		start = ALIGN(start, alignto);
 		if (hose->sparse_mem_base && size <= 7 * 16*MB) {
 			if (((start / (16*MB)) & 0x7) == 0) {
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index b9094da05d7..bfb880af959 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -247,7 +247,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	    && paddr + size <= __direct_map_size) {
 		ret = paddr + __direct_map_base;
 
-		DBGA2("pci_map_single: [%p,%lx] -> direct %lx from %p\n",
+		DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %p\n",
 		      cpu_addr, size, ret, __builtin_return_address(0));
 
 		return ret;
@@ -258,7 +258,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	if (dac_allowed) {
 		ret = paddr + alpha_mv.pci_dac_offset;
 
-		DBGA2("pci_map_single: [%p,%lx] -> DAC %lx from %p\n",
+		DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %p\n",
 		      cpu_addr, size, ret, __builtin_return_address(0));
 
 		return ret;
@@ -299,7 +299,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	ret = arena->dma_base + dma_ofs * PAGE_SIZE;
 	ret += (unsigned long)cpu_addr & ~PAGE_MASK;
 
-	DBGA2("pci_map_single: [%p,%lx] np %ld -> sg %lx from %p\n",
+	DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %p\n",
 	      cpu_addr, size, npages, ret, __builtin_return_address(0));
 
 	return ret;
@@ -355,14 +355,14 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size,
 	    && dma_addr < __direct_map_base + __direct_map_size) {
 		/* Nothing to do.  */
 
-		DBGA2("pci_unmap_single: direct [%lx,%lx] from %p\n",
+		DBGA2("pci_unmap_single: direct [%llx,%zx] from %p\n",
 		      dma_addr, size, __builtin_return_address(0));
 
 		return;
 	}
 
 	if (dma_addr > 0xffffffff) {
-		DBGA2("pci64_unmap_single: DAC [%lx,%lx] from %p\n",
+		DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %p\n",
 		      dma_addr, size, __builtin_return_address(0));
 		return;
 	}
@@ -373,9 +373,9 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size,
 
 	dma_ofs = (dma_addr - arena->dma_base) >> PAGE_SHIFT;
 	if (dma_ofs * PAGE_SIZE >= arena->size) {
-		printk(KERN_ERR "Bogus pci_unmap_single: dma_addr %lx "
-		       " base %lx size %x\n", dma_addr, arena->dma_base,
-		       arena->size);
+		printk(KERN_ERR "Bogus pci_unmap_single: dma_addr %llx "
+		       " base %llx size %x\n",
+		       dma_addr, arena->dma_base, arena->size);
 		return;
 		BUG();
 	}
@@ -394,7 +394,7 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size,
 
 	spin_unlock_irqrestore(&arena->lock, flags);
 
-	DBGA2("pci_unmap_single: sg [%lx,%lx] np %ld from %p\n",
+	DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %p\n",
 	      dma_addr, size, npages, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(pci_unmap_single);
@@ -444,7 +444,7 @@ try_again:
 		goto try_again;
 	}
 		
-	DBGA2("pci_alloc_consistent: %lx -> [%p,%x] from %p\n",
+	DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %p\n",
 	      size, cpu_addr, *dma_addrp, __builtin_return_address(0));
 
 	return cpu_addr;
@@ -464,7 +464,7 @@ pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu_addr,
 	pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
 	free_pages((unsigned long)cpu_addr, get_order(size));
 
-	DBGA2("pci_free_consistent: [%x,%lx] from %p\n",
+	DBGA2("pci_free_consistent: [%llx,%zx] from %p\n",
 	      dma_addr, size, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(pci_free_consistent);
@@ -551,7 +551,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
 		out->dma_address = paddr + __direct_map_base;
 		out->dma_length = size;
 
-		DBGA("    sg_fill: [%p,%lx] -> direct %lx\n",
+		DBGA("    sg_fill: [%p,%lx] -> direct %llx\n",
 		     __va(paddr), size, out->dma_address);
 
 		return 0;
@@ -563,7 +563,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
 		out->dma_address = paddr + alpha_mv.pci_dac_offset;
 		out->dma_length = size;
 
-		DBGA("    sg_fill: [%p,%lx] -> DAC %lx\n",
+		DBGA("    sg_fill: [%p,%lx] -> DAC %llx\n",
 		     __va(paddr), size, out->dma_address);
 
 		return 0;
@@ -589,7 +589,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
 	out->dma_address = arena->dma_base + dma_ofs*PAGE_SIZE + paddr;
 	out->dma_length = size;
 
-	DBGA("    sg_fill: [%p,%lx] -> sg %lx np %ld\n",
+	DBGA("    sg_fill: [%p,%lx] -> sg %llx np %ld\n",
 	     __va(paddr), size, out->dma_address, npages);
 
 	/* All virtually contiguous.  We need to find the length of each
@@ -752,7 +752,7 @@ pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
 
 		if (addr > 0xffffffff) {
 			/* It's a DAC address -- nothing to do.  */
-			DBGA("    (%ld) DAC [%lx,%lx]\n",
+			DBGA("    (%ld) DAC [%llx,%zx]\n",
 			      sg - end + nents, addr, size);
 			continue;
 		}
@@ -760,12 +760,12 @@ pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
 		if (addr >= __direct_map_base
 		    && addr < __direct_map_base + __direct_map_size) {
 			/* Nothing to do.  */
-			DBGA("    (%ld) direct [%lx,%lx]\n",
+			DBGA("    (%ld) direct [%llx,%zx]\n",
 			      sg - end + nents, addr, size);
 			continue;
 		}
 
-		DBGA("    (%ld) sg [%lx,%lx]\n",
+		DBGA("    (%ld) sg [%llx,%zx]\n",
 		     sg - end + nents, addr, size);
 
 		npages = iommu_num_pages(addr, size, PAGE_SIZE);
diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h
index fe14c6747cd..567f2598d09 100644
--- a/arch/alpha/kernel/proto.h
+++ b/arch/alpha/kernel/proto.h
@@ -20,7 +20,7 @@ struct pci_controller;
 extern struct pci_ops apecs_pci_ops;
 extern void apecs_init_arch(void);
 extern void apecs_pci_clr_err(void);
-extern void apecs_machine_check(u64, u64);
+extern void apecs_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void apecs_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
 /* core_cia.c */
@@ -29,7 +29,7 @@ extern void cia_init_pci(void);
 extern void cia_init_arch(void);
 extern void pyxis_init_arch(void);
 extern void cia_kill_arch(int);
-extern void cia_machine_check(u64, u64);
+extern void cia_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void cia_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
 /* core_irongate.c */
@@ -42,7 +42,7 @@ extern void irongate_machine_check(u64, u64);
 /* core_lca.c */
 extern struct pci_ops lca_pci_ops;
 extern void lca_init_arch(void);
-extern void lca_machine_check(u64, u64);
+extern void lca_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void lca_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
 /* core_marvel.c */
@@ -64,7 +64,7 @@ void io7_clear_errors(struct io7 *io7);
 extern struct pci_ops mcpcia_pci_ops;
 extern void mcpcia_init_arch(void);
 extern void mcpcia_init_hoses(void);
-extern void mcpcia_machine_check(u64, u64);
+extern void mcpcia_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void mcpcia_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
 /* core_polaris.c */
@@ -72,14 +72,14 @@ extern struct pci_ops polaris_pci_ops;
 extern int polaris_read_config_dword(struct pci_dev *, int, u32 *);
 extern int polaris_write_config_dword(struct pci_dev *, int, u32);
 extern void polaris_init_arch(void);
-extern void polaris_machine_check(u64, u64);
+extern void polaris_machine_check(unsigned long vector, unsigned long la_ptr);
 #define polaris_pci_tbi ((void *)0)
 
 /* core_t2.c */
 extern struct pci_ops t2_pci_ops;
 extern void t2_init_arch(void);
 extern void t2_kill_arch(int);
-extern void t2_machine_check(u64, u64);
+extern void t2_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void t2_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
 /* core_titan.c */
@@ -94,14 +94,14 @@ extern struct _alpha_agp_info *titan_agp_info(void);
 extern struct pci_ops tsunami_pci_ops;
 extern void tsunami_init_arch(void);
 extern void tsunami_kill_arch(int);
-extern void tsunami_machine_check(u64, u64);
+extern void tsunami_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void tsunami_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
 /* core_wildfire.c */
 extern struct pci_ops wildfire_pci_ops;
 extern void wildfire_init_arch(void);
 extern void wildfire_kill_arch(int);
-extern void wildfire_machine_check(u64, u64);
+extern void wildfire_machine_check(unsigned long vector, unsigned long la_ptr);
 extern void wildfire_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 extern int wildfire_pa_to_nid(unsigned long);
 extern int wildfire_cpuid_to_nid(int);
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 02bee6983ce..80df86cd746 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -1255,7 +1255,7 @@ show_cpuinfo(struct seq_file *f, void *slot)
 		       platform_string(), nr_processors);
 
 #ifdef CONFIG_SMP
-	seq_printf(f, "cpus active\t\t: %d\n"
+	seq_printf(f, "cpus active\t\t: %u\n"
 		      "cpu active mask\t\t: %016lx\n",
 		       num_online_cpus(), cpus_addr(cpu_possible_map)[0]);
 #endif
diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c
index fd467b207f0..bca5bda90cd 100644
--- a/arch/alpha/kernel/smc37c669.c
+++ b/arch/alpha/kernel/smc37c669.c
@@ -2542,8 +2542,8 @@ void __init SMC669_Init ( int index )
         SMC37c669_display_device_info( );
 #endif
 	local_irq_restore(flags);
-        printk( "SMC37c669 Super I/O Controller found @ 0x%lx\n",
-		(unsigned long) SMC_base );
+        printk( "SMC37c669 Super I/O Controller found @ 0x%p\n",
+		SMC_base );
     }
     else {
 	local_irq_restore(flags);
diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c
index e2516f9a896..2b5caf3d9b1 100644
--- a/arch/alpha/kernel/sys_jensen.c
+++ b/arch/alpha/kernel/sys_jensen.c
@@ -244,12 +244,11 @@ jensen_init_arch(void)
 }
 
 static void
-jensen_machine_check (u64 vector, u64 la)
+jensen_machine_check(unsigned long vector, unsigned long la)
 {
 	printk(KERN_CRIT "Machine check\n");
 }
 
-
 /*
  * The System Vector
  */
diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c
index d232e42be01..9e263256a42 100644
--- a/arch/alpha/kernel/sys_sable.c
+++ b/arch/alpha/kernel/sys_sable.c
@@ -453,7 +453,7 @@ sable_lynx_enable_irq(unsigned int irq)
 	sable_lynx_irq_swizzle->update_irq_hw(bit, mask);
 	spin_unlock(&sable_lynx_irq_lock);
 #if 0
-	printk("%s: mask 0x%lx bit 0x%x irq 0x%x\n",
+	printk("%s: mask 0x%lx bit 0x%lx irq 0x%x\n",
 	       __func__, mask, bit, irq);
 #endif
 }
@@ -469,7 +469,7 @@ sable_lynx_disable_irq(unsigned int irq)
 	sable_lynx_irq_swizzle->update_irq_hw(bit, mask);
 	spin_unlock(&sable_lynx_irq_lock);
 #if 0
-	printk("%s: mask 0x%lx bit 0x%x irq 0x%x\n",
+	printk("%s: mask 0x%lx bit 0x%lx irq 0x%x\n",
 	       __func__, mask, bit, irq);
 #endif
 }
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index cefc5a355ef..6ee7655b756 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -623,7 +623,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
 	}
 
 	lock_kernel();
-	printk("Bad unaligned kernel access at %016lx: %p %lx %ld\n",
+	printk("Bad unaligned kernel access at %016lx: %p %lx %lu\n",
 		pc, va, opcode, reg);
 	do_exit(SIGSEGV);
 
-- 
cgit v1.2.3


From a8af78982ff4c0b3731527b0217d286a343a3089 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 31 Mar 2009 15:23:37 -0700
Subject: pm: rework includes, remove arch ifdefs

Make the following header file changes:

 - remove arch ifdefs and asm/suspend.h from linux/suspend.h
 - add asm/suspend.h to disk.c (for arch_prepare_suspend())
 - add linux/io.h to swsusp.c (for ioremap())
 - x86 32/64 bit compile fixes

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/asm-offsets_32.c | 1 +
 arch/x86/kernel/asm-offsets_64.c | 1 +
 arch/x86/power/cpu_32.c          | 1 +
 arch/x86/power/cpu_64.c          | 1 +
 arch/x86/power/hibernate_64.c    | 1 +
 include/linux/suspend.h          | 3 ---
 kernel/power/disk.c              | 1 +
 kernel/power/swsusp.c            | 1 +
 8 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fbf2f33e308..5a6aa1c1162 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
 #include <asm/thread_info.h>
 #include <asm/bootparam.h>
 #include <asm/elf.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 8793ab33e2c..e72f062fb4b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -16,6 +16,7 @@
 #include <asm/thread_info.h>
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 274d06082f4..ce702c5b3a2 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static struct saved_context saved_context;
 
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index e3b6cf70d62..5343540f260 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static void fix_processor_context(void);
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6dd000dd793..65fdc86e923 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
+#include <asm/suspend.h>
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index c7d9bb1832b..3e3a4364cbf 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -1,9 +1,6 @@
 #ifndef _LINUX_SUSPEND_H
 #define _LINUX_SUSPEND_H
 
-#if defined(CONFIG_X86) || defined(CONFIG_FRV) || defined(CONFIG_PPC32) || defined(CONFIG_PPC64)
-#include <asm/suspend.h>
-#endif
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e886d1332a1..f3db382c2b2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <asm/suspend.h>
 
 #include "power.h"
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1ee6636414b..78c35047586 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/rbtree.h>
+#include <linux/io.h>
 
 #include "power.h"
 
-- 
cgit v1.2.3


From bf9ed57d35d64dac5d5651478b5530a89b20ea1e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 31 Mar 2009 15:23:38 -0700
Subject: pm: cleanup includes

Remove unused/duplicate cruft from asm/suspend.h:

 - x86_32: remove unused acpi code
 - powerpc: remove duplicate prototypes, see linux/suspend.h

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/suspend.h |  3 ---
 arch/x86/include/asm/suspend_32.h  | 24 ------------------------
 2 files changed, 27 deletions(-)

diff --git a/arch/powerpc/include/asm/suspend.h b/arch/powerpc/include/asm/suspend.h
index cbf2c9404c3..c6efc3466aa 100644
--- a/arch/powerpc/include/asm/suspend.h
+++ b/arch/powerpc/include/asm/suspend.h
@@ -3,7 +3,4 @@
 
 static inline int arch_prepare_suspend(void) { return 0; }
 
-void save_processor_state(void);
-void restore_processor_state(void);
-
 #endif /* __ASM_POWERPC_SUSPEND_H */
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index a5074bd0f8b..48dcfa62ea0 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -24,28 +24,4 @@ struct saved_context {
 	unsigned long return_address;
 } __attribute__((packed));
 
-#ifdef CONFIG_ACPI
-extern unsigned long saved_eip;
-extern unsigned long saved_esp;
-extern unsigned long saved_ebp;
-extern unsigned long saved_ebx;
-extern unsigned long saved_esi;
-extern unsigned long saved_edi;
-
-static inline void acpi_save_register_state(unsigned long return_point)
-{
-	saved_eip = return_point;
-	asm volatile("movl %%esp,%0" : "=m" (saved_esp));
-	asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
-	asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
-	asm volatile("movl %%edi,%0" : "=m" (saved_edi));
-	asm volatile("movl %%esi,%0" : "=m" (saved_esi));
-}
-
-#define acpi_restore_register_state()  do {} while (0)
-
-/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-#endif
-
 #endif /* _ASM_X86_SUSPEND_32_H */
-- 
cgit v1.2.3


From 792dd4fc317e1f94a6af111a0979c1c0d8c14453 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Mar 2009 15:23:39 -0700
Subject: ubd: stop defintining MAJOR_NR

MAJOR_NR isn't needed anymore since very early 2.5 kernels.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/ubd_kern.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 0a868118cf0..d42f826a8ab 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -17,7 +17,6 @@
  * James McMechan
  */
 
-#define MAJOR_NR UBD_MAJOR
 #define UBD_SHIFT 4
 
 #include "linux/kernel.h"
@@ -115,7 +114,7 @@ static struct block_device_operations ubd_blops = {
 };
 
 /* Protected by ubd_lock */
-static int fake_major = MAJOR_NR;
+static int fake_major = UBD_MAJOR;
 static struct gendisk *ubd_gendisk[MAX_DEV];
 static struct gendisk *fake_gendisk[MAX_DEV];
 
@@ -299,7 +298,7 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out)
 		}
 
 		mutex_lock(&ubd_lock);
-		if(fake_major != MAJOR_NR){
+		if (fake_major != UBD_MAJOR) {
 			*error_out = "Can't assign a fake major twice";
 			goto out1;
 		}
@@ -818,13 +817,13 @@ static int ubd_disk_register(int major, u64 size, int unit,
 	disk->first_minor = unit << UBD_SHIFT;
 	disk->fops = &ubd_blops;
 	set_capacity(disk, size / 512);
-	if(major == MAJOR_NR)
+	if (major == UBD_MAJOR)
 		sprintf(disk->disk_name, "ubd%c", 'a' + unit);
 	else
 		sprintf(disk->disk_name, "ubd_fake%d", unit);
 
 	/* sysfs register (not for ide fake devices) */
-	if (major == MAJOR_NR) {
+	if (major == UBD_MAJOR) {
 		ubd_devs[unit].pdev.id   = unit;
 		ubd_devs[unit].pdev.name = DRIVER_NAME;
 		ubd_devs[unit].pdev.dev.release = ubd_device_release;
@@ -871,13 +870,13 @@ static int ubd_add(int n, char **error_out)
 	ubd_dev->queue->queuedata = ubd_dev;
 
 	blk_queue_max_hw_segments(ubd_dev->queue, MAX_SG);
-	err = ubd_disk_register(MAJOR_NR, ubd_dev->size, n, &ubd_gendisk[n]);
+	err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
 	if(err){
 		*error_out = "Failed to register device";
 		goto out_cleanup;
 	}
 
-	if(fake_major != MAJOR_NR)
+	if (fake_major != UBD_MAJOR)
 		ubd_disk_register(fake_major, ubd_dev->size, n,
 				  &fake_gendisk[n]);
 
@@ -1059,10 +1058,10 @@ static int __init ubd_init(void)
 	char *error;
 	int i, err;
 
-	if (register_blkdev(MAJOR_NR, "ubd"))
+	if (register_blkdev(UBD_MAJOR, "ubd"))
 		return -1;
 
-	if (fake_major != MAJOR_NR) {
+	if (fake_major != UBD_MAJOR) {
 		char name[sizeof("ubd_nnn\0")];
 
 		snprintf(name, sizeof(name), "ubd_%d", fake_major);
-- 
cgit v1.2.3


From dc71768742b39bca298e9ca6c91e575cd4b140e6 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 31 Mar 2009 15:23:40 -0700
Subject: uml: don't use a too long string literal

uml uses a concatenated string literal to store the contents of .config,
but .config file content is varaible, it can be very long.

Use an array of string literals instead.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/Makefile    | 6 +++---
 arch/um/kernel/config.c.in | 8 ++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 499e5e95e60..388ec0a3ea9 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -28,7 +28,7 @@ $(obj)/config.tmp: $(objtree)/.config FORCE
 	$(call if_changed,quote1)
 
 quiet_cmd_quote1 = QUOTE   $@
-      cmd_quote1 = sed -e 's/"/\\"/g' -e 's/^/"/' -e 's/$$/\\n"/' \
+      cmd_quote1 = sed -e 's/"/\\"/g' -e 's/^/"/' -e 's/$$/\\n",/' \
 		   $< > $@
 
 $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE
@@ -36,9 +36,9 @@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE
 
 quiet_cmd_quote2 = QUOTE   $@
       cmd_quote2 = sed -e '/CONFIG/{'          \
-		  -e 's/"CONFIG"\;/""/'        \
+		  -e 's/"CONFIG"//'            \
 		  -e 'r $(obj)/config.tmp'     \
 		  -e 'a \'                     \
-		  -e '""\;'                    \
+		  -e '""'                      \
 		  -e '}'                       \
 		  $< > $@
diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in
index c062cbfe386..bee154d1ba7 100644
--- a/arch/um/kernel/config.c.in
+++ b/arch/um/kernel/config.c.in
@@ -7,11 +7,15 @@
 #include <stdlib.h>
 #include "init.h"
 
-static __initdata char *config = "CONFIG";
+static __initdata const char *config[] = {
+"CONFIG"
+};
 
 static int __init print_config(char *line, int *add)
 {
-	printf("%s", config);
+	int i;
+	for (i = 0; i < sizeof(config)/sizeof(config[0]); i++)
+		printf("%s", config[i]);
 	exit(0);
 }
 
-- 
cgit v1.2.3


From 5062910a06ee979002edbf58ab65481c81242df4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 31 Mar 2009 15:23:41 -0700
Subject: uml: improve error messages

These error messages are from check_sysemu(), not check_ptrace().

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/start_up.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 183db26d01b..02ee9adff54 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -244,7 +244,7 @@ static void __init check_sysemu(void)
 
 	if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0,
 		   (void *) PTRACE_O_TRACESYSGOOD) < 0))
-		fatal_perror("check_ptrace: PTRACE_OLDSETOPTIONS failed");
+		fatal_perror("check_sysemu: PTRACE_OLDSETOPTIONS failed");
 
 	while (1) {
 		count++;
@@ -252,12 +252,12 @@ static void __init check_sysemu(void)
 			goto fail;
 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
 		if (n < 0)
-			fatal_perror("check_ptrace : wait failed");
+			fatal_perror("check_sysemu: wait failed");
 
 		if (WIFSTOPPED(status) &&
 		    (WSTOPSIG(status) == (SIGTRAP|0x80))) {
 			if (!count) {
-				non_fatal("check_ptrace : SYSEMU_SINGLESTEP "
+				non_fatal("check_sysemu: SYSEMU_SINGLESTEP "
 					  "doesn't singlestep");
 				goto fail;
 			}
@@ -271,7 +271,7 @@ static void __init check_sysemu(void)
 		else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
 			count++;
 		else {
-			non_fatal("check_ptrace : expected SIGTRAP or "
+			non_fatal("check_sysemu: expected SIGTRAP or "
 				  "(SIGTRAP | 0x80), got status = %d\n",
 				  status);
 			goto fail;
-- 
cgit v1.2.3


From 65bd6a9bc7be3f5841dad12f77ce4b3210bd82c5 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 31 Mar 2009 15:23:41 -0700
Subject: uml: remove useless comments

These comments are useless now, remove them.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/pcap_user.h                | 10 ----------
 arch/um/drivers/port.h                     | 10 ----------
 arch/um/drivers/ssl.h                      | 10 ----------
 arch/um/drivers/stdio_console.h            | 10 ----------
 arch/um/drivers/xterm.h                    | 10 ----------
 arch/um/include/asm/irq_vectors.h          | 10 ----------
 arch/um/include/asm/mmu.h                  | 10 ----------
 arch/um/include/asm/pda.h                  | 10 ----------
 arch/um/include/asm/pgalloc.h              | 10 ----------
 arch/um/include/asm/pgtable-3level.h       | 10 ----------
 arch/um/include/shared/frame_kern.h        | 10 ----------
 arch/um/include/shared/initrd.h            | 10 ----------
 arch/um/include/shared/irq_kern.h          | 10 ----------
 arch/um/include/shared/mem_kern.h          | 10 ----------
 arch/um/include/shared/ubd_user.h          | 10 ----------
 arch/um/kernel/config.c.in                 | 10 ----------
 arch/um/sys-i386/asm/archparam.h           | 10 ----------
 arch/um/sys-i386/shared/sysdep/checksum.h  | 10 ----------
 arch/um/sys-ia64/sysdep/ptrace.h           | 10 ----------
 arch/um/sys-ia64/sysdep/sigcontext.h       | 10 ----------
 arch/um/sys-ia64/sysdep/syscalls.h         | 10 ----------
 arch/um/sys-ppc/miscthings.c               | 11 -----------
 arch/um/sys-ppc/ptrace.c                   | 10 ----------
 arch/um/sys-ppc/ptrace_user.c              | 10 ----------
 arch/um/sys-ppc/shared/sysdep/ptrace.h     | 10 ----------
 arch/um/sys-ppc/shared/sysdep/sigcontext.h | 10 ----------
 arch/um/sys-ppc/shared/sysdep/syscalls.h   | 10 ----------
 arch/um/sys-ppc/sigcontext.c               | 10 ----------
 arch/um/sys-x86_64/asm/archparam.h         | 10 ----------
 arch/um/sys-x86_64/asm/module.h            | 10 ----------
 arch/um/sys-x86_64/mem.c                   |  9 ---------
 31 files changed, 310 deletions(-)

diff --git a/arch/um/drivers/pcap_user.h b/arch/um/drivers/pcap_user.h
index 96b80b565ee..d8ba6153f91 100644
--- a/arch/um/drivers/pcap_user.h
+++ b/arch/um/drivers/pcap_user.h
@@ -19,13 +19,3 @@ extern const struct net_user_info pcap_user_info;
 
 extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri);
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/drivers/port.h b/arch/um/drivers/port.h
index 9117609a575..372a80c0556 100644
--- a/arch/um/drivers/port.h
+++ b/arch/um/drivers/port.h
@@ -18,13 +18,3 @@ extern void port_remove_dev(void *d);
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/drivers/ssl.h b/arch/um/drivers/ssl.h
index 98412aa6660..314d17725ce 100644
--- a/arch/um/drivers/ssl.h
+++ b/arch/um/drivers/ssl.h
@@ -11,13 +11,3 @@ extern void ssl_receive_char(int line, char ch);
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/drivers/stdio_console.h b/arch/um/drivers/stdio_console.h
index 505a3d5bea5..6d8275f71fd 100644
--- a/arch/um/drivers/stdio_console.h
+++ b/arch/um/drivers/stdio_console.h
@@ -9,13 +9,3 @@
 extern void save_console_flags(void);
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/drivers/xterm.h b/arch/um/drivers/xterm.h
index f33a6e77b18..56b9c4aba42 100644
--- a/arch/um/drivers/xterm.h
+++ b/arch/um/drivers/xterm.h
@@ -10,13 +10,3 @@ extern int xterm_fd(int socket, int *pid_out);
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/asm/irq_vectors.h b/arch/um/include/asm/irq_vectors.h
index 62ddba6fc73..272a81e0ce1 100644
--- a/arch/um/include/asm/irq_vectors.h
+++ b/arch/um/include/asm/irq_vectors.h
@@ -8,13 +8,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
index 2cf35c21d69..cf259de5153 100644
--- a/arch/um/include/asm/mmu.h
+++ b/arch/um/include/asm/mmu.h
@@ -10,13 +10,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/asm/pda.h b/arch/um/include/asm/pda.h
index 0d8bf33ffd4..ddcd774fc2a 100644
--- a/arch/um/include/asm/pda.h
+++ b/arch/um/include/asm/pda.h
@@ -19,13 +19,3 @@ extern struct foo me;
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 9062a6e7224..718984359f8 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -60,13 +60,3 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 0446f456b42..084de4a9fc7 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -134,13 +134,3 @@ static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot)
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/shared/frame_kern.h b/arch/um/include/shared/frame_kern.h
index ce9514f5721..76078490c25 100644
--- a/arch/um/include/shared/frame_kern.h
+++ b/arch/um/include/shared/frame_kern.h
@@ -20,13 +20,3 @@ extern int setup_signal_stack_si(unsigned long stack_top, int sig,
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/shared/initrd.h b/arch/um/include/shared/initrd.h
index 439b9a81498..22673bcc273 100644
--- a/arch/um/include/shared/initrd.h
+++ b/arch/um/include/shared/initrd.h
@@ -10,13 +10,3 @@ extern int load_initrd(char *filename, void *buf, int size);
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/shared/irq_kern.h b/arch/um/include/shared/irq_kern.h
index fba3895274f..b05d22f3d84 100644
--- a/arch/um/include/shared/irq_kern.h
+++ b/arch/um/include/shared/irq_kern.h
@@ -16,13 +16,3 @@ extern int um_request_irq(unsigned int irq, int fd, int type,
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/shared/mem_kern.h b/arch/um/include/shared/mem_kern.h
index cb7e196d366..69be0fd0ce4 100644
--- a/arch/um/include/shared/mem_kern.h
+++ b/arch/um/include/shared/mem_kern.h
@@ -18,13 +18,3 @@ extern void register_remapper(struct remapper *info);
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/shared/ubd_user.h b/arch/um/include/shared/ubd_user.h
index bb66517f073..3845051f1b1 100644
--- a/arch/um/include/shared/ubd_user.h
+++ b/arch/um/include/shared/ubd_user.h
@@ -14,13 +14,3 @@ extern int kernel_fd;
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in
index bee154d1ba7..b7a43feafde 100644
--- a/arch/um/kernel/config.c.in
+++ b/arch/um/kernel/config.c.in
@@ -24,13 +24,3 @@ __uml_setup("--showconfig", print_config,
 "    Prints the config file that this UML binary was generated from.\n\n"
 );
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-i386/asm/archparam.h b/arch/um/sys-i386/asm/archparam.h
index 93fd723344e..2a18a884ca1 100644
--- a/arch/um/sys-i386/asm/archparam.h
+++ b/arch/um/sys-i386/asm/archparam.h
@@ -14,13 +14,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-i386/shared/sysdep/checksum.h b/arch/um/sys-i386/shared/sysdep/checksum.h
index 0cb4645cbeb..ed47445f390 100644
--- a/arch/um/sys-i386/shared/sysdep/checksum.h
+++ b/arch/um/sys-i386/shared/sysdep/checksum.h
@@ -199,13 +199,3 @@ static __inline__ __wsum csum_and_copy_to_user(const void *src,
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ia64/sysdep/ptrace.h b/arch/um/sys-ia64/sysdep/ptrace.h
index 42dd8fb6f2f..0f0f4e6fd33 100644
--- a/arch/um/sys-ia64/sysdep/ptrace.h
+++ b/arch/um/sys-ia64/sysdep/ptrace.h
@@ -14,13 +14,3 @@ struct sys_pt_regs {
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ia64/sysdep/sigcontext.h b/arch/um/sys-ia64/sysdep/sigcontext.h
index f15fb25260b..76b43161e77 100644
--- a/arch/um/sys-ia64/sysdep/sigcontext.h
+++ b/arch/um/sys-ia64/sysdep/sigcontext.h
@@ -8,13 +8,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ia64/sysdep/syscalls.h b/arch/um/sys-ia64/sysdep/syscalls.h
index 4a1f46ef1eb..5f6700c4155 100644
--- a/arch/um/sys-ia64/sysdep/syscalls.h
+++ b/arch/um/sys-ia64/sysdep/syscalls.h
@@ -8,13 +8,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/miscthings.c b/arch/um/sys-ppc/miscthings.c
index 373061c5012..1c11aed9c71 100644
--- a/arch/um/sys-ppc/miscthings.c
+++ b/arch/um/sys-ppc/miscthings.c
@@ -40,14 +40,3 @@ void shove_aux_table(unsigned long sp)
 }
 /* END stuff taken from arch/ppc/kernel/process.c */
 
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/ptrace.c b/arch/um/sys-ppc/ptrace.c
index 8e71b47f2b8..66ef155248f 100644
--- a/arch/um/sys-ppc/ptrace.c
+++ b/arch/um/sys-ppc/ptrace.c
@@ -56,13 +56,3 @@ int peek_user(struct task_struct *child, long addr, long data)
 	return put_user(tmp, (unsigned long *) data);
 }
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/ptrace_user.c b/arch/um/sys-ppc/ptrace_user.c
index ff0b9c077a1..224d2403c37 100644
--- a/arch/um/sys-ppc/ptrace_user.c
+++ b/arch/um/sys-ppc/ptrace_user.c
@@ -27,13 +27,3 @@ int ptrace_setregs(long pid, unsigned long *regs_in)
     }
     return 0;
 }
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/shared/sysdep/ptrace.h b/arch/um/sys-ppc/shared/sysdep/ptrace.h
index df2397dba3e..0e3230e937e 100644
--- a/arch/um/sys-ppc/shared/sysdep/ptrace.h
+++ b/arch/um/sys-ppc/shared/sysdep/ptrace.h
@@ -91,13 +91,3 @@ extern void shove_aux_table(unsigned long sp);
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/shared/sysdep/sigcontext.h b/arch/um/sys-ppc/shared/sysdep/sigcontext.h
index f20d965de9c..b7286f0a1e0 100644
--- a/arch/um/sys-ppc/shared/sysdep/sigcontext.h
+++ b/arch/um/sys-ppc/shared/sysdep/sigcontext.h
@@ -50,13 +50,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/shared/sysdep/syscalls.h b/arch/um/sys-ppc/shared/sysdep/syscalls.h
index 679df351e19..1ff81552251 100644
--- a/arch/um/sys-ppc/shared/sysdep/syscalls.h
+++ b/arch/um/sys-ppc/shared/sysdep/syscalls.h
@@ -41,13 +41,3 @@ int old_mmap(unsigned long addr, unsigned long len,
 
 #define LAST_ARCH_SYSCALL __NR_fadvise64
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-ppc/sigcontext.c b/arch/um/sys-ppc/sigcontext.c
index 4bdc15c89ed..40694d0f3d1 100644
--- a/arch/um/sys-ppc/sigcontext.c
+++ b/arch/um/sys-ppc/sigcontext.c
@@ -2,13 +2,3 @@
 #include "asm/sigcontext.h"
 #include "sysdep/ptrace.h"
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-x86_64/asm/archparam.h b/arch/um/sys-x86_64/asm/archparam.h
index 270ed9586b6..6c083663b8d 100644
--- a/arch/um/sys-x86_64/asm/archparam.h
+++ b/arch/um/sys-x86_64/asm/archparam.h
@@ -14,13 +14,3 @@
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-x86_64/asm/module.h b/arch/um/sys-x86_64/asm/module.h
index 35b5491d3e9..8eb79c2d07d 100644
--- a/arch/um/sys-x86_64/asm/module.h
+++ b/arch/um/sys-x86_64/asm/module.h
@@ -18,13 +18,3 @@ struct mod_arch_specific
 
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/sys-x86_64/mem.c b/arch/um/sys-x86_64/mem.c
index 3f59a0a4f15..3f8df8abf34 100644
--- a/arch/um/sys-x86_64/mem.c
+++ b/arch/um/sys-x86_64/mem.c
@@ -14,12 +14,3 @@ unsigned long vm_data_default_flags = __VM_DATA_DEFAULT_FLAGS;
 unsigned long vm_data_default_flags32 = __VM_DATA_DEFAULT_FLAGS;
 unsigned long vm_force_exec32 = PROT_EXEC;
 
-/* Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
-- 
cgit v1.2.3


From 53d6660836f233df66490707365ab177e5fb2bb4 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Tue, 31 Mar 2009 15:23:43 -0700
Subject: loop: add ioctl to resize a loop device

Add the ability to 'resize' the loop device on the fly.

One practical application is a loop file with XFS filesystem, already
mounted: You can easily enlarge the file (append some bytes) and then call
ioctl(fd, LOOP_SET_CAPACITY, new); The loop driver will learn about the
new size and you can use xfs_growfs later on, which will allow you to use
full capacity of the loop file without the need to unmount.

Test app:

#include <linux/fs.h>
#include <linux/loop.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define _GNU_SOURCE
#include <getopt.h>

char *me;

void usage(FILE *f)
{
	fprintf(f, "%s [options] loop_dev [backend_file]\n"
		"-s, --set new_size_in_bytes\n"
		"\twhen backend_file is given, "
		"it will be expanded too while keeping the original contents\n",
		me);
}

struct option opts[] = {
	{
		.name		= "set",
		.has_arg	= 1,
		.flag		= NULL,
		.val		= 's'
	},
	{
		.name		= "help",
		.has_arg	= 0,
		.flag		= NULL,
		.val		= 'h'
	}
};

void err_size(char *name, __u64 old)
{
	fprintf(stderr, "size must be larger than current %s (%llu)\n",
		name, old);
}

int main(int argc, char *argv[])
{
	int fd, err, c, i, bfd;
	ssize_t ssz;
	size_t sz;
	__u64 old, new, append;
	char a[BUFSIZ];
	struct stat st;
	FILE *out;
	char *backend, *dev;

	err = EINVAL;
	out = stderr;
	me = argv[0];
	new = 0;
	while ((c = getopt_long(argc, argv, "s:h", opts, &i)) != -1) {
		switch (c) {
		case 's':
			errno = 0;
			new = strtoull(optarg, NULL, 0);
			if (errno) {
				err = errno;
				perror(argv[i]);
				goto out;
			}
			break;

		case 'h':
			err = 0;
			out = stdout;
			goto err;

		default:
			perror(argv[i]);
			goto err;
		}
	}

	if (optind < argc)
		dev = argv[optind++];
	else
		goto err;

	fd = open(dev, O_RDONLY);
	if (fd < 0) {
		err = errno;
		perror(dev);
		goto out;
	}

	err = ioctl(fd, BLKGETSIZE64, &old);
	if (err) {
		err = errno;
		perror("ioctl BLKGETSIZE64");
		goto out;
	}

	if (!new) {
		printf("%llu\n", old);
		goto out;
	}

	if (new < old) {
		err = EINVAL;
		err_size(dev, old);
		goto out;
	}

	if (optind < argc) {
		backend = argv[optind++];
		bfd = open(backend, O_WRONLY|O_APPEND);
		if (bfd < 0) {
			err = errno;
			perror(backend);
			goto out;
		}
		err = fstat(bfd, &st);
		if (err) {
			err = errno;
			perror(backend);
			goto out;
		}
		if (new < st.st_size) {
			err = EINVAL;
			err_size(backend, st.st_size);
			goto out;
		}
		append = new - st.st_size;
		sz = sizeof(a);
		while (append > 0) {
			if (append < sz)
				sz = append;
			ssz = write(bfd, a, sz);
			if (ssz != sz) {
				err = errno;
				perror(backend);
				goto out;
			}
			append -= sz;
		}
		err = fsync(bfd);
		if (err) {
			err = errno;
			perror(backend);
			goto out;
		}
	}

	err = ioctl(fd, LOOP_SET_CAPACITY, new);
	if (err) {
		err = errno;
		perror("ioctl LOOP_SET_CAPACITY");
	}
	goto out;

 err:
	usage(out);
 out:
	return err;
}

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Tomas Matejicek <tomas@slax.org>
Cc: <util-linux-ng@vger.kernel.org>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/loop.c | 30 ++++++++++++++++++++++++++++++
 include/linux/loop.h |  1 +
 2 files changed, 31 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2621ed2ce6d..40b17d3b55a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1192,6 +1192,30 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	return err;
 }
 
+static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
+{
+	int err;
+	sector_t sec;
+	loff_t sz;
+
+	err = -ENXIO;
+	if (unlikely(lo->lo_state != Lo_bound))
+		goto out;
+	err = figure_loop_size(lo);
+	if (unlikely(err))
+		goto out;
+	sec = get_capacity(lo->lo_disk);
+	/* the width of sector_t may be narrow for bit-shift */
+	sz = sec;
+	sz <<= 9;
+	mutex_lock(&bdev->bd_mutex);
+	bd_set_size(bdev, sz);
+	mutex_unlock(&bdev->bd_mutex);
+
+ out:
+	return err;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1224,6 +1248,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
 		break;
+	case LOOP_SET_CAPACITY:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_capacity(lo, bdev);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
@@ -1371,6 +1400,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 			lo, (struct compat_loop_info __user *) arg);
 		mutex_unlock(&lo->lo_ctl_mutex);
 		break;
+	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:
 	case LOOP_GET_STATUS64:
 	case LOOP_SET_STATUS64:
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 6ffd6db5bb0..40725447f5e 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -160,5 +160,6 @@ int loop_unregister_transfer(int number);
 #define LOOP_SET_STATUS64	0x4C04
 #define LOOP_GET_STATUS64	0x4C05
 #define LOOP_CHANGE_FD		0x4C06
+#define LOOP_SET_CAPACITY	0x4C07
 
 #endif
-- 
cgit v1.2.3


From 55a63998b8967615a15e2211ba0ff3a84a565824 Mon Sep 17 00:00:00 2001
From: Wolfram Strepp <wstrepp@gmx.de>
Date: Tue, 31 Mar 2009 15:23:45 -0700
Subject: lib/rbtree.c: optimize rb_erase()

Tfour 4 redundant if-conditions in function __rb_erase_color() in
lib/rbtree.c are removed.

In pseudo-source-code, the structure of the code is as follows:

if ((!A || B) && (!C || D)) {
	.
	.
	.
} else {
	if (!C || D) {//if this is true, it implies: (A == true) && (B == false)
		if (A) {//hence this always evaluates to 'true'...
			.
		}
		.
		//at this point, C always becomes true, because of:
		__rb_rotate_right/left();
		//and:
		other = parent->rb_right/left;
	}
	.
	.
	if (C) {//...and this too !
		.
	}
}

Signed-off-by: Wolfram Strepp <wstrepp@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/lib/rbtree.c b/lib/rbtree.c
index 9956b99649f..f653659e0bc 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -163,17 +163,14 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 			{
 				if (!other->rb_right || rb_is_black(other->rb_right))
 				{
-					struct rb_node *o_left;
-					if ((o_left = other->rb_left))
-						rb_set_black(o_left);
+					rb_set_black(other->rb_left);
 					rb_set_red(other);
 					__rb_rotate_right(other, root);
 					other = parent->rb_right;
 				}
 				rb_set_color(other, rb_color(parent));
 				rb_set_black(parent);
-				if (other->rb_right)
-					rb_set_black(other->rb_right);
+				rb_set_black(other->rb_right);
 				__rb_rotate_left(parent, root);
 				node = root->rb_node;
 				break;
@@ -200,17 +197,14 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
 			{
 				if (!other->rb_left || rb_is_black(other->rb_left))
 				{
-					register struct rb_node *o_right;
-					if ((o_right = other->rb_right))
-						rb_set_black(o_right);
+					rb_set_black(other->rb_right);
 					rb_set_red(other);
 					__rb_rotate_left(other, root);
 					other = parent->rb_left;
 				}
 				rb_set_color(other, rb_color(parent));
 				rb_set_black(parent);
-				if (other->rb_left)
-					rb_set_black(other->rb_left);
+				rb_set_black(other->rb_left);
 				__rb_rotate_right(parent, root);
 				node = root->rb_node;
 				break;
-- 
cgit v1.2.3


From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 31 Mar 2009 15:23:46 -0700
Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen
 filesystems

Now that the filesystem freeze operation has been elevated to the VFS, and
is just an ioctl away, some sort of safety net for unintentionally frozen
root filesystems may be in order.

The timeout thaw originally proposed did not get merged, but perhaps
something like this would be useful in emergencies.

For example, freeze /path/to/mountpoint may freeze your root filesystem if
you forgot that you had that unmounted.

I chose 'j' as the last remaining character other than 'h' which is sort
of reserved for help (because help is generated on any unknown character).

I've tested this on a non-root fs with multiple (nested) freezers, as well
as on a system rendered unresponsive due to a frozen root fs.

[randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysrq.txt |  5 +++++
 drivers/char/sysrq.c    | 19 ++++++++++++++++++-
 fs/buffer.c             | 33 +++++++++++++++++++++++++++++++++
 include/linux/fs.h      |  1 +
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 9e592c718af..afa2946892d 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -81,6 +81,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 
 'i'     - Send a SIGKILL to all processes, except for init.
 
+'j'     - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl.
+
 'k'     - Secure Access Key (SAK) Kills all programs on the current virtual
           console. NOTE: See important comments below in SAK section.
 
@@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you
 are unable to kill any other way, especially if it's spawning other
 processes.
 
+"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen
+(probably root) filesystem via the FIFREEZE ioctl.
+
 *  Sometimes SysRq seems to get 'stuck' after using it, what can I do?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 That happens to me, also. I've found that tapping shift, alt, and control
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 33a9351c896..5afe7316c72 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = {
 	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
 };
 
+#ifdef CONFIG_BLOCK
+static void sysrq_handle_thaw(int key, struct tty_struct *tty)
+{
+	emergency_thaw_all();
+}
+static struct sysrq_key_op sysrq_thaw_op = {
+	.handler	= sysrq_handle_thaw,
+	.help_msg	= "thaw-filesystems(J)",
+	.action_msg	= "Emergency Thaw of all frozen filesystems",
+	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
+};
+#endif
+
 static void sysrq_handle_kill(int key, struct tty_struct *tty)
 {
 	send_sig_all(SIGKILL);
@@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
 	&sysrq_moom_op,			/* f */
 	/* g: May be registered by ppc for kgdb */
 	NULL,				/* g */
-	NULL,				/* h */
+	NULL,				/* h - reserved for help */
 	&sysrq_kill_op,			/* i */
+#ifdef CONFIG_BLOCK
+	&sysrq_thaw_op,			/* j */
+#else
 	NULL,				/* j */
+#endif
 	&sysrq_SAK_op,			/* k */
 #ifdef CONFIG_SMP
 	&sysrq_showallcpus_op,		/* l */
diff --git a/fs/buffer.c b/fs/buffer.c
index c77b848c3d4..f5f8b15a6e4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -547,6 +547,39 @@ repeat:
 	return err;
 }
 
+void do_thaw_all(unsigned long unused)
+{
+	struct super_block *sb;
+	char b[BDEVNAME_SIZE];
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+			printk(KERN_WARNING "Emergency Thaw on %s\n",
+			       bdevname(sb->s_bdev, b));
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	pdflush_operation(do_thaw_all, 0);
+}
+
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87e7bfc5ebd..61211ad823f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
+extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
 extern int fsync_super(struct super_block *);
-- 
cgit v1.2.3


From fcd5e16286024360340eda61da4e2f5ec152b087 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 31 Mar 2009 15:23:47 -0700
Subject: remove unused include/asm-generic/dma-mapping.h

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/dma-mapping.h | 308 --------------------------------------
 1 file changed, 308 deletions(-)
 delete mode 100644 include/asm-generic/dma-mapping.h

diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
deleted file mode 100644
index 189486c3f92..00000000000
--- a/include/asm-generic/dma-mapping.h
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (C) 2002 by James.Bottomley@HansenPartnership.com 
- *
- * Implements the generic device dma API via the existing pci_ one
- * for unconverted architectures
- */
-
-#ifndef _ASM_GENERIC_DMA_MAPPING_H
-#define _ASM_GENERIC_DMA_MAPPING_H
-
-
-#ifdef CONFIG_PCI
-
-/* we implement the API below in terms of the existing PCI one,
- * so include it */
-#include <linux/pci.h>
-/* need struct page definitions */
-#include <linux/mm.h>
-
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	return pci_dma_supported(to_pci_dev(dev), mask);
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
-}
-
-static inline void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flag)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle);
-}
-
-static inline void
-dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
-		    dma_addr_t dma_handle)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_free_consistent(to_pci_dev(dev), size, cpu_addr, dma_handle);
-}
-
-static inline dma_addr_t
-dma_map_single(struct device *dev, void *cpu_addr, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	return pci_map_single(to_pci_dev(dev), cpu_addr, size, (int)direction);
-}
-
-static inline void
-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_unmap_single(to_pci_dev(dev), dma_addr, size, (int)direction);
-}
-
-static inline dma_addr_t
-dma_map_page(struct device *dev, struct page *page,
-	     unsigned long offset, size_t size,
-	     enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction);
-}
-
-static inline void
-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_unmap_page(to_pci_dev(dev), dma_address, size, (int)direction);
-}
-
-static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-	   enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
-}
-
-static inline void
-dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
-	     enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_unmap_sg(to_pci_dev(dev), sg, nhwentries, (int)direction);
-}
-
-static inline void
-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-			enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle,
-				    size, (int)direction);
-}
-
-static inline void
-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
-			   enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle,
-				       size, (int)direction);
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg, nelems, (int)direction);
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-		       enum dma_data_direction direction)
-{
-	BUG_ON(dev->bus != &pci_bus_type);
-
-	pci_dma_sync_sg_for_device(to_pci_dev(dev), sg, nelems, (int)direction);
-}
-
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return pci_dma_mapping_error(to_pci_dev(dev), dma_addr);
-}
-
-
-#else
-
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-	return 0;
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	BUG();
-	return 0;
-}
-
-static inline void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flag)
-{
-	BUG();
-	return NULL;
-}
-
-static inline void
-dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
-		    dma_addr_t dma_handle)
-{
-	BUG();
-}
-
-static inline dma_addr_t
-dma_map_single(struct device *dev, void *cpu_addr, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG();
-	return 0;
-}
-
-static inline void
-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline dma_addr_t
-dma_map_page(struct device *dev, struct page *page,
-	     unsigned long offset, size_t size,
-	     enum dma_data_direction direction)
-{
-	BUG();
-	return 0;
-}
-
-static inline void
-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-	   enum dma_data_direction direction)
-{
-	BUG();
-	return 0;
-}
-
-static inline void
-dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
-	     enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline void
-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-			enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline void
-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
-			   enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-		       enum dma_data_direction direction)
-{
-	BUG();
-}
-
-static inline int
-dma_error(dma_addr_t dma_addr)
-{
-	return 0;
-}
-
-#endif
-
-/* Now for the API extensions over the pci_ one */
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d, h)	(1)
-
-static inline int
-dma_get_cache_alignment(void)
-{
-	/* no easy way to get cache size on all processors, so return
-	 * the maximum possible, to be safe */
-	return (1 << INTERNODE_CACHE_SHIFT);
-}
-
-static inline void
-dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
-			      unsigned long offset, size_t size,
-			      enum dma_data_direction direction)
-{
-	/* just sync everything, that's all the pci API can do */
-	dma_sync_single_for_cpu(dev, dma_handle, offset+size, direction);
-}
-
-static inline void
-dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction direction)
-{
-	/* just sync everything, that's all the pci API can do */
-	dma_sync_single_for_device(dev, dma_handle, offset+size, direction);
-}
-
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction)
-{
-	/* could define this in terms of the dma_cache ... operations,
-	 * but if you get this on a platform, you should convert the platform
-	 * to using the generic device DMA API */
-	BUG();
-}
-
-#endif
-
-- 
cgit v1.2.3


From c0aa24ba8962bd1db98938fb2f3773f870896036 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Tue, 31 Mar 2009 15:23:48 -0700
Subject: auxdisplay: remove PARPORT dependency

Remove PARPORT dependency for Auxiliary Display support.

This is not needed since the dependency for the KS0108 driver is
PARPORT_PC.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/auxdisplay/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig
index 14b9d5f4c20..c07e725ea93 100644
--- a/drivers/auxdisplay/Kconfig
+++ b/drivers/auxdisplay/Kconfig
@@ -6,7 +6,6 @@
 #
 
 menuconfig AUXDISPLAY
-	depends on PARPORT
 	bool "Auxiliary Display support"
 	---help---
 	  Say Y here to get to see options for auxiliary display drivers.
@@ -14,7 +13,7 @@ menuconfig AUXDISPLAY
 
 	  If you say N, all options in this submenu will be skipped and disabled.
 
-if AUXDISPLAY && PARPORT
+if AUXDISPLAY
 
 config KS0108
 	tristate "KS0108 LCD Controller"
-- 
cgit v1.2.3


From e0f7ad5f4f5056b20914a35d31abcf29036ca364 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Tue, 31 Mar 2009 15:23:49 -0700
Subject: bcm47xx: fix GPIO API return codes

The GPIO API is supposed to return 0 or a negative error code,
but the SSB GPIO functions return the bitmask of the GPIO register.
Fix this by ignoring the bitmask and always returning 0. The SSB GPIO functions can't fail.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/mach-bcm47xx/gpio.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/mips/include/asm/mach-bcm47xx/gpio.h b/arch/mips/include/asm/mach-bcm47xx/gpio.h
index d8ff4cd89ab..1784fde2e28 100644
--- a/arch/mips/include/asm/mach-bcm47xx/gpio.h
+++ b/arch/mips/include/asm/mach-bcm47xx/gpio.h
@@ -31,24 +31,28 @@ static inline void gpio_set_value(unsigned gpio, int value)
 
 static inline int gpio_direction_input(unsigned gpio)
 {
-	return ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 0);
+	ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 0);
+	return 0;
 }
 
 static inline int gpio_direction_output(unsigned gpio, int value)
 {
-	return ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 1 << gpio);
+	ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 1 << gpio);
+	return 0;
 }
 
-static int gpio_intmask(unsigned gpio, int value)
+static inline int gpio_intmask(unsigned gpio, int value)
 {
-	return ssb_gpio_intmask(&ssb_bcm47xx, 1 << gpio,
-				value ? 1 << gpio : 0);
+	ssb_gpio_intmask(&ssb_bcm47xx, 1 << gpio,
+			 value ? 1 << gpio : 0);
+	return 0;
 }
 
-static int gpio_polarity(unsigned gpio, int value)
+static inline int gpio_polarity(unsigned gpio, int value)
 {
-	return ssb_gpio_polarity(&ssb_bcm47xx, 1 << gpio,
-				 value ? 1 << gpio : 0);
+	ssb_gpio_polarity(&ssb_bcm47xx, 1 << gpio,
+			  value ? 1 << gpio : 0);
+	return 0;
 }
 
 
-- 
cgit v1.2.3


From acdd052a285a7b4cc7da4fa7d34ef9fd0a67b2f8 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 31 Mar 2009 15:23:50 -0700
Subject: init/main.c: fix sparse warnings: context imbalance

Impact: Attribute function 'init_post' with __releases(...).

Fix these sparse warnings:
  init/main.c:805:21: warning: context imbalance in 'init_post' - unexpected unlock
  init/main.c:899:9: warning: context imbalance in 'kernel_init' - wrong count at exit

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/main.c b/init/main.c
index d6b388fbffa..07c8658ffca 100644
--- a/init/main.c
+++ b/init/main.c
@@ -793,6 +793,7 @@ static void run_init_process(char *init_filename)
  * makes it inline to init() and it becomes part of init.text section
  */
 static noinline int init_post(void)
+	__releases(kernel_lock)
 {
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
-- 
cgit v1.2.3


From 311d07611e8b354cc1ee6546e4c574c01111adc8 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 31 Mar 2009 15:23:51 -0700
Subject: introduce pr_cont() macro

We cover all log-levels by pr_...  macros except KERN_CONT one.  Add it
for convenience.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f81d80f47dc..e720b0da775 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -353,6 +353,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
         printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
         printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_cont(fmt, ...) \
+	printk(KERN_CONT fmt, ##__VA_ARGS__)
 
 /* If you are writing a driver, please use dev_dbg instead */
 #if defined(DEBUG)
-- 
cgit v1.2.3


From 63cd885426872254e82dac2d9e13ea4f720c21dc Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 31 Mar 2009 15:23:52 -0700
Subject: ntfs: remove private wrapper of endian helpers

The base versions handle constant folding now and are shorter than these
private wrappers, use them directly.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/dir.c     |   4 +-
 fs/ntfs/inode.c   |   3 +-
 fs/ntfs/layout.h  | 329 ++++++++++++++++++++++++++----------------------------
 fs/ntfs/logfile.h |   6 +-
 fs/ntfs/mft.c     |   2 +-
 fs/ntfs/super.c   |  50 ++++-----
 fs/ntfs/usnjrnl.h |  48 ++++----
 7 files changed, 215 insertions(+), 227 deletions(-)

diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd..5a9e34475e3 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
 /**
  * The little endian Unicode string $I30 as a global constant.
  */
-ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
-		const_cpu_to_le16('3'),	const_cpu_to_le16('0'), 0 };
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
 
 /**
  * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0..82c5085559c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 				goto em_put_err_out;
 			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
 					le16_to_cpu(al_entry->length));
-			if (le32_to_cpu(al_entry->type) >
-					const_le32_to_cpu(AT_DATA))
+			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
 				goto em_put_err_out;
 			if (AT_DATA != al_entry->type)
 				continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328ece..50931b1ce4b 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
 
 #include "types.h"
 
-/*
- * Constant endianness conversion defines.
- */
-#define const_le16_to_cpu(x)	__constant_le16_to_cpu(x)
-#define const_le32_to_cpu(x)	__constant_le32_to_cpu(x)
-#define const_le64_to_cpu(x)	__constant_le64_to_cpu(x)
-
-#define const_cpu_to_le16(x)	__constant_cpu_to_le16(x)
-#define const_cpu_to_le32(x)	__constant_cpu_to_le32(x)
-#define const_cpu_to_le64(x)	__constant_cpu_to_le64(x)
-
 /* The NTFS oem_id "NTFS    " */
-#define magicNTFS	const_cpu_to_le64(0x202020205346544eULL)
+#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
 
 /*
  * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
  */
 enum {
 	/* Found in $MFT/$DATA. */
-	magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
-	magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
-	magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
 
 	/* Found in $LogFile/$DATA. */
-	magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
-	magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
 
 	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-	magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
 
 	/* Found in all ntfs record containing records. */
-	magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
 						       transfer was detected. */
 	/*
 	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
 	 * thus not initialized.  Page must be initialized before using it.
 	 */
-	magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
 };
 
 typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
  * information about the mft record in which they are present.
  */
 enum {
-	MFT_RECORD_IN_USE	= const_cpu_to_le16(0x0001),
-	MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
+	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
 } __attribute__ ((__packed__));
 
 typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
  * Note: The _LE versions will return a CPU endian formatted value!
  */
 #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU)
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
 
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
  * a revealing choice of symbol I do not know what is... (-;
  */
 enum {
-	AT_UNUSED			= const_cpu_to_le32(         0),
-	AT_STANDARD_INFORMATION		= const_cpu_to_le32(      0x10),
-	AT_ATTRIBUTE_LIST		= const_cpu_to_le32(      0x20),
-	AT_FILE_NAME			= const_cpu_to_le32(      0x30),
-	AT_OBJECT_ID			= const_cpu_to_le32(      0x40),
-	AT_SECURITY_DESCRIPTOR		= const_cpu_to_le32(      0x50),
-	AT_VOLUME_NAME			= const_cpu_to_le32(      0x60),
-	AT_VOLUME_INFORMATION		= const_cpu_to_le32(      0x70),
-	AT_DATA				= const_cpu_to_le32(      0x80),
-	AT_INDEX_ROOT			= const_cpu_to_le32(      0x90),
-	AT_INDEX_ALLOCATION		= const_cpu_to_le32(      0xa0),
-	AT_BITMAP			= const_cpu_to_le32(      0xb0),
-	AT_REPARSE_POINT		= const_cpu_to_le32(      0xc0),
-	AT_EA_INFORMATION		= const_cpu_to_le32(      0xd0),
-	AT_EA				= const_cpu_to_le32(      0xe0),
-	AT_PROPERTY_SET			= const_cpu_to_le32(      0xf0),
-	AT_LOGGED_UTILITY_STREAM	= const_cpu_to_le32(     0x100),
-	AT_FIRST_USER_DEFINED_ATTRIBUTE	= const_cpu_to_le32(    0x1000),
-	AT_END				= const_cpu_to_le32(0xffffffff)
+	AT_UNUSED			= cpu_to_le32(         0),
+	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
+	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
+	AT_FILE_NAME			= cpu_to_le32(      0x30),
+	AT_OBJECT_ID			= cpu_to_le32(      0x40),
+	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
+	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
+	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
+	AT_DATA				= cpu_to_le32(      0x80),
+	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
+	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
+	AT_BITMAP			= cpu_to_le32(      0xb0),
+	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
+	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
+	AT_EA				= cpu_to_le32(      0xe0),
+	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
+	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
+	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
+	AT_END				= cpu_to_le32(0xffffffff)
 };
 
 typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
  *	equal then the second le32 values would be compared, etc.
  */
 enum {
-	COLLATION_BINARY		= const_cpu_to_le32(0x00),
-	COLLATION_FILE_NAME		= const_cpu_to_le32(0x01),
-	COLLATION_UNICODE_STRING	= const_cpu_to_le32(0x02),
-	COLLATION_NTOFS_ULONG		= const_cpu_to_le32(0x10),
-	COLLATION_NTOFS_SID		= const_cpu_to_le32(0x11),
-	COLLATION_NTOFS_SECURITY_HASH	= const_cpu_to_le32(0x12),
-	COLLATION_NTOFS_ULONGS		= const_cpu_to_le32(0x13),
+	COLLATION_BINARY		= cpu_to_le32(0x00),
+	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
+	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
+	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
+	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
+	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
+	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
 };
 
 typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
  * NT4.
  */
 enum {
-	ATTR_DEF_INDEXABLE	= const_cpu_to_le32(0x02), /* Attribute can be
+	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
 					indexed. */
-	ATTR_DEF_MULTIPLE	= const_cpu_to_le32(0x04), /* Attribute type
+	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
 					can be present multiple times in the
 					mft records of an inode. */
-	ATTR_DEF_NOT_ZERO	= const_cpu_to_le32(0x08), /* Attribute value
+	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
 					must contain at least one non-zero
 					byte. */
-	ATTR_DEF_INDEXED_UNIQUE	= const_cpu_to_le32(0x10), /* Attribute must be
+	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
 					indexed and the attribute value must be
 					unique for the attribute type in all of
 					the mft records of an inode. */
-	ATTR_DEF_NAMED_UNIQUE	= const_cpu_to_le32(0x20), /* Attribute must be
+	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
 					named and the name must be unique for
 					the attribute type in all of the mft
 					records of an inode. */
-	ATTR_DEF_RESIDENT	= const_cpu_to_le32(0x40), /* Attribute must be
+	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
 					resident. */
-	ATTR_DEF_ALWAYS_LOG	= const_cpu_to_le32(0x80), /* Always log
+	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
 					modifications to this attribute,
 					regardless of whether it is resident or
 					non-resident.  Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
  * Attribute flags (16-bit).
  */
 enum {
-	ATTR_IS_COMPRESSED    = const_cpu_to_le16(0x0001),
-	ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
+	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
 							      mask.  Also, first
 							      illegal value. */
-	ATTR_IS_ENCRYPTED     = const_cpu_to_le16(0x4000),
-	ATTR_IS_SPARSE	      = const_cpu_to_le16(0x8000),
+	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
+	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
 } __attribute__ ((__packed__));
 
 typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
  * flags appear in all of the above.
  */
 enum {
-	FILE_ATTR_READONLY		= const_cpu_to_le32(0x00000001),
-	FILE_ATTR_HIDDEN		= const_cpu_to_le32(0x00000002),
-	FILE_ATTR_SYSTEM		= const_cpu_to_le32(0x00000004),
-	/* Old DOS volid. Unused in NT.	= const_cpu_to_le32(0x00000008), */
+	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
+	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
+	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
+	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
 
-	FILE_ATTR_DIRECTORY		= const_cpu_to_le32(0x00000010),
+	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
 	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
 	   reserved for the DOS SUBDIRECTORY flag. */
-	FILE_ATTR_ARCHIVE		= const_cpu_to_le32(0x00000020),
-	FILE_ATTR_DEVICE		= const_cpu_to_le32(0x00000040),
-	FILE_ATTR_NORMAL		= const_cpu_to_le32(0x00000080),
+	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
+	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
+	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
 
-	FILE_ATTR_TEMPORARY		= const_cpu_to_le32(0x00000100),
-	FILE_ATTR_SPARSE_FILE		= const_cpu_to_le32(0x00000200),
-	FILE_ATTR_REPARSE_POINT		= const_cpu_to_le32(0x00000400),
-	FILE_ATTR_COMPRESSED		= const_cpu_to_le32(0x00000800),
+	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
+	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
+	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
+	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
 
-	FILE_ATTR_OFFLINE		= const_cpu_to_le32(0x00001000),
-	FILE_ATTR_NOT_CONTENT_INDEXED	= const_cpu_to_le32(0x00002000),
-	FILE_ATTR_ENCRYPTED		= const_cpu_to_le32(0x00004000),
+	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
+	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
+	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
 
-	FILE_ATTR_VALID_FLAGS		= const_cpu_to_le32(0x00007fb7),
+	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
 	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
 	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
 	   to obtain all flags that are valid for reading. */
-	FILE_ATTR_VALID_SET_FLAGS	= const_cpu_to_le32(0x000031a7),
+	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
 	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
 	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
 	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
@@ -846,11 +835,11 @@ enum {
 	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
 	 * attribute of an mft record.
 	 */
-	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= const_cpu_to_le32(0x10000000),
+	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this is a directory or not, i.e. whether it has
 	   an index root attribute or not. */
-	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= const_cpu_to_le32(0x20000000),
+	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this file has a view index present (eg. object id
 	   index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
 	/* Specific rights for files and directories are as follows: */
 
 	/* Right to read data from the file. (FILE) */
-	FILE_READ_DATA			= const_cpu_to_le32(0x00000001),
+	FILE_READ_DATA			= cpu_to_le32(0x00000001),
 	/* Right to list contents of a directory. (DIRECTORY) */
-	FILE_LIST_DIRECTORY		= const_cpu_to_le32(0x00000001),
+	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
 
 	/* Right to write data to the file. (FILE) */
-	FILE_WRITE_DATA			= const_cpu_to_le32(0x00000002),
+	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
 	/* Right to create a file in the directory. (DIRECTORY) */
-	FILE_ADD_FILE			= const_cpu_to_le32(0x00000002),
+	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
 
 	/* Right to append data to the file. (FILE) */
-	FILE_APPEND_DATA		= const_cpu_to_le32(0x00000004),
+	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
 	/* Right to create a subdirectory. (DIRECTORY) */
-	FILE_ADD_SUBDIRECTORY		= const_cpu_to_le32(0x00000004),
+	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
 
 	/* Right to read extended attributes. (FILE/DIRECTORY) */
-	FILE_READ_EA			= const_cpu_to_le32(0x00000008),
+	FILE_READ_EA			= cpu_to_le32(0x00000008),
 
 	/* Right to write extended attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_EA			= const_cpu_to_le32(0x00000010),
+	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
 
 	/* Right to execute a file. (FILE) */
-	FILE_EXECUTE			= const_cpu_to_le32(0x00000020),
+	FILE_EXECUTE			= cpu_to_le32(0x00000020),
 	/* Right to traverse the directory. (DIRECTORY) */
-	FILE_TRAVERSE			= const_cpu_to_le32(0x00000020),
+	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
 
 	/*
 	 * Right to delete a directory and all the files it contains (its
 	 * children), even if the files are read-only. (DIRECTORY)
 	 */
-	FILE_DELETE_CHILD		= const_cpu_to_le32(0x00000040),
+	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
 
 	/* Right to read file attributes. (FILE/DIRECTORY) */
-	FILE_READ_ATTRIBUTES		= const_cpu_to_le32(0x00000080),
+	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
 
 	/* Right to change file attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_ATTRIBUTES		= const_cpu_to_le32(0x00000100),
+	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
 
 	/*
 	 * The standard rights (bits 16 to 23).  These are independent of the
@@ -1489,27 +1478,27 @@ enum {
 	 */
 
 	/* Right to delete the object. */
-	DELETE				= const_cpu_to_le32(0x00010000),
+	DELETE				= cpu_to_le32(0x00010000),
 
 	/*
 	 * Right to read the information in the object's security descriptor,
 	 * not including the information in the SACL, i.e. right to read the
 	 * security descriptor and owner.
 	 */
-	READ_CONTROL			= const_cpu_to_le32(0x00020000),
+	READ_CONTROL			= cpu_to_le32(0x00020000),
 
 	/* Right to modify the DACL in the object's security descriptor. */
-	WRITE_DAC			= const_cpu_to_le32(0x00040000),
+	WRITE_DAC			= cpu_to_le32(0x00040000),
 
 	/* Right to change the owner in the object's security descriptor. */
-	WRITE_OWNER			= const_cpu_to_le32(0x00080000),
+	WRITE_OWNER			= cpu_to_le32(0x00080000),
 
 	/*
 	 * Right to use the object for synchronization.  Enables a process to
 	 * wait until the object is in the signalled state.  Some object types
 	 * do not support this access right.
 	 */
-	SYNCHRONIZE			= const_cpu_to_le32(0x00100000),
+	SYNCHRONIZE			= cpu_to_le32(0x00100000),
 
 	/*
 	 * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
 	 */
 
 	/* These are currently defined to READ_CONTROL. */
-	STANDARD_RIGHTS_READ		= const_cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_WRITE		= const_cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_EXECUTE		= const_cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
 
 	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-	STANDARD_RIGHTS_REQUIRED	= const_cpu_to_le32(0x000f0000),
+	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
 
 	/*
 	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
 	 * SYNCHRONIZE access.
 	 */
-	STANDARD_RIGHTS_ALL		= const_cpu_to_le32(0x001f0000),
+	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
 
 	/*
 	 * The access system ACL and maximum allowed access types (bits 24 to
 	 * 25, bits 26 to 27 are reserved).
 	 */
-	ACCESS_SYSTEM_SECURITY		= const_cpu_to_le32(0x01000000),
-	MAXIMUM_ALLOWED			= const_cpu_to_le32(0x02000000),
+	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
+	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
 
 	/*
 	 * The generic rights (bits 28 to 31).  These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
 	 */
 
 	/* Read, write, and execute access. */
-	GENERIC_ALL			= const_cpu_to_le32(0x10000000),
+	GENERIC_ALL			= cpu_to_le32(0x10000000),
 
 	/* Execute access. */
-	GENERIC_EXECUTE			= const_cpu_to_le32(0x20000000),
+	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
 
 	/*
 	 * Write access.  For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
 	 * For directories, the mapping has the same numerical value.  See
 	 * above for the descriptions of the rights granted.
 	 */
-	GENERIC_WRITE			= const_cpu_to_le32(0x40000000),
+	GENERIC_WRITE			= cpu_to_le32(0x40000000),
 
 	/*
 	 * Read access.  For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
 	 * For directories, the mapping has the same numberical value.  See
 	 * above for the descriptions of the rights granted.
 	 */
-	GENERIC_READ			= const_cpu_to_le32(0x80000000),
+	GENERIC_READ			= cpu_to_le32(0x80000000),
 };
 
 typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
  * The object ACE flags (32-bit).
  */
 enum {
-	ACE_OBJECT_TYPE_PRESENT			= const_cpu_to_le32(1),
-	ACE_INHERITED_OBJECT_TYPE_PRESENT	= const_cpu_to_le32(2),
+	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
+	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
 };
 
 typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
  *	expressed as offsets from the beginning of the security descriptor.
  */
 enum {
-	SE_OWNER_DEFAULTED		= const_cpu_to_le16(0x0001),
-	SE_GROUP_DEFAULTED		= const_cpu_to_le16(0x0002),
-	SE_DACL_PRESENT			= const_cpu_to_le16(0x0004),
-	SE_DACL_DEFAULTED		= const_cpu_to_le16(0x0008),
-
-	SE_SACL_PRESENT			= const_cpu_to_le16(0x0010),
-	SE_SACL_DEFAULTED		= const_cpu_to_le16(0x0020),
-
-	SE_DACL_AUTO_INHERIT_REQ	= const_cpu_to_le16(0x0100),
-	SE_SACL_AUTO_INHERIT_REQ	= const_cpu_to_le16(0x0200),
-	SE_DACL_AUTO_INHERITED		= const_cpu_to_le16(0x0400),
-	SE_SACL_AUTO_INHERITED		= const_cpu_to_le16(0x0800),
-
-	SE_DACL_PROTECTED		= const_cpu_to_le16(0x1000),
-	SE_SACL_PROTECTED		= const_cpu_to_le16(0x2000),
-	SE_RM_CONTROL_VALID		= const_cpu_to_le16(0x4000),
-	SE_SELF_RELATIVE		= const_cpu_to_le16(0x8000)
+	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
+	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
+	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
+	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
+
+	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
+	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
+
+	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
+	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
+	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
+	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
+
+	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
+	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
+	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
+	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
 } __attribute__ ((__packed__));
 
 typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
  * Possible flags for the volume (16-bit).
  */
 enum {
-	VOLUME_IS_DIRTY			= const_cpu_to_le16(0x0001),
-	VOLUME_RESIZE_LOG_FILE		= const_cpu_to_le16(0x0002),
-	VOLUME_UPGRADE_ON_MOUNT		= const_cpu_to_le16(0x0004),
-	VOLUME_MOUNTED_ON_NT4		= const_cpu_to_le16(0x0008),
+	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
+	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
+	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
+	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
 
-	VOLUME_DELETE_USN_UNDERWAY	= const_cpu_to_le16(0x0010),
-	VOLUME_REPAIR_OBJECT_ID		= const_cpu_to_le16(0x0020),
+	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
+	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
 
-	VOLUME_CHKDSK_UNDERWAY		= const_cpu_to_le16(0x4000),
-	VOLUME_MODIFIED_BY_CHKDSK	= const_cpu_to_le16(0x8000),
+	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
+	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
 
-	VOLUME_FLAGS_MASK		= const_cpu_to_le16(0xc03f),
+	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
 
 	/* To make our life easier when checking if we must mount read-only. */
-	VOLUME_MUST_MOUNT_RO_MASK	= const_cpu_to_le16(0xc027),
+	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
 } __attribute__ ((__packed__));
 
 typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
  * The user quota flags.  Names explain meaning.
  */
 enum {
-	QUOTA_FLAG_DEFAULT_LIMITS	= const_cpu_to_le32(0x00000001),
-	QUOTA_FLAG_LIMIT_REACHED	= const_cpu_to_le32(0x00000002),
-	QUOTA_FLAG_ID_DELETED		= const_cpu_to_le32(0x00000004),
+	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
+	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
+	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
 
-	QUOTA_FLAG_USER_MASK		= const_cpu_to_le32(0x00000007),
+	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
 	/* This is a bit mask for the user quota flags. */
 
 	/*
 	 * These flags are only present in the quota defaults index entry, i.e.
 	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
 	 */
-	QUOTA_FLAG_TRACKING_ENABLED	= const_cpu_to_le32(0x00000010),
-	QUOTA_FLAG_ENFORCEMENT_ENABLED	= const_cpu_to_le32(0x00000020),
-	QUOTA_FLAG_TRACKING_REQUESTED	= const_cpu_to_le32(0x00000040),
-	QUOTA_FLAG_LOG_THRESHOLD	= const_cpu_to_le32(0x00000080),
-
-	QUOTA_FLAG_LOG_LIMIT		= const_cpu_to_le32(0x00000100),
-	QUOTA_FLAG_OUT_OF_DATE		= const_cpu_to_le32(0x00000200),
-	QUOTA_FLAG_CORRUPT		= const_cpu_to_le32(0x00000400),
-	QUOTA_FLAG_PENDING_DELETES	= const_cpu_to_le32(0x00000800),
+	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
+	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
+	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
+	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
+
+	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
+	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
+	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
+	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
 };
 
 typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
  * Predefined owner_id values (32-bit).
  */
 enum {
-	QUOTA_INVALID_ID	= const_cpu_to_le32(0x00000000),
-	QUOTA_DEFAULTS_ID	= const_cpu_to_le32(0x00000001),
-	QUOTA_FIRST_USER_ID	= const_cpu_to_le32(0x00000100),
+	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
+	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
+	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
 };
 
 /*
@@ -2189,14 +2178,14 @@ typedef enum {
  * Index entry flags (16-bit).
  */
 enum {
-	INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
 			sub-node, i.e. a reference to an index block in form of
 			a virtual cluster number (see below). */
-	INDEX_ENTRY_END  = const_cpu_to_le16(2), /* This signifies the last
+	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
 			entry in an index block.  The index entry does not
 			represent a file but it can point to a sub-node. */
 
-	INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
 			enum bit width to 16-bit. */
 } __attribute__ ((__packed__));
 
@@ -2334,26 +2323,26 @@ typedef struct {
  * These are the predefined reparse point tags:
  */
 enum {
-	IO_REPARSE_TAG_IS_ALIAS		= const_cpu_to_le32(0x20000000),
-	IO_REPARSE_TAG_IS_HIGH_LATENCY	= const_cpu_to_le32(0x40000000),
-	IO_REPARSE_TAG_IS_MICROSOFT	= const_cpu_to_le32(0x80000000),
+	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
+	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
+	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
 
-	IO_REPARSE_TAG_RESERVED_ZERO	= const_cpu_to_le32(0x00000000),
-	IO_REPARSE_TAG_RESERVED_ONE	= const_cpu_to_le32(0x00000001),
-	IO_REPARSE_TAG_RESERVED_RANGE	= const_cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
+	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
 
-	IO_REPARSE_TAG_NSS		= const_cpu_to_le32(0x68000005),
-	IO_REPARSE_TAG_NSS_RECOVER	= const_cpu_to_le32(0x68000006),
-	IO_REPARSE_TAG_SIS		= const_cpu_to_le32(0x68000007),
-	IO_REPARSE_TAG_DFS		= const_cpu_to_le32(0x68000008),
+	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
+	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
+	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
+	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
 
-	IO_REPARSE_TAG_MOUNT_POINT	= const_cpu_to_le32(0x88000003),
+	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
 
-	IO_REPARSE_TAG_HSM		= const_cpu_to_le32(0xa8000004),
+	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
 
-	IO_REPARSE_TAG_SYMBOLIC_LINK	= const_cpu_to_le32(0xe8000000),
+	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
 
-	IO_REPARSE_TAG_VALID_VALUES	= const_cpu_to_le32(0xe000ffff),
+	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
 };
 
 /*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae..b5a6f08bd35 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
  * in this particular client array.  Also inside the client records themselves,
  * this means that there are no client records preceding or following this one.
  */
-#define LOGFILE_NO_CLIENT	const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
 #define LOGFILE_NO_CLIENT_CPU	0xffff
 
 /*
@@ -112,8 +112,8 @@ typedef struct {
  * information about the log file in which they are present.
  */
 enum {
-	RESTART_VOLUME_IS_CLEAN	= const_cpu_to_le16(0x0002),
-	RESTART_SPACE_FILLER	= const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
+	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
 } __attribute__ ((__packed__));
 
 typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc3..23bf68453d7 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
 	 */
 
 	/* Mark the mft record as not in use. */
-	m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+	m->flags &= ~MFT_RECORD_IN_USE;
 
 	/* Increment the sequence number, skipping zero, if it is not zero. */
 	old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b507..f76951dcd4a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
 	 * many BIOSes will refuse to boot from a bootsector if the magic is
 	 * incorrect, so we emit a warning.
 	 */
-	if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55))
+	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
 		ntfs_warning(sb, "Invalid end of sector marker.");
 	return true;
 not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 	u32 *kaddr, *kend;
 	ntfs_name *name = NULL;
 	int ret = 1;
-	static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'),
-			const_cpu_to_le16('i'), const_cpu_to_le16('b'),
-			const_cpu_to_le16('e'), const_cpu_to_le16('r'),
-			const_cpu_to_le16('f'), const_cpu_to_le16('i'),
-			const_cpu_to_le16('l'), const_cpu_to_le16('.'),
-			const_cpu_to_le16('s'), const_cpu_to_le16('y'),
-			const_cpu_to_le16('s'), 0 };
+	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+			cpu_to_le16('i'), cpu_to_le16('b'),
+			cpu_to_le16('e'), cpu_to_le16('r'),
+			cpu_to_le16('f'), cpu_to_le16('i'),
+			cpu_to_le16('l'), cpu_to_le16('.'),
+			cpu_to_le16('s'), cpu_to_le16('y'),
+			cpu_to_le16('s'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 		goto iput_out;
 	}
 	kaddr = (u32*)page_address(page);
-	if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) {
+	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
 		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
 				"hibernated on the volume.  This is the "
 				"system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
 	MFT_REF mref;
 	struct inode *tmp_ino;
 	ntfs_name *name = NULL;
-	static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
-			const_cpu_to_le16('o'), const_cpu_to_le16('t'),
-			const_cpu_to_le16('a'), 0 };
-	static ntfschar Q[3] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('Q'), 0 };
+	static const ntfschar Quota[7] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), cpu_to_le16('u'),
+			cpu_to_le16('o'), cpu_to_le16('t'),
+			cpu_to_le16('a'), 0 };
+	static ntfschar Q[3] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
 	struct page *page;
 	ntfs_name *name = NULL;
 	USN_HEADER *uh;
-	static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('U'), const_cpu_to_le16('s'),
-			const_cpu_to_le16('n'), const_cpu_to_le16('J'),
-			const_cpu_to_le16('r'), const_cpu_to_le16('n'),
-			const_cpu_to_le16('l'), 0 };
-	static ntfschar Max[5] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('M'), const_cpu_to_le16('a'),
-			const_cpu_to_le16('x'), 0 };
-	static ntfschar J[3] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('J'), 0 };
+	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+			cpu_to_le16('U'), cpu_to_le16('s'),
+			cpu_to_le16('n'), cpu_to_le16('J'),
+			cpu_to_le16('r'), cpu_to_le16('n'),
+			cpu_to_le16('l'), 0 };
+	static ntfschar Max[5] = { cpu_to_le16('$'),
+			cpu_to_le16('M'), cpu_to_le16('a'),
+			cpu_to_le16('x'), 0 };
+	static ntfschar J[3] = { cpu_to_le16('$'),
+			cpu_to_le16('J'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac32..00d8e6bd7c3 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
  * documentation: http://www.linux-ntfs.org/
  */
 enum {
-	USN_REASON_DATA_OVERWRITE	= const_cpu_to_le32(0x00000001),
-	USN_REASON_DATA_EXTEND		= const_cpu_to_le32(0x00000002),
-	USN_REASON_DATA_TRUNCATION	= const_cpu_to_le32(0x00000004),
-	USN_REASON_NAMED_DATA_OVERWRITE	= const_cpu_to_le32(0x00000010),
-	USN_REASON_NAMED_DATA_EXTEND	= const_cpu_to_le32(0x00000020),
-	USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040),
-	USN_REASON_FILE_CREATE		= const_cpu_to_le32(0x00000100),
-	USN_REASON_FILE_DELETE		= const_cpu_to_le32(0x00000200),
-	USN_REASON_EA_CHANGE		= const_cpu_to_le32(0x00000400),
-	USN_REASON_SECURITY_CHANGE	= const_cpu_to_le32(0x00000800),
-	USN_REASON_RENAME_OLD_NAME	= const_cpu_to_le32(0x00001000),
-	USN_REASON_RENAME_NEW_NAME	= const_cpu_to_le32(0x00002000),
-	USN_REASON_INDEXABLE_CHANGE	= const_cpu_to_le32(0x00004000),
-	USN_REASON_BASIC_INFO_CHANGE	= const_cpu_to_le32(0x00008000),
-	USN_REASON_HARD_LINK_CHANGE	= const_cpu_to_le32(0x00010000),
-	USN_REASON_COMPRESSION_CHANGE	= const_cpu_to_le32(0x00020000),
-	USN_REASON_ENCRYPTION_CHANGE	= const_cpu_to_le32(0x00040000),
-	USN_REASON_OBJECT_ID_CHANGE	= const_cpu_to_le32(0x00080000),
-	USN_REASON_REPARSE_POINT_CHANGE	= const_cpu_to_le32(0x00100000),
-	USN_REASON_STREAM_CHANGE	= const_cpu_to_le32(0x00200000),
-	USN_REASON_CLOSE		= const_cpu_to_le32(0x80000000),
+	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
+	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
+	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
+	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
+	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
+	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
+	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
+	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
+	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
+	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
+	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
+	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
+	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
+	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
+	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
+	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
+	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
+	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
+	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
+	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
 };
 
 typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
  *	http://www.linux-ntfs.org/
  */
 enum {
-	USN_SOURCE_DATA_MANAGEMENT	  = const_cpu_to_le32(0x00000001),
-	USN_SOURCE_AUXILIARY_DATA	  = const_cpu_to_le32(0x00000002),
-	USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004),
+	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
+	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
+	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
 };
 
 typedef le32 USN_SOURCE_INFO_FLAGS;
-- 
cgit v1.2.3


From 891f7d73ea30f925596b90bcf21020bfc5d90f3f Mon Sep 17 00:00:00 2001
From: David Altobelli <david.altobelli@hp.com>
Date: Tue, 31 Mar 2009 15:23:53 -0700
Subject: hpilo: reduce frequency of IO operations

Change hpilo open and close logic to spin for 10usec between checking device,
rather than every usec.

Because the loop is coded to take up to 10ms, it seemed prudent to
increase the interval between polling the device, to reduce the load on
the system and allow more other work to happen.

Signed-off-by: David Altobelli <david.altobelli@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/hpilo.c | 6 +++---
 drivers/misc/hpilo.h | 6 +++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index cf991850f01..880ccf39e23 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -209,7 +209,7 @@ static void ilo_ccb_close(struct pci_dev *pdev, struct ccb_data *data)
 	/* give iLO some time to process stop request */
 	for (retries = MAX_WAIT; retries > 0; retries--) {
 		doorbell_set(driver_ccb);
-		udelay(1);
+		udelay(WAIT_TIME);
 		if (!(ioread32(&device_ccb->send_ctrl) & (1 << CTRL_BITPOS_A))
 		    &&
 		    !(ioread32(&device_ccb->recv_ctrl) & (1 << CTRL_BITPOS_A)))
@@ -312,7 +312,7 @@ static int ilo_ccb_open(struct ilo_hwinfo *hw, struct ccb_data *data, int slot)
 	for (i = MAX_WAIT; i > 0; i--) {
 		if (ilo_pkt_dequeue(hw, driver_ccb, SENDQ, &pkt_id, NULL, NULL))
 			break;
-		udelay(1);
+		udelay(WAIT_TIME);
 	}
 
 	if (i) {
@@ -759,7 +759,7 @@ static void __exit ilo_exit(void)
 	class_destroy(ilo_class);
 }
 
-MODULE_VERSION("1.0");
+MODULE_VERSION("1.1");
 MODULE_ALIAS(ILO_NAME);
 MODULE_DESCRIPTION(ILO_NAME);
 MODULE_AUTHOR("David Altobelli <david.altobelli@hp.com>");
diff --git a/drivers/misc/hpilo.h b/drivers/misc/hpilo.h
index b64a20ef07e..03a14c82aad 100644
--- a/drivers/misc/hpilo.h
+++ b/drivers/misc/hpilo.h
@@ -19,8 +19,12 @@
 #define MAX_ILO_DEV	1
 /* max number of files */
 #define MAX_OPEN	(MAX_CCB * MAX_ILO_DEV)
+/* total wait time in usec */
+#define MAX_WAIT_TIME	10000
+/* per spin wait time in usec */
+#define WAIT_TIME	10
 /* spin counter for open/close delay */
-#define MAX_WAIT	10000
+#define MAX_WAIT	(MAX_WAIT_TIME / WAIT_TIME)
 
 /*
  * Per device, used to track global memory allocations.
-- 
cgit v1.2.3


From 3cdbbeebb77348176bd6a03fd86e11bc281c529e Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 31 Mar 2009 15:23:53 -0700
Subject: drivers/misc/isl29003.c: driver for the ISL29003 ambient light sensor

Add a driver for Intersil's ISL29003 ambient light sensor device plus some
documentation.  Inspired by tsl2550.c, a driver for a similar device.

It is put in drivers/misc for now until the industrial I/O framework gets
merged.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Jonathan Cameron <jic23@cam.ac.uk>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/misc-devices/isl29003 |  62 +++++
 drivers/misc/Kconfig                |  10 +
 drivers/misc/Makefile               |   1 +
 drivers/misc/isl29003.c             | 470 ++++++++++++++++++++++++++++++++++++
 4 files changed, 543 insertions(+)
 create mode 100644 Documentation/misc-devices/isl29003
 create mode 100644 drivers/misc/isl29003.c

diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003
new file mode 100644
index 00000000000..c4ff5f38e01
--- /dev/null
+++ b/Documentation/misc-devices/isl29003
@@ -0,0 +1,62 @@
+Kernel driver isl29003
+=====================
+
+Supported chips:
+* Intersil ISL29003
+Prefix: 'isl29003'
+Addresses scanned: none
+Datasheet:
+http://www.intersil.com/data/fn/fn7464.pdf
+
+Author: Daniel Mack <daniel@caiaq.de>
+
+
+Description
+-----------
+The ISL29003 is an integrated light sensor with a 16-bit integrating type
+ADC, I2C user programmable lux range select for optimized counts/lux, and
+I2C multi-function control and monitoring capabilities. The internal ADC
+provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
+artificial light sources.
+
+The driver allows to set the lux range, the bit resolution, the operational
+mode (see below) and the power state of device and can read the current lux
+value, of course.
+
+
+Detection
+---------
+
+The ISL29003 does not have an ID register which could be used to identify
+it, so the detection routine will just try to read from the configured I2C
+addess and consider the device to be present as soon as it ACKs the
+transfer.
+
+
+Sysfs entries
+-------------
+
+range:
+	0: 0 lux to 1000 lux (default)
+	1: 0 lux to 4000 lux
+	2: 0 lux to 16,000 lux
+	3: 0 lux to 64,000 lux
+
+resolution:
+	0: 2^16 cycles (default)
+	1: 2^12 cycles
+	2: 2^8 cycles
+	3: 2^4 cycles
+
+mode:
+	0: diode1's current (unsigned 16bit) (default)
+	1: diode1's current (unsigned 16bit)
+	2: difference between diodes (l1 - l2, signed 15bit)
+
+power_state:
+	0: device is disabled (default)
+	1: device is enabled
+
+lux (read only):
+	returns the value from the last sensor reading
+
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 1c484084ed4..5f3bff43462 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -223,6 +223,16 @@ config DELL_LAPTOP
 	This driver adds support for rfkill and backlight control to Dell
 	laptops.
 
+config ISL29003
+	tristate "Intersil ISL29003 ambient light sensor"
+	depends on I2C && SYSFS
+	help
+	  If you say yes here you get support for the Intersil ISL29003
+	  ambient light sensor.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called isl29003.
+
 source "drivers/misc/c2port/Kconfig"
 source "drivers/misc/eeprom/Kconfig"
 
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index bc119983055..7871f05dcb9 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -18,5 +18,6 @@ obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
 obj-$(CONFIG_SGI_XP)		+= sgi-xp/
 obj-$(CONFIG_SGI_GRU)		+= sgi-gru/
 obj-$(CONFIG_HP_ILO)		+= hpilo.o
+obj-$(CONFIG_ISL29003)		+= isl29003.o
 obj-$(CONFIG_C2PORT)		+= c2port/
 obj-y				+= eeprom/
diff --git a/drivers/misc/isl29003.c b/drivers/misc/isl29003.c
new file mode 100644
index 00000000000..2e2a5923d4c
--- /dev/null
+++ b/drivers/misc/isl29003.c
@@ -0,0 +1,470 @@
+/*
+ *  isl29003.c - Linux kernel module for
+ * 	Intersil ISL29003 ambient light sensor
+ *
+ *  See file:Documentation/misc-devices/isl29003
+ *
+ *  Copyright (c) 2009 Daniel Mack <daniel@caiaq.de>
+ *
+ *  Based on code written by
+ *  	Rodolfo Giometti <giometti@linux.it>
+ *  	Eurotech S.p.A. <info@eurotech.it>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+
+#define ISL29003_DRV_NAME	"isl29003"
+#define DRIVER_VERSION		"1.0"
+
+#define ISL29003_REG_COMMAND		0x00
+#define ISL29003_ADC_ENABLED		(1 << 7)
+#define ISL29003_ADC_PD			(1 << 6)
+#define ISL29003_TIMING_INT		(1 << 5)
+#define ISL29003_MODE_SHIFT		(2)
+#define ISL29003_MODE_MASK		(0x3 << ISL29003_MODE_SHIFT)
+#define ISL29003_RES_SHIFT		(0)
+#define ISL29003_RES_MASK		(0x3 << ISL29003_RES_SHIFT)
+
+#define ISL29003_REG_CONTROL		0x01
+#define ISL29003_INT_FLG		(1 << 5)
+#define ISL29003_RANGE_SHIFT		(2)
+#define ISL29003_RANGE_MASK		(0x3 << ISL29003_RANGE_SHIFT)
+#define ISL29003_INT_PERSISTS_SHIFT	(0)
+#define ISL29003_INT_PERSISTS_MASK	(0xf << ISL29003_INT_PERSISTS_SHIFT)
+
+#define ISL29003_REG_IRQ_THRESH_HI	0x02
+#define ISL29003_REG_IRQ_THRESH_LO	0x03
+#define ISL29003_REG_LSB_SENSOR		0x04
+#define ISL29003_REG_MSB_SENSOR		0x05
+#define ISL29003_REG_LSB_TIMER		0x06
+#define ISL29003_REG_MSB_TIMER		0x07
+
+#define ISL29003_NUM_CACHABLE_REGS	4
+
+struct isl29003_data {
+	struct i2c_client *client;
+	struct mutex lock;
+	u8 reg_cache[ISL29003_NUM_CACHABLE_REGS];
+};
+
+static int gain_range[] = {
+	1000, 4000, 16000, 64000
+};
+
+/*
+ * register access helpers
+ */
+
+static int __isl29003_read_reg(struct i2c_client *client,
+			       u32 reg, u8 mask, u8 shift)
+{
+	struct isl29003_data *data = i2c_get_clientdata(client);
+	return (data->reg_cache[reg] & mask) >> shift;
+}
+
+static int __isl29003_write_reg(struct i2c_client *client,
+				u32 reg, u8 mask, u8 shift, u8 val)
+{
+	struct isl29003_data *data = i2c_get_clientdata(client);
+	int ret = 0;
+	u8 tmp;
+
+	if (reg >= ISL29003_NUM_CACHABLE_REGS)
+		return -EINVAL;
+
+	mutex_lock(&data->lock);
+
+	tmp = data->reg_cache[reg];
+	tmp &= ~mask;
+	tmp |= val << shift;
+
+	ret = i2c_smbus_write_byte_data(client, reg, tmp);
+	if (!ret)
+		data->reg_cache[reg] = tmp;
+
+	mutex_unlock(&data->lock);
+	return ret;
+}
+
+/*
+ * internally used functions
+ */
+
+/* range */
+static int isl29003_get_range(struct i2c_client *client)
+{
+	return __isl29003_read_reg(client, ISL29003_REG_CONTROL,
+		ISL29003_RANGE_MASK, ISL29003_RANGE_SHIFT);
+}
+
+static int isl29003_set_range(struct i2c_client *client, int range)
+{
+	return __isl29003_write_reg(client, ISL29003_REG_CONTROL,
+		ISL29003_RANGE_MASK, ISL29003_RANGE_SHIFT, range);
+}
+
+/* resolution */
+static int isl29003_get_resolution(struct i2c_client *client)
+{
+	return __isl29003_read_reg(client, ISL29003_REG_COMMAND,
+		ISL29003_RES_MASK, ISL29003_RES_SHIFT);
+}
+
+static int isl29003_set_resolution(struct i2c_client *client, int res)
+{
+	return __isl29003_write_reg(client, ISL29003_REG_COMMAND,
+		ISL29003_RES_MASK, ISL29003_RES_SHIFT, res);
+}
+
+/* mode */
+static int isl29003_get_mode(struct i2c_client *client)
+{
+	return __isl29003_read_reg(client, ISL29003_REG_COMMAND,
+		ISL29003_RES_MASK, ISL29003_RES_SHIFT);
+}
+
+static int isl29003_set_mode(struct i2c_client *client, int mode)
+{
+	return __isl29003_write_reg(client, ISL29003_REG_COMMAND,
+		ISL29003_RES_MASK, ISL29003_RES_SHIFT, mode);
+}
+
+/* power_state */
+static int isl29003_set_power_state(struct i2c_client *client, int state)
+{
+	return __isl29003_write_reg(client, ISL29003_REG_COMMAND,
+				ISL29003_ADC_ENABLED | ISL29003_ADC_PD, 0,
+				state ? ISL29003_ADC_ENABLED : ISL29003_ADC_PD);
+}
+
+static int isl29003_get_power_state(struct i2c_client *client)
+{
+	struct isl29003_data *data = i2c_get_clientdata(client);
+	u8 cmdreg = data->reg_cache[ISL29003_REG_COMMAND];
+	return ~cmdreg & ISL29003_ADC_PD;
+}
+
+static int isl29003_get_adc_value(struct i2c_client *client)
+{
+	struct isl29003_data *data = i2c_get_clientdata(client);
+	int lsb, msb, range, bitdepth;
+
+	mutex_lock(&data->lock);
+	lsb = i2c_smbus_read_byte_data(client, ISL29003_REG_LSB_SENSOR);
+
+	if (lsb < 0) {
+		mutex_unlock(&data->lock);
+		return lsb;
+	}
+
+	msb = i2c_smbus_read_byte_data(client, ISL29003_REG_MSB_SENSOR);
+	mutex_unlock(&data->lock);
+
+	if (msb < 0)
+		return msb;
+
+	range = isl29003_get_range(client);
+	bitdepth = (4 - isl29003_get_resolution(client)) * 4;
+	return (((msb << 8) | lsb) * gain_range[range]) >> bitdepth;
+}
+
+/*
+ * sysfs layer
+ */
+
+/* range */
+static ssize_t isl29003_show_range(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	return sprintf(buf, "%i\n", isl29003_get_range(client));
+}
+
+static ssize_t isl29003_store_range(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned long val;
+	int ret;
+
+	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3))
+		return -EINVAL;
+
+	ret = isl29003_set_range(client, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(range, S_IWUSR | S_IRUGO,
+		   isl29003_show_range, isl29003_store_range);
+
+
+/* resolution */
+static ssize_t isl29003_show_resolution(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	return sprintf(buf, "%d\n", isl29003_get_resolution(client));
+}
+
+static ssize_t isl29003_store_resolution(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned long val;
+	int ret;
+
+	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3))
+		return -EINVAL;
+
+	ret = isl29003_set_resolution(client, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(resolution, S_IWUSR | S_IRUGO,
+		   isl29003_show_resolution, isl29003_store_resolution);
+
+/* mode */
+static ssize_t isl29003_show_mode(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	return sprintf(buf, "%d\n", isl29003_get_mode(client));
+}
+
+static ssize_t isl29003_store_mode(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned long val;
+	int ret;
+
+	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 2))
+		return -EINVAL;
+
+	ret = isl29003_set_mode(client, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO,
+		   isl29003_show_mode, isl29003_store_mode);
+
+
+/* power state */
+static ssize_t isl29003_show_power_state(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	return sprintf(buf, "%d\n", isl29003_get_power_state(client));
+}
+
+static ssize_t isl29003_store_power_state(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned long val;
+	int ret;
+
+	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 1))
+		return -EINVAL;
+
+	ret = isl29003_set_power_state(client, val);
+	return ret ? ret : count;
+}
+
+static DEVICE_ATTR(power_state, S_IWUSR | S_IRUGO,
+		   isl29003_show_power_state, isl29003_store_power_state);
+
+
+/* lux */
+static ssize_t isl29003_show_lux(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+
+	/* No LUX data if not operational */
+	if (!isl29003_get_power_state(client))
+		return -EBUSY;
+
+	return sprintf(buf, "%d\n", isl29003_get_adc_value(client));
+}
+
+static DEVICE_ATTR(lux, S_IRUGO, isl29003_show_lux, NULL);
+
+static struct attribute *isl29003_attributes[] = {
+	&dev_attr_range.attr,
+	&dev_attr_resolution.attr,
+	&dev_attr_mode.attr,
+	&dev_attr_power_state.attr,
+	&dev_attr_lux.attr,
+	NULL
+};
+
+static const struct attribute_group isl29003_attr_group = {
+	.attrs = isl29003_attributes,
+};
+
+static int isl29003_init_client(struct i2c_client *client)
+{
+	struct isl29003_data *data = i2c_get_clientdata(client);
+	int i;
+
+	/* read all the registers once to fill the cache.
+	 * if one of the reads fails, we consider the init failed */
+	for (i = 0; i < ARRAY_SIZE(data->reg_cache); i++) {
+		int v = i2c_smbus_read_byte_data(client, i);
+		if (v < 0)
+			return -ENODEV;
+
+		data->reg_cache[i] = v;
+	}
+
+	/* set defaults */
+	isl29003_set_range(client, 0);
+	isl29003_set_resolution(client, 0);
+	isl29003_set_mode(client, 0);
+	isl29003_set_power_state(client, 0);
+
+	return 0;
+}
+
+/*
+ * I2C layer
+ */
+
+static int __devinit isl29003_probe(struct i2c_client *client,
+				    const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct isl29003_data *data;
+	int err = 0;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE))
+		return -EIO;
+
+	data = kzalloc(sizeof(struct isl29003_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->client = client;
+	i2c_set_clientdata(client, data);
+	mutex_init(&data->lock);
+
+	/* initialize the ISL29003 chip */
+	err = isl29003_init_client(client);
+	if (err)
+		goto exit_kfree;
+
+	/* register sysfs hooks */
+	err = sysfs_create_group(&client->dev.kobj, &isl29003_attr_group);
+	if (err)
+		goto exit_kfree;
+
+	dev_info(&client->dev, "driver version %s enabled\n", DRIVER_VERSION);
+	return 0;
+
+exit_kfree:
+	kfree(data);
+	return err;
+}
+
+static int __devexit isl29003_remove(struct i2c_client *client)
+{
+	sysfs_remove_group(&client->dev.kobj, &isl29003_attr_group);
+	isl29003_set_power_state(client, 0);
+	kfree(i2c_get_clientdata(client));
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int isl29003_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	return isl29003_set_power_state(client, 0);
+}
+
+static int isl29003_resume(struct i2c_client *client)
+{
+	int i;
+	struct isl29003_data *data = i2c_get_clientdata(client);
+
+	/* restore registers from cache */
+	for (i = 0; i < ARRAY_SIZE(data->reg_cache); i++)
+		if (!i2c_smbus_write_byte_data(client, i, data->reg_cache[i]))
+			return -EIO;
+
+	return 0;
+}
+
+#else
+#define isl29003_suspend	NULL
+#define isl29003_resume		NULL
+#endif /* CONFIG_PM */
+
+static const struct i2c_device_id isl29003_id[] = {
+	{ "isl29003", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, isl29003_id);
+
+static struct i2c_driver isl29003_driver = {
+	.driver = {
+		.name	= ISL29003_DRV_NAME,
+		.owner	= THIS_MODULE,
+	},
+	.suspend = isl29003_suspend,
+	.resume	= isl29003_resume,
+	.probe	= isl29003_probe,
+	.remove	= __devexit_p(isl29003_remove),
+	.id_table = isl29003_id,
+};
+
+static int __init isl29003_init(void)
+{
+	return i2c_add_driver(&isl29003_driver);
+}
+
+static void __exit isl29003_exit(void)
+{
+	i2c_del_driver(&isl29003_driver);
+}
+
+MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
+MODULE_DESCRIPTION("ISL29003 ambient light sensor driver");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRIVER_VERSION);
+
+module_init(isl29003_init);
+module_exit(isl29003_exit);
+
-- 
cgit v1.2.3


From 5071f97ec6d74f006072de0ce89b67c8792fe5a1 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:10 -0700
Subject: epoll: fix epoll's own poll

Fix a bug inside the epoll's f_op->poll() code, that returns POLLIN even
though there are no actual ready monitored fds.  The bug shows up if you
add an epoll fd inside another fd container (poll, select, epoll).

The problem is that callback-based wake ups used by epoll does not carry
(patches will follow, to fix this) any information about the events that
actually happened.  So the callback code, since it can't call the file*
->poll() inside the callback, chains the file* into a ready-list.

So, suppose you added an fd with EPOLLOUT only, and some data shows up on
the fd, the file* mapped by the fd will be added into the ready-list (via
wakeup callback).  During normal epoll_wait() use, this condition is
sorted out at the time we're actually able to call the file*'s
f_op->poll().

Inside the old epoll's f_op->poll() though, only a quick check
!list_empty(ready-list) was performed, and this could have led to
reporting POLLIN even though no ready fds would show up at a following
epoll_wait().  In order to correctly report the ready status for an epoll
fd, the ready-list must be checked to see if any really available fd+event
would be ready in a following epoll_wait().

Operation (calling f_op->poll() from inside f_op->poll()) that, like wake
ups, must be handled with care because of the fact that epoll fds can be
added to other epoll fds.

Test code:

/*
 *  epoll_test by Davide Libenzi (Simple code to test epoll internals)
 *  Copyright (C) 2008  Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <limits.h>
#include <poll.h>
#include <sys/epoll.h>
#include <sys/wait.h>

#define EPWAIT_TIMEO	(1 * 1000)
#ifndef POLLRDHUP
#define POLLRDHUP 0x2000
#endif

#define EPOLL_MAX_CHAIN	100L

#define EPOLL_TF_LOOP (1 << 0)

struct epoll_test_cfg {
	long size;
	long flags;
};

static int xepoll_create(int n) {
	int epfd;

	if ((epfd = epoll_create(n)) == -1) {
		perror("epoll_create");
		exit(2);
	}

	return epfd;
}

static void xepoll_ctl(int epfd, int cmd, int fd, struct epoll_event *evt) {
	if (epoll_ctl(epfd, cmd, fd, evt) < 0) {
		perror("epoll_ctl");
		exit(3);
	}
}

static void xpipe(int *fds) {
	if (pipe(fds)) {
		perror("pipe");
		exit(4);
	}
}

static pid_t xfork(void) {
	pid_t pid;

	if ((pid = fork()) == (pid_t) -1) {
		perror("pipe");
		exit(5);
	}

	return pid;
}

static int run_forked_proc(int (*proc)(void *), void *data) {
	int status;
	pid_t pid;

	if ((pid = xfork()) == 0)
		exit((*proc)(data));
	if (waitpid(pid, &status, 0) != pid) {
		perror("waitpid");
		return -1;
	}

	return WIFEXITED(status) ? WEXITSTATUS(status): -2;
}

static int check_events(int fd, int timeo) {
	struct pollfd pfd;

	fprintf(stdout, "Checking events for fd %d\n", fd);
	memset(&pfd, 0, sizeof(pfd));
	pfd.fd = fd;
	pfd.events = POLLIN | POLLOUT;
	if (poll(&pfd, 1, timeo) < 0) {
		perror("poll()");
		return 0;
	}
	if (pfd.revents & POLLIN)
		fprintf(stdout, "\tPOLLIN\n");
	if (pfd.revents & POLLOUT)
		fprintf(stdout, "\tPOLLOUT\n");
	if (pfd.revents & POLLERR)
		fprintf(stdout, "\tPOLLERR\n");
	if (pfd.revents & POLLHUP)
		fprintf(stdout, "\tPOLLHUP\n");
	if (pfd.revents & POLLRDHUP)
		fprintf(stdout, "\tPOLLRDHUP\n");

	return pfd.revents;
}

static int epoll_test_tty(void *data) {
	int epfd, ifd = fileno(stdin), res;
	struct epoll_event evt;

	if (check_events(ifd, 0) != POLLOUT) {
		fprintf(stderr, "Something is cooking on STDIN (%d)\n", ifd);
		return 1;
	}
	epfd = xepoll_create(1);
	fprintf(stdout, "Created epoll fd (%d)\n", epfd);
	memset(&evt, 0, sizeof(evt));
	evt.events = EPOLLIN;
	xepoll_ctl(epfd, EPOLL_CTL_ADD, ifd, &evt);
	if (check_events(epfd, 0) & POLLIN) {
		res = epoll_wait(epfd, &evt, 1, 0);
		if (res == 0) {
			fprintf(stderr, "Epoll fd (%d) is ready when it shouldn't!\n",
				epfd);
			return 2;
		}
	}

	return 0;
}

static int epoll_wakeup_chain(void *data) {
	struct epoll_test_cfg *tcfg = data;
	int i, res, epfd, bfd, nfd, pfds[2];
	pid_t pid;
	struct epoll_event evt;

	memset(&evt, 0, sizeof(evt));
	evt.events = EPOLLIN;

	epfd = bfd = xepoll_create(1);

	for (i = 0; i < tcfg->size; i++) {
		nfd = xepoll_create(1);
		xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt);
		bfd = nfd;
	}
	xpipe(pfds);
	if (tcfg->flags & EPOLL_TF_LOOP)
	{
		xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt);
		/*
		 * If we're testing for loop, we want that the wakeup
		 * triggered by the write to the pipe done in the child
		 * process, triggers a fake event. So we add the pipe
		 * read size with EPOLLOUT events. This will trigger
		 * an addition to the ready-list, but no real events
		 * will be there. The the epoll kernel code will proceed
		 * in calling f_op->poll() of the epfd, triggering the
		 * loop we want to test.
		 */
		evt.events = EPOLLOUT;
	}
	xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt);

	/*
	 * The pipe write must come after the poll(2) call inside
	 * check_events(). This tests the nested wakeup code in
	 * fs/eventpoll.c:ep_poll_safewake()
	 * By having the check_events() (hence poll(2)) happens first,
	 * we have poll wait queue filled up, and the write(2) in the
	 * child will trigger the wakeup chain.
	 */
	if ((pid = xfork()) == 0) {
		sleep(1);
		write(pfds[1], "w", 1);
		exit(0);
	}

	res = check_events(epfd, 2000) & POLLIN;

	if (waitpid(pid, NULL, 0) != pid) {
		perror("waitpid");
		return -1;
	}

	return res;
}

static int epoll_poll_chain(void *data) {
	struct epoll_test_cfg *tcfg = data;
	int i, res, epfd, bfd, nfd, pfds[2];
	pid_t pid;
	struct epoll_event evt;

	memset(&evt, 0, sizeof(evt));
	evt.events = EPOLLIN;

	epfd = bfd = xepoll_create(1);

	for (i = 0; i < tcfg->size; i++) {
		nfd = xepoll_create(1);
		xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt);
		bfd = nfd;
	}
	xpipe(pfds);
	if (tcfg->flags & EPOLL_TF_LOOP)
	{
		xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt);
		/*
		 * If we're testing for loop, we want that the wakeup
		 * triggered by the write to the pipe done in the child
		 * process, triggers a fake event. So we add the pipe
		 * read size with EPOLLOUT events. This will trigger
		 * an addition to the ready-list, but no real events
		 * will be there. The the epoll kernel code will proceed
		 * in calling f_op->poll() of the epfd, triggering the
		 * loop we want to test.
		 */
		evt.events = EPOLLOUT;
	}
	xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt);

	/*
	 * The pipe write mush come before the poll(2) call inside
	 * check_events(). This tests the nested f_op->poll calls code in
	 * fs/eventpoll.c:ep_eventpoll_poll()
	 * By having the pipe write(2) happen first, we make the kernel
	 * epoll code to load the ready lists, and the following poll(2)
	 * done inside check_events() will test nested poll code in
	 * ep_eventpoll_poll().
	 */
	if ((pid = xfork()) == 0) {
		write(pfds[1], "w", 1);
		exit(0);
	}
	sleep(1);
	res = check_events(epfd, 1000) & POLLIN;

	if (waitpid(pid, NULL, 0) != pid) {
		perror("waitpid");
		return -1;
	}

	return res;
}

int main(int ac, char **av) {
	int error;
	struct epoll_test_cfg tcfg;

	fprintf(stdout, "\n********** Testing TTY events\n");
	error = run_forked_proc(epoll_test_tty, NULL);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing short wakeup chain\n");
	error = run_forked_proc(epoll_wakeup_chain, &tcfg);
	fprintf(stdout, error == POLLIN ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = EPOLL_MAX_CHAIN;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing long wakeup chain (HOLD ON)\n");
	error = run_forked_proc(epoll_wakeup_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing short poll chain\n");
	error = run_forked_proc(epoll_poll_chain, &tcfg);
	fprintf(stdout, error == POLLIN ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = EPOLL_MAX_CHAIN;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing long poll chain (HOLD ON)\n");
	error = run_forked_proc(epoll_poll_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = EPOLL_TF_LOOP;
	fprintf(stdout, "\n********** Testing loopy wakeup chain (HOLD ON)\n");
	error = run_forked_proc(epoll_wakeup_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = EPOLL_TF_LOOP;
	fprintf(stdout, "\n********** Testing loopy poll chain (HOLD ON)\n");
	error = run_forked_proc(epoll_poll_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	return 0;
}

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 511 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 304 insertions(+), 207 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd..8a23a91e137 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
- *  fs/eventpoll.c (Efficent event polling implementation)
- *  Copyright (C) 2001,...,2007	 Davide Libenzi
+ *  fs/eventpoll.c (Efficient event retrieval implementation)
+ *  Copyright (C) 2001,...,2009	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -92,8 +92,8 @@
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
 
-/* Maximum number of poll wake up nests we are allowing */
-#define EP_MAX_POLLWAKE_NESTS 4
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
 
 /* Maximum msec timeout value storeable in a long int */
 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +110,21 @@ struct epoll_filefd {
 };
 
 /*
- * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
- * It is used to keep track on all tasks that are currently inside the wake_up() code
- * to 1) short-circuit the one coming from the same task and same wait queue head
- * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
- * 3) let go the ones coming from other tasks.
+ * Structure used to track possible nested calls, for too deep recursions
+ * and loop cycles.
  */
-struct wake_task_node {
+struct nested_call_node {
 	struct list_head llink;
 	struct task_struct *task;
-	wait_queue_head_t *wq;
+	void *cookie;
 };
 
 /*
- * This is used to implement the safe poll wake up avoiding to reenter
- * the poll callback from inside wake_up().
+ * This structure is used as collector for nested calls, to check for
+ * maximum recursion dept and loop cycles.
  */
-struct poll_safewake {
-	struct list_head wake_task_list;
+struct nested_calls {
+	struct list_head tasks_call_list;
 	spinlock_t lock;
 };
 
@@ -231,6 +228,12 @@ struct ep_pqueue {
 	struct epitem *epi;
 };
 
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+	int maxevents;
+	struct epoll_event __user *events;
+};
+
 /*
  * Configuration options available inside /proc/sys/fs/epoll/
  */
@@ -242,8 +245,11 @@ static int max_user_watches __read_mostly;
  */
 static DEFINE_MUTEX(epmutex);
 
-/* Safe wake up implementation */
-static struct poll_safewake psw;
+/* Used for safe wake up implementation */
+static struct nested_calls poll_safewake_ncalls;
+
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
 
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
@@ -312,64 +318,96 @@ static inline int ep_op_has_event(int op)
 }
 
 /* Initialize the poll safe wake up structure */
-static void ep_poll_safewake_init(struct poll_safewake *psw)
+static void ep_nested_calls_init(struct nested_calls *ncalls)
 {
-
-	INIT_LIST_HEAD(&psw->wake_task_list);
-	spin_lock_init(&psw->lock);
+	INIT_LIST_HEAD(&ncalls->tasks_call_list);
+	spin_lock_init(&ncalls->lock);
 }
 
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
- * because this one gets called by the poll callback, that in turn is called
- * from inside a wake_up(), that might be called from irq context.
+/**
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
+ *                  that the recursion limit is not exceeded, and that
+ *                  the same nested call (by the meaning of same cookie) is
+ *                  no re-entered.
+ *
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
+ * @max_nests: Maximum number of allowed nesting calls.
+ * @nproc: Nested call core function pointer.
+ * @priv: Opaque data to be passed to the @nproc callback.
+ * @cookie: Cookie to be used to identify this nested call.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ *          the maximum recursion limit has been exceeded.
  */
-static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+			  int (*nproc)(void *, void *, int), void *priv,
+			  void *cookie)
 {
-	int wake_nests = 0;
+	int error, call_nests = 0;
 	unsigned long flags;
 	struct task_struct *this_task = current;
-	struct list_head *lsthead = &psw->wake_task_list;
-	struct wake_task_node *tncur;
-	struct wake_task_node tnode;
+	struct list_head *lsthead = &ncalls->tasks_call_list;
+	struct nested_call_node *tncur;
+	struct nested_call_node tnode;
 
-	spin_lock_irqsave(&psw->lock, flags);
+	spin_lock_irqsave(&ncalls->lock, flags);
 
-	/* Try to see if the current task is already inside this wakeup call */
+	/*
+	 * Try to see if the current task is already inside this wakeup call.
+	 * We use a list here, since the population inside this set is always
+	 * very much limited.
+	 */
 	list_for_each_entry(tncur, lsthead, llink) {
-
-		if (tncur->wq == wq ||
-		    (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
+		if (tncur->task == this_task &&
+		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
 			/*
 			 * Ops ... loop detected or maximum nest level reached.
 			 * We abort this wake by breaking the cycle itself.
 			 */
-			spin_unlock_irqrestore(&psw->lock, flags);
-			return;
+			spin_unlock_irqrestore(&ncalls->lock, flags);
+
+			return -1;
 		}
 	}
 
-	/* Add the current task to the list */
+	/* Add the current task and cookie to the list */
 	tnode.task = this_task;
-	tnode.wq = wq;
+	tnode.cookie = cookie;
 	list_add(&tnode.llink, lsthead);
 
-	spin_unlock_irqrestore(&psw->lock, flags);
+	spin_unlock_irqrestore(&ncalls->lock, flags);
 
-	/* Do really wake up now */
-	wake_up_nested(wq, 1 + wake_nests);
+	/* Call the nested function */
+	error = (*nproc)(priv, cookie, call_nests);
 
 	/* Remove the current task from the list */
-	spin_lock_irqsave(&psw->lock, flags);
+	spin_lock_irqsave(&ncalls->lock, flags);
 	list_del(&tnode.llink);
-	spin_unlock_irqrestore(&psw->lock, flags);
+	spin_unlock_irqrestore(&ncalls->lock, flags);
+
+	return error;
+}
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+	wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests);
+	return 0;
+}
+
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+		       ep_poll_wakeup_proc, NULL, wq);
 }
 
 /*
@@ -397,6 +435,104 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 	}
 }
 
+/**
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
+ *                      the scan code, to call f_op->poll(). Also allows for
+ *                      O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+			      int (*sproc)(struct eventpoll *,
+					   struct list_head *, void *),
+			      void *priv)
+{
+	int error, pwake = 0;
+	unsigned long flags;
+	struct epitem *epi, *nepi;
+	struct list_head txlist;
+
+	INIT_LIST_HEAD(&txlist);
+
+	/*
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
+	 */
+	mutex_lock(&ep->mtx);
+
+	/*
+	 * Steal the ready list, and re-init the original one to the
+	 * empty list. Also, set ep->ovflist to NULL so that events
+	 * happening while looping w/out locks, are not lost. We cannot
+	 * have the poll callback to queue directly on ep->rdllist,
+	 * because we want the "sproc" callback to be able to do it
+	 * in a lockless way.
+	 */
+	spin_lock_irqsave(&ep->lock, flags);
+	list_splice(&ep->rdllist, &txlist);
+	INIT_LIST_HEAD(&ep->rdllist);
+	ep->ovflist = NULL;
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/*
+	 * Now call the callback function.
+	 */
+	error = (*sproc)(ep, &txlist, priv);
+
+	spin_lock_irqsave(&ep->lock, flags);
+	/*
+	 * During the time we spent inside the "sproc" callback, some
+	 * other events might have been queued by the poll callback.
+	 * We re-insert them inside the main ready-list here.
+	 */
+	for (nepi = ep->ovflist; (epi = nepi) != NULL;
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/*
+		 * We need to check if the item is already in the list.
+		 * During the "sproc" callback execution time, items are
+		 * queued into ->ovflist but the "txlist" might already
+		 * contain them, and the list_splice() below takes care of them.
+		 */
+		if (!ep_is_linked(&epi->rdllink))
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+	}
+	/*
+	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+	 * releasing the lock, events will be queued in the normal way inside
+	 * ep->rdllist.
+	 */
+	ep->ovflist = EP_UNACTIVE_PTR;
+
+	/*
+	 * Quickly re-inject items left on "txlist".
+	 */
+	list_splice(&txlist, &ep->rdllist);
+
+	if (!list_empty(&ep->rdllist)) {
+		/*
+		 * Wake up (if active) both the eventpoll wait list and the ->poll()
+		 * wait list (delayed after we release the lock).
+		 */
+		if (waitqueue_active(&ep->wq))
+			wake_up_locked(&ep->wq);
+		if (waitqueue_active(&ep->poll_wait))
+			pwake++;
+	}
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	mutex_unlock(&ep->mtx);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return error;
+}
+
 /*
  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  * all the associated resources. Must be called with "mtx" held.
@@ -447,7 +583,7 @@ static void ep_free(struct eventpoll *ep)
 
 	/* We need to release all tasks waiting for these file */
 	if (waitqueue_active(&ep->poll_wait))
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	/*
 	 * We need to lock this because we could be hit by
@@ -496,22 +632,49 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
+{
+	struct epitem *epi, *tmp;
+
+	list_for_each_entry_safe(epi, tmp, head, rdllink) {
+		if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+		    epi->event.events)
+			return POLLIN | POLLRDNORM;
+		else
+			/*
+			 * Item has been dropped into the ready list by the poll
+			 * callback, but it's not actually ready, as far as
+			 * caller requested events goes. We can remove it here.
+			 */
+			list_del_init(&epi->rdllink);
+	}
+
+	return 0;
+}
+
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+}
+
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-	unsigned int pollflags = 0;
-	unsigned long flags;
+	int pollflags;
 	struct eventpoll *ep = file->private_data;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
 
-	/* Check our condition */
-	spin_lock_irqsave(&ep->lock, flags);
-	if (!list_empty(&ep->rdllist))
-		pollflags = POLLIN | POLLRDNORM;
-	spin_unlock_irqrestore(&ep->lock, flags);
+	/*
+	 * Proceed to find out if wanted events are really available inside
+	 * the ready list. This need to be done under ep_call_nested()
+	 * supervision, since the call to f_op->poll() done on listed files
+	 * could re-enter here.
+	 */
+	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+				   ep_poll_readyevents_proc, ep, ep);
 
-	return pollflags;
+	return pollflags != -1 ? pollflags: 0;
 }
 
 /* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +704,7 @@ void eventpoll_release_file(struct file *file)
 	 * We don't want to get "file->f_lock" because it is not
 	 * necessary. It is not necessary because we're in the "struct file"
 	 * cleanup path, and this means that noone is using this file anymore.
-	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
+	 * So, for example, epoll_ctl() cannot hit here since if we reach this
 	 * point, the file counter already went to zero and fget() would fail.
 	 * The only hit might come from ep_free() but by holding the mutex
 	 * will correctly serialize the operation. We do need to acquire
@@ -670,12 +833,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	}
 
 	/* If this file is already in the ready list we exit soon */
-	if (ep_is_linked(&epi->rdllink))
-		goto is_linked;
-
-	list_add_tail(&epi->rdllink, &ep->rdllist);
+	if (!ep_is_linked(&epi->rdllink))
+		list_add_tail(&epi->rdllink, &ep->rdllist);
 
-is_linked:
 	/*
 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 	 * wait list.
@@ -690,7 +850,7 @@ out_unlock:
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	return 1;
 }
@@ -712,10 +872,9 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 		add_wait_queue(whead, &pwq->wait);
 		list_add_tail(&pwq->llink, &epi->pwqlist);
 		epi->nwait++;
-	} else {
+	} else
 		/* We have to signal that an error occurred */
 		epi->nwait = -1;
-	}
 }
 
 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
@@ -817,7 +976,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
 		     current, ep, tfile, fd));
@@ -891,137 +1050,74 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	return 0;
 }
 
-static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
-			  int maxevents)
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
 {
-	int eventcnt, error = -EFAULT, pwake = 0;
-	unsigned int revents;
-	unsigned long flags;
-	struct epitem *epi, *nepi;
-	struct list_head txlist;
-
-	INIT_LIST_HEAD(&txlist);
-
-	/*
-	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
-	 */
-	mutex_lock(&ep->mtx);
-
-	/*
-	 * Steal the ready list, and re-init the original one to the
-	 * empty list. Also, set ep->ovflist to NULL so that events
-	 * happening while looping w/out locks, are not lost. We cannot
-	 * have the poll callback to queue directly on ep->rdllist,
-	 * because we are doing it in the loop below, in a lockless way.
-	 */
-	spin_lock_irqsave(&ep->lock, flags);
-	list_splice(&ep->rdllist, &txlist);
-	INIT_LIST_HEAD(&ep->rdllist);
-	ep->ovflist = NULL;
-	spin_unlock_irqrestore(&ep->lock, flags);
+	struct ep_send_events_data *esed = priv;
+	int eventcnt;
+  	unsigned int revents;
+	struct epitem *epi;
+	struct epoll_event __user *uevent;
 
-	/*
-	 * We can loop without lock because this is a task private list.
-	 * We just splice'd out the ep->rdllist in ep_collect_ready_items().
-	 * Items cannot vanish during the loop because we are holding "mtx".
-	 */
-	for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
-		epi = list_first_entry(&txlist, struct epitem, rdllink);
+  	/*
+	 * We can loop without lock because we are passed a task private list.
+	 * Items cannot vanish during the loop because ep_scan_ready_list() is
+	 * holding "mtx" during this call.
+  	 */
+	for (eventcnt = 0, uevent = esed->events;
+	     !list_empty(head) && eventcnt < esed->maxevents;) {
+		epi = list_first_entry(head, struct epitem, rdllink);
 
 		list_del_init(&epi->rdllink);
 
-		/*
-		 * Get the ready file event set. We can safely use the file
-		 * because we are holding the "mtx" and this will guarantee
-		 * that both the file and the item will not vanish.
-		 */
-		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
-		revents &= epi->event.events;
-
-		/*
-		 * Is the event mask intersect the caller-requested one,
-		 * deliver the event to userspace. Again, we are holding
-		 * "mtx", so no operations coming from userspace can change
-		 * the item.
-		 */
-		if (revents) {
-			if (__put_user(revents,
-				       &events[eventcnt].events) ||
-			    __put_user(epi->event.data,
-				       &events[eventcnt].data))
-				goto errxit;
-			if (epi->event.events & EPOLLONESHOT)
-				epi->event.events &= EP_PRIVATE_BITS;
-			eventcnt++;
-		}
-		/*
-		 * At this point, noone can insert into ep->rdllist besides
-		 * us. The epoll_ctl() callers are locked out by us holding
-		 * "mtx" and the poll callback will queue them in ep->ovflist.
-		 */
-		if (!(epi->event.events & EPOLLET) &&
-		    (revents & epi->event.events))
-			list_add_tail(&epi->rdllink, &ep->rdllist);
-	}
-	error = 0;
-
-errxit:
-
-	spin_lock_irqsave(&ep->lock, flags);
-	/*
-	 * During the time we spent in the loop above, some other events
-	 * might have been queued by the poll callback. We re-insert them
-	 * inside the main ready-list here.
-	 */
-	for (nepi = ep->ovflist; (epi = nepi) != NULL;
-	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
-		/*
-		 * If the above loop quit with errors, the epoll item might still
-		 * be linked to "txlist", and the list_splice() done below will
-		 * take care of those cases.
-		 */
-		if (!ep_is_linked(&epi->rdllink))
-			list_add_tail(&epi->rdllink, &ep->rdllist);
-	}
-	/*
-	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
-	 * releasing the lock, events will be queued in the normal way inside
-	 * ep->rdllist.
-	 */
-	ep->ovflist = EP_UNACTIVE_PTR;
-
-	/*
-	 * In case of error in the event-send loop, or in case the number of
-	 * ready events exceeds the userspace limit, we need to splice the
-	 * "txlist" back inside ep->rdllist.
-	 */
-	list_splice(&txlist, &ep->rdllist);
-
-	if (!list_empty(&ep->rdllist)) {
-		/*
-		 * Wake up (if active) both the eventpoll wait list and the ->poll()
-		 * wait list (delayed after we release the lock).
-		 */
-		if (waitqueue_active(&ep->wq))
-			wake_up_locked(&ep->wq);
-		if (waitqueue_active(&ep->poll_wait))
-			pwake++;
-	}
-	spin_unlock_irqrestore(&ep->lock, flags);
+  		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+  			epi->event.events;
+
+  		/*
+		 * If the event mask intersect the caller-requested one,
+		 * deliver the event to userspace. Again, ep_scan_ready_list()
+		 * is holding "mtx", so no operations coming from userspace
+		 * can change the item.
+  		 */
+  		if (revents) {
+			if (__put_user(revents, &uevent->events) ||
+			    __put_user(epi->event.data, &uevent->data))
+				return eventcnt ? eventcnt: -EFAULT;
+  			eventcnt++;
+			uevent++;
+  			if (epi->event.events & EPOLLONESHOT)
+  				epi->event.events &= EP_PRIVATE_BITS;
+  			else if (!(epi->event.events & EPOLLET))
+  				/*
+				 * If this file has been added with Level Trigger
+				 * mode, we need to insert back inside the ready
+				 * list, so that the next call to epoll_wait()
+				 * will check again the events availability.
+  				 * At this point, noone can insert into ep->rdllist
+  				 * besides us. The epoll_ctl() callers are locked
+				 * out by ep_scan_ready_list() holding "mtx" and
+				 * the poll callback will queue them in ep->ovflist.
+  				 */
+  				list_add_tail(&epi->rdllink, &ep->rdllist);
+  		}
+  	}
+
+	return eventcnt;
+}
 
-	mutex_unlock(&ep->mtx);
+static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
+			  int maxevents)
+{
+	struct ep_send_events_data esed;
 
-	/* We have to call this outside the lock */
-	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+	esed.maxevents = maxevents;
+	esed.events = events;
 
-	return eventcnt == 0 ? error: eventcnt;
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
 
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1129,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	wait_queue_t wait;
 
 	/*
-	 * Calculate the timeout by checking for the "infinite" value ( -1 )
+	 * Calculate the timeout by checking for the "infinite" value (-1)
 	 * and the overflow condition. The passed timeout is in milliseconds,
 	 * that why (t * HZ) / 1000.
 	 */
@@ -1076,9 +1172,8 @@ retry:
 
 		set_current_state(TASK_RUNNING);
 	}
-
 	/* Is it worth to try to dig for events ? */
-	eavail = !list_empty(&ep->rdllist);
+	eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
@@ -1099,41 +1194,40 @@ retry:
  */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-	int error, fd = -1;
-	struct eventpoll *ep;
+	int error;
+	struct eventpoll *ep = NULL;
 
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
-	if (flags & ~EPOLL_CLOEXEC)
-		return -EINVAL;
-
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
 		     current, flags));
 
+	error = -EINVAL;
+	if (flags & ~EPOLL_CLOEXEC)
+		goto error_return;
+
 	/*
-	 * Create the internal data structure ( "struct eventpoll" ).
+	 * Create the internal data structure ("struct eventpoll").
 	 */
 	error = ep_alloc(&ep);
-	if (error < 0) {
-		fd = error;
+	if (error < 0)
 		goto error_return;
-	}
 
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-			      flags & O_CLOEXEC);
-	if (fd < 0)
+	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+				 flags & O_CLOEXEC);
+	if (error < 0)
 		ep_free(ep);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, flags, fd));
+		     current, flags, error));
 
-	return fd;
+	return error;
 }
 
 SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1359,7 +1453,10 @@ static int __init eventpoll_init(void)
 		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
-	ep_poll_safewake_init(&psw);
+	ep_nested_calls_init(&poll_safewake_ncalls);
+
+	/* Initialize the structure used to perform file's f_op->poll() calls */
+	ep_nested_calls_init(&poll_readywalk_ncalls);
 
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-- 
cgit v1.2.3


From 296e236e96dddef351a1809c0d414bcddfcf3800 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:11 -0700
Subject: epoll: fix epoll's own poll (update)

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 110 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 57 insertions(+), 53 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a23a91e137..e24d137c93b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -454,9 +454,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	int error, pwake = 0;
 	unsigned long flags;
 	struct epitem *epi, *nepi;
-	struct list_head txlist;
-
-	INIT_LIST_HEAD(&txlist);
+	LIST_HEAD(txlist);
 
 	/*
 	 * We need to lock this because we could be hit by
@@ -473,8 +471,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	 * in a lockless way.
 	 */
 	spin_lock_irqsave(&ep->lock, flags);
-	list_splice(&ep->rdllist, &txlist);
-	INIT_LIST_HEAD(&ep->rdllist);
+	list_splice_init(&ep->rdllist, &txlist);
 	ep->ovflist = NULL;
 	spin_unlock_irqrestore(&ep->lock, flags);
 
@@ -514,8 +511,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 
 	if (!list_empty(&ep->rdllist)) {
 		/*
-		 * Wake up (if active) both the eventpoll wait list and the ->poll()
-		 * wait list (delayed after we release the lock).
+		 * Wake up (if active) both the eventpoll wait list and
+		 * the ->poll() wait list (delayed after we release the lock).
 		 */
 		if (waitqueue_active(&ep->wq))
 			wake_up_locked(&ep->wq);
@@ -632,7 +629,8 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
 {
 	struct epitem *epi, *tmp;
 
@@ -640,13 +638,14 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, voi
 		if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
 		    epi->event.events)
 			return POLLIN | POLLRDNORM;
-		else
+		else {
 			/*
 			 * Item has been dropped into the ready list by the poll
 			 * callback, but it's not actually ready, as far as
 			 * caller requested events goes. We can remove it here.
 			 */
 			list_del_init(&epi->rdllink);
+		}
 	}
 
 	return 0;
@@ -674,7 +673,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
 				   ep_poll_readyevents_proc, ep, ep);
 
-	return pollflags != -1 ? pollflags: 0;
+	return pollflags != -1 ? pollflags : 0;
 }
 
 /* File callbacks that implement the eventpoll file behaviour */
@@ -872,9 +871,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 		add_wait_queue(whead, &pwq->wait);
 		list_add_tail(&pwq->llink, &epi->pwqlist);
 		epi->nwait++;
-	} else
+	} else {
 		/* We have to signal that an error occurred */
 		epi->nwait = -1;
+	}
 }
 
 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
@@ -1055,62 +1055,65 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	return 0;
 }
 
-static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
 {
 	struct ep_send_events_data *esed = priv;
 	int eventcnt;
-  	unsigned int revents;
+	unsigned int revents;
 	struct epitem *epi;
 	struct epoll_event __user *uevent;
 
-  	/*
+	/*
 	 * We can loop without lock because we are passed a task private list.
 	 * Items cannot vanish during the loop because ep_scan_ready_list() is
 	 * holding "mtx" during this call.
-  	 */
+	 */
 	for (eventcnt = 0, uevent = esed->events;
 	     !list_empty(head) && eventcnt < esed->maxevents;) {
 		epi = list_first_entry(head, struct epitem, rdllink);
 
 		list_del_init(&epi->rdllink);
 
-  		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
-  			epi->event.events;
+		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+			epi->event.events;
 
-  		/*
+		/*
 		 * If the event mask intersect the caller-requested one,
 		 * deliver the event to userspace. Again, ep_scan_ready_list()
 		 * is holding "mtx", so no operations coming from userspace
 		 * can change the item.
-  		 */
-  		if (revents) {
+		 */
+		if (revents) {
 			if (__put_user(revents, &uevent->events) ||
 			    __put_user(epi->event.data, &uevent->data))
-				return eventcnt ? eventcnt: -EFAULT;
-  			eventcnt++;
+				return eventcnt ? eventcnt : -EFAULT;
+			eventcnt++;
 			uevent++;
-  			if (epi->event.events & EPOLLONESHOT)
-  				epi->event.events &= EP_PRIVATE_BITS;
-  			else if (!(epi->event.events & EPOLLET))
-  				/*
-				 * If this file has been added with Level Trigger
-				 * mode, we need to insert back inside the ready
-				 * list, so that the next call to epoll_wait()
-				 * will check again the events availability.
-  				 * At this point, noone can insert into ep->rdllist
-  				 * besides us. The epoll_ctl() callers are locked
-				 * out by ep_scan_ready_list() holding "mtx" and
-				 * the poll callback will queue them in ep->ovflist.
-  				 */
-  				list_add_tail(&epi->rdllink, &ep->rdllist);
-  		}
-  	}
+			if (epi->event.events & EPOLLONESHOT)
+				epi->event.events &= EP_PRIVATE_BITS;
+			else if (!(epi->event.events & EPOLLET)) {
+				/*
+				 * If this file has been added with Level
+				 * Trigger mode, we need to insert back inside
+				 * the ready list, so that the next call to
+				 * epoll_wait() will check again the events
+				 * availability. At this point, noone can insert
+				 * into ep->rdllist besides us. The epoll_ctl()
+				 * callers are locked out by
+				 * ep_scan_ready_list() holding "mtx" and the
+				 * poll callback will queue them in ep->ovflist.
+				 */
+				list_add_tail(&epi->rdllink, &ep->rdllist);
+			}
+		}
+	}
 
 	return eventcnt;
 }
 
-static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
-			  int maxevents)
+static int ep_send_events(struct eventpoll *ep,
+			  struct epoll_event __user *events, int maxevents)
 {
 	struct ep_send_events_data esed;
 
@@ -1194,40 +1197,41 @@ retry:
  */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-	int error;
-	struct eventpoll *ep = NULL;
+	int error, fd = -1;
+	struct eventpoll *ep;
 
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
+	if (flags & ~EPOLL_CLOEXEC)
+		return -EINVAL;
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
 		     current, flags));
 
-	error = -EINVAL;
-	if (flags & ~EPOLL_CLOEXEC)
-		goto error_return;
-
 	/*
-	 * Create the internal data structure ("struct eventpoll").
+	 * Create the internal data structure ( "struct eventpoll" ).
 	 */
 	error = ep_alloc(&ep);
-	if (error < 0)
+	if (error < 0) {
+		fd = error;
 		goto error_return;
+	}
 
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-				 flags & O_CLOEXEC);
-	if (error < 0)
+	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+			      flags & O_CLOEXEC);
+	if (fd < 0)
 		ep_free(ep);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, flags, error));
+		     current, flags, fd));
 
-	return error;
+	return fd;
 }
 
 SYSCALL_DEFINE1(epoll_create, int, size)
-- 
cgit v1.2.3


From bb57c3edcd2fc51d95914c39448f36e43af9d6af Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:12 -0700
Subject: epoll: remove debugging code

Remove debugging code from epoll.  There's no need for it to be included
into mainline code.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 78 +++++++++-------------------------------------------------
 1 file changed, 11 insertions(+), 67 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e24d137c93b..205a1e1c77c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -71,24 +71,6 @@
  * a better scalability.
  */
 
-#define DEBUG_EPOLL 0
-
-#if DEBUG_EPOLL > 0
-#define DPRINTK(x) printk x
-#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
-#else /* #if DEBUG_EPOLL > 0 */
-#define DPRINTK(x) (void) 0
-#define DNPRINTK(n, x) (void) 0
-#endif /* #if DEBUG_EPOLL > 0 */
-
-#define DEBUG_EPI 0
-
-#if DEBUG_EPI != 0
-#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
-#else /* #if DEBUG_EPI != 0 */
-#define EPI_SLAB_DEBUG 0
-#endif /* #if DEBUG_EPI != 0 */
-
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
 
@@ -567,9 +549,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	atomic_dec(&ep->user->epoll_watches);
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
-		     current, ep, file));
-
 	return 0;
 }
 
@@ -625,7 +604,6 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	if (ep)
 		ep_free(ep);
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
 	return 0;
 }
 
@@ -750,8 +728,6 @@ static int ep_alloc(struct eventpoll **pep)
 
 	*pep = ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
-		     current, ep));
 	return 0;
 
 free_uid:
@@ -785,9 +761,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 		}
 	}
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
-		     current, file, epir));
-
 	return epir;
 }
 
@@ -803,9 +776,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
-		     current, epi->ffd.file, epi, ep));
-
 	spin_lock_irqsave(&ep->lock, flags);
 
 	/*
@@ -978,9 +948,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	if (pwake)
 		ep_poll_safewake(&ep->poll_wait);
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
-		     current, ep, tfile, fd));
-
 	return 0;
 
 error_unregister:
@@ -1197,41 +1164,30 @@ retry:
  */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-	int error, fd = -1;
-	struct eventpoll *ep;
+	int error;
+	struct eventpoll *ep = NULL;
 
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
 	if (flags & ~EPOLL_CLOEXEC)
 		return -EINVAL;
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-		     current, flags));
-
 	/*
-	 * Create the internal data structure ( "struct eventpoll" ).
+	 * Create the internal data structure ("struct eventpoll").
 	 */
 	error = ep_alloc(&ep);
-	if (error < 0) {
-		fd = error;
-		goto error_return;
-	}
-
+	if (error < 0)
+		return error;
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-			      flags & O_CLOEXEC);
-	if (fd < 0)
+	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+				 flags & O_CLOEXEC);
+	if (error < 0)
 		ep_free(ep);
 
-error_return:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, flags, fd));
-
-	return fd;
+	return error;
 }
 
 SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1256,9 +1212,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	struct epitem *epi;
 	struct epoll_event epds;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
-		     current, epfd, op, fd, event));
-
 	error = -EFAULT;
 	if (ep_op_has_event(op) &&
 	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1335,8 +1288,6 @@ error_tgt_fput:
 error_fput:
 	fput(file);
 error_return:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
-		     current, epfd, op, fd, event, error));
 
 	return error;
 }
@@ -1352,9 +1303,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 	struct file *file;
 	struct eventpoll *ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
-		     current, epfd, events, maxevents, timeout));
-
 	/* The maximum number of event must be greater than zero */
 	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
 		return -EINVAL;
@@ -1391,8 +1339,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 error_fput:
 	fput(file);
 error_return:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
-		     current, epfd, events, maxevents, timeout, error));
 
 	return error;
 }
@@ -1464,13 +1410,11 @@ static int __init eventpoll_init(void)
 
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-			0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
-			NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
-			sizeof(struct eppoll_entry), 0,
-			EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
+			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
 
 	return 0;
 }
-- 
cgit v1.2.3


From abff55cee1039b5a3b96f7a5eb6e65b9f247a274 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Tue, 31 Mar 2009 15:24:13 -0700
Subject: epoll: don't use current in irq context

ep_call_nested() (formerly ep_poll_safewake()) uses "current" (without
dereferencing it) to detect callback recursion, but it may be called from
irq context where the use of current is generally discouraged.  It would
be better to use get_cpu() and put_cpu() to detect the callback recursion.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 205a1e1c77c..db4365f8a75 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -97,8 +97,8 @@ struct epoll_filefd {
  */
 struct nested_call_node {
 	struct list_head llink;
-	struct task_struct *task;
 	void *cookie;
+	int cpu;
 };
 
 /*
@@ -327,7 +327,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 {
 	int error, call_nests = 0;
 	unsigned long flags;
-	struct task_struct *this_task = current;
+	int this_cpu = get_cpu();
 	struct list_head *lsthead = &ncalls->tasks_call_list;
 	struct nested_call_node *tncur;
 	struct nested_call_node tnode;
@@ -340,20 +340,19 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	 * very much limited.
 	 */
 	list_for_each_entry(tncur, lsthead, llink) {
-		if (tncur->task == this_task &&
+		if (tncur->cpu == this_cpu &&
 		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
 			/*
 			 * Ops ... loop detected or maximum nest level reached.
 			 * We abort this wake by breaking the cycle itself.
 			 */
-			spin_unlock_irqrestore(&ncalls->lock, flags);
-
-			return -1;
+			error = -1;
+			goto out_unlock;
 		}
 	}
 
 	/* Add the current task and cookie to the list */
-	tnode.task = this_task;
+	tnode.cpu = this_cpu;
 	tnode.cookie = cookie;
 	list_add(&tnode.llink, lsthead);
 
@@ -365,8 +364,10 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	/* Remove the current task from the list */
 	spin_lock_irqsave(&ncalls->lock, flags);
 	list_del(&tnode.llink);
+ out_unlock:
 	spin_unlock_irqrestore(&ncalls->lock, flags);
 
+	put_cpu();
 	return error;
 }
 
-- 
cgit v1.2.3


From d0305882825784e74f68a56eee6c3a812a99f235 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Tue, 31 Mar 2009 15:24:14 -0700
Subject: epoll: remember the event if epoll_wait returns -EFAULT

If epoll_wait returns -EFAULT, the event that was being returned when the
fault was encountered will be forgotten.  This is not a big deal since
EFAULT will happen only if a buggy userspace program passes in a bad
address, in which case what happens later usually doesn't matter.
However, it is easy to remember the event for later, and this patch makes
a simple change to do that.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index db4365f8a75..c806a0c4383 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1054,8 +1054,10 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 		 */
 		if (revents) {
 			if (__put_user(revents, &uevent->events) ||
-			    __put_user(epi->event.data, &uevent->data))
+			    __put_user(epi->event.data, &uevent->data)) {
+				list_add(&epi->rdllink, head);
 				return eventcnt ? eventcnt : -EFAULT;
+			}
 			eventcnt++;
 			uevent++;
 			if (epi->event.events & EPOLLONESHOT)
-- 
cgit v1.2.3


From d1bc90dd5d037079f96b3327f943eb6ae8ef7491 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Tue, 31 Mar 2009 15:24:15 -0700
Subject: epoll: remove unnecessary xchg

xchg in ep_unregister_pollwait() is unnecessary because it is protected by
either epmutex or ep->mtx (the same protection as ep_remove()).

If xchg was necessary, it would be insufficient to protect against
problems: if multiple concurrent calls to ep_unregister_pollwait() were
possible then a second caller that returns without doing anything because
nwait == 0 could return before the waitqueues are removed by the first
caller, which looks like it could lead to problematic races with
ep_poll_callback().

So remove xchg and add comments about the locking.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c806a0c4383..64c55037c13 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -394,27 +394,21 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
 }
 
 /*
- * This function unregister poll callbacks from the associated file descriptor.
- * Since this must be called without holding "ep->lock" the atomic exchange trick
- * will protect us from multiple unregister.
+ * This function unregisters poll callbacks from the associated file
+ * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
+ * ep_free).
  */
 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
-	int nwait;
 	struct list_head *lsthead = &epi->pwqlist;
 	struct eppoll_entry *pwq;
 
-	/* This is called without locks, so we need the atomic exchange */
-	nwait = xchg(&epi->nwait, 0);
-
-	if (nwait) {
-		while (!list_empty(lsthead)) {
-			pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+	while (!list_empty(lsthead)) {
+		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
 
-			list_del_init(&pwq->llink);
-			remove_wait_queue(pwq->whead, &pwq->wait);
-			kmem_cache_free(pwq_cache, pwq);
-		}
+		list_del(&pwq->llink);
+		remove_wait_queue(pwq->whead, &pwq->wait);
+		kmem_cache_free(pwq_cache, pwq);
 	}
 }
 
-- 
cgit v1.2.3


From e057e15ff66a620eda4c407486cbb8f8fbb7d878 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Tue, 31 Mar 2009 15:24:15 -0700
Subject: epoll: clean up ep_modify

ep_modify() doesn't need to set event.data from within the ep->lock
spinlock as the comment suggests.  The only place event.data is used is
ep_send_events_proc(), and this is protected by ep->mtx instead of
ep->lock.  Also update the comment for mutex_lock() at the top of
ep_scan_ready_list(), which mentions epoll_ctl(EPOLL_CTL_DEL) but not
epoll_ctl(EPOLL_CTL_MOD).

ep_modify() can also use spin_lock_irq() instead of spin_lock_irqsave().

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 64c55037c13..744377c0186 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -435,7 +435,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 
 	/*
 	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
+	 * eventpoll_release_file() and epoll_ctl().
 	 */
 	mutex_lock(&ep->mtx);
 
@@ -972,15 +972,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 {
 	int pwake = 0;
 	unsigned int revents;
-	unsigned long flags;
 
 	/*
-	 * Set the new event interest mask before calling f_op->poll(), otherwise
-	 * a potential race might occur. In fact if we do this operation inside
-	 * the lock, an event might happen between the f_op->poll() call and the
-	 * new event set registering.
+	 * Set the new event interest mask before calling f_op->poll();
+	 * otherwise we might miss an event that happens between the
+	 * f_op->poll() call and the new event set registering.
 	 */
 	epi->event.events = event->events;
+	epi->event.data = event->data; /* protected by mtx */
 
 	/*
 	 * Get current event bits. We can safely use the file* here because
@@ -988,16 +987,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 */
 	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
 
-	spin_lock_irqsave(&ep->lock, flags);
-
-	/* Copy the data member from inside the lock */
-	epi->event.data = event->data;
-
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
 	 * list, push it inside.
 	 */
 	if (revents & event->events) {
+		spin_lock_irq(&ep->lock);
 		if (!ep_is_linked(&epi->rdllink)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 
@@ -1007,8 +1002,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 			if (waitqueue_active(&ep->poll_wait))
 				pwake++;
 		}
+		spin_unlock_irq(&ep->lock);
 	}
-	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
-- 
cgit v1.2.3


From 4f0989dbfa8d18dd17c32120aac1eb3e906a62a2 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Tue, 31 Mar 2009 15:24:16 -0700
Subject: epoll: use real type instead of void *

eventpoll.c uses void * in one place for no obvious reason; change it to
use the real type instead.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 744377c0186..336fdb8ed47 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -192,7 +192,7 @@ struct eppoll_entry {
 	struct list_head llink;
 
 	/* The "base" pointer is set to the container "struct epitem" */
-	void *base;
+	struct epitem *base;
 
 	/*
 	 * Wait queue item that will be linked to the target file wait
-- 
cgit v1.2.3


From bcd0b235bf3808dec5115c381cd55568f63b85f0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:18 -0700
Subject: eventfd: improve support for semaphore-like behavior

People started using eventfd in a semaphore-like way where before they
were using pipes.

That is, counter-based resource access.  Where a "wait()" returns
immediately by decrementing the counter by one, if counter is greater than
zero.  Otherwise will wait.  And where a "post(count)" will add count to
the counter releasing the appropriate amount of waiters.  If eventfd the
"post" (write) part is fine, while the "wait" (read) does not dequeue 1,
but the whole counter value.

The problem with eventfd is that a read() on the fd returns and wipes the
whole counter, making the use of it as semaphore a little bit more
cumbersome.  You can do a read() followed by a write() of COUNTER-1, but
IMO it's pretty easy and cheap to make this work w/out extra steps.  This
patch introduces a new eventfd flag that tells eventfd to only dequeue 1
from the counter, allowing simple read/write to make it behave like a
semaphore.  Simple test here:

http://www.xmailserver.org/eventfd-sem.c

To be back-compatible with earlier kernels, userspace applications should
probe for the availability of this feature via

#ifdef EFD_SEMAPHORE
	fd = eventfd2 (CNT, EFD_SEMAPHORE);
	if (fd == -1 && errno == EINVAL)
		<fallback>
#else
		<fallback>
#endif

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <linux-api@vger.kernel.org>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c            | 20 +++++++++++---------
 include/linux/eventfd.h | 12 +++++++++++-
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa..91c0829a703 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
 	 * issue a wakeup.
 	 */
 	__u64 count;
+	unsigned int flags;
 };
 
 /*
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
-	__u64 ucnt;
+	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
 	if (count < sizeof(ucnt))
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	ucnt = ctx->count;
-	if (ucnt > 0)
+	if (ctx->count > 0)
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (res = 0;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				ucnt = ctx->count;
 				res = sizeof(ucnt);
 				break;
 			}
@@ -117,8 +116,9 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
-		ctx->count = 0;
+	if (likely(res > 0)) {
+		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+		ctx->count -= ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked(&ctx->wqh);
 	}
@@ -166,7 +166,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
+	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked(&ctx->wqh);
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
+	ctx->flags = flags;
 
 	/*
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      flags & (O_CLOEXEC | O_NONBLOCK));
+			      flags & EFD_SHARED_FCNTL_FLAGS);
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
 	return sys_eventfd2(count, 0);
 }
+
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index a667637b54e..f45a8ae5f82 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -13,10 +13,20 @@
 /* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
-/* Flags for eventfd2.  */
+/*
+ * CAREFUL: Check include/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from eventfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define EFD_SEMAPHORE (1 << 0)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
+#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+
 struct file *eventfd_fget(int fd);
 int eventfd_signal(struct file *file, int n);
 
-- 
cgit v1.2.3


From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: add __wake_up_locked_key() and
 __wake_up_sync_key()

This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.

The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to.  So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time.  This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.

The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.

By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.

Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).

Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled.  The former will gain some
efficiency though.

As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues.  I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.

Test program available here:

http://www.xmailserver.org/epoll_test.c

This patch:

Nothing revolutionary here.  Just using the available "key" that our
wakeup core already support.  The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().

The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both.  I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  7 +++++--
 kernel/sched.c       | 23 +++++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index a210ede73b5..0d2eeb03a71 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -135,8 +135,11 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
-extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			void *key);
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_bit(wait_queue_head_t *, void *, int);
 int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
 int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbe..73513f4e19d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+
 /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
@@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int sync = 1;
@@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 		sync = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	__wake_up_common(q, mode, nr_exclusive, sync, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 /**
-- 
cgit v1.2.3


From c0da37753695e010776ccf2200a5731e0f88a9f3 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: introduce new *_poll() wakeup macros

Introduce new wakeup macros that allow passing an event mask to the wakeup
targets.  They exactly mimic their non-_poll() counterpart, with the added
event mask passing capability.  I did add only the ones currently
requested, avoiding the _nr() and _all() for the moment.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 0d2eeb03a71..5d631c17eae 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -158,21 +158,17 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
 #define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
- * macro to avoid include hell
+ * Wakeup macros to be used to report events to the targets.
  */
-#define wake_up_nested(x, s)						\
-do {									\
-	unsigned long flags;						\
-									\
-	spin_lock_irqsave_nested(&(x)->lock, flags, (s));		\
-	wake_up_locked(x); 						\
-	spin_unlock_irqrestore(&(x)->lock, flags);			\
-} while (0)
-#else
-#define wake_up_nested(x, s)		wake_up(x)
-#endif
+#define wake_up_poll(x, m)				\
+	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
+#define wake_up_locked_poll(x, m)				\
+	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+#define wake_up_interruptible_poll(x, m)			\
+	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
+#define wake_up_interruptible_sync_poll(x, m)				\
+	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
 
 #define __wait_event(wq, condition) 					\
 do {									\
-- 
cgit v1.2.3


From 37e5540b3c9d838eb20f2ca8ea2eb8072271e403 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:21 -0700
Subject: epoll keyed wakeups: make sockets use keyed wakeups

Add support for event-aware wakeups to the sockets code.  Events are
delivered to the wakeup target, so that epoll can avoid spurious wakeups
for non-interesting events.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/core/sock.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 0620046e4eb..7dbf3ffb35c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1677,7 +1677,7 @@ static void sock_def_error_report(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -1686,7 +1686,8 @@ static void sock_def_readable(struct sock *sk, int len)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_sync(sk->sk_sleep);
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
+						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
 }
@@ -1700,7 +1701,8 @@ static void sock_def_write_space(struct sock *sk)
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible_sync(sk->sk_sleep);
+			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
+						POLLWRNORM | POLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
 		if (sock_writeable(sk))
-- 
cgit v1.2.3


From 2dfa4eeab0fc7e8633974f2770945311b31eedf6 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:22 -0700
Subject: epoll keyed wakeups: teach epoll about hints coming with the wakeup
 key

Use the events hint now sent by some devices, to avoid unnecessary wakeups
for events that are of no interest for the caller.  This code handles both
devices that are sending keyed events, and the ones that are not (and
event the ones that sometimes send events, and sometimes don't).

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 336fdb8ed47..a89f370fadb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -371,9 +371,28 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	return error;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+	wake_up_locked_poll(wqueue, events);
+	spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	wake_up_poll(wqueue, events);
+}
+#endif
+
 static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 {
-	wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests);
+	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+			  1 + call_nests);
 	return 0;
 }
 
@@ -782,6 +801,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	if (!(epi->event.events & ~EP_PRIVATE_BITS))
 		goto out_unlock;
 
+	/*
+	 * Check the events coming with the callback. At this stage, not
+	 * every device reports the events in the "key" parameter of the
+	 * callback. We need to be able to handle both cases here, hence the
+	 * test for "key" != NULL before the event match test.
+	 */
+	if (key && !((unsigned long) key & epi->event.events))
+		goto out_unlock;
+
 	/*
 	 * If we are trasfering events to userspace, we can hold no locks
 	 * (because we're accessing user memory, and because of linux f_op->poll()
@@ -1254,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds.events |= POLLERR | POLLHUP;
-
 			error = ep_insert(ep, &epds, tfile, fd);
 		} else
 			error = -EEXIST;
-- 
cgit v1.2.3


From 395108880efff4a4ffa1ffa554477f7f5ba6a031 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:23 -0700
Subject: epoll keyed wakeups: make eventfd use keyed wakeups

Introduce keyed event wakeups inside the eventfd code.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 91c0829a703..2a701d593d3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -51,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
 		n = (int) (ULLONG_MAX - ctx->count);
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
-		wake_up_locked(&ctx->wqh);
+		wake_up_locked_poll(&ctx->wqh, POLLIN);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
@@ -120,7 +120,7 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 		ctx->count -= ucnt;
 		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked(&ctx->wqh);
+			wake_up_locked_poll(&ctx->wqh, POLLOUT);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 	if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -169,7 +169,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked(&ctx->wqh);
+			wake_up_locked_poll(&ctx->wqh, POLLIN);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
-- 
cgit v1.2.3


From 4b19449db074eec86ae31a96d3cdca4aa7f138ab Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:24 -0700
Subject: epoll keyed wakeups: make tty use keyed wakeups

Introduce keyed event wakeups inside the TTY code.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 224f271d8cb..33dac94922a 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -464,7 +464,7 @@ void tty_wakeup(struct tty_struct *tty)
 			tty_ldisc_deref(ld);
 		}
 	}
-	wake_up_interruptible(&tty->write_wait);
+	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
 }
 
 EXPORT_SYMBOL_GPL(tty_wakeup);
@@ -587,8 +587,8 @@ static void do_tty_hangup(struct work_struct *work)
 	 * FIXME: Once we trust the LDISC code better we can wait here for
 	 * ldisc completion and fix the driver call race
 	 */
-	wake_up_interruptible(&tty->write_wait);
-	wake_up_interruptible(&tty->read_wait);
+	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
 	/*
 	 * Shutdown the current line discipline, and reset it to
 	 * N_TTY.
@@ -879,7 +879,7 @@ void stop_tty(struct tty_struct *tty)
 	if (tty->link && tty->link->packet) {
 		tty->ctrl_status &= ~TIOCPKT_START;
 		tty->ctrl_status |= TIOCPKT_STOP;
-		wake_up_interruptible(&tty->link->read_wait);
+		wake_up_interruptible_poll(&tty->link->read_wait, POLLIN);
 	}
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 	if (tty->ops->stop)
@@ -913,7 +913,7 @@ void start_tty(struct tty_struct *tty)
 	if (tty->link && tty->link->packet) {
 		tty->ctrl_status &= ~TIOCPKT_STOP;
 		tty->ctrl_status |= TIOCPKT_START;
-		wake_up_interruptible(&tty->link->read_wait);
+		wake_up_interruptible_poll(&tty->link->read_wait, POLLIN);
 	}
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 	if (tty->ops->start)
@@ -970,7 +970,7 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 void tty_write_unlock(struct tty_struct *tty)
 {
 	mutex_unlock(&tty->atomic_write_lock);
-	wake_up_interruptible(&tty->write_wait);
+	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
 }
 
 int tty_write_lock(struct tty_struct *tty, int ndelay)
@@ -1623,21 +1623,21 @@ void tty_release_dev(struct file *filp)
 
 		if (tty_closing) {
 			if (waitqueue_active(&tty->read_wait)) {
-				wake_up(&tty->read_wait);
+				wake_up_poll(&tty->read_wait, POLLIN);
 				do_sleep++;
 			}
 			if (waitqueue_active(&tty->write_wait)) {
-				wake_up(&tty->write_wait);
+				wake_up_poll(&tty->write_wait, POLLOUT);
 				do_sleep++;
 			}
 		}
 		if (o_tty_closing) {
 			if (waitqueue_active(&o_tty->read_wait)) {
-				wake_up(&o_tty->read_wait);
+				wake_up_poll(&o_tty->read_wait, POLLIN);
 				do_sleep++;
 			}
 			if (waitqueue_active(&o_tty->write_wait)) {
-				wake_up(&o_tty->write_wait);
+				wake_up_poll(&o_tty->write_wait, POLLOUT);
 				do_sleep++;
 			}
 		}
-- 
cgit v1.2.3


From 2b872903c5d66bccdf4306a35e3e94d4da8555d3 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 31 Mar 2009 15:24:25 -0700
Subject: hp_accel: small documentation updates

Fix english in Documentation, add "how to test" description.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: Eric Piel <eric.piel@tremplin-utc.net>
Cc: Vladimir Botka <vbotka@suse.cz>
Cc: <Quoc.Pham@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/hwmon/lis3lv02d | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d
index 287f8c90265..effe949a728 100644
--- a/Documentation/hwmon/lis3lv02d
+++ b/Documentation/hwmon/lis3lv02d
@@ -1,11 +1,11 @@
 Kernel driver lis3lv02d
-==================
+=======================
 
 Supported chips:
 
   * STMicroelectronics LIS3LV02DL and LIS3LV02DQ
 
-Author:
+Authors:
         Yan Burman <burman.yan@gmail.com>
 	Eric Piel <eric.piel@tremplin-utc.net>
 
@@ -15,7 +15,7 @@ Description
 
 This driver provides support for the accelerometer found in various HP
 laptops sporting the feature officially called "HP Mobile Data
-Protection System 3D" or "HP 3D DriveGuard". It detect automatically
+Protection System 3D" or "HP 3D DriveGuard". It detects automatically
 laptops with this sensor. Known models (for now the HP 2133, nc6420,
 nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis
 automatically oriented on standard way (eg: you can directly play
@@ -27,7 +27,7 @@ position - 3D position that the accelerometer reports. Format: "(x,y,z)"
 calibrate - read: values (x, y, z) that are used as the base for input
 		  class device operation.
             write: forces the base to be recalibrated with the current
-		  position.
+		   position.
 rate - reports the sampling rate of the accelerometer device in HZ
 
 This driver also provides an absolute input class device, allowing
@@ -48,7 +48,7 @@ For better compatibility between the various laptops. The values reported by
 the accelerometer are converted into a "standard" organisation of the axes
 (aka "can play neverball out of the box"):
  * When the laptop is horizontal the position reported is about 0 for X and Y
-and a positive value for Z
+	and a positive value for Z
  * If the left side is elevated, X increases (becomes positive)
  * If the front side (where the touchpad is) is elevated, Y decreases
 	(becomes negative)
@@ -59,3 +59,13 @@ email to the authors to add it to the database.  When reporting a new
 laptop, please include the output of "dmidecode" plus the value of
 /sys/devices/platform/lis3lv02d/position in these four cases.
 
+Q&A
+---
+
+Q: How do I safely simulate freefall? I have an HP "portable
+workstation" which has about 3.5kg and a plastic case, so letting it
+fall to the ground is out of question...
+
+A: The sensor is pretty sensitive, so your hands can do it. Lift it
+into free space, follow the fall with your hands for like 10
+centimeters. That should be enough to trigger the detection.
-- 
cgit v1.2.3


From be84cfc588b19f14764d78556dc7b630ee8c914c Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 31 Mar 2009 15:24:26 -0700
Subject: hp_accel: adev is poor name of exported symbol

As Andrew noted, adev is pretty poor name for symbol being exported.
Rename it to lis3.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Cc: Eric Piel <eric.piel@tremplin-utc.net>
Cc: Vladimir Botka <vbotka@suse.cz>
Cc: <Quoc.Pham@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/hp_accel.c  |  48 ++++++-------
 drivers/hwmon/lis3lv02d.c | 172 ++++++++++++++++++++++++----------------------
 drivers/hwmon/lis3lv02d.h |   2 +-
 3 files changed, 116 insertions(+), 106 deletions(-)

diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index 29c83b5b969..ee1ed4270df 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -140,7 +140,7 @@ acpi_status lis3lv02d_acpi_write(acpi_handle handle, int reg, u8 val)
 
 static int lis3lv02d_dmi_matched(const struct dmi_system_id *dmi)
 {
-	adev.ac = *((struct axis_conversion *)dmi->driver_data);
+	lis3_dev.ac = *((struct axis_conversion *)dmi->driver_data);
 	printk(KERN_INFO DRIVER_NAME ": hardware type %s found.\n", dmi->ident);
 
 	return 1;
@@ -214,7 +214,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 
 static void hpled_set(struct delayed_led_classdev *led_cdev, enum led_brightness value)
 {
-	acpi_handle handle = adev.device->handle;
+	acpi_handle handle = lis3_dev.device->handle;
 	unsigned long long ret; /* Not used when writing */
 	union acpi_object in_obj[1];
 	struct acpi_object_list args = { 1, in_obj };
@@ -254,7 +254,7 @@ static void lis3lv02d_enum_resources(struct acpi_device *device)
 	acpi_status status;
 
 	status = acpi_walk_resources(device->handle, METHOD_NAME__CRS,
-					lis3lv02d_get_resource, &adev.irq);
+					lis3lv02d_get_resource, &lis3_dev.irq);
 	if (ACPI_FAILURE(status))
 		printk(KERN_DEBUG DRIVER_NAME ": Error getting resources\n");
 }
@@ -263,8 +263,8 @@ static s16 lis3lv02d_read_16(acpi_handle handle, int reg)
 {
 	u8 lo, hi;
 
-	adev.read(handle, reg - 1, &lo);
-	adev.read(handle, reg, &hi);
+	lis3_dev.read(handle, reg - 1, &lo);
+	lis3_dev.read(handle, reg, &hi);
 	/* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */
 	return (s16)((hi << 8) | lo);
 }
@@ -272,7 +272,7 @@ static s16 lis3lv02d_read_16(acpi_handle handle, int reg)
 static s16 lis3lv02d_read_8(acpi_handle handle, int reg)
 {
 	s8 lo;
-	adev.read(handle, reg, &lo);
+	lis3_dev.read(handle, reg, &lo);
 	return lo;
 }
 
@@ -283,29 +283,29 @@ static int lis3lv02d_add(struct acpi_device *device)
 	if (!device)
 		return -EINVAL;
 
-	adev.device = device;
-	adev.init = lis3lv02d_acpi_init;
-	adev.read = lis3lv02d_acpi_read;
-	adev.write = lis3lv02d_acpi_write;
+	lis3_dev.device = device;
+	lis3_dev.init = lis3lv02d_acpi_init;
+	lis3_dev.read = lis3lv02d_acpi_read;
+	lis3_dev.write = lis3lv02d_acpi_write;
 	strcpy(acpi_device_name(device), DRIVER_NAME);
 	strcpy(acpi_device_class(device), ACPI_MDPS_CLASS);
-	device->driver_data = &adev;
+	device->driver_data = &lis3_dev;
 
-	lis3lv02d_acpi_read(device->handle, WHO_AM_I, &adev.whoami);
-	switch (adev.whoami) {
+	lis3lv02d_acpi_read(device->handle, WHO_AM_I, &lis3_dev.whoami);
+	switch (lis3_dev.whoami) {
 	case LIS_DOUBLE_ID:
 		printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n");
-		adev.read_data = lis3lv02d_read_16;
-		adev.mdps_max_val = 2048;
+		lis3_dev.read_data = lis3lv02d_read_16;
+		lis3_dev.mdps_max_val = 2048;
 		break;
 	case LIS_SINGLE_ID:
 		printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n");
-		adev.read_data = lis3lv02d_read_8;
-		adev.mdps_max_val = 128;
+		lis3_dev.read_data = lis3lv02d_read_8;
+		lis3_dev.mdps_max_val = 128;
 		break;
 	default:
 		printk(KERN_ERR DRIVER_NAME
-			": unknown sensor type 0x%X\n", adev.whoami);
+			": unknown sensor type 0x%X\n", lis3_dev.whoami);
 		return -EINVAL;
 	}
 
@@ -313,7 +313,7 @@ static int lis3lv02d_add(struct acpi_device *device)
 	if (dmi_check_system(lis3lv02d_dmi_ids) == 0) {
 		printk(KERN_INFO DRIVER_NAME ": laptop model unknown, "
 				 "using default axes configuration\n");
-		adev.ac = lis3lv02d_axis_normal;
+		lis3_dev.ac = lis3lv02d_axis_normal;
 	}
 
 	INIT_WORK(&hpled_led.work, delayed_set_status_worker);
@@ -322,9 +322,9 @@ static int lis3lv02d_add(struct acpi_device *device)
 		return ret;
 
 	/* obtain IRQ number of our device from ACPI */
-	lis3lv02d_enum_resources(adev.device);
+	lis3lv02d_enum_resources(lis3_dev.device);
 
-	ret = lis3lv02d_init_device(&adev);
+	ret = lis3lv02d_init_device(&lis3_dev);
 	if (ret) {
 		flush_work(&hpled_led.work);
 		led_classdev_unregister(&hpled_led.led_classdev);
@@ -360,12 +360,12 @@ static int lis3lv02d_suspend(struct acpi_device *device, pm_message_t state)
 static int lis3lv02d_resume(struct acpi_device *device)
 {
 	/* put back the device in the right state (ACPI might turn it on) */
-	mutex_lock(&adev.lock);
-	if (adev.usage > 0)
+	mutex_lock(&lis3_dev.lock);
+	if (lis3_dev.usage > 0)
 		lis3lv02d_poweron(device->handle);
 	else
 		lis3lv02d_poweroff(device->handle);
-	mutex_unlock(&adev.lock);
+	mutex_unlock(&lis3_dev.lock);
 	return 0;
 }
 #else
diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 8bb2158f045..4888ac5ba8c 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -53,14 +53,24 @@
  * joystick.
  */
 
-struct acpi_lis3lv02d adev = {
-	.misc_wait   = __WAIT_QUEUE_HEAD_INITIALIZER(adev.misc_wait),
+struct acpi_lis3lv02d lis3_dev = {
+	.misc_wait   = __WAIT_QUEUE_HEAD_INITIALIZER(lis3_dev.misc_wait),
 };
 
-EXPORT_SYMBOL_GPL(adev);
+EXPORT_SYMBOL_GPL(lis3_dev);
 
 static int lis3lv02d_add_fs(struct acpi_device *device);
 
+static s16 lis3lv02d_read_16(acpi_handle handle, int reg)
+{
+	u8 lo, hi;
+
+	lis3_dev.read(handle, reg, &lo);
+	lis3_dev.read(handle, reg + 1, &hi);
+	/* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */
+	return (s16)((hi << 8) | lo);
+}
+
 /**
  * lis3lv02d_get_axis - For the given axis, give the value converted
  * @axis:      1,2,3 - can also be negative
@@ -89,25 +99,25 @@ static void lis3lv02d_get_xyz(acpi_handle handle, int *x, int *y, int *z)
 {
 	int position[3];
 
-	position[0] = adev.read_data(handle, OUTX);
-	position[1] = adev.read_data(handle, OUTY);
-	position[2] = adev.read_data(handle, OUTZ);
+	position[0] = lis3_dev.read_data(handle, OUTX);
+	position[1] = lis3_dev.read_data(handle, OUTY);
+	position[2] = lis3_dev.read_data(handle, OUTZ);
 
-	*x = lis3lv02d_get_axis(adev.ac.x, position);
-	*y = lis3lv02d_get_axis(adev.ac.y, position);
-	*z = lis3lv02d_get_axis(adev.ac.z, position);
+	*x = lis3lv02d_get_axis(lis3_dev.ac.x, position);
+	*y = lis3lv02d_get_axis(lis3_dev.ac.y, position);
+	*z = lis3lv02d_get_axis(lis3_dev.ac.z, position);
 }
 
 void lis3lv02d_poweroff(acpi_handle handle)
 {
-	adev.is_on = 0;
+	lis3_dev.is_on = 0;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_poweroff);
 
 void lis3lv02d_poweron(acpi_handle handle)
 {
-	adev.is_on = 1;
-	adev.init(handle);
+	lis3_dev.is_on = 1;
+	lis3_dev.init(handle);
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_poweron);
 
@@ -147,10 +157,10 @@ static irqreturn_t lis302dl_interrupt(int irq, void *dummy)
 	 * the lid is closed. This leads to interrupts as soon as a little move
 	 * is done.
 	 */
-	atomic_inc(&adev.count);
+	atomic_inc(&lis3_dev.count);
 
-	wake_up_interruptible(&adev.misc_wait);
-	kill_fasync(&adev.async_queue, SIGIO, POLL_IN);
+	wake_up_interruptible(&lis3_dev.misc_wait);
+	kill_fasync(&lis3_dev.async_queue, SIGIO, POLL_IN);
 	return IRQ_HANDLED;
 }
 
@@ -158,10 +168,10 @@ static int lis3lv02d_misc_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
-	if (test_and_set_bit(0, &adev.misc_opened))
+	if (test_and_set_bit(0, &lis3_dev.misc_opened))
 		return -EBUSY; /* already open */
 
-	atomic_set(&adev.count, 0);
+	atomic_set(&lis3_dev.count, 0);
 
 	/*
 	 * The sensor can generate interrupts for free-fall and direction
@@ -174,25 +184,25 @@ static int lis3lv02d_misc_open(struct inode *inode, struct file *file)
 	 * io-apic is not configurable (and generates a warning) but I keep it
 	 * in case of support for other hardware.
 	 */
-	ret = request_irq(adev.irq, lis302dl_interrupt, IRQF_TRIGGER_RISING,
-			  DRIVER_NAME, &adev);
+	ret = request_irq(lis3_dev.irq, lis302dl_interrupt, IRQF_TRIGGER_RISING,
+			  DRIVER_NAME, &lis3_dev);
 
 	if (ret) {
-		clear_bit(0, &adev.misc_opened);
-		printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", adev.irq);
+		clear_bit(0, &lis3_dev.misc_opened);
+		printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", lis3_dev.irq);
 		return -EBUSY;
 	}
-	lis3lv02d_increase_use(&adev);
-	printk("lis3: registered interrupt %d\n", adev.irq);
+	lis3lv02d_increase_use(&lis3_dev);
+	printk("lis3: registered interrupt %d\n", lis3_dev.irq);
 	return 0;
 }
 
 static int lis3lv02d_misc_release(struct inode *inode, struct file *file)
 {
-	fasync_helper(-1, file, 0, &adev.async_queue);
-	lis3lv02d_decrease_use(&adev);
-	free_irq(adev.irq, &adev);
-	clear_bit(0, &adev.misc_opened); /* release the device */
+	fasync_helper(-1, file, 0, &lis3_dev.async_queue);
+	lis3lv02d_decrease_use(&lis3_dev);
+	free_irq(lis3_dev.irq, &lis3_dev);
+	clear_bit(0, &lis3_dev.misc_opened); /* release the device */
 	return 0;
 }
 
@@ -207,10 +217,10 @@ static ssize_t lis3lv02d_misc_read(struct file *file, char __user *buf,
 	if (count < 1)
 		return -EINVAL;
 
-	add_wait_queue(&adev.misc_wait, &wait);
+	add_wait_queue(&lis3_dev.misc_wait, &wait);
 	while (true) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		data = atomic_xchg(&adev.count, 0);
+		data = atomic_xchg(&lis3_dev.count, 0);
 		if (data)
 			break;
 
@@ -240,22 +250,22 @@ static ssize_t lis3lv02d_misc_read(struct file *file, char __user *buf,
 
 out:
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&adev.misc_wait, &wait);
+	remove_wait_queue(&lis3_dev.misc_wait, &wait);
 
 	return retval;
 }
 
 static unsigned int lis3lv02d_misc_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, &adev.misc_wait, wait);
-	if (atomic_read(&adev.count))
+	poll_wait(file, &lis3_dev.misc_wait, wait);
+	if (atomic_read(&lis3_dev.count))
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
 static int lis3lv02d_misc_fasync(int fd, struct file *file, int on)
 {
-	return fasync_helper(fd, file, on, &adev.async_queue);
+	return fasync_helper(fd, file, on, &lis3_dev.async_queue);
 }
 
 static const struct file_operations lis3lv02d_misc_fops = {
@@ -283,12 +293,12 @@ static int lis3lv02d_joystick_kthread(void *data)
 	int x, y, z;
 
 	while (!kthread_should_stop()) {
-		lis3lv02d_get_xyz(adev.device->handle, &x, &y, &z);
-		input_report_abs(adev.idev, ABS_X, x - adev.xcalib);
-		input_report_abs(adev.idev, ABS_Y, y - adev.ycalib);
-		input_report_abs(adev.idev, ABS_Z, z - adev.zcalib);
+		lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z);
+		input_report_abs(lis3_dev.idev, ABS_X, x - lis3_dev.xcalib);
+		input_report_abs(lis3_dev.idev, ABS_Y, y - lis3_dev.ycalib);
+		input_report_abs(lis3_dev.idev, ABS_Z, z - lis3_dev.zcalib);
 
-		input_sync(adev.idev);
+		input_sync(lis3_dev.idev);
 
 		try_to_freeze();
 		msleep_interruptible(MDPS_POLL_INTERVAL);
@@ -299,11 +309,11 @@ static int lis3lv02d_joystick_kthread(void *data)
 
 static int lis3lv02d_joystick_open(struct input_dev *input)
 {
-	lis3lv02d_increase_use(&adev);
-	adev.kthread = kthread_run(lis3lv02d_joystick_kthread, NULL, "klis3lv02d");
-	if (IS_ERR(adev.kthread)) {
-		lis3lv02d_decrease_use(&adev);
-		return PTR_ERR(adev.kthread);
+	lis3lv02d_increase_use(&lis3_dev);
+	lis3_dev.kthread = kthread_run(lis3lv02d_joystick_kthread, NULL, "klis3lv02d");
+	if (IS_ERR(lis3_dev.kthread)) {
+		lis3lv02d_decrease_use(&lis3_dev);
+		return PTR_ERR(lis3_dev.kthread);
 	}
 
 	return 0;
@@ -311,45 +321,45 @@ static int lis3lv02d_joystick_open(struct input_dev *input)
 
 static void lis3lv02d_joystick_close(struct input_dev *input)
 {
-	kthread_stop(adev.kthread);
-	lis3lv02d_decrease_use(&adev);
+	kthread_stop(lis3_dev.kthread);
+	lis3lv02d_decrease_use(&lis3_dev);
 }
 
 static inline void lis3lv02d_calibrate_joystick(void)
 {
-	lis3lv02d_get_xyz(adev.device->handle, &adev.xcalib, &adev.ycalib, &adev.zcalib);
+	lis3lv02d_get_xyz(lis3_dev.device->handle, &lis3_dev.xcalib, &lis3_dev.ycalib, &lis3_dev.zcalib);
 }
 
 int lis3lv02d_joystick_enable(void)
 {
 	int err;
 
-	if (adev.idev)
+	if (lis3_dev.idev)
 		return -EINVAL;
 
-	adev.idev = input_allocate_device();
-	if (!adev.idev)
+	lis3_dev.idev = input_allocate_device();
+	if (!lis3_dev.idev)
 		return -ENOMEM;
 
 	lis3lv02d_calibrate_joystick();
 
-	adev.idev->name       = "ST LIS3LV02DL Accelerometer";
-	adev.idev->phys       = DRIVER_NAME "/input0";
-	adev.idev->id.bustype = BUS_HOST;
-	adev.idev->id.vendor  = 0;
-	adev.idev->dev.parent = &adev.pdev->dev;
-	adev.idev->open       = lis3lv02d_joystick_open;
-	adev.idev->close      = lis3lv02d_joystick_close;
+	lis3_dev.idev->name       = "ST LIS3LV02DL Accelerometer";
+	lis3_dev.idev->phys       = DRIVER_NAME "/input0";
+	lis3_dev.idev->id.bustype = BUS_HOST;
+	lis3_dev.idev->id.vendor  = 0;
+	lis3_dev.idev->dev.parent = &lis3_dev.pdev->dev;
+	lis3_dev.idev->open       = lis3lv02d_joystick_open;
+	lis3_dev.idev->close      = lis3lv02d_joystick_close;
 
-	set_bit(EV_ABS, adev.idev->evbit);
-	input_set_abs_params(adev.idev, ABS_X, -adev.mdps_max_val, adev.mdps_max_val, 3, 3);
-	input_set_abs_params(adev.idev, ABS_Y, -adev.mdps_max_val, adev.mdps_max_val, 3, 3);
-	input_set_abs_params(adev.idev, ABS_Z, -adev.mdps_max_val, adev.mdps_max_val, 3, 3);
+	set_bit(EV_ABS, lis3_dev.idev->evbit);
+	input_set_abs_params(lis3_dev.idev, ABS_X, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
+	input_set_abs_params(lis3_dev.idev, ABS_Y, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
+	input_set_abs_params(lis3_dev.idev, ABS_Z, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3);
 
-	err = input_register_device(adev.idev);
+	err = input_register_device(lis3_dev.idev);
 	if (err) {
-		input_free_device(adev.idev);
-		adev.idev = NULL;
+		input_free_device(lis3_dev.idev);
+		lis3_dev.idev = NULL;
 	}
 
 	return err;
@@ -358,12 +368,12 @@ EXPORT_SYMBOL_GPL(lis3lv02d_joystick_enable);
 
 void lis3lv02d_joystick_disable(void)
 {
-	if (!adev.idev)
+	if (!lis3_dev.idev)
 		return;
 
 	misc_deregister(&lis3lv02d_misc_device);
-	input_unregister_device(adev.idev);
-	adev.idev = NULL;
+	input_unregister_device(lis3_dev.idev);
+	lis3_dev.idev = NULL;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_joystick_disable);
 
@@ -404,25 +414,25 @@ static ssize_t lis3lv02d_position_show(struct device *dev,
 {
 	int x, y, z;
 
-	lis3lv02d_increase_use(&adev);
-	lis3lv02d_get_xyz(adev.device->handle, &x, &y, &z);
-	lis3lv02d_decrease_use(&adev);
+	lis3lv02d_increase_use(&lis3_dev);
+	lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z);
+	lis3lv02d_decrease_use(&lis3_dev);
 	return sprintf(buf, "(%d,%d,%d)\n", x, y, z);
 }
 
 static ssize_t lis3lv02d_calibrate_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "(%d,%d,%d)\n", adev.xcalib, adev.ycalib, adev.zcalib);
+	return sprintf(buf, "(%d,%d,%d)\n", lis3_dev.xcalib, lis3_dev.ycalib, lis3_dev.zcalib);
 }
 
 static ssize_t lis3lv02d_calibrate_store(struct device *dev,
 				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
-	lis3lv02d_increase_use(&adev);
+	lis3lv02d_increase_use(&lis3_dev);
 	lis3lv02d_calibrate_joystick();
-	lis3lv02d_decrease_use(&adev);
+	lis3lv02d_decrease_use(&lis3_dev);
 	return count;
 }
 
@@ -434,9 +444,9 @@ static ssize_t lis3lv02d_rate_show(struct device *dev,
 	u8 ctrl;
 	int val;
 
-	lis3lv02d_increase_use(&adev);
-	adev.read(adev.device->handle, CTRL_REG1, &ctrl);
-	lis3lv02d_decrease_use(&adev);
+	lis3lv02d_increase_use(&lis3_dev);
+	lis3_dev.read(lis3_dev.device->handle, CTRL_REG1, &ctrl);
+	lis3lv02d_decrease_use(&lis3_dev);
 	val = (ctrl & (CTRL1_DF0 | CTRL1_DF1)) >> 4;
 	return sprintf(buf, "%d\n", lis3lv02dl_df_val[val]);
 }
@@ -460,17 +470,17 @@ static struct attribute_group lis3lv02d_attribute_group = {
 
 static int lis3lv02d_add_fs(struct acpi_device *device)
 {
-	adev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
-	if (IS_ERR(adev.pdev))
-		return PTR_ERR(adev.pdev);
+	lis3_dev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
+	if (IS_ERR(lis3_dev.pdev))
+		return PTR_ERR(lis3_dev.pdev);
 
-	return sysfs_create_group(&adev.pdev->dev.kobj, &lis3lv02d_attribute_group);
+	return sysfs_create_group(&lis3_dev.pdev->dev.kobj, &lis3lv02d_attribute_group);
 }
 
 int lis3lv02d_remove_fs(void)
 {
-	sysfs_remove_group(&adev.pdev->dev.kobj, &lis3lv02d_attribute_group);
-	platform_device_unregister(adev.pdev);
+	sysfs_remove_group(&lis3_dev.pdev->dev.kobj, &lis3lv02d_attribute_group);
+	platform_device_unregister(lis3_dev.pdev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs);
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index 75972bf372f..017fb2b754d 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -194,4 +194,4 @@ void lis3lv02d_poweroff(acpi_handle handle);
 void lis3lv02d_poweron(acpi_handle handle);
 int lis3lv02d_remove_fs(void);
 
-extern struct acpi_lis3lv02d adev;
+extern struct acpi_lis3lv02d lis3_dev;
-- 
cgit v1.2.3


From 061603275814544842e7df77d1157eff18565997 Mon Sep 17 00:00:00 2001
From: Davide Rizzo <elpa.rizzo@gmail.com>
Date: Tue, 31 Mar 2009 15:24:27 -0700
Subject: hwmon: LM95241 driver

An hwmon driver for the National Semiconductor LM95241 triple temperature
sensors chip

Signed-off-by: Davide Rizzo <elpa-rizzo@gmail.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: "Mark M. Hoffman" <mhoffman@lightlink.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/Kconfig   |   9 +
 drivers/hwmon/Makefile  |   1 +
 drivers/hwmon/lm95241.c | 527 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 537 insertions(+)
 create mode 100644 drivers/hwmon/lm95241.c

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 51ff9b3d7ea..8a91c997155 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -582,6 +582,15 @@ config SENSORS_LTC4245
 	  This driver can also be built as a module. If so, the module will
 	  be called ltc4245.
 
+config SENSORS_LM95241
+	tristate "National Semiconductor LM95241 sensor chip"
+	depends on I2C
+	help
+	  If you say yes here you get support for LM95241 sensor chip.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called lm95241.
+
 config SENSORS_MAX1111
 	tristate "Maxim MAX1111 Multichannel, Serial 8-bit ADC chip"
 	depends on SPI_MASTER
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index e332d626792..81c88822a3e 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_SENSORS_LM87)	+= lm87.o
 obj-$(CONFIG_SENSORS_LM90)	+= lm90.o
 obj-$(CONFIG_SENSORS_LM92)	+= lm92.o
 obj-$(CONFIG_SENSORS_LM93)	+= lm93.o
+obj-$(CONFIG_SENSORS_LM95241)	+= lm95241.o
 obj-$(CONFIG_SENSORS_LTC4245)	+= ltc4245.o
 obj-$(CONFIG_SENSORS_MAX1111)	+= max1111.o
 obj-$(CONFIG_SENSORS_MAX1619)	+= max1619.o
diff --git a/drivers/hwmon/lm95241.c b/drivers/hwmon/lm95241.c
new file mode 100644
index 00000000000..091d95f38aa
--- /dev/null
+++ b/drivers/hwmon/lm95241.c
@@ -0,0 +1,527 @@
+/*
+ * lm95241.c - Part of lm_sensors, Linux kernel modules for hardware
+ *             monitoring
+ * Copyright (C) 2008 Davide Rizzo <elpa-rizzo@gmail.com>
+ *
+ * Based on the max1619 driver. The LM95241 is a sensor chip made by National
+ *   Semiconductors.
+ * It reports up to three temperatures (its own plus up to
+ * two external ones). Complete datasheet can be
+ * obtained from National's website at:
+ *   http://www.national.com/ds.cgi/LM/LM95241.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/i2c.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/sysfs.h>
+
+static const unsigned short normal_i2c[] = {
+	0x19, 0x2a, 0x2b, I2C_CLIENT_END};
+
+/* Insmod parameters */
+I2C_CLIENT_INSMOD_1(lm95241);
+
+/* LM95241 registers */
+#define LM95241_REG_R_MAN_ID		0xFE
+#define LM95241_REG_R_CHIP_ID		0xFF
+#define LM95241_REG_R_STATUS		0x02
+#define LM95241_REG_RW_CONFIG		0x03
+#define LM95241_REG_RW_REM_FILTER	0x06
+#define LM95241_REG_RW_TRUTHERM		0x07
+#define LM95241_REG_W_ONE_SHOT  	0x0F
+#define LM95241_REG_R_LOCAL_TEMPH	0x10
+#define LM95241_REG_R_REMOTE1_TEMPH	0x11
+#define LM95241_REG_R_REMOTE2_TEMPH	0x12
+#define LM95241_REG_R_LOCAL_TEMPL	0x20
+#define LM95241_REG_R_REMOTE1_TEMPL	0x21
+#define LM95241_REG_R_REMOTE2_TEMPL	0x22
+#define LM95241_REG_RW_REMOTE_MODEL	0x30
+
+/* LM95241 specific bitfields */
+#define CFG_STOP 0x40
+#define CFG_CR0076 0x00
+#define CFG_CR0182 0x10
+#define CFG_CR1000 0x20
+#define CFG_CR2700 0x30
+#define R1MS_SHIFT 0
+#define R2MS_SHIFT 2
+#define R1MS_MASK (0x01 << (R1MS_SHIFT))
+#define R2MS_MASK (0x01 << (R2MS_SHIFT))
+#define R1DF_SHIFT 1
+#define R2DF_SHIFT 2
+#define R1DF_MASK (0x01 << (R1DF_SHIFT))
+#define R2DF_MASK (0x01 << (R2DF_SHIFT))
+#define R1FE_MASK 0x01
+#define R2FE_MASK 0x05
+#define TT1_SHIFT 0
+#define TT2_SHIFT 4
+#define TT_OFF 0
+#define TT_ON 1
+#define TT_MASK 7
+#define MANUFACTURER_ID 0x01
+#define DEFAULT_REVISION 0xA4
+
+/* Conversions and various macros */
+#define TEMP_FROM_REG(val_h, val_l) (((val_h) & 0x80 ? (val_h) - 0x100 : \
+    (val_h)) * 1000 + (val_l) * 1000 / 256)
+
+/* Functions declaration */
+static int lm95241_attach_adapter(struct i2c_adapter *adapter);
+static int lm95241_detect(struct i2c_adapter *adapter, int address,
+			  int kind);
+static void lm95241_init_client(struct i2c_client *client);
+static int lm95241_detach_client(struct i2c_client *client);
+static struct lm95241_data *lm95241_update_device(struct device *dev);
+
+/* Driver data (common to all clients) */
+static struct i2c_driver lm95241_driver = {
+	.driver = {
+		.name   = "lm95241",
+	},
+	.attach_adapter = lm95241_attach_adapter,
+	.detach_client  = lm95241_detach_client,
+};
+
+/* Client data (each client gets its own) */
+struct lm95241_data {
+	struct i2c_client client;
+	struct device *hwmon_dev;
+	struct mutex update_lock;
+	unsigned long last_updated, rate; /* in jiffies */
+	char valid; /* zero until following fields are valid */
+	/* registers values */
+	u8 local_h, local_l; /* local */
+	u8 remote1_h, remote1_l; /* remote1 */
+	u8 remote2_h, remote2_l; /* remote2 */
+	u8 config, model, trutherm;
+};
+
+/* Sysfs stuff */
+#define show_temp(value) \
+static ssize_t show_##value(struct device *dev, \
+    struct device_attribute *attr, char *buf) \
+{ \
+	struct lm95241_data *data = lm95241_update_device(dev); \
+	snprintf(buf, PAGE_SIZE - 1, "%d\n", \
+		TEMP_FROM_REG(data->value##_h, data->value##_l)); \
+	return strlen(buf); \
+}
+show_temp(local);
+show_temp(remote1);
+show_temp(remote2);
+
+static ssize_t show_rate(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct lm95241_data *data = lm95241_update_device(dev);
+
+	snprintf(buf, PAGE_SIZE - 1, "%lu\n", 1000 * data->rate / HZ);
+	return strlen(buf);
+}
+
+static ssize_t set_rate(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct lm95241_data *data = i2c_get_clientdata(client);
+
+	strict_strtol(buf, 10, &data->rate);
+	data->rate = data->rate * HZ / 1000;
+
+	return count;
+}
+
+#define show_type(flag) \
+static ssize_t show_type##flag(struct device *dev, \
+				   struct device_attribute *attr, char *buf) \
+{ \
+	struct i2c_client *client = to_i2c_client(dev); \
+	struct lm95241_data *data = i2c_get_clientdata(client); \
+\
+	snprintf(buf, PAGE_SIZE - 1, \
+		data->model & R##flag##MS_MASK ? "1\n" : "2\n"); \
+	return strlen(buf); \
+}
+show_type(1);
+show_type(2);
+
+#define show_min(flag) \
+static ssize_t show_min##flag(struct device *dev, \
+    struct device_attribute *attr, char *buf) \
+{ \
+	struct i2c_client *client = to_i2c_client(dev); \
+	struct lm95241_data *data = i2c_get_clientdata(client); \
+\
+	snprintf(buf, PAGE_SIZE - 1, \
+		data->config & R##flag##DF_MASK ?	\
+		"-127000\n" : "0\n"); \
+	return strlen(buf); \
+}
+show_min(1);
+show_min(2);
+
+#define show_max(flag) \
+static ssize_t show_max##flag(struct device *dev, \
+    struct device_attribute *attr, char *buf) \
+{ \
+	struct i2c_client *client = to_i2c_client(dev); \
+	struct lm95241_data *data = i2c_get_clientdata(client); \
+\
+	snprintf(buf, PAGE_SIZE - 1, \
+		data->config & R##flag##DF_MASK ? \
+		"127000\n" : "255000\n"); \
+	return strlen(buf); \
+}
+show_max(1);
+show_max(2);
+
+#define set_type(flag) \
+static ssize_t set_type##flag(struct device *dev, \
+				  struct device_attribute *attr, \
+				  const char *buf, size_t count) \
+{ \
+	struct i2c_client *client = to_i2c_client(dev); \
+	struct lm95241_data *data = i2c_get_clientdata(client); \
+\
+	long val; \
+	strict_strtol(buf, 10, &val); \
+\
+	if ((val == 1) || (val == 2)) { \
+\
+		mutex_lock(&data->update_lock); \
+\
+		data->trutherm &= ~(TT_MASK << TT##flag##_SHIFT); \
+		if (val == 1) { \
+			data->model |= R##flag##MS_MASK; \
+			data->trutherm |= (TT_ON << TT##flag##_SHIFT); \
+		} \
+		else { \
+			data->model &= ~R##flag##MS_MASK; \
+			data->trutherm |= (TT_OFF << TT##flag##_SHIFT); \
+		} \
+\
+		data->valid = 0; \
+\
+		i2c_smbus_write_byte_data(client, LM95241_REG_RW_REMOTE_MODEL, \
+					  data->model); \
+		i2c_smbus_write_byte_data(client, LM95241_REG_RW_TRUTHERM, \
+					  data->trutherm); \
+\
+		mutex_unlock(&data->update_lock); \
+\
+	} \
+	return count; \
+}
+set_type(1);
+set_type(2);
+
+#define set_min(flag) \
+static ssize_t set_min##flag(struct device *dev, \
+	struct device_attribute *devattr, const char *buf, size_t count) \
+{ \
+	struct i2c_client *client = to_i2c_client(dev); \
+	struct lm95241_data *data = i2c_get_clientdata(client); \
+\
+	long val; \
+	strict_strtol(buf, 10, &val); \
+\
+	mutex_lock(&data->update_lock); \
+\
+	if (val < 0) \
+		data->config |= R##flag##DF_MASK; \
+	else \
+		data->config &= ~R##flag##DF_MASK; \
+\
+	data->valid = 0; \
+\
+	i2c_smbus_write_byte_data(client, LM95241_REG_RW_CONFIG, \
+		data->config); \
+\
+	mutex_unlock(&data->update_lock); \
+\
+	return count; \
+}
+set_min(1);
+set_min(2);
+
+#define set_max(flag) \
+static ssize_t set_max##flag(struct device *dev, \
+	struct device_attribute *devattr, const char *buf, size_t count) \
+{ \
+	struct i2c_client *client = to_i2c_client(dev); \
+	struct lm95241_data *data = i2c_get_clientdata(client); \
+\
+	long val; \
+	strict_strtol(buf, 10, &val); \
+\
+	mutex_lock(&data->update_lock); \
+\
+	if (val <= 127000) \
+		data->config |= R##flag##DF_MASK; \
+	else \
+		data->config &= ~R##flag##DF_MASK; \
+\
+	data->valid = 0; \
+\
+	i2c_smbus_write_byte_data(client, LM95241_REG_RW_CONFIG, \
+		data->config); \
+\
+	mutex_unlock(&data->update_lock); \
+\
+	return count; \
+}
+set_max(1);
+set_max(2);
+
+static DEVICE_ATTR(temp1_input, S_IRUGO, show_local, NULL);
+static DEVICE_ATTR(temp2_input, S_IRUGO, show_remote1, NULL);
+static DEVICE_ATTR(temp3_input, S_IRUGO, show_remote2, NULL);
+static DEVICE_ATTR(temp2_type, S_IWUSR | S_IRUGO, show_type1, set_type1);
+static DEVICE_ATTR(temp3_type, S_IWUSR | S_IRUGO, show_type2, set_type2);
+static DEVICE_ATTR(temp2_min, S_IWUSR | S_IRUGO, show_min1, set_min1);
+static DEVICE_ATTR(temp3_min, S_IWUSR | S_IRUGO, show_min2, set_min2);
+static DEVICE_ATTR(temp2_max, S_IWUSR | S_IRUGO, show_max1, set_max1);
+static DEVICE_ATTR(temp3_max, S_IWUSR | S_IRUGO, show_max2, set_max2);
+static DEVICE_ATTR(rate, S_IWUSR | S_IRUGO, show_rate, set_rate);
+
+static struct attribute *lm95241_attributes[] = {
+	&dev_attr_temp1_input.attr,
+	&dev_attr_temp2_input.attr,
+	&dev_attr_temp3_input.attr,
+	&dev_attr_temp2_type.attr,
+	&dev_attr_temp3_type.attr,
+	&dev_attr_temp2_min.attr,
+	&dev_attr_temp3_min.attr,
+	&dev_attr_temp2_max.attr,
+	&dev_attr_temp3_max.attr,
+	&dev_attr_rate.attr,
+	NULL
+};
+
+static const struct attribute_group lm95241_group = {
+	.attrs = lm95241_attributes,
+};
+
+/* Init/exit code */
+static int lm95241_attach_adapter(struct i2c_adapter *adapter)
+{
+	if (!(adapter->class & I2C_CLASS_HWMON))
+		return 0;
+	return i2c_probe(adapter, &addr_data, lm95241_detect);
+}
+
+/*
+ * The following function does more than just detection. If detection
+ * succeeds, it also registers the new chip.
+ */
+static int lm95241_detect(struct i2c_adapter *adapter, int address, int kind)
+{
+	struct i2c_client *new_client;
+	struct lm95241_data *data;
+	int err = 0;
+	const char *name = "";
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
+		goto exit;
+
+	data = kzalloc(sizeof(struct lm95241_data), GFP_KERNEL);
+	if (!data) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	/* The common I2C client data is placed right before the
+	   LM95241-specific data. */
+	new_client = &data->client;
+	i2c_set_clientdata(new_client, data);
+	new_client->addr = address;
+	new_client->adapter = adapter;
+	new_client->driver = &lm95241_driver;
+	new_client->flags = 0;
+
+	/*
+	 * Now we do the remaining detection. A negative kind means that
+	 * the driver was loaded with no force parameter (default), so we
+	 * must both detect and identify the chip. A zero kind means that
+	 * the driver was loaded with the force parameter, the detection
+	 * step shall be skipped. A positive kind means that the driver
+	 * was loaded with the force parameter and a given kind of chip is
+	 * requested, so both the detection and the identification steps
+	 * are skipped.
+	 */
+	if (kind < 0) {	/* detection */
+		if ((i2c_smbus_read_byte_data(new_client, LM95241_REG_R_MAN_ID)
+		     != MANUFACTURER_ID)
+		|| (i2c_smbus_read_byte_data(new_client, LM95241_REG_R_CHIP_ID)
+		    < DEFAULT_REVISION)) {
+			dev_dbg(&adapter->dev,
+				"LM95241 detection failed at 0x%02x.\n",
+				address);
+			goto exit_free;
+		}
+	}
+
+	if (kind <= 0) { /* identification */
+		if ((i2c_smbus_read_byte_data(new_client, LM95241_REG_R_MAN_ID)
+		     == MANUFACTURER_ID)
+		&& (i2c_smbus_read_byte_data(new_client, LM95241_REG_R_CHIP_ID)
+		    >= DEFAULT_REVISION)) {
+
+			kind = lm95241;
+
+			if (kind <= 0) { /* identification failed */
+				dev_info(&adapter->dev, "Unsupported chip\n");
+				goto exit_free;
+			}
+		}
+	}
+
+	if (kind == lm95241)
+		name = "lm95241";
+
+	/* We can fill in the remaining client fields */
+	strlcpy(new_client->name, name, I2C_NAME_SIZE);
+	data->valid = 0;
+	mutex_init(&data->update_lock);
+
+	/* Tell the I2C layer a new client has arrived */
+	err = i2c_attach_client(new_client);
+	if (err)
+		goto exit_free;
+
+	/* Initialize the LM95241 chip */
+	lm95241_init_client(new_client);
+
+	/* Register sysfs hooks */
+	err = sysfs_create_group(&new_client->dev.kobj, &lm95241_group);
+	if (err)
+		goto exit_detach;
+
+	data->hwmon_dev = hwmon_device_register(&new_client->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		err = PTR_ERR(data->hwmon_dev);
+		goto exit_remove_files;
+	}
+
+	return 0;
+
+exit_remove_files:
+	sysfs_remove_group(&new_client->dev.kobj, &lm95241_group);
+exit_detach:
+	i2c_detach_client(new_client);
+exit_free:
+	kfree(data);
+exit:
+	return err;
+}
+
+static void lm95241_init_client(struct i2c_client *client)
+{
+	struct lm95241_data *data = i2c_get_clientdata(client);
+
+	data->rate = HZ;    /* 1 sec default */
+	data->valid = 0;
+	data->config = CFG_CR0076;
+	data->model = 0;
+	data->trutherm = (TT_OFF << TT1_SHIFT) | (TT_OFF << TT2_SHIFT);
+
+	i2c_smbus_write_byte_data(client, LM95241_REG_RW_CONFIG,
+				  data->config);
+	i2c_smbus_write_byte_data(client, LM95241_REG_RW_REM_FILTER,
+				  R1FE_MASK | R2FE_MASK);
+	i2c_smbus_write_byte_data(client, LM95241_REG_RW_TRUTHERM,
+				  data->trutherm);
+	i2c_smbus_write_byte_data(client, LM95241_REG_RW_REMOTE_MODEL,
+				  data->model);
+}
+
+static int lm95241_detach_client(struct i2c_client *client)
+{
+	struct lm95241_data *data = i2c_get_clientdata(client);
+	int err;
+
+	hwmon_device_unregister(data->hwmon_dev);
+	sysfs_remove_group(&client->dev.kobj, &lm95241_group);
+
+	err = i2c_detach_client(client);
+	if (err)
+		return err;
+
+	kfree(data);
+	return 0;
+}
+
+static struct lm95241_data *lm95241_update_device(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct lm95241_data *data = i2c_get_clientdata(client);
+
+	mutex_lock(&data->update_lock);
+
+	if (time_after(jiffies, data->last_updated + data->rate) ||
+	    !data->valid) {
+		dev_dbg(&client->dev, "Updating lm95241 data.\n");
+		data->local_h =
+			i2c_smbus_read_byte_data(client,
+						 LM95241_REG_R_LOCAL_TEMPH);
+		data->local_l =
+			i2c_smbus_read_byte_data(client,
+						 LM95241_REG_R_LOCAL_TEMPL);
+		data->remote1_h =
+			i2c_smbus_read_byte_data(client,
+						 LM95241_REG_R_REMOTE1_TEMPH);
+		data->remote1_l =
+			i2c_smbus_read_byte_data(client,
+						 LM95241_REG_R_REMOTE1_TEMPL);
+		data->remote2_h =
+			i2c_smbus_read_byte_data(client,
+						 LM95241_REG_R_REMOTE2_TEMPH);
+		data->remote2_l =
+			i2c_smbus_read_byte_data(client,
+						 LM95241_REG_R_REMOTE2_TEMPL);
+		data->last_updated = jiffies;
+		data->valid = 1;
+	}
+
+	mutex_unlock(&data->update_lock);
+
+	return data;
+}
+
+static int __init sensors_lm95241_init(void)
+{
+	return i2c_add_driver(&lm95241_driver);
+}
+
+static void __exit sensors_lm95241_exit(void)
+{
+	i2c_del_driver(&lm95241_driver);
+}
+
+MODULE_AUTHOR("Davide Rizzo <elpa-rizzo@gmail.com>");
+MODULE_DESCRIPTION("LM95241 sensor driver");
+MODULE_LICENSE("GPL");
+
+module_init(sensors_lm95241_init);
+module_exit(sensors_lm95241_exit);
-- 
cgit v1.2.3


From 72f5de92e199f96cfcea125aefc76c138d8c553c Mon Sep 17 00:00:00 2001
From: Ira Snyder <iws@ovro.caltech.edu>
Date: Tue, 31 Mar 2009 15:24:29 -0700
Subject: hwmon: Add LTC4215 driver

Add Linux support for the Linear Technology LTC4215 Hot Swap controller
I2C monitoring interface.

I have tested the driver with my board, and it appears to work fine.  With
the power supplies disabled, it reads 11.93V input, 1.93V output, no
current and no power.  With the supplies enabled, it reads 11.93V input,
11.98V output, no current, no power.  I'm not drawing any current at the
moment, so this is reasonable.  The value in the sense register never
reads anything except 0, so I expect to get zero from the current and
power calculations.

I didn't attempt to support changing any of the chip's settings or
enabling the FET.  I'm not sure even how to do that and still fit within
the hwmon framework.  :)

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: "Mark M. Hoffman" <mhoffman@lightlink.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/hwmon/ltc4215 |  50 ++++++
 drivers/hwmon/Kconfig       |  11 ++
 drivers/hwmon/Makefile      |   1 +
 drivers/hwmon/ltc4215.c     | 364 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 426 insertions(+)
 create mode 100644 Documentation/hwmon/ltc4215
 create mode 100644 drivers/hwmon/ltc4215.c

diff --git a/Documentation/hwmon/ltc4215 b/Documentation/hwmon/ltc4215
new file mode 100644
index 00000000000..2e6a21eb656
--- /dev/null
+++ b/Documentation/hwmon/ltc4215
@@ -0,0 +1,50 @@
+Kernel driver ltc4215
+=====================
+
+Supported chips:
+  * Linear Technology LTC4215
+    Prefix: 'ltc4215'
+    Addresses scanned: 0x44
+    Datasheet:
+        http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697
+
+Author: Ira W. Snyder <iws@ovro.caltech.edu>
+
+
+Description
+-----------
+
+The LTC4215 controller allows a board to be safely inserted and removed
+from a live backplane.
+
+
+Usage Notes
+-----------
+
+This driver does not probe for LTC4215 devices, due to the fact that some
+of the possible addresses are unfriendly to probing. You will need to use
+the "force" parameter to tell the driver where to find the device.
+
+Example: the following will load the driver for an LTC4215 at address 0x44
+on I2C bus #0:
+$ modprobe ltc4215 force=0,0x44
+
+
+Sysfs entries
+-------------
+
+The LTC4215 has built-in limits for overvoltage, undervoltage, and
+undercurrent warnings. This makes it very likely that the reference
+circuit will be used.
+
+in1_input		input voltage
+in2_input		output voltage
+
+in1_min_alarm		input undervoltage alarm
+in1_max_alarm		input overvoltage alarm
+
+curr1_input		current
+curr1_max_alarm		overcurrent alarm
+
+power1_input		power usage
+power1_alarm		power bad alarm
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 8a91c997155..bc6810c22dd 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -571,6 +571,17 @@ config SENSORS_LM93
 	  This driver can also be built as a module.  If so, the module
 	  will be called lm93.
 
+config SENSORS_LTC4215
+	tristate "Linear Technology LTC4215"
+	depends on I2C && EXPERIMENTAL
+	default n
+	help
+	  If you say yes here you get support for Linear Technology LTC4215
+	  Hot Swap Controller I2C interface.
+
+	  This driver can also be built as a module. If so, the module will
+	  be called ltc4215.
+
 config SENSORS_LTC4245
 	tristate "Linear Technology LTC4245"
 	depends on I2C && EXPERIMENTAL
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 81c88822a3e..4da261d16ab 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_SENSORS_LM90)	+= lm90.o
 obj-$(CONFIG_SENSORS_LM92)	+= lm92.o
 obj-$(CONFIG_SENSORS_LM93)	+= lm93.o
 obj-$(CONFIG_SENSORS_LM95241)	+= lm95241.o
+obj-$(CONFIG_SENSORS_LTC4215)	+= ltc4215.o
 obj-$(CONFIG_SENSORS_LTC4245)	+= ltc4245.o
 obj-$(CONFIG_SENSORS_MAX1111)	+= max1111.o
 obj-$(CONFIG_SENSORS_MAX1619)	+= max1619.o
diff --git a/drivers/hwmon/ltc4215.c b/drivers/hwmon/ltc4215.c
new file mode 100644
index 00000000000..9386e2a3921
--- /dev/null
+++ b/drivers/hwmon/ltc4215.c
@@ -0,0 +1,364 @@
+/*
+ * Driver for Linear Technology LTC4215 I2C Hot Swap Controller
+ *
+ * Copyright (C) 2009 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * Datasheet:
+ * http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+
+static const unsigned short normal_i2c[] = { I2C_CLIENT_END };
+
+/* Insmod parameters */
+I2C_CLIENT_INSMOD_1(ltc4215);
+
+/* Here are names of the chip's registers (a.k.a. commands) */
+enum ltc4215_cmd {
+	LTC4215_CONTROL			= 0x00, /* rw */
+	LTC4215_ALERT			= 0x01, /* rw */
+	LTC4215_STATUS			= 0x02, /* ro */
+	LTC4215_FAULT			= 0x03, /* rw */
+	LTC4215_SENSE			= 0x04, /* rw */
+	LTC4215_SOURCE			= 0x05, /* rw */
+	LTC4215_ADIN			= 0x06, /* rw */
+};
+
+struct ltc4215_data {
+	struct device *hwmon_dev;
+
+	struct mutex update_lock;
+	bool valid;
+	unsigned long last_updated; /* in jiffies */
+
+	/* Registers */
+	u8 regs[7];
+};
+
+static struct ltc4215_data *ltc4215_update_device(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct ltc4215_data *data = i2c_get_clientdata(client);
+	s32 val;
+	int i;
+
+	mutex_lock(&data->update_lock);
+
+	/* The chip's A/D updates 10 times per second */
+	if (time_after(jiffies, data->last_updated + HZ / 10) || !data->valid) {
+
+		dev_dbg(&client->dev, "Starting ltc4215 update\n");
+
+		/* Read all registers */
+		for (i = 0; i < ARRAY_SIZE(data->regs); i++) {
+			val = i2c_smbus_read_byte_data(client, i);
+			if (unlikely(val < 0))
+				data->regs[i] = 0;
+			else
+				data->regs[i] = val;
+		}
+
+		data->last_updated = jiffies;
+		data->valid = 1;
+	}
+
+	mutex_unlock(&data->update_lock);
+
+	return data;
+}
+
+/* Return the voltage from the given register in millivolts */
+static int ltc4215_get_voltage(struct device *dev, u8 reg)
+{
+	struct ltc4215_data *data = ltc4215_update_device(dev);
+	const u8 regval = data->regs[reg];
+	u32 voltage = 0;
+
+	switch (reg) {
+	case LTC4215_SENSE:
+		/* 151 uV per increment */
+		voltage = regval * 151 / 1000;
+		break;
+	case LTC4215_SOURCE:
+		/* 60.5 mV per increment */
+		voltage = regval * 605 / 10;
+		break;
+	case LTC4215_ADIN:
+		/* The ADIN input is divided by 12.5, and has 4.82 mV
+		 * per increment, so we have the additional multiply */
+		voltage = regval * 482 * 125 / 1000;
+		break;
+	default:
+		/* If we get here, the developer messed up */
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	return voltage;
+}
+
+/* Return the current from the sense resistor in mA */
+static unsigned int ltc4215_get_current(struct device *dev)
+{
+	struct ltc4215_data *data = ltc4215_update_device(dev);
+
+	/* The strange looking conversions that follow are fixed-point
+	 * math, since we cannot do floating point in the kernel.
+	 *
+	 * Step 1: convert sense register to microVolts
+	 * Step 2: convert voltage to milliAmperes
+	 *
+	 * If you play around with the V=IR equation, you come up with
+	 * the following: X uV / Y mOhm == Z mA
+	 *
+	 * With the resistors that are fractions of a milliOhm, we multiply
+	 * the voltage and resistance by 10, to shift the decimal point.
+	 * Now we can use the normal division operator again.
+	 */
+
+	/* Calculate voltage in microVolts (151 uV per increment) */
+	const unsigned int voltage = data->regs[LTC4215_SENSE] * 151;
+
+	/* Calculate current in milliAmperes (4 milliOhm sense resistor) */
+	const unsigned int curr = voltage / 4;
+
+	return curr;
+}
+
+static ssize_t ltc4215_show_voltage(struct device *dev,
+				    struct device_attribute *da,
+				    char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+	const int voltage = ltc4215_get_voltage(dev, attr->index);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", voltage);
+}
+
+static ssize_t ltc4215_show_current(struct device *dev,
+				    struct device_attribute *da,
+				    char *buf)
+{
+	const unsigned int curr = ltc4215_get_current(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", curr);
+}
+
+static ssize_t ltc4215_show_power(struct device *dev,
+				  struct device_attribute *da,
+				  char *buf)
+{
+	const unsigned int curr = ltc4215_get_current(dev);
+	const int output_voltage = ltc4215_get_voltage(dev, LTC4215_ADIN);
+
+	/* current in mA * voltage in mV == power in uW */
+	const unsigned int power = abs(output_voltage * curr);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", power);
+}
+
+static ssize_t ltc4215_show_alarm(struct device *dev,
+					  struct device_attribute *da,
+					  char *buf)
+{
+	struct sensor_device_attribute_2 *attr = to_sensor_dev_attr_2(da);
+	struct ltc4215_data *data = ltc4215_update_device(dev);
+	const u8 reg = data->regs[attr->index];
+	const u32 mask = attr->nr;
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", (reg & mask) ? 1 : 0);
+}
+
+/* These macros are used below in constructing device attribute objects
+ * for use with sysfs_create_group() to make a sysfs device file
+ * for each register.
+ */
+
+#define LTC4215_VOLTAGE(name, ltc4215_cmd_idx) \
+	static SENSOR_DEVICE_ATTR(name, S_IRUGO, \
+	ltc4215_show_voltage, NULL, ltc4215_cmd_idx)
+
+#define LTC4215_CURRENT(name) \
+	static SENSOR_DEVICE_ATTR(name, S_IRUGO, \
+	ltc4215_show_current, NULL, 0);
+
+#define LTC4215_POWER(name) \
+	static SENSOR_DEVICE_ATTR(name, S_IRUGO, \
+	ltc4215_show_power, NULL, 0);
+
+#define LTC4215_ALARM(name, mask, reg) \
+	static SENSOR_DEVICE_ATTR_2(name, S_IRUGO, \
+	ltc4215_show_alarm, NULL, (mask), reg)
+
+/* Construct a sensor_device_attribute structure for each register */
+
+/* Current */
+LTC4215_CURRENT(curr1_input);
+LTC4215_ALARM(curr1_max_alarm,	(1 << 2),	LTC4215_STATUS);
+
+/* Power (virtual) */
+LTC4215_POWER(power1_input);
+LTC4215_ALARM(power1_alarm,	(1 << 3),	LTC4215_STATUS);
+
+/* Input Voltage */
+LTC4215_VOLTAGE(in1_input,			LTC4215_ADIN);
+LTC4215_ALARM(in1_max_alarm,	(1 << 0),	LTC4215_STATUS);
+LTC4215_ALARM(in1_min_alarm,	(1 << 1),	LTC4215_STATUS);
+
+/* Output Voltage */
+LTC4215_VOLTAGE(in2_input,			LTC4215_SOURCE);
+
+/* Finally, construct an array of pointers to members of the above objects,
+ * as required for sysfs_create_group()
+ */
+static struct attribute *ltc4215_attributes[] = {
+	&sensor_dev_attr_curr1_input.dev_attr.attr,
+	&sensor_dev_attr_curr1_max_alarm.dev_attr.attr,
+
+	&sensor_dev_attr_power1_input.dev_attr.attr,
+	&sensor_dev_attr_power1_alarm.dev_attr.attr,
+
+	&sensor_dev_attr_in1_input.dev_attr.attr,
+	&sensor_dev_attr_in1_max_alarm.dev_attr.attr,
+	&sensor_dev_attr_in1_min_alarm.dev_attr.attr,
+
+	&sensor_dev_attr_in2_input.dev_attr.attr,
+
+	NULL,
+};
+
+static const struct attribute_group ltc4215_group = {
+	.attrs = ltc4215_attributes,
+};
+
+static int ltc4215_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
+{
+	struct ltc4215_data *data;
+	int ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto out_kzalloc;
+	}
+
+	i2c_set_clientdata(client, data);
+	mutex_init(&data->update_lock);
+
+	/* Initialize the LTC4215 chip */
+	/* TODO */
+
+	/* Register sysfs hooks */
+	ret = sysfs_create_group(&client->dev.kobj, &ltc4215_group);
+	if (ret)
+		goto out_sysfs_create_group;
+
+	data->hwmon_dev = hwmon_device_register(&client->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		ret = PTR_ERR(data->hwmon_dev);
+		goto out_hwmon_device_register;
+	}
+
+	return 0;
+
+out_hwmon_device_register:
+	sysfs_remove_group(&client->dev.kobj, &ltc4215_group);
+out_sysfs_create_group:
+	kfree(data);
+out_kzalloc:
+	return ret;
+}
+
+static int ltc4215_remove(struct i2c_client *client)
+{
+	struct ltc4215_data *data = i2c_get_clientdata(client);
+
+	hwmon_device_unregister(data->hwmon_dev);
+	sysfs_remove_group(&client->dev.kobj, &ltc4215_group);
+
+	kfree(data);
+
+	return 0;
+}
+
+static int ltc4215_detect(struct i2c_client *client,
+			  int kind,
+			  struct i2c_board_info *info)
+{
+	struct i2c_adapter *adapter = client->adapter;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
+		return -ENODEV;
+
+	if (kind < 0) {		/* probed detection - check the chip type */
+		s32 v;		/* 8 bits from the chip, or -ERRNO */
+
+		/*
+		 * Register 0x01 bit b7 is reserved, expect 0
+		 * Register 0x03 bit b6 and b7 are reserved, expect 0
+		 */
+		v = i2c_smbus_read_byte_data(client, LTC4215_ALERT);
+		if (v < 0 || (v & (1 << 7)) != 0)
+			return -ENODEV;
+
+		v = i2c_smbus_read_byte_data(client, LTC4215_FAULT);
+		if (v < 0 || (v & ((1 << 6) | (1 << 7))) != 0)
+				return -ENODEV;
+	}
+
+	strlcpy(info->type, "ltc4215", I2C_NAME_SIZE);
+	dev_info(&adapter->dev, "ltc4215 %s at address 0x%02x\n",
+			kind < 0 ? "probed" : "forced",
+			client->addr);
+
+	return 0;
+}
+
+static const struct i2c_device_id ltc4215_id[] = {
+	{ "ltc4215", ltc4215 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, ltc4215_id);
+
+/* This is the driver that will be inserted */
+static struct i2c_driver ltc4215_driver = {
+	.class		= I2C_CLASS_HWMON,
+	.driver = {
+		.name	= "ltc4215",
+	},
+	.probe		= ltc4215_probe,
+	.remove		= ltc4215_remove,
+	.id_table	= ltc4215_id,
+	.detect		= ltc4215_detect,
+	.address_data	= &addr_data,
+};
+
+static int __init ltc4215_init(void)
+{
+	return i2c_add_driver(&ltc4215_driver);
+}
+
+static void __exit ltc4215_exit(void)
+{
+	i2c_del_driver(&ltc4215_driver);
+}
+
+MODULE_AUTHOR("Ira W. Snyder <iws@ovro.caltech.edu>");
+MODULE_DESCRIPTION("LTC4215 driver");
+MODULE_LICENSE("GPL");
+
+module_init(ltc4215_init);
+module_exit(ltc4215_exit);
-- 
cgit v1.2.3


From 9d7639d33a348d5637daa50bba27497e9f5dac64 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 31 Mar 2009 15:24:29 -0700
Subject: hp_accel: add two more axis information

Add two more laptops to whitelist.

Signed-off-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Cc: Daniel Mack <daniel@caiaq.de>
Cc: Eric Piel <eric.piel@tremplin-utc.net>
Cc: Vladimir Botka <vbotka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/hp_accel.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index ee1ed4270df..448a462ad78 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -187,6 +187,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 	AXIS_DMI_MATCH("NC2510", "HP Compaq 2510", y_inverted),
 	AXIS_DMI_MATCH("NC8510", "HP Compaq 8510", xy_swap_inverted),
 	AXIS_DMI_MATCH("HP2133", "HP 2133", xy_rotated_left),
+	AXIS_DMI_MATCH("HP2140", "HP 2140", xy_swap_inverted),
 	AXIS_DMI_MATCH("NC653x", "HP Compaq 653", xy_rotated_left_usd),
 	AXIS_DMI_MATCH("NC673x", "HP Compaq 673", xy_rotated_left_usd),
 	AXIS_DMI_MATCH("NC651xx", "HP Compaq 651", xy_rotated_right),
@@ -201,6 +202,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 			PRODUCT_NAME, "HP Pavilion dv5",
 			BOARD_NAME, "3600",
 			y_inverted),
+	AXIS_DMI_MATCH("DV7", "HP Pavilion dv7", x_inverted),
 	{ NULL, }
 /* Laptop models without axis info (yet):
  * "NC6910" "HP Compaq 6910"
-- 
cgit v1.2.3


From 12a324b6a3758f04a952b99d75a625732d767585 Mon Sep 17 00:00:00 2001
From: Luca Cappa <lcappa@gmail.com>
Date: Tue, 31 Mar 2009 15:24:31 -0700
Subject: hp_accel: axis conversion for hp compaq 8710w

I have a laptop HP Compaq 8710W, I compiled into my kernel the LIS3LV02DL
and HP_ACCEL module drivers.  While loading it cannot recognize the laptop
model, so i am sending the necessary information to update the database of
axis orientations.

>When the laptop is horizontal the position reported is about 0 for X and Y
>and a positive value for Z
Yes, it is about 0,0,1000, the actual reading says: (-17,-26,1018);

> If the left side is elevated, X increases (becomes positive)
Yes, X goes toward to positive 1000.

>If the front side (where the touchpad is) is elevated, Y decreases (becomes negative)
No, Y goes toward to positive 1000.

>If the laptop is put upside-down, Z becomes negative
Yes, the laptop on a table Z gives 1000, and if upsidedown the Z reads
-1000.

So in few words the Y axis is inverted.

Cc: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/hp_accel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index 448a462ad78..de26b1c3a96 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -203,6 +203,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 			BOARD_NAME, "3600",
 			y_inverted),
 	AXIS_DMI_MATCH("DV7", "HP Pavilion dv7", x_inverted),
+	AXIS_DMI_MATCH("HP8710", "HP Compaq 8710", y_inverted),
 	{ NULL, }
 /* Laptop models without axis info (yet):
  * "NC6910" "HP Compaq 6910"
-- 
cgit v1.2.3


From ab337a632783c251a3c3852aec0ead8a0281cbdd Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 31 Mar 2009 15:24:31 -0700
Subject: lis3: reorder functions to make forward decl obsolete

Move lis3lv02d_init_device() down so that the forward declaration of
lis3lv02d_add_fs() becomes unnecessary.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/lis3lv02d.c | 64 +++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 4888ac5ba8c..eeae7c9600e 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -59,8 +59,6 @@ struct acpi_lis3lv02d lis3_dev = {
 
 EXPORT_SYMBOL_GPL(lis3_dev);
 
-static int lis3lv02d_add_fs(struct acpi_device *device);
-
 static s16 lis3lv02d_read_16(acpi_handle handle, int reg)
 {
 	u8 lo, hi;
@@ -377,37 +375,6 @@ void lis3lv02d_joystick_disable(void)
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_joystick_disable);
 
-/*
- * Initialise the accelerometer and the various subsystems.
- * Should be rather independant of the bus system.
- */
-int lis3lv02d_init_device(struct acpi_lis3lv02d *dev)
-{
-	mutex_init(&dev->lock);
-	lis3lv02d_add_fs(dev->device);
-	lis3lv02d_increase_use(dev);
-
-	if (lis3lv02d_joystick_enable())
-		printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n");
-
-	printk("lis3_init_device: irq %d\n", dev->irq);
-
-	/* if we did not get an IRQ from ACPI - we have nothing more to do */
-	if (!dev->irq) {
-		printk(KERN_ERR DRIVER_NAME
-			": No IRQ in ACPI. Disabling /dev/freefall\n");
-		goto out;
-	}
-
-	printk("lis3: registering device\n");
-	if (misc_register(&lis3lv02d_misc_device))
-		printk(KERN_ERR DRIVER_NAME ": misc_register failed\n");
-out:
-	lis3lv02d_decrease_use(dev);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(lis3lv02d_init_device);
-
 /* Sysfs stuff */
 static ssize_t lis3lv02d_position_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
@@ -485,6 +452,37 @@ int lis3lv02d_remove_fs(void)
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs);
 
+/*
+ * Initialise the accelerometer and the various subsystems.
+ * Should be rather independant of the bus system.
+ */
+int lis3lv02d_init_device(struct acpi_lis3lv02d *dev)
+{
+	mutex_init(&dev->lock);
+	lis3lv02d_add_fs(dev->device);
+	lis3lv02d_increase_use(dev);
+
+	if (lis3lv02d_joystick_enable())
+		printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n");
+
+	printk("lis3_init_device: irq %d\n", dev->irq);
+
+	/* if we did not get an IRQ from ACPI - we have nothing more to do */
+	if (!dev->irq) {
+		printk(KERN_ERR DRIVER_NAME
+			": No IRQ in ACPI. Disabling /dev/freefall\n");
+		goto out;
+	}
+
+	printk("lis3: registering device\n");
+	if (misc_register(&lis3lv02d_misc_device))
+		printk(KERN_ERR DRIVER_NAME ": misc_register failed\n");
+out:
+	lis3lv02d_decrease_use(dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(lis3lv02d_init_device);
+
 MODULE_DESCRIPTION("ST LIS3LV02Dx three-axis digital accelerometer driver");
 MODULE_AUTHOR("Yan Burman, Eric Piel, Pavel Machek");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From a38da2ed74f628c1f3e907c772be21a66eccab9c Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 31 Mar 2009 15:24:32 -0700
Subject: lis3: solve dependency between core and ACPI

This solves the dependency between lis3lv02d.[ch] and ACPI specific
methods.  It introduces a ->bus_priv pointer to the device struct which is
casted to 'struct acpi_device' in the ACIP layer.  Changed hp_accel.c
accordingly.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/hp_accel.c  | 99 ++++++++++++++++++-----------------------------
 drivers/hwmon/lis3lv02d.c | 86 ++++++++++++++++++++++++++--------------
 drivers/hwmon/lis3lv02d.h | 20 +++++-----
 3 files changed, 105 insertions(+), 100 deletions(-)

diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c
index de26b1c3a96..55d3dc565be 100644
--- a/drivers/hwmon/hp_accel.c
+++ b/drivers/hwmon/hp_accel.c
@@ -85,25 +85,31 @@ MODULE_DEVICE_TABLE(acpi, lis3lv02d_device_ids);
 
 /**
  * lis3lv02d_acpi_init - ACPI _INI method: initialize the device.
- * @handle: the handle of the device
+ * @lis3: pointer to the device struct
  *
- * Returns AE_OK on success.
+ * Returns 0 on success.
  */
-acpi_status lis3lv02d_acpi_init(acpi_handle handle)
+int lis3lv02d_acpi_init(struct lis3lv02d *lis3)
 {
-	return acpi_evaluate_object(handle, METHOD_NAME__INI, NULL, NULL);
+	struct acpi_device *dev = lis3->bus_priv;
+	if (acpi_evaluate_object(dev->handle, METHOD_NAME__INI,
+				 NULL, NULL) != AE_OK)
+		return -EINVAL;
+
+	return 0;
 }
 
 /**
  * lis3lv02d_acpi_read - ACPI ALRD method: read a register
- * @handle: the handle of the device
+ * @lis3: pointer to the device struct
  * @reg:    the register to read
  * @ret:    result of the operation
  *
- * Returns AE_OK on success.
+ * Returns 0 on success.
  */
-acpi_status lis3lv02d_acpi_read(acpi_handle handle, int reg, u8 *ret)
+int lis3lv02d_acpi_read(struct lis3lv02d *lis3, int reg, u8 *ret)
 {
+	struct acpi_device *dev = lis3->bus_priv;
 	union acpi_object arg0 = { ACPI_TYPE_INTEGER };
 	struct acpi_object_list args = { 1, &arg0 };
 	unsigned long long lret;
@@ -111,21 +117,22 @@ acpi_status lis3lv02d_acpi_read(acpi_handle handle, int reg, u8 *ret)
 
 	arg0.integer.value = reg;
 
-	status = acpi_evaluate_integer(handle, "ALRD", &args, &lret);
+	status = acpi_evaluate_integer(dev->handle, "ALRD", &args, &lret);
 	*ret = lret;
-	return status;
+	return (status != AE_OK) ? -EINVAL : 0;
 }
 
 /**
  * lis3lv02d_acpi_write - ACPI ALWR method: write to a register
- * @handle: the handle of the device
+ * @lis3: pointer to the device struct
  * @reg:    the register to write to
  * @val:    the value to write
  *
- * Returns AE_OK on success.
+ * Returns 0 on success.
  */
-acpi_status lis3lv02d_acpi_write(acpi_handle handle, int reg, u8 val)
+int lis3lv02d_acpi_write(struct lis3lv02d *lis3, int reg, u8 val)
 {
+	struct acpi_device *dev = lis3->bus_priv;
 	unsigned long long ret; /* Not used when writting */
 	union acpi_object in_obj[2];
 	struct acpi_object_list args = { 2, in_obj };
@@ -135,7 +142,10 @@ acpi_status lis3lv02d_acpi_write(acpi_handle handle, int reg, u8 val)
 	in_obj[1].type          = ACPI_TYPE_INTEGER;
 	in_obj[1].integer.value = val;
 
-	return acpi_evaluate_integer(handle, "ALWR", &args, &ret);
+	if (acpi_evaluate_integer(dev->handle, "ALWR", &args, &ret) != AE_OK)
+		return -EINVAL;
+
+	return 0;
 }
 
 static int lis3lv02d_dmi_matched(const struct dmi_system_id *dmi)
@@ -217,7 +227,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = {
 
 static void hpled_set(struct delayed_led_classdev *led_cdev, enum led_brightness value)
 {
-	acpi_handle handle = lis3_dev.device->handle;
+	struct acpi_device *dev = lis3_dev.bus_priv;
 	unsigned long long ret; /* Not used when writing */
 	union acpi_object in_obj[1];
 	struct acpi_object_list args = { 1, in_obj };
@@ -225,7 +235,7 @@ static void hpled_set(struct delayed_led_classdev *led_cdev, enum led_brightness
 	in_obj[0].type          = ACPI_TYPE_INTEGER;
 	in_obj[0].integer.value = !!value;
 
-	acpi_evaluate_integer(handle, "ALED", &args, &ret);
+	acpi_evaluate_integer(dev->handle, "ALED", &args, &ret);
 }
 
 static struct delayed_led_classdev hpled_led = {
@@ -262,23 +272,6 @@ static void lis3lv02d_enum_resources(struct acpi_device *device)
 		printk(KERN_DEBUG DRIVER_NAME ": Error getting resources\n");
 }
 
-static s16 lis3lv02d_read_16(acpi_handle handle, int reg)
-{
-	u8 lo, hi;
-
-	lis3_dev.read(handle, reg - 1, &lo);
-	lis3_dev.read(handle, reg, &hi);
-	/* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */
-	return (s16)((hi << 8) | lo);
-}
-
-static s16 lis3lv02d_read_8(acpi_handle handle, int reg)
-{
-	s8 lo;
-	lis3_dev.read(handle, reg, &lo);
-	return lo;
-}
-
 static int lis3lv02d_add(struct acpi_device *device)
 {
 	int ret;
@@ -286,7 +279,7 @@ static int lis3lv02d_add(struct acpi_device *device)
 	if (!device)
 		return -EINVAL;
 
-	lis3_dev.device = device;
+	lis3_dev.bus_priv = device;
 	lis3_dev.init = lis3lv02d_acpi_init;
 	lis3_dev.read = lis3lv02d_acpi_read;
 	lis3_dev.write = lis3lv02d_acpi_write;
@@ -294,23 +287,8 @@ static int lis3lv02d_add(struct acpi_device *device)
 	strcpy(acpi_device_class(device), ACPI_MDPS_CLASS);
 	device->driver_data = &lis3_dev;
 
-	lis3lv02d_acpi_read(device->handle, WHO_AM_I, &lis3_dev.whoami);
-	switch (lis3_dev.whoami) {
-	case LIS_DOUBLE_ID:
-		printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n");
-		lis3_dev.read_data = lis3lv02d_read_16;
-		lis3_dev.mdps_max_val = 2048;
-		break;
-	case LIS_SINGLE_ID:
-		printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n");
-		lis3_dev.read_data = lis3lv02d_read_8;
-		lis3_dev.mdps_max_val = 128;
-		break;
-	default:
-		printk(KERN_ERR DRIVER_NAME
-			": unknown sensor type 0x%X\n", lis3_dev.whoami);
-		return -EINVAL;
-	}
+	/* obtain IRQ number of our device from ACPI */
+	lis3lv02d_enum_resources(device);
 
 	/* If possible use a "standard" axes order */
 	if (dmi_check_system(lis3lv02d_dmi_ids) == 0) {
@@ -319,18 +297,17 @@ static int lis3lv02d_add(struct acpi_device *device)
 		lis3_dev.ac = lis3lv02d_axis_normal;
 	}
 
-	INIT_WORK(&hpled_led.work, delayed_set_status_worker);
-	ret = led_classdev_register(NULL, &hpled_led.led_classdev);
+	/* call the core layer do its init */
+	ret = lis3lv02d_init_device(&lis3_dev);
 	if (ret)
 		return ret;
 
-	/* obtain IRQ number of our device from ACPI */
-	lis3lv02d_enum_resources(lis3_dev.device);
-
-	ret = lis3lv02d_init_device(&lis3_dev);
+	INIT_WORK(&hpled_led.work, delayed_set_status_worker);
+	ret = led_classdev_register(NULL, &hpled_led.led_classdev);
 	if (ret) {
+		lis3lv02d_joystick_disable();
+		lis3lv02d_poweroff(&lis3_dev);
 		flush_work(&hpled_led.work);
-		led_classdev_unregister(&hpled_led.led_classdev);
 		return ret;
 	}
 
@@ -343,7 +320,7 @@ static int lis3lv02d_remove(struct acpi_device *device, int type)
 		return -EINVAL;
 
 	lis3lv02d_joystick_disable();
-	lis3lv02d_poweroff(device->handle);
+	lis3lv02d_poweroff(&lis3_dev);
 
 	flush_work(&hpled_led.work);
 	led_classdev_unregister(&hpled_led.led_classdev);
@@ -356,7 +333,7 @@ static int lis3lv02d_remove(struct acpi_device *device, int type)
 static int lis3lv02d_suspend(struct acpi_device *device, pm_message_t state)
 {
 	/* make sure the device is off when we suspend */
-	lis3lv02d_poweroff(device->handle);
+	lis3lv02d_poweroff(&lis3_dev);
 	return 0;
 }
 
@@ -365,9 +342,9 @@ static int lis3lv02d_resume(struct acpi_device *device)
 	/* put back the device in the right state (ACPI might turn it on) */
 	mutex_lock(&lis3_dev.lock);
 	if (lis3_dev.usage > 0)
-		lis3lv02d_poweron(device->handle);
+		lis3lv02d_poweron(&lis3_dev);
 	else
-		lis3lv02d_poweroff(device->handle);
+		lis3lv02d_poweroff(&lis3_dev);
 	mutex_unlock(&lis3_dev.lock);
 	return 0;
 }
diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index eeae7c9600e..778eb779598 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -36,7 +36,6 @@
 #include <linux/freezer.h>
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
-#include <acpi/acpi_drivers.h>
 #include <asm/atomic.h>
 #include "lis3lv02d.h"
 
@@ -53,18 +52,27 @@
  * joystick.
  */
 
-struct acpi_lis3lv02d lis3_dev = {
+struct lis3lv02d lis3_dev = {
 	.misc_wait   = __WAIT_QUEUE_HEAD_INITIALIZER(lis3_dev.misc_wait),
 };
 
 EXPORT_SYMBOL_GPL(lis3_dev);
 
-static s16 lis3lv02d_read_16(acpi_handle handle, int reg)
+static s16 lis3lv02d_read_8(struct lis3lv02d *lis3, int reg)
+{
+	s8 lo;
+	if (lis3->read(lis3, reg, &lo) < 0)
+		return 0;
+
+	return lo;
+}
+
+static s16 lis3lv02d_read_16(struct lis3lv02d *lis3, int reg)
 {
 	u8 lo, hi;
 
-	lis3_dev.read(handle, reg, &lo);
-	lis3_dev.read(handle, reg + 1, &hi);
+	lis3->read(lis3, reg - 1, &lo);
+	lis3->read(lis3, reg, &hi);
 	/* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */
 	return (s16)((hi << 8) | lo);
 }
@@ -86,36 +94,36 @@ static inline int lis3lv02d_get_axis(s8 axis, int hw_values[3])
 
 /**
  * lis3lv02d_get_xyz - Get X, Y and Z axis values from the accelerometer
- * @handle: the handle to the device
- * @x:      where to store the X axis value
- * @y:      where to store the Y axis value
- * @z:      where to store the Z axis value
+ * @lis3: pointer to the device struct
+ * @x:    where to store the X axis value
+ * @y:    where to store the Y axis value
+ * @z:    where to store the Z axis value
  *
  * Note that 40Hz input device can eat up about 10% CPU at 800MHZ
  */
-static void lis3lv02d_get_xyz(acpi_handle handle, int *x, int *y, int *z)
+static void lis3lv02d_get_xyz(struct lis3lv02d *lis3, int *x, int *y, int *z)
 {
 	int position[3];
 
-	position[0] = lis3_dev.read_data(handle, OUTX);
-	position[1] = lis3_dev.read_data(handle, OUTY);
-	position[2] = lis3_dev.read_data(handle, OUTZ);
+	position[0] = lis3_dev.read_data(lis3, OUTX);
+	position[1] = lis3_dev.read_data(lis3, OUTY);
+	position[2] = lis3_dev.read_data(lis3, OUTZ);
 
 	*x = lis3lv02d_get_axis(lis3_dev.ac.x, position);
 	*y = lis3lv02d_get_axis(lis3_dev.ac.y, position);
 	*z = lis3lv02d_get_axis(lis3_dev.ac.z, position);
 }
 
-void lis3lv02d_poweroff(acpi_handle handle)
+void lis3lv02d_poweroff(struct lis3lv02d *lis3)
 {
 	lis3_dev.is_on = 0;
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_poweroff);
 
-void lis3lv02d_poweron(acpi_handle handle)
+void lis3lv02d_poweron(struct lis3lv02d *lis3)
 {
 	lis3_dev.is_on = 1;
-	lis3_dev.init(handle);
+	lis3_dev.init(lis3);
 }
 EXPORT_SYMBOL_GPL(lis3lv02d_poweron);
 
@@ -124,13 +132,13 @@ EXPORT_SYMBOL_GPL(lis3lv02d_poweron);
  * device will always be on until a call to lis3lv02d_decrease_use(). Not to be
  * used from interrupt context.
  */
-static void lis3lv02d_increase_use(struct acpi_lis3lv02d *dev)
+static void lis3lv02d_increase_use(struct lis3lv02d *dev)
 {
 	mutex_lock(&dev->lock);
 	dev->usage++;
 	if (dev->usage == 1) {
 		if (!dev->is_on)
-			lis3lv02d_poweron(dev->device->handle);
+			lis3lv02d_poweron(dev);
 	}
 	mutex_unlock(&dev->lock);
 }
@@ -139,12 +147,12 @@ static void lis3lv02d_increase_use(struct acpi_lis3lv02d *dev)
  * To be called whenever a usage of the device is stopped.
  * It will make sure to turn off the device when there is not usage.
  */
-static void lis3lv02d_decrease_use(struct acpi_lis3lv02d *dev)
+static void lis3lv02d_decrease_use(struct lis3lv02d *dev)
 {
 	mutex_lock(&dev->lock);
 	dev->usage--;
 	if (dev->usage == 0)
-		lis3lv02d_poweroff(dev->device->handle);
+		lis3lv02d_poweroff(dev);
 	mutex_unlock(&dev->lock);
 }
 
@@ -291,7 +299,7 @@ static int lis3lv02d_joystick_kthread(void *data)
 	int x, y, z;
 
 	while (!kthread_should_stop()) {
-		lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z);
+		lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
 		input_report_abs(lis3_dev.idev, ABS_X, x - lis3_dev.xcalib);
 		input_report_abs(lis3_dev.idev, ABS_Y, y - lis3_dev.ycalib);
 		input_report_abs(lis3_dev.idev, ABS_Z, z - lis3_dev.zcalib);
@@ -325,7 +333,8 @@ static void lis3lv02d_joystick_close(struct input_dev *input)
 
 static inline void lis3lv02d_calibrate_joystick(void)
 {
-	lis3lv02d_get_xyz(lis3_dev.device->handle, &lis3_dev.xcalib, &lis3_dev.ycalib, &lis3_dev.zcalib);
+	lis3lv02d_get_xyz(&lis3_dev,
+		&lis3_dev.xcalib, &lis3_dev.ycalib, &lis3_dev.zcalib);
 }
 
 int lis3lv02d_joystick_enable(void)
@@ -382,7 +391,7 @@ static ssize_t lis3lv02d_position_show(struct device *dev,
 	int x, y, z;
 
 	lis3lv02d_increase_use(&lis3_dev);
-	lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z);
+	lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
 	lis3lv02d_decrease_use(&lis3_dev);
 	return sprintf(buf, "(%d,%d,%d)\n", x, y, z);
 }
@@ -412,7 +421,7 @@ static ssize_t lis3lv02d_rate_show(struct device *dev,
 	int val;
 
 	lis3lv02d_increase_use(&lis3_dev);
-	lis3_dev.read(lis3_dev.device->handle, CTRL_REG1, &ctrl);
+	lis3_dev.read(&lis3_dev, CTRL_REG1, &ctrl);
 	lis3lv02d_decrease_use(&lis3_dev);
 	val = (ctrl & (CTRL1_DF0 | CTRL1_DF1)) >> 4;
 	return sprintf(buf, "%d\n", lis3lv02dl_df_val[val]);
@@ -435,7 +444,7 @@ static struct attribute_group lis3lv02d_attribute_group = {
 };
 
 
-static int lis3lv02d_add_fs(struct acpi_device *device)
+static int lis3lv02d_add_fs(struct lis3lv02d *lis3)
 {
 	lis3_dev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
 	if (IS_ERR(lis3_dev.pdev))
@@ -456,10 +465,29 @@ EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs);
  * Initialise the accelerometer and the various subsystems.
  * Should be rather independant of the bus system.
  */
-int lis3lv02d_init_device(struct acpi_lis3lv02d *dev)
+int lis3lv02d_init_device(struct lis3lv02d *dev)
 {
+	dev->whoami = lis3lv02d_read_8(dev, WHO_AM_I);
+
+	switch (dev->whoami) {
+	case LIS_DOUBLE_ID:
+		printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n");
+		dev->read_data = lis3lv02d_read_16;
+		dev->mdps_max_val = 2048;
+		break;
+	case LIS_SINGLE_ID:
+		printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n");
+		dev->read_data = lis3lv02d_read_8;
+		dev->mdps_max_val = 128;
+		break;
+	default:
+		printk(KERN_ERR DRIVER_NAME
+			": unknown sensor type 0x%X\n", lis3_dev.whoami);
+		return -EINVAL;
+	}
+
 	mutex_init(&dev->lock);
-	lis3lv02d_add_fs(dev->device);
+	lis3lv02d_add_fs(dev);
 	lis3lv02d_increase_use(dev);
 
 	if (lis3lv02d_joystick_enable())
@@ -467,10 +495,10 @@ int lis3lv02d_init_device(struct acpi_lis3lv02d *dev)
 
 	printk("lis3_init_device: irq %d\n", dev->irq);
 
-	/* if we did not get an IRQ from ACPI - we have nothing more to do */
+	/* bail if we did not get an IRQ from the bus layer */
 	if (!dev->irq) {
 		printk(KERN_ERR DRIVER_NAME
-			": No IRQ in ACPI. Disabling /dev/freefall\n");
+			": No IRQ. Disabling /dev/freefall\n");
 		goto out;
 	}
 
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index 017fb2b754d..745ec96806d 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -159,14 +159,14 @@ struct axis_conversion {
 	s8	z;
 };
 
-struct acpi_lis3lv02d {
-	struct acpi_device	*device;   /* The ACPI device */
-	acpi_status (*init) (acpi_handle handle);
-	acpi_status (*write) (acpi_handle handle, int reg, u8 val);
-	acpi_status (*read) (acpi_handle handle, int reg, u8 *ret);
+struct lis3lv02d {
+	void			*bus_priv; /* used by the bus layer only */
+	int (*init) (struct lis3lv02d *lis3);
+	int (*write) (struct lis3lv02d *lis3, int reg, u8 val);
+	int (*read) (struct lis3lv02d *lis3, int reg, u8 *ret);
 
 	u8			whoami;    /* 3Ah: 2-byte registries, 3Bh: 1-byte registries */
-	s16 (*read_data) (acpi_handle handle, int reg);
+	s16 (*read_data) (struct lis3lv02d *lis3, int reg);
 	int			mdps_max_val;
 
 	struct input_dev	*idev;     /* input device */
@@ -187,11 +187,11 @@ struct acpi_lis3lv02d {
 	unsigned long		misc_opened; /* bit0: whether the device is open */
 };
 
-int lis3lv02d_init_device(struct acpi_lis3lv02d *dev);
+int lis3lv02d_init_device(struct lis3lv02d *lis3);
 int lis3lv02d_joystick_enable(void);
 void lis3lv02d_joystick_disable(void);
-void lis3lv02d_poweroff(acpi_handle handle);
-void lis3lv02d_poweron(acpi_handle handle);
+void lis3lv02d_poweroff(struct lis3lv02d *lis3);
+void lis3lv02d_poweron(struct lis3lv02d *lis3);
 int lis3lv02d_remove_fs(void);
 
-extern struct acpi_lis3lv02d lis3_dev;
+extern struct lis3lv02d lis3_dev;
-- 
cgit v1.2.3


From bb233fdfc7b7cefe45bfa2e8d1b24e79c60a48e5 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 31 Mar 2009 15:24:33 -0700
Subject: lis3: SPI transport layer

Make use of the new abstraction layer and add a new transport layer for
spi.  Works fine on a PXA based board.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Eric Piel <eric.piel@tremplin-utc.net>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/Kconfig         |  16 ++++++
 drivers/hwmon/Makefile        |   1 +
 drivers/hwmon/lis3lv02d_spi.c | 114 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+)
 create mode 100644 drivers/hwmon/lis3lv02d_spi.c

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index bc6810c22dd..ce52bf2f235 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -932,6 +932,22 @@ config SENSORS_LIS3LV02D
 	  Say Y here if you have an applicable laptop and want to experience
 	  the awesome power of lis3lv02d.
 
+config SENSORS_LIS3_SPI
+	tristate "STMicroeletronics LIS3LV02Dx three-axis digital accelerometer (SPI)"
+	depends on !ACPI && SPI_MASTER && INPUT
+	default n
+	help
+	  This driver provides support for the LIS3LV02Dx accelerometer connected
+	  via SPI. The accelerometer data is readable via
+	  /sys/devices/platform/lis3lv02d.
+
+	  This driver also provides an absolute input class device, allowing
+	  the laptop to act as a pinball machine-esque joystick.
+
+	  This driver can also be built as modules.  If so, the core module
+	  will be called lis3lv02d and a specific module for the SPI transport
+	  is called lis3lv02d_spi.
+
 config SENSORS_APPLESMC
 	tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
 	depends on INPUT && X86
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 4da261d16ab..3a6b1f06f8f 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_SENSORS_IBMPEX)	+= ibmpex.o
 obj-$(CONFIG_SENSORS_IT87)	+= it87.o
 obj-$(CONFIG_SENSORS_K8TEMP)	+= k8temp.o
 obj-$(CONFIG_SENSORS_LIS3LV02D) += lis3lv02d.o hp_accel.o
+obj-$(CONFIG_SENSORS_LIS3_SPI)	+= lis3lv02d.o lis3lv02d_spi.o
 obj-$(CONFIG_SENSORS_LM63)	+= lm63.o
 obj-$(CONFIG_SENSORS_LM70)	+= lm70.o
 obj-$(CONFIG_SENSORS_LM75)	+= lm75.o
diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c
new file mode 100644
index 00000000000..07ae74b0e19
--- /dev/null
+++ b/drivers/hwmon/lis3lv02d_spi.c
@@ -0,0 +1,114 @@
+/*
+ * lis3lv02d_spi - SPI glue layer for lis3lv02d
+ *
+ * Copyright (c) 2009 Daniel Mack <daniel@caiaq.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  publishhed by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/spi/spi.h>
+
+#include "lis3lv02d.h"
+
+#define DRV_NAME 	"lis3lv02d_spi"
+#define LIS3_SPI_READ	0x80
+
+static int lis3_spi_read(struct lis3lv02d *lis3, int reg, u8 *v)
+{
+	struct spi_device *spi = lis3->bus_priv;
+	int ret = spi_w8r8(spi, reg | LIS3_SPI_READ);
+	if (ret < 0)
+		return -EINVAL;
+
+	*v = (u8) ret;
+	return 0;
+}
+
+static int lis3_spi_write(struct lis3lv02d *lis3, int reg, u8 val)
+{
+	u8 tmp[2] = { reg, val };
+	struct spi_device *spi = lis3->bus_priv;
+	return spi_write(spi, tmp, sizeof(tmp));
+}
+
+static int lis3_spi_init(struct lis3lv02d *lis3)
+{
+	u8 reg;
+	int ret;
+
+	/* power up the device */
+	ret = lis3->read(lis3, CTRL_REG1, &reg);
+	if (ret < 0)
+		return ret;
+
+	reg |= CTRL1_PD0;
+	return lis3->write(lis3, CTRL_REG1, reg);
+}
+
+static struct axis_conversion lis3lv02d_axis_normal = { 1, 2, 3 };
+
+static int __devinit lis302dl_spi_probe(struct spi_device *spi)
+{
+	int ret;
+
+	spi->bits_per_word = 8;
+	spi->mode = SPI_MODE_0;
+	ret = spi_setup(spi);
+	if (ret < 0)
+		return ret;
+
+	lis3_dev.bus_priv = spi;
+	lis3_dev.init = lis3_spi_init;
+	lis3_dev.read = lis3_spi_read;
+	lis3_dev.write = lis3_spi_write;
+	lis3_dev.irq = spi->irq;
+	lis3_dev.ac = lis3lv02d_axis_normal;
+	spi_set_drvdata(spi, &lis3_dev);
+
+	ret = lis3lv02d_init_device(&lis3_dev);
+	return ret;
+}
+
+static int __devexit lis302dl_spi_remove(struct spi_device *spi)
+{
+	struct lis3lv02d *lis3 = spi_get_drvdata(spi);
+	lis3lv02d_joystick_disable();
+	lis3lv02d_poweroff(lis3);
+	return 0;
+}
+
+static struct spi_driver lis302dl_spi_driver = {
+	.driver	 = {
+		.name   = DRV_NAME,
+		.owner  = THIS_MODULE,
+	},
+	.probe	= lis302dl_spi_probe,
+	.remove	= __devexit_p(lis302dl_spi_remove),
+};
+
+static int __init lis302dl_init(void)
+{
+	return spi_register_driver(&lis302dl_spi_driver);
+}
+
+static void __exit lis302dl_exit(void)
+{
+	spi_unregister_driver(&lis302dl_spi_driver);
+}
+
+module_init(lis302dl_init);
+module_exit(lis302dl_exit);
+
+MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
+MODULE_DESCRIPTION("lis3lv02d SPI glue layer");
+MODULE_LICENSE("GPL");
+
-- 
cgit v1.2.3


From c3b1b1cbf002e65a3cabd479e68b5f35886a26db Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 31 Mar 2009 15:24:34 -0700
Subject: ramfs: add support for "mode=" mount option

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12843

"I use ramfs instead of tmpfs for /tmp because I don't use swap on my
laptop.  Some apps need 1777 mode for /tmp directory, but ramfs does not
support 'mode=' mount option."

Reported-by: Avan Anishchuk <matimatik@gmail.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 86 insertions(+), 8 deletions(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b8..a404fb88e45 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
+#define RAMFS_DEFAULT_MODE	0755
+
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 
@@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = {
 static const struct super_operations ramfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
+};
+
+struct ramfs_mount_opts {
+	umode_t mode;
+};
+
+enum {
+	Opt_mode,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_mode, "mode=%o"},
+	{Opt_err, NULL}
+};
+
+struct ramfs_fs_info {
+	struct ramfs_mount_opts mount_opts;
 };
 
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int token;
+	char *p;
+
+	opts->mode = RAMFS_DEFAULT_MODE;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		default:
+			printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root;
+	struct ramfs_fs_info *fsi;
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	save_mount_options(sb, data);
+
+	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+	if (!fsi) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	sb->s_fs_info = fsi;
+
+	err = ramfs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_magic = RAMFS_MAGIC;
 	sb->s_op = &ramfs_ops;
 	sb->s_time_gran = 1;
-	inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
-	if (!inode)
-		return -ENOMEM;
+	inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail;
+	}
 
 	root = d_alloc_root(inode);
 	if (!root) {
-		iput(inode);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto fail;
 	}
 	sb->s_root = root;
 	return 0;
+fail:
+	kfree(fsi);
+	iput(inode);
+	return err;
 }
 
 int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
 			    mnt);
 }
 
+static void ramfs_kill_sb(struct super_block *sb)
+{
+	kfree(sb->s_fs_info);
+	kill_litter_super(sb);
+}
+
 static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
 	.get_sb		= ramfs_get_sb,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
-- 
cgit v1.2.3


From 34c8a20c6ee6af25ee35da9ca15ba81faacfc73d Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:35 -0700
Subject: spi_mpc83xx: fix sparse warnings

The patch fixes following sparse warnings:

  CHECK   spi_mpc83xx.c
spi_mpc83xx.c:145:1: warning: symbol 'mpc83xx_spi_rx_buf_u8' was not declared. Should it be static?
spi_mpc83xx.c:146:1: warning: symbol 'mpc83xx_spi_rx_buf_u16' was not declared. Should it be static?
spi_mpc83xx.c:147:1: warning: symbol 'mpc83xx_spi_rx_buf_u32' was not declared. Should it be static?
spi_mpc83xx.c:148:1: warning: symbol 'mpc83xx_spi_tx_buf_u8' was not declared. Should it be static?
spi_mpc83xx.c:149:1: warning: symbol 'mpc83xx_spi_tx_buf_u16' was not declared. Should it be static?
spi_mpc83xx.c:150:1: warning: symbol 'mpc83xx_spi_tx_buf_u32' was not declared. Should it be static?
spi_mpc83xx.c:175:32: warning: incorrect type in initializer (different address spaces)
spi_mpc83xx.c:175:32:    expected void *tmp_ptr
spi_mpc83xx.c:175:32:    got unsigned int [noderef] <asn:2>*<noident>
spi_mpc83xx.c:183:26: warning: incorrect type in argument 1 (different address spaces)
spi_mpc83xx.c:183:26:    expected unsigned int [noderef] [usertype] <asn:2>*reg
spi_mpc83xx.c:183:26:    got void *tmp_ptr
spi_mpc83xx.c:184:26: warning: incorrect type in argument 1 (different address spaces)
spi_mpc83xx.c:184:26:    expected unsigned int [noderef] [usertype] <asn:2>*reg
spi_mpc83xx.c:184:26:    got void *tmp_ptr
spi_mpc83xx.c:287:31: warning: incorrect type in initializer (different address spaces)
spi_mpc83xx.c:287:31:    expected void *tmp_ptr
spi_mpc83xx.c:287:31:    got unsigned int [noderef] <asn:2>*<noident>
spi_mpc83xx.c:295:25: warning: incorrect type in argument 1 (different address spaces)
spi_mpc83xx.c:295:25:    expected unsigned int [noderef] [usertype] <asn:2>*reg
spi_mpc83xx.c:295:25:    got void *tmp_ptr
spi_mpc83xx.c:296:25: warning: incorrect type in argument 1 (different address spaces)
spi_mpc83xx.c:296:25:    expected unsigned int [noderef] [usertype] <asn:2>*reg
spi_mpc83xx.c:296:25:    got void *tmp_ptr
spi_mpc83xx.c:486:13: warning: symbol 'mpc83xx_spi_irq' was not declared. Should it be static?

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_mpc83xx.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index 44a2b46ccb7..df642002900 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -123,6 +123,7 @@ static inline u32 mpc83xx_spi_read_reg(__be32 __iomem * reg)
 }
 
 #define MPC83XX_SPI_RX_BUF(type) 					  \
+static									  \
 void mpc83xx_spi_rx_buf_##type(u32 data, struct mpc83xx_spi *mpc83xx_spi) \
 {									  \
 	type * rx = mpc83xx_spi->rx;					  \
@@ -131,6 +132,7 @@ void mpc83xx_spi_rx_buf_##type(u32 data, struct mpc83xx_spi *mpc83xx_spi) \
 }
 
 #define MPC83XX_SPI_TX_BUF(type)				\
+static								\
 u32 mpc83xx_spi_tx_buf_##type(struct mpc83xx_spi *mpc83xx_spi)	\
 {								\
 	u32 data;						\
@@ -172,7 +174,7 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 
 		if (cs->hw_mode != regval) {
 			unsigned long flags;
-			void *tmp_ptr = &mpc83xx_spi->base->mode;
+			__be32 __iomem *mode = &mpc83xx_spi->base->mode;
 
 			regval = cs->hw_mode;
 			/* Turn off IRQs locally to minimize time that
@@ -180,8 +182,8 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 			 */
 			local_irq_save(flags);
 			/* Turn off SPI unit prior changing mode */
-			mpc83xx_spi_write_reg(tmp_ptr, regval & ~SPMODE_ENABLE);
-			mpc83xx_spi_write_reg(tmp_ptr, regval);
+			mpc83xx_spi_write_reg(mode, regval & ~SPMODE_ENABLE);
+			mpc83xx_spi_write_reg(mode, regval);
 			local_irq_restore(flags);
 		}
 		if (mpc83xx_spi->activate_cs)
@@ -284,7 +286,7 @@ int mpc83xx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 	regval =  mpc83xx_spi_read_reg(&mpc83xx_spi->base->mode);
 	if (cs->hw_mode != regval) {
 		unsigned long flags;
-		void *tmp_ptr = &mpc83xx_spi->base->mode;
+		__be32 __iomem *mode = &mpc83xx_spi->base->mode;
 
 		regval = cs->hw_mode;
 		/* Turn off IRQs locally to minimize time
@@ -292,8 +294,8 @@ int mpc83xx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 		 */
 		local_irq_save(flags);
 		/* Turn off SPI unit prior changing mode */
-		mpc83xx_spi_write_reg(tmp_ptr, regval & ~SPMODE_ENABLE);
-		mpc83xx_spi_write_reg(tmp_ptr, regval);
+		mpc83xx_spi_write_reg(mode, regval & ~SPMODE_ENABLE);
+		mpc83xx_spi_write_reg(mode, regval);
 		local_irq_restore(flags);
 	}
 	return 0;
@@ -483,7 +485,7 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 	return 0;
 }
 
-irqreturn_t mpc83xx_spi_irq(s32 irq, void *context_data)
+static irqreturn_t mpc83xx_spi_irq(s32 irq, void *context_data)
 {
 	struct mpc83xx_spi *mpc83xx_spi = context_data;
 	u32 event;
-- 
cgit v1.2.3


From 364fdbc00fbdd409ade63500710123fe323aa164 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:36 -0700
Subject: spi_mpc83xx: rework chip selects handling

The main purpose of this patch is to pass 'struct spi_device' to the chip
select handling routines.  This is needed so that we could implement
full-fledged OpenFirmware support for this driver.

While at it, also:
- Replace two {de,activate}_cs routines by single cs_contol().
- Don't duplicate platform data callbacks in mpc83xx_spi struct.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/83xx/mpc832x_rdb.c | 16 ++++------------
 arch/powerpc/sysdev/fsl_soc.c             | 14 ++++++--------
 arch/powerpc/sysdev/fsl_soc.h             |  5 +++--
 drivers/spi/spi_mpc83xx.c                 | 20 +++++++-------------
 include/linux/fsl_devices.h               |  5 +++--
 5 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index 2a1295f1983..28e23cde64a 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -39,16 +39,10 @@
 #endif
 
 #ifdef CONFIG_QUICC_ENGINE
-static void mpc83xx_spi_activate_cs(u8 cs, u8 polarity)
+static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
 {
-	pr_debug("%s %d %d\n", __func__, cs, polarity);
-	par_io_data_set(3, 13, polarity);
-}
-
-static void mpc83xx_spi_deactivate_cs(u8 cs, u8 polarity)
-{
-	pr_debug("%s %d %d\n", __func__, cs, polarity);
-	par_io_data_set(3, 13, !polarity);
+	pr_debug("%s %d %d\n", __func__, spi->chip_select, on);
+	par_io_data_set(3, 13, on);
 }
 
 static struct mmc_spi_platform_data mpc832x_mmc_pdata = {
@@ -74,9 +68,7 @@ static int __init mpc832x_spi_init(void)
 	par_io_config_pin(3, 14, 2, 0, 0, 0); /* SD_INSERT, I */
 	par_io_config_pin(3, 15, 2, 0, 0, 0); /* SD_PROTECT,I */
 
-	return fsl_spi_init(&mpc832x_spi_boardinfo, 1,
-			    mpc83xx_spi_activate_cs,
-			    mpc83xx_spi_deactivate_cs);
+	return fsl_spi_init(&mpc832x_spi_boardinfo, 1, mpc83xx_spi_cs_control);
 }
 machine_device_initcall(mpc832x_rdb, mpc832x_spi_init);
 #endif /* CONFIG_QUICC_ENGINE */
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index a01c89d3f9b..a46c1c86793 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -420,8 +420,8 @@ arch_initcall(fsl_usb_of_init);
 static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 				   struct spi_board_info *board_infos,
 				   unsigned int num_board_infos,
-				   void (*activate_cs)(u8 cs, u8 polarity),
-				   void (*deactivate_cs)(u8 cs, u8 polarity))
+				   void (*cs_control)(struct spi_device *dev,
+						      bool on))
 {
 	struct device_node *np;
 	unsigned int i = 0;
@@ -433,8 +433,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 		struct resource res[2];
 		struct platform_device *pdev;
 		struct fsl_spi_platform_data pdata = {
-			.activate_cs = activate_cs,
-			.deactivate_cs = deactivate_cs,
+			.cs_control = cs_control,
 		};
 
 		memset(res, 0, sizeof(res));
@@ -501,8 +500,7 @@ next:
 
 int __init fsl_spi_init(struct spi_board_info *board_infos,
 			unsigned int num_board_infos,
-			void (*activate_cs)(u8 cs, u8 polarity),
-			void (*deactivate_cs)(u8 cs, u8 polarity))
+			void (*cs_control)(struct spi_device *spi, bool on))
 {
 	u32 sysclk = -1;
 	int ret;
@@ -518,10 +516,10 @@ int __init fsl_spi_init(struct spi_board_info *board_infos,
 	}
 
 	ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos,
-			       num_board_infos, activate_cs, deactivate_cs);
+			       num_board_infos, cs_control);
 	if (!ret)
 		of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos,
-				 num_board_infos, activate_cs, deactivate_cs);
+				 num_board_infos, cs_control);
 
 	return spi_register_board_info(board_infos, num_board_infos);
 }
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index 9c744e4285a..b5f3456780b 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -4,6 +4,8 @@
 
 #include <asm/mmu.h>
 
+struct spi_device;
+
 extern phys_addr_t get_immrbase(void);
 #if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx)
 extern u32 get_brgfreq(void);
@@ -19,8 +21,7 @@ struct device_node;
 
 extern int fsl_spi_init(struct spi_board_info *board_infos,
 			unsigned int num_board_infos,
-			void (*activate_cs)(u8 cs, u8 polarity),
-			void (*deactivate_cs)(u8 cs, u8 polarity));
+			void (*cs_control)(struct spi_device *spi, bool on));
 
 extern void fsl_rstcr_restart(char *cmd);
 
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index df642002900..b95085a46f9 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -89,9 +89,6 @@ struct mpc83xx_spi {
 
 	bool qe_mode;
 
-	void (*activate_cs) (u8 cs, u8 polarity);
-	void (*deactivate_cs) (u8 cs, u8 polarity);
-
 	u8 busy;
 
 	struct workqueue_struct *workqueue;
@@ -153,15 +150,14 @@ MPC83XX_SPI_TX_BUF(u32)
 
 static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 {
-	struct mpc83xx_spi *mpc83xx_spi;
-	u8 pol = spi->mode & SPI_CS_HIGH ? 1 : 0;
+	struct mpc83xx_spi *mpc83xx_spi = spi_master_get_devdata(spi->master);
+	struct fsl_spi_platform_data *pdata = spi->dev.parent->platform_data;
+	bool pol = spi->mode & SPI_CS_HIGH;
 	struct spi_mpc83xx_cs	*cs = spi->controller_state;
 
-	mpc83xx_spi = spi_master_get_devdata(spi->master);
-
 	if (value == BITBANG_CS_INACTIVE) {
-		if (mpc83xx_spi->deactivate_cs)
-			mpc83xx_spi->deactivate_cs(spi->chip_select, pol);
+		if (pdata->cs_control)
+			pdata->cs_control(spi, !pol);
 	}
 
 	if (value == BITBANG_CS_ACTIVE) {
@@ -186,8 +182,8 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 			mpc83xx_spi_write_reg(mode, regval);
 			local_irq_restore(flags);
 		}
-		if (mpc83xx_spi->activate_cs)
-			mpc83xx_spi->activate_cs(spi->chip_select, pol);
+		if (pdata->cs_control)
+			pdata->cs_control(spi, pol);
 	}
 }
 
@@ -582,8 +578,6 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 	master->cleanup = mpc83xx_spi_cleanup;
 
 	mpc83xx_spi = spi_master_get_devdata(master);
-	mpc83xx_spi->activate_cs = pdata->activate_cs;
-	mpc83xx_spi->deactivate_cs = pdata->deactivate_cs;
 	mpc83xx_spi->qe_mode = pdata->qe_mode;
 	mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u8;
 	mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u8;
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index d9051d717d2..7bc1b643d37 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -95,14 +95,15 @@ struct fsl_usb2_platform_data {
 #define FSL_USB2_PORT0_ENABLED	0x00000001
 #define FSL_USB2_PORT1_ENABLED	0x00000002
 
+struct spi_device;
+
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
 	u16	bus_num;
 	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
-	void	(*activate_cs)(u8 cs, u8 polarity);
-	void	(*deactivate_cs)(u8 cs, u8 polarity);
+	void	(*cs_control)(struct spi_device *spi, bool on);
 	u32	sysclk;
 };
 
-- 
cgit v1.2.3


From 35b4b3c0c1265f1a7342574be393c157601401f0 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:37 -0700
Subject: spi_mpc83xx: add OF platform driver bindings

Implement full support for OF SPI bindings.  Now the driver can manage its
own chip selects without any help from the board files and/or fsl_soc
constructors.

The "legacy" code is well isolated and could be removed as time goes by.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_mpc83xx.c   | 330 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/fsl_devices.h |   2 +-
 2 files changed, 295 insertions(+), 37 deletions(-)

diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index b95085a46f9..f4573a96af2 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -14,6 +14,8 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -23,7 +25,13 @@
 #include <linux/spi/spi_bitbang.h>
 #include <linux/platform_device.h>
 #include <linux/fsl_devices.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/gpio.h>
+#include <linux/of_gpio.h>
+#include <linux/of_spi.h>
 
+#include <sysdev/fsl_soc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 
@@ -79,7 +87,7 @@ struct mpc83xx_spi {
 	u32(*get_tx) (struct mpc83xx_spi *);
 
 	unsigned int count;
-	int irq;
+	unsigned int irq;
 
 	unsigned nsecs;		/* (clock cycle time)/2 */
 
@@ -543,36 +551,23 @@ static void mpc83xx_spi_cleanup(struct spi_device *spi)
 	kfree(spi->controller_state);
 }
 
-static int __init mpc83xx_spi_probe(struct platform_device *dev)
+static struct spi_master * __devinit
+mpc83xx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 {
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
 	struct spi_master *master;
 	struct mpc83xx_spi *mpc83xx_spi;
-	struct fsl_spi_platform_data *pdata;
-	struct resource *r;
 	u32 regval;
 	int ret = 0;
 
-	/* Get resources(memory, IRQ) associated with the device */
-	master = spi_alloc_master(&dev->dev, sizeof(struct mpc83xx_spi));
-
+	master = spi_alloc_master(dev, sizeof(struct mpc83xx_spi));
 	if (master == NULL) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	platform_set_drvdata(dev, master);
-	pdata = dev->dev.platform_data;
-
-	if (pdata == NULL) {
-		ret = -ENODEV;
-		goto free_master;
-	}
+	dev_set_drvdata(dev, master);
 
-	r = platform_get_resource(dev, IORESOURCE_MEM, 0);
-	if (r == NULL) {
-		ret = -ENODEV;
-		goto free_master;
-	}
 	master->setup = mpc83xx_spi_setup;
 	master->transfer = mpc83xx_spi_transfer;
 	master->cleanup = mpc83xx_spi_cleanup;
@@ -592,18 +587,13 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	init_completion(&mpc83xx_spi->done);
 
-	mpc83xx_spi->base = ioremap(r->start, r->end - r->start + 1);
+	mpc83xx_spi->base = ioremap(mem->start, mem->end - mem->start + 1);
 	if (mpc83xx_spi->base == NULL) {
 		ret = -ENOMEM;
 		goto put_master;
 	}
 
-	mpc83xx_spi->irq = platform_get_irq(dev, 0);
-
-	if (mpc83xx_spi->irq < 0) {
-		ret = -ENXIO;
-		goto unmap_io;
-	}
+	mpc83xx_spi->irq = irq;
 
 	/* Register for SPI Interrupt */
 	ret = request_irq(mpc83xx_spi->irq, mpc83xx_spi_irq,
@@ -645,9 +635,9 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	printk(KERN_INFO
 	       "%s: MPC83xx SPI Controller driver at 0x%p (irq = %d)\n",
-	       dev_name(&dev->dev), mpc83xx_spi->base, mpc83xx_spi->irq);
+	       dev_name(dev), mpc83xx_spi->base, mpc83xx_spi->irq);
 
-	return ret;
+	return master;
 
 unreg_master:
 	destroy_workqueue(mpc83xx_spi->workqueue);
@@ -657,18 +647,16 @@ unmap_io:
 	iounmap(mpc83xx_spi->base);
 put_master:
 	spi_master_put(master);
-free_master:
-	kfree(master);
 err:
-	return ret;
+	return ERR_PTR(ret);
 }
 
-static int __exit mpc83xx_spi_remove(struct platform_device *dev)
+static int __devexit mpc83xx_spi_remove(struct device *dev)
 {
 	struct mpc83xx_spi *mpc83xx_spi;
 	struct spi_master *master;
 
-	master = platform_get_drvdata(dev);
+	master = dev_get_drvdata(dev);
 	mpc83xx_spi = spi_master_get_devdata(master);
 
 	flush_workqueue(mpc83xx_spi->workqueue);
@@ -681,23 +669,293 @@ static int __exit mpc83xx_spi_remove(struct platform_device *dev)
 	return 0;
 }
 
+struct mpc83xx_spi_probe_info {
+	struct fsl_spi_platform_data pdata;
+	int *gpios;
+	bool *alow_flags;
+};
+
+static struct mpc83xx_spi_probe_info *
+to_of_pinfo(struct fsl_spi_platform_data *pdata)
+{
+	return container_of(pdata, struct mpc83xx_spi_probe_info, pdata);
+}
+
+static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
+{
+	struct device *dev = spi->dev.parent;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(dev->platform_data);
+	u16 cs = spi->chip_select;
+	int gpio = pinfo->gpios[cs];
+	bool alow = pinfo->alow_flags[cs];
+
+	gpio_set_value(gpio, on ^ alow);
+}
+
+static int of_mpc83xx_spi_get_chipselects(struct device *dev)
+{
+	struct device_node *np = dev_archdata_get_node(&dev->archdata);
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata);
+	unsigned int ngpios;
+	int i = 0;
+	int ret;
+
+	ngpios = of_gpio_count(np);
+	if (!ngpios) {
+		/*
+		 * SPI w/o chip-select line. One SPI device is still permitted
+		 * though.
+		 */
+		pdata->max_chipselect = 1;
+		return 0;
+	}
+
+	pinfo->gpios = kmalloc(ngpios * sizeof(pinfo->gpios), GFP_KERNEL);
+	if (!pinfo->gpios)
+		return -ENOMEM;
+	memset(pinfo->gpios, -1, ngpios * sizeof(pinfo->gpios));
+
+	pinfo->alow_flags = kzalloc(ngpios * sizeof(pinfo->alow_flags),
+				    GFP_KERNEL);
+	if (!pinfo->alow_flags) {
+		ret = -ENOMEM;
+		goto err_alloc_flags;
+	}
+
+	for (; i < ngpios; i++) {
+		int gpio;
+		enum of_gpio_flags flags;
+
+		gpio = of_get_gpio_flags(np, i, &flags);
+		if (!gpio_is_valid(gpio)) {
+			dev_err(dev, "invalid gpio #%d: %d\n", i, gpio);
+			goto err_loop;
+		}
+
+		ret = gpio_request(gpio, dev_name(dev));
+		if (ret) {
+			dev_err(dev, "can't request gpio #%d: %d\n", i, ret);
+			goto err_loop;
+		}
+
+		pinfo->gpios[i] = gpio;
+		pinfo->alow_flags[i] = flags & OF_GPIO_ACTIVE_LOW;
+
+		ret = gpio_direction_output(pinfo->gpios[i],
+					    pinfo->alow_flags[i]);
+		if (ret) {
+			dev_err(dev, "can't set output direction for gpio "
+				"#%d: %d\n", i, ret);
+			goto err_loop;
+		}
+	}
+
+	pdata->max_chipselect = ngpios;
+	pdata->cs_control = mpc83xx_spi_cs_control;
+
+	return 0;
+
+err_loop:
+	while (i >= 0) {
+		if (gpio_is_valid(pinfo->gpios[i]))
+			gpio_free(pinfo->gpios[i]);
+		i--;
+	}
+
+	kfree(pinfo->alow_flags);
+	pinfo->alow_flags = NULL;
+err_alloc_flags:
+	kfree(pinfo->gpios);
+	pinfo->gpios = NULL;
+	return ret;
+}
+
+static int of_mpc83xx_spi_free_chipselects(struct device *dev)
+{
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata);
+	int i;
+
+	if (!pinfo->gpios)
+		return 0;
+
+	for (i = 0; i < pdata->max_chipselect; i++) {
+		if (gpio_is_valid(pinfo->gpios[i]))
+			gpio_free(pinfo->gpios[i]);
+	}
+
+	kfree(pinfo->gpios);
+	kfree(pinfo->alow_flags);
+	return 0;
+}
+
+static int __devinit of_mpc83xx_spi_probe(struct of_device *ofdev,
+					  const struct of_device_id *ofid)
+{
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = ofdev->node;
+	struct mpc83xx_spi_probe_info *pinfo;
+	struct fsl_spi_platform_data *pdata;
+	struct spi_master *master;
+	struct resource mem;
+	struct resource irq;
+	const void *prop;
+	int ret = -ENOMEM;
+
+	pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL);
+	if (!pinfo)
+		return -ENOMEM;
+
+	pdata = &pinfo->pdata;
+	dev->platform_data = pdata;
+
+	/* Allocate bus num dynamically. */
+	pdata->bus_num = -1;
+
+	/* SPI controller is either clocked from QE or SoC clock. */
+	pdata->sysclk = get_brgfreq();
+	if (pdata->sysclk == -1) {
+		pdata->sysclk = fsl_get_sys_freq();
+		if (pdata->sysclk == -1) {
+			ret = -ENODEV;
+			goto err_clk;
+		}
+	}
+
+	prop = of_get_property(np, "mode", NULL);
+	if (prop && !strcmp(prop, "cpu-qe"))
+		pdata->qe_mode = 1;
+
+	ret = of_mpc83xx_spi_get_chipselects(dev);
+	if (ret)
+		goto err;
+
+	ret = of_address_to_resource(np, 0, &mem);
+	if (ret)
+		goto err;
+
+	ret = of_irq_to_resource(np, 0, &irq);
+	if (!ret) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	master = mpc83xx_spi_probe(dev, &mem, irq.start);
+	if (IS_ERR(master)) {
+		ret = PTR_ERR(master);
+		goto err;
+	}
+
+	of_register_spi_devices(master, np);
+
+	return 0;
+
+err:
+	of_mpc83xx_spi_free_chipselects(dev);
+err_clk:
+	kfree(pinfo);
+	return ret;
+}
+
+static int __devexit of_mpc83xx_spi_remove(struct of_device *ofdev)
+{
+	int ret;
+
+	ret = mpc83xx_spi_remove(&ofdev->dev);
+	if (ret)
+		return ret;
+	of_mpc83xx_spi_free_chipselects(&ofdev->dev);
+	return 0;
+}
+
+static const struct of_device_id of_mpc83xx_spi_match[] = {
+	{ .compatible = "fsl,spi" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, of_mpc83xx_spi_match);
+
+static struct of_platform_driver of_mpc83xx_spi_driver = {
+	.name		= "mpc83xx_spi",
+	.match_table	= of_mpc83xx_spi_match,
+	.probe		= of_mpc83xx_spi_probe,
+	.remove		= __devexit_p(of_mpc83xx_spi_remove),
+};
+
+#ifdef CONFIG_MPC832x_RDB
+/*
+ * 				XXX XXX XXX
+ * This is "legacy" platform driver, was used by the MPC8323E-RDB boards
+ * only. The driver should go away soon, since newer MPC8323E-RDB's device
+ * tree can work with OpenFirmware driver. But for now we support old trees
+ * as well.
+ */
+static int __devinit plat_mpc83xx_spi_probe(struct platform_device *pdev)
+{
+	struct resource *mem;
+	unsigned int irq;
+	struct spi_master *master;
+
+	if (!pdev->dev.platform_data)
+		return -EINVAL;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem)
+		return -EINVAL;
+
+	irq = platform_get_irq(pdev, 0);
+	if (!irq)
+		return -EINVAL;
+
+	master = mpc83xx_spi_probe(&pdev->dev, mem, irq);
+	if (IS_ERR(master))
+		return PTR_ERR(master);
+	return 0;
+}
+
+static int __devexit plat_mpc83xx_spi_remove(struct platform_device *pdev)
+{
+	return mpc83xx_spi_remove(&pdev->dev);
+}
+
 MODULE_ALIAS("platform:mpc83xx_spi");
 static struct platform_driver mpc83xx_spi_driver = {
-	.remove = __exit_p(mpc83xx_spi_remove),
+	.probe = plat_mpc83xx_spi_probe,
+	.remove = __exit_p(plat_mpc83xx_spi_remove),
 	.driver = {
 		.name = "mpc83xx_spi",
 		.owner = THIS_MODULE,
 	},
 };
 
+static bool legacy_driver_failed;
+
+static void __init legacy_driver_register(void)
+{
+	legacy_driver_failed = platform_driver_register(&mpc83xx_spi_driver);
+}
+
+static void __exit legacy_driver_unregister(void)
+{
+	if (legacy_driver_failed)
+		return;
+	platform_driver_unregister(&mpc83xx_spi_driver);
+}
+#else
+static void __init legacy_driver_register(void) {}
+static void __exit legacy_driver_unregister(void) {}
+#endif /* CONFIG_MPC832x_RDB */
+
 static int __init mpc83xx_spi_init(void)
 {
-	return platform_driver_probe(&mpc83xx_spi_driver, mpc83xx_spi_probe);
+	legacy_driver_register();
+	return of_register_platform_driver(&of_mpc83xx_spi_driver);
 }
 
 static void __exit mpc83xx_spi_exit(void)
 {
-	platform_driver_unregister(&mpc83xx_spi_driver);
+	of_unregister_platform_driver(&of_mpc83xx_spi_driver);
+	legacy_driver_unregister();
 }
 
 module_init(mpc83xx_spi_init);
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 7bc1b643d37..7ef1caf5026 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -99,7 +99,7 @@ struct spi_device;
 
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
-	u16	bus_num;
+	s16	bus_num;
 	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
-- 
cgit v1.2.3


From 3f1c6ebf57b815ad709e89291e446935fee78f75 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:38 -0700
Subject: powerpc: add mmc-spi-slot bindings

The bindings describes a case where MMC/SD/SDIO slot directly connected to
a SPI bus.  Such setups are widely used on embedded PowerPC boards.

The patch also adds the mmc-spi-slot entry to the OpenFirmware modalias
table.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../powerpc/dts-bindings/mmc-spi-slot.txt          | 23 ++++++++++++++++++++++
 drivers/of/base.c                                  |  1 +
 2 files changed, 24 insertions(+)
 create mode 100644 Documentation/powerpc/dts-bindings/mmc-spi-slot.txt

diff --git a/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt
new file mode 100644
index 00000000000..c39ac289195
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt
@@ -0,0 +1,23 @@
+MMC/SD/SDIO slot directly connected to a SPI bus
+
+Required properties:
+- compatible : should be "mmc-spi-slot".
+- reg : should specify SPI address (chip-select number).
+- spi-max-frequency : maximum frequency for this device (Hz).
+- voltage-ranges : two cells are required, first cell specifies minimum
+  slot voltage (mV), second cell specifies maximum slot voltage (mV).
+  Several ranges could be specified.
+- gpios : (optional) may specify GPIOs in this order: Card-Detect GPIO,
+  Write-Protect GPIO.
+
+Example:
+
+	mmc-slot@0 {
+		compatible = "fsl,mpc8323rdb-mmc-slot",
+			     "mmc-spi-slot";
+		reg = <0>;
+		gpios = <&qe_pio_d 14 1
+			 &qe_pio_d 15 0>;
+		voltage-ranges = <3300 3300>;
+		spi-max-frequency = <50000000>;
+	};
diff --git a/drivers/of/base.c b/drivers/of/base.c
index cd17092b82b..41c5dfd8535 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -446,6 +446,7 @@ struct of_modalias_table {
 };
 static struct of_modalias_table of_modalias_table[] = {
 	{ "fsl,mcu-mpc8349emitx", "mcu-mpc8349emitx" },
+	{ "mmc-spi-slot", "mmc_spi" },
 };
 
 /**
-- 
cgit v1.2.3


From 754582853120a9ec8b8293b5147b605b1c6a39f1 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:39 -0700
Subject: powerpc/83xx: add mmc-spi support via the device tree for
 MPC8323E-RDB

- Add gpio-controller node to manage QE GPIO Bank D;
- Add mmc-spi node;
- Modify board file so that it won't use legacy SPI support with the new
  device trees.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/boot/dts/mpc832x_rdb.dts     | 24 ++++++++++++++++++++++++
 arch/powerpc/platforms/83xx/mpc832x_rdb.c |  6 ++++++
 2 files changed, 30 insertions(+)

diff --git a/arch/powerpc/boot/dts/mpc832x_rdb.dts b/arch/powerpc/boot/dts/mpc832x_rdb.dts
index dea30910c13..4319bd70a58 100644
--- a/arch/powerpc/boot/dts/mpc832x_rdb.dts
+++ b/arch/powerpc/boot/dts/mpc832x_rdb.dts
@@ -152,10 +152,21 @@
 		};
 
 		par_io@1400 {
+			#address-cells = <1>;
+			#size-cells = <1>;
 			reg = <0x1400 0x100>;
+			ranges = <3 0x1448 0x18>;
+			compatible = "fsl,mpc8323-qe-pario";
 			device_type = "par_io";
 			num-ports = <7>;
 
+			qe_pio_d: gpio-controller@1448 {
+				#gpio-cells = <2>;
+				compatible = "fsl,mpc8323-qe-pario-bank";
+				reg = <3 0x18>;
+				gpio-controller;
+			};
+
 			ucc2pio:ucc_pin@02 {
 				pio-map = <
 			/* port  pin  dir  open_drain  assignment  has_irq */
@@ -225,12 +236,25 @@
 		};
 
 		spi@4c0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
 			cell-index = <0>;
 			compatible = "fsl,spi";
 			reg = <0x4c0 0x40>;
 			interrupts = <2>;
 			interrupt-parent = <&qeic>;
+			gpios = <&qe_pio_d 13 0>;
 			mode = "cpu-qe";
+
+			mmc-slot@0 {
+				compatible = "fsl,mpc8323rdb-mmc-slot",
+					     "mmc-spi-slot";
+				reg = <0>;
+				gpios = <&qe_pio_d 14 1
+					 &qe_pio_d 15 0>;
+				voltage-ranges = <3300 3300>;
+				spi-max-frequency = <50000000>;
+			};
 		};
 
 		spi@500 {
diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index 28e23cde64a..e1e7eeb7d3a 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -68,6 +68,12 @@ static int __init mpc832x_spi_init(void)
 	par_io_config_pin(3, 14, 2, 0, 0, 0); /* SD_INSERT, I */
 	par_io_config_pin(3, 15, 2, 0, 0, 0); /* SD_PROTECT,I */
 
+	/*
+	 * Don't bother with legacy stuff when device tree contains
+	 * mmc-spi-slot node.
+	 */
+	if (of_find_compatible_node(NULL, NULL, "mmc-spi-slot"))
+		return 0;
 	return fsl_spi_init(&mpc832x_spi_boardinfo, 1, mpc83xx_spi_cs_control);
 }
 machine_device_initcall(mpc832x_rdb, mpc832x_spi_init);
-- 
cgit v1.2.3


From e2801806de1c9c1d03b7d1bfcb2e01dd4d389e80 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:40 -0700
Subject: powerpc/fsl_soc: isolate legacy fsl_spi support to mpc832x_rdb boards

The advantages of this:
- Don't encourage legacy support;
- Less external symbols, less code to compile-in for !MPC832x_RDB
  platforms.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/83xx/mpc832x_rdb.c | 107 ++++++++++++++++++++++++++++++
 arch/powerpc/sysdev/fsl_soc.c             | 107 ------------------------------
 arch/powerpc/sysdev/fsl_soc.h             |   4 --
 3 files changed, 107 insertions(+), 111 deletions(-)

diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index e1e7eeb7d3a..567ded7c3b9 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -20,6 +20,7 @@
 #include <linux/spi/mmc_spi.h>
 #include <linux/mmc/host.h>
 #include <linux/of_platform.h>
+#include <linux/fsl_devices.h>
 
 #include <asm/time.h>
 #include <asm/ipic.h>
@@ -39,6 +40,112 @@
 #endif
 
 #ifdef CONFIG_QUICC_ENGINE
+static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
+				   struct spi_board_info *board_infos,
+				   unsigned int num_board_infos,
+				   void (*cs_control)(struct spi_device *dev,
+						      bool on))
+{
+	struct device_node *np;
+	unsigned int i = 0;
+
+	for_each_compatible_node(np, type, compatible) {
+		int ret;
+		unsigned int j;
+		const void *prop;
+		struct resource res[2];
+		struct platform_device *pdev;
+		struct fsl_spi_platform_data pdata = {
+			.cs_control = cs_control,
+		};
+
+		memset(res, 0, sizeof(res));
+
+		pdata.sysclk = sysclk;
+
+		prop = of_get_property(np, "reg", NULL);
+		if (!prop)
+			goto err;
+		pdata.bus_num = *(u32 *)prop;
+
+		prop = of_get_property(np, "cell-index", NULL);
+		if (prop)
+			i = *(u32 *)prop;
+
+		prop = of_get_property(np, "mode", NULL);
+		if (prop && !strcmp(prop, "cpu-qe"))
+			pdata.qe_mode = 1;
+
+		for (j = 0; j < num_board_infos; j++) {
+			if (board_infos[j].bus_num == pdata.bus_num)
+				pdata.max_chipselect++;
+		}
+
+		if (!pdata.max_chipselect)
+			continue;
+
+		ret = of_address_to_resource(np, 0, &res[0]);
+		if (ret)
+			goto err;
+
+		ret = of_irq_to_resource(np, 0, &res[1]);
+		if (ret == NO_IRQ)
+			goto err;
+
+		pdev = platform_device_alloc("mpc83xx_spi", i);
+		if (!pdev)
+			goto err;
+
+		ret = platform_device_add_data(pdev, &pdata, sizeof(pdata));
+		if (ret)
+			goto unreg;
+
+		ret = platform_device_add_resources(pdev, res,
+						    ARRAY_SIZE(res));
+		if (ret)
+			goto unreg;
+
+		ret = platform_device_add(pdev);
+		if (ret)
+			goto unreg;
+
+		goto next;
+unreg:
+		platform_device_del(pdev);
+err:
+		pr_err("%s: registration failed\n", np->full_name);
+next:
+		i++;
+	}
+
+	return i;
+}
+
+static int __init fsl_spi_init(struct spi_board_info *board_infos,
+			       unsigned int num_board_infos,
+			       void (*cs_control)(struct spi_device *spi,
+						  bool on))
+{
+	u32 sysclk = -1;
+	int ret;
+
+	/* SPI controller is either clocked from QE or SoC clock */
+	sysclk = get_brgfreq();
+	if (sysclk == -1) {
+		sysclk = fsl_get_sys_freq();
+		if (sysclk == -1)
+			return -ENODEV;
+	}
+
+	ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos,
+			       num_board_infos, cs_control);
+	if (!ret)
+		of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos,
+				 num_board_infos, cs_control);
+
+	return spi_register_board_info(board_infos, num_board_infos);
+}
+
 static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
 {
 	pr_debug("%s %d %d\n", __func__, spi->chip_select, on);
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index a46c1c86793..afe8dbc964a 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -417,113 +417,6 @@ err:
 
 arch_initcall(fsl_usb_of_init);
 
-static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
-				   struct spi_board_info *board_infos,
-				   unsigned int num_board_infos,
-				   void (*cs_control)(struct spi_device *dev,
-						      bool on))
-{
-	struct device_node *np;
-	unsigned int i = 0;
-
-	for_each_compatible_node(np, type, compatible) {
-		int ret;
-		unsigned int j;
-		const void *prop;
-		struct resource res[2];
-		struct platform_device *pdev;
-		struct fsl_spi_platform_data pdata = {
-			.cs_control = cs_control,
-		};
-
-		memset(res, 0, sizeof(res));
-
-		pdata.sysclk = sysclk;
-
-		prop = of_get_property(np, "reg", NULL);
-		if (!prop)
-			goto err;
-		pdata.bus_num = *(u32 *)prop;
-
-		prop = of_get_property(np, "cell-index", NULL);
-		if (prop)
-			i = *(u32 *)prop;
-
-		prop = of_get_property(np, "mode", NULL);
-		if (prop && !strcmp(prop, "cpu-qe"))
-			pdata.qe_mode = 1;
-
-		for (j = 0; j < num_board_infos; j++) {
-			if (board_infos[j].bus_num == pdata.bus_num)
-				pdata.max_chipselect++;
-		}
-
-		if (!pdata.max_chipselect)
-			continue;
-
-		ret = of_address_to_resource(np, 0, &res[0]);
-		if (ret)
-			goto err;
-
-		ret = of_irq_to_resource(np, 0, &res[1]);
-		if (ret == NO_IRQ)
-			goto err;
-
-		pdev = platform_device_alloc("mpc83xx_spi", i);
-		if (!pdev)
-			goto err;
-
-		ret = platform_device_add_data(pdev, &pdata, sizeof(pdata));
-		if (ret)
-			goto unreg;
-
-		ret = platform_device_add_resources(pdev, res,
-						    ARRAY_SIZE(res));
-		if (ret)
-			goto unreg;
-
-		ret = platform_device_add(pdev);
-		if (ret)
-			goto unreg;
-
-		goto next;
-unreg:
-		platform_device_del(pdev);
-err:
-		pr_err("%s: registration failed\n", np->full_name);
-next:
-		i++;
-	}
-
-	return i;
-}
-
-int __init fsl_spi_init(struct spi_board_info *board_infos,
-			unsigned int num_board_infos,
-			void (*cs_control)(struct spi_device *spi, bool on))
-{
-	u32 sysclk = -1;
-	int ret;
-
-#ifdef CONFIG_QUICC_ENGINE
-	/* SPI controller is either clocked from QE or SoC clock */
-	sysclk = get_brgfreq();
-#endif
-	if (sysclk == -1) {
-		sysclk = fsl_get_sys_freq();
-		if (sysclk == -1)
-			return -ENODEV;
-	}
-
-	ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos,
-			       num_board_infos, cs_control);
-	if (!ret)
-		of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos,
-				 num_board_infos, cs_control);
-
-	return spi_register_board_info(board_infos, num_board_infos);
-}
-
 #if defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_86xx)
 static __be32 __iomem *rstcr;
 
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index b5f3456780b..42381bb6cd5 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -19,10 +19,6 @@ extern u32 fsl_get_sys_freq(void);
 struct spi_board_info;
 struct device_node;
 
-extern int fsl_spi_init(struct spi_board_info *board_infos,
-			unsigned int num_board_infos,
-			void (*cs_control)(struct spi_device *spi, bool on));
-
 extern void fsl_rstcr_restart(char *cmd);
 
 #if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
-- 
cgit v1.2.3


From 00fcf2cb6f6bb421851c3ba062c0a36760ea6e53 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 31 Mar 2009 15:24:42 -0700
Subject: ecryptfs: use kzfree()

Use kzfree() instead of memset() + kfree().

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c  | 3 +--
 fs/ecryptfs/messaging.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e4a6223c314..af737bb56cb 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 out_release_free_unlock:
 	crypto_free_hash(s->hash_desc.tfm);
 out_free_unlock:
-	memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
-	kfree(s->block_aligned_filename);
+	kzfree(s->block_aligned_filename);
 out_unlock:
 	mutex_unlock(s->tfm_mutex);
 out:
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e0..295e7fa5675 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
 	if (daemon->user_ns)
 		put_user_ns(daemon->user_ns);
 	mutex_unlock(&daemon->mux);
-	memset(daemon, 0, sizeof(*daemon));
-	kfree(daemon);
+	kzfree(daemon);
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 56fcef75117a153f298b3fe54af31053f53997dd Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 31 Mar 2009 15:24:43 -0700
Subject: autofs4: cleanup expire code duplication

A significant portion of the autofs_dev_ioctl_expire() and
autofs4_expire_multi() functions is duplicated code.  This patch cleans that
up.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h  |  2 ++
 fs/autofs4/dev-ioctl.c | 29 +----------------------------
 fs/autofs4/expire.c    | 27 +++++++++++++++++----------
 3 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d0..b7ff33c6310 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *,
 			struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *, int __user *);
 struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffe..9e5ae8a4f5c 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
 				   struct autofs_sb_info *sbi,
 				   struct autofs_dev_ioctl *param)
 {
-	struct dentry *dentry;
 	struct vfsmount *mnt;
-	int err = -EAGAIN;
 	int how;
 
 	how = param->expire.how;
 	mnt = fp->f_path.mnt;
 
-	if (autofs_type_trigger(sbi->type))
-		dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
-	else
-		dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
-
-	if (dentry) {
-		struct autofs_info *ino = autofs4_dentry_ino(dentry);
-
-		/*
-		 * This is synchronous because it makes the daemon a
-		 * little easier
-		*/
-		err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
-
-		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-			sbi->sb->s_root->d_mounted++;
-		}
-		ino->flags &= ~AUTOFS_INF_EXPIRING;
-		complete_all(&ino->expire_complete);
-		spin_unlock(&sbi->fs_lock);
-		dput(dentry);
-	}
-
-	return err;
+	return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
 }
 
 /* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9..75f7ddacf7d 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb,
 	return ret;
 }
 
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
-   more to be done */
-int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-			struct autofs_sb_info *sbi, int __user *arg)
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when)
 {
 	struct dentry *dentry;
 	int ret = -EAGAIN;
-	int do_now = 0;
-
-	if (arg && get_user(do_now, arg))
-		return -EFAULT;
 
 	if (autofs_type_trigger(sbi->type))
-		dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
+		dentry = autofs4_expire_direct(sb, mnt, sbi, when);
 	else
-		dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
+		dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
 
 	if (dentry) {
 		struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 	return ret;
 }
 
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+   more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			struct autofs_sb_info *sbi, int __user *arg)
+{
+	int do_now = 0;
+
+	if (arg && get_user(do_now, arg))
+		return -EFAULT;
+
+	return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
+
-- 
cgit v1.2.3


From 8f63aaa8b9239475fc580d4450f1141496655305 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 31 Mar 2009 15:24:45 -0700
Subject: autofs4: fix lookup deadlock

A deadlock can occur when user space uses a signal (autofs version 4 uses
SIGCHLD for this) to effect expire completion.

The order of events is:

Expire process completes, but before being able to send SIGCHLD to it's parent
...

Another process walks onto a different mount point and drops the directory
inode semaphore prior to sending the request to the daemon as it must ...

A third process does an lstat on on the expired mount point causing it to wait
on expire completion (unfortunately) holding the directory semaphore.

The mount request then arrives at the daemon which does an lstat and,
deadlock.

For some time I was concerned about releasing the directory semaphore around
the expire wait in autofs4_lookup as well as for the mount call back.  I
finally realized that the last round of changes in this function made the
expiring dentry and the lookup dentry separate and distinct so the check and
possible wait can be done anywhere prior to the mount call back.  This patch
moves the check to just before the mount call back and inside the directory
inode mutex release.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 74b1469a950..e383bf0334f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
-	if (expiring) {
-		/*
-		 * If we are racing with expire the request might not
-		 * be quite complete but the directory has been removed
-		 * so it must have been successful, so just wait for it.
-		 */
-		ino = autofs4_dentry_ino(expiring);
-		autofs4_expire_wait(expiring);
-		spin_lock(&sbi->lookup_lock);
-		if (!list_empty(&ino->expiring))
-			list_del_init(&ino->expiring);
-		spin_unlock(&sbi->lookup_lock);
-		dput(expiring);
-	}
-
 	unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
 	if (unhashed)
 		dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	}
 
 	if (!oz_mode) {
+		mutex_unlock(&dir->i_mutex);
+		expiring = autofs4_lookup_expiring(sbi,
+						   dentry->d_parent,
+						   &dentry->d_name);
+		if (expiring) {
+			/*
+			 * If we are racing with expire the request might not
+			 * be quite complete but the directory has been removed
+			 * so it must have been successful, so just wait for it.
+			 */
+			ino = autofs4_dentry_ino(expiring);
+			autofs4_expire_wait(expiring);
+			spin_lock(&sbi->lookup_lock);
+			if (!list_empty(&ino->expiring))
+				list_del_init(&ino->expiring);
+			spin_unlock(&sbi->lookup_lock);
+			dput(expiring);
+		}
+
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
 		spin_unlock(&dentry->d_lock);
-		if (dentry->d_op && dentry->d_op->d_revalidate) {
-			mutex_unlock(&dir->i_mutex);
+		if (dentry->d_op && dentry->d_op->d_revalidate)
 			(dentry->d_op->d_revalidate)(dentry, nd);
-			mutex_lock(&dir->i_mutex);
-		}
+		mutex_lock(&dir->i_mutex);
 	}
 
 	/*
-- 
cgit v1.2.3


From 79955898f961a870cbcc58f6ae13f3741a909da5 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 31 Mar 2009 15:24:45 -0700
Subject: autofs4: fix kernel includes

autofs_dev-ioctl.h is included by both the kernel module and user space tools
and it includes two kernel header files.  Compiles work if the kernel headers
are installed but fail otherwise.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/auto_dev-ioctl.h | 7 ++++++-
 include/linux/auto_fs.h        | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 91a773993a5..850f39b33e7 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -10,8 +10,13 @@
 #ifndef _LINUX_AUTO_DEV_IOCTL_H
 #define _LINUX_AUTO_DEV_IOCTL_H
 
+#include <linux/auto_fs.h>
+
+#ifdef __KERNEL__
 #include <linux/string.h>
-#include <linux/types.h>
+#else
+#include <string.h>
+#endif /* __KERNEL__ */
 
 #define AUTOFS_DEVICE_NAME		"autofs"
 
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index c21e5972a3e..63265852b7d 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -17,11 +17,13 @@
 #ifdef __KERNEL__
 #include <linux/fs.h>
 #include <linux/limits.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#else
 #include <asm/types.h>
+#include <sys/ioctl.h>
 #endif /* __KERNEL__ */
 
-#include <linux/ioctl.h>
-
 /* This file describes autofs v3 */
 #define AUTOFS_PROTO_VERSION	3
 
-- 
cgit v1.2.3


From 47367a3ba425d70467af0009782098235ddbf204 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 31 Mar 2009 15:24:46 -0700
Subject: rtc: convert wm8350 use new alarm and update operations

These are the only two ioctls so the ioctl() function is also removed.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-wm8350.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index 5c5e3aa9138..616630d1e31 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -236,6 +236,17 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350)
 	return 0;
 }
 
+static int wm8350_rtc_alarm_irq_enable(struct device *dev,
+				       unsigned int enabled)
+{
+	struct wm8350 *wm8350 = dev_get_drvdata(dev);
+
+	if (enabled)
+		return wm8350_rtc_start_alarm(wm8350);
+	else
+		return wm8350_rtc_stop_alarm(wm8350);
+}
+
 static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct wm8350 *wm8350 = dev_get_drvdata(dev);
@@ -291,30 +302,15 @@ static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	return ret;
 }
 
-/*
- * Handle commands from user-space
- */
-static int wm8350_rtc_ioctl(struct device *dev, unsigned int cmd,
-			    unsigned long arg)
+static int wm8350_rtc_update_irq_enable(struct device *dev,
+					unsigned int enabled)
 {
 	struct wm8350 *wm8350 = dev_get_drvdata(dev);
 
-	switch (cmd) {
-	case RTC_AIE_OFF:
-		return wm8350_rtc_stop_alarm(wm8350);
-	case RTC_AIE_ON:
-		return wm8350_rtc_start_alarm(wm8350);
-
-	case RTC_UIE_OFF:
-		wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC);
-		break;
-	case RTC_UIE_ON:
+	if (enabled)
 		wm8350_unmask_irq(wm8350, WM8350_IRQ_RTC_SEC);
-		break;
-
-	default:
-		return -ENOIOCTLCMD;
-	}
+	else
+		wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC);
 
 	return 0;
 }
@@ -345,11 +341,12 @@ static void wm8350_rtc_update_handler(struct wm8350 *wm8350, int irq,
 }
 
 static const struct rtc_class_ops wm8350_rtc_ops = {
-	.ioctl = wm8350_rtc_ioctl,
 	.read_time = wm8350_rtc_readtime,
 	.set_time = wm8350_rtc_settime,
 	.read_alarm = wm8350_rtc_readalarm,
 	.set_alarm = wm8350_rtc_setalarm,
+	.alarm_irq_enable = wm8350_rtc_alarm_irq_enable,
+	.update_irq_enable = wm8350_rtc_update_irq_enable,
 };
 
 #ifdef CONFIG_PM
-- 
cgit v1.2.3


From 78d89ef40c2ff7265df077e20c4d76be7d415204 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:24:48 -0700
Subject: rtc: convert LEAP_YEAR into an inline

- the LEAP_YEAR macro is buggy - it references its arg multiple times.
  Fix this by turning it into a C function.

- give it a more approriate name

- Move it to rtc.h so that other .c files can use it, instead of copying it.

Cc: dann frazier <dannf@hp.com>
Acked-by: Alessandro Zummo <alessandro.zummo@towertech.it>
Cc: stephane eranian <eranian@googlemail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-lib.c | 7 +++----
 include/linux/rtc.h   | 6 ++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index dd70bf73ce9..773851f338b 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -26,14 +26,13 @@ static const unsigned short rtc_ydays[2][13] = {
 };
 
 #define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400)
-#define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400))
 
 /*
  * The number of days in the month.
  */
 int rtc_month_days(unsigned int month, unsigned int year)
 {
-	return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1);
+	return rtc_days_in_month[month] + (is_leap_year(year) && month == 1);
 }
 EXPORT_SYMBOL(rtc_month_days);
 
@@ -42,7 +41,7 @@ EXPORT_SYMBOL(rtc_month_days);
  */
 int rtc_year_days(unsigned int day, unsigned int month, unsigned int year)
 {
-	return rtc_ydays[LEAP_YEAR(year)][month] + day-1;
+	return rtc_ydays[is_leap_year(year)][month] + day-1;
 }
 EXPORT_SYMBOL(rtc_year_days);
 
@@ -66,7 +65,7 @@ void rtc_time_to_tm(unsigned long time, struct rtc_time *tm)
 		- LEAPS_THRU_END_OF(1970 - 1);
 	if (days < 0) {
 		year -= 1;
-		days += 365 + LEAP_YEAR(year);
+		days += 365 + is_leap_year(year);
 	}
 	tm->tm_year = year - 1900;
 	tm->tm_yday = days + 1;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4046b75563c..60f88a7fb13 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -99,6 +99,7 @@ struct rtc_pll_info {
 
 #ifdef __KERNEL__
 
+#include <linux/types.h>
 #include <linux/interrupt.h>
 
 extern int rtc_month_days(unsigned int month, unsigned int year);
@@ -232,6 +233,11 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
+static inline bool is_leap_year(unsigned int year)
+{
+	return (!(year % 4) && (year % 100)) || !(year % 400);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_RTC_H_ */
-- 
cgit v1.2.3


From 5e3fd9e5810f141c9c70c36992d4ed72b3aa1fed Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@dannf.org>
Date: Tue, 31 Mar 2009 15:24:48 -0700
Subject: rtc: add platform driver for EFI

Munge Stephane Eranian's efirtc.c code into an rtc platform driver

[akpm@linux-foundation.org: use is_leap_year()]
Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <alessandro.zummo@towertech.it>
Cc: stephane eranian <eranian@googlemail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/time.c |  16 ++++
 drivers/rtc/Kconfig     |  10 +++
 drivers/rtc/Makefile    |   1 +
 drivers/rtc/rtc-efi.c   | 235 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 drivers/rtc/rtc-efi.c

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index f0ebb342409..d6747bae52d 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -20,6 +20,7 @@
 #include <linux/efi.h>
 #include <linux/timex.h>
 #include <linux/clocksource.h>
+#include <linux/platform_device.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -405,6 +406,21 @@ static struct irqaction timer_irqaction = {
 	.name =		"timer"
 };
 
+static struct platform_device rtc_efi_dev = {
+	.name = "rtc-efi",
+	.id = -1,
+};
+
+static int __init rtc_init(void)
+{
+	if (platform_device_register(&rtc_efi_dev) < 0)
+		printk(KERN_ERR "unable to register rtc device...\n");
+
+	/* not necessarily an error */
+	return 0;
+}
+module_init(rtc_init);
+
 void __init
 time_init (void)
 {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 81450fbd8b1..d669b916927 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -440,6 +440,16 @@ config RTC_DRV_DS1742
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-ds1742.
 
+config RTC_DRV_EFI
+	tristate "EFI RTC"
+	depends on IA64
+	help
+	  If you say yes here you will get support for the EFI
+	  Real Time Clock.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-efi.
+
 config RTC_DRV_STK17TA8
 	tristate "Simtek STK17TA8"
 	depends on RTC_CLASS
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 0e697aa51ca..e7b09986d26 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_RTC_DRV_DS1553)	+= rtc-ds1553.o
 obj-$(CONFIG_RTC_DRV_DS1672)	+= rtc-ds1672.o
 obj-$(CONFIG_RTC_DRV_DS1742)	+= rtc-ds1742.o
 obj-$(CONFIG_RTC_DRV_DS3234)	+= rtc-ds3234.o
+obj-$(CONFIG_RTC_DRV_EFI)	+= rtc-efi.o
 obj-$(CONFIG_RTC_DRV_EP93XX)	+= rtc-ep93xx.o
 obj-$(CONFIG_RTC_DRV_FM3130)	+= rtc-fm3130.o
 obj-$(CONFIG_RTC_DRV_ISL1208)	+= rtc-isl1208.o
diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c
new file mode 100644
index 00000000000..550292304b0
--- /dev/null
+++ b/drivers/rtc/rtc-efi.c
@@ -0,0 +1,235 @@
+/*
+ * rtc-efi: RTC Class Driver for EFI-based systems
+ *
+ * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
+ *
+ * Author: dann frazier <dannf@hp.com>
+ * Based on efirtc.c by Stephane Eranian
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/efi.h>
+
+#define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT)
+/*
+ * EFI Epoch is 1/1/1998
+ */
+#define EFI_RTC_EPOCH		1998
+
+/*
+ * returns day of the year [0-365]
+ */
+static inline int
+compute_yday(efi_time_t *eft)
+{
+	/* efi_time_t.month is in the [1-12] so, we need -1 */
+	return rtc_year_days(eft->day - 1, eft->month - 1, eft->year);
+}
+/*
+ * returns day of the week [0-6] 0=Sunday
+ *
+ * Don't try to provide a year that's before 1998, please !
+ */
+static int
+compute_wday(efi_time_t *eft)
+{
+	int y;
+	int ndays = 0;
+
+	if (eft->year < 1998) {
+		printk(KERN_ERR "efirtc: EFI year < 1998, invalid date\n");
+		return -1;
+	}
+
+	for (y = EFI_RTC_EPOCH; y < eft->year; y++)
+		ndays += 365 + (is_leap_year(y) ? 1 : 0);
+
+	ndays += compute_yday(eft);
+
+	/*
+	 * 4=1/1/1998 was a Thursday
+	 */
+	return (ndays + 4) % 7;
+}
+
+static void
+convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft)
+{
+	eft->year	= wtime->tm_year + 1900;
+	eft->month	= wtime->tm_mon + 1;
+	eft->day	= wtime->tm_mday;
+	eft->hour	= wtime->tm_hour;
+	eft->minute	= wtime->tm_min;
+	eft->second 	= wtime->tm_sec;
+	eft->nanosecond = 0;
+	eft->daylight	= wtime->tm_isdst ? EFI_ISDST : 0;
+	eft->timezone	= EFI_UNSPECIFIED_TIMEZONE;
+}
+
+static void
+convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime)
+{
+	memset(wtime, 0, sizeof(*wtime));
+	wtime->tm_sec  = eft->second;
+	wtime->tm_min  = eft->minute;
+	wtime->tm_hour = eft->hour;
+	wtime->tm_mday = eft->day;
+	wtime->tm_mon  = eft->month - 1;
+	wtime->tm_year = eft->year - 1900;
+
+	/* day of the week [0-6], Sunday=0 */
+	wtime->tm_wday = compute_wday(eft);
+
+	/* day in the year [1-365]*/
+	wtime->tm_yday = compute_yday(eft);
+
+
+	switch (eft->daylight & EFI_ISDST) {
+	case EFI_ISDST:
+		wtime->tm_isdst = 1;
+		break;
+	case EFI_TIME_ADJUST_DAYLIGHT:
+		wtime->tm_isdst = 0;
+		break;
+	default:
+		wtime->tm_isdst = -1;
+	}
+}
+
+static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
+{
+	efi_time_t eft;
+	efi_status_t status;
+
+	/*
+	 * As of EFI v1.10, this call always returns an unsupported status
+	 */
+	status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled,
+				     (efi_bool_t *)&wkalrm->pending, &eft);
+
+	if (status != EFI_SUCCESS)
+		return -EINVAL;
+
+	convert_from_efi_time(&eft, &wkalrm->time);
+
+	return rtc_valid_tm(&wkalrm->time);
+}
+
+static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
+{
+	efi_time_t eft;
+	efi_status_t status;
+
+	convert_to_efi_time(&wkalrm->time, &eft);
+
+	/*
+	 * XXX Fixme:
+	 * As of EFI 0.92 with the firmware I have on my
+	 * machine this call does not seem to work quite
+	 * right
+	 *
+	 * As of v1.10, this call always returns an unsupported status
+	 */
+	status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft);
+
+	printk(KERN_WARNING "write status is %d\n", (int)status);
+
+	return status == EFI_SUCCESS ? 0 : -EINVAL;
+}
+
+static int efi_read_time(struct device *dev, struct rtc_time *tm)
+{
+	efi_status_t status;
+	efi_time_t eft;
+	efi_time_cap_t cap;
+
+	status = efi.get_time(&eft, &cap);
+
+	if (status != EFI_SUCCESS) {
+		/* should never happen */
+		printk(KERN_ERR "efitime: can't read time\n");
+		return -EINVAL;
+	}
+
+	convert_from_efi_time(&eft, tm);
+
+	return rtc_valid_tm(tm);
+}
+
+static int efi_set_time(struct device *dev, struct rtc_time *tm)
+{
+	efi_status_t status;
+	efi_time_t eft;
+
+	convert_to_efi_time(tm, &eft);
+
+	status = efi.set_time(&eft);
+
+	return status == EFI_SUCCESS ? 0 : -EINVAL;
+}
+
+static const struct rtc_class_ops efi_rtc_ops = {
+	.read_time = efi_read_time,
+	.set_time = efi_set_time,
+	.read_alarm = efi_read_alarm,
+	.set_alarm = efi_set_alarm,
+};
+
+static int __init efi_rtc_probe(struct platform_device *dev)
+{
+	struct rtc_device *rtc;
+
+	rtc = rtc_device_register("rtc-efi", &dev->dev, &efi_rtc_ops,
+					THIS_MODULE);
+	if (IS_ERR(rtc))
+		return PTR_ERR(rtc);
+
+	platform_set_drvdata(dev, rtc);
+
+	return 0;
+}
+
+static int __exit efi_rtc_remove(struct platform_device *dev)
+{
+	struct rtc_device *rtc = platform_get_drvdata(dev);
+
+	rtc_device_unregister(rtc);
+
+	return 0;
+}
+
+static struct platform_driver efi_rtc_driver = {
+	.driver = {
+		.name = "rtc-efi",
+		.owner = THIS_MODULE,
+	},
+	.probe = efi_rtc_probe,
+	.remove = __exit_p(efi_rtc_remove),
+};
+
+static int __init efi_rtc_init(void)
+{
+	return platform_driver_probe(&efi_rtc_driver, efi_rtc_probe);
+}
+
+static void __exit efi_rtc_exit(void)
+{
+	platform_driver_unregister(&efi_rtc_driver);
+}
+
+module_init(efi_rtc_init);
+module_exit(efi_rtc_exit);
+
+MODULE_AUTHOR("dann frazier <dannf@hp.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("EFI RTC driver");
-- 
cgit v1.2.3


From 93d456d9802a40859ecc3d67be8c759b03aa487d Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:49 -0700
Subject: rtc-parisc: add a missing include for linux/rtc.h

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index c6bfa6fe1a2..319bb5d445e 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/platform_device.h>
+#include <linux/rtc.h>
 
 #include <asm/rtc.h>
 
-- 
cgit v1.2.3


From 05439f1f89aebbdb791c49e980f0f31652e4055b Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:50 -0700
Subject: rtc-parisc: remove redundant locking

The RTC subsystem proides ops locking, no need to implement our own

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index 319bb5d445e..cb087ad407f 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -14,17 +14,13 @@
 /* as simple as can be, and no simpler. */
 struct parisc_rtc {
 	struct rtc_device *rtc;
-	spinlock_t lock;
 };
 
 static int parisc_get_time(struct device *dev, struct rtc_time *tm)
 {
-	struct parisc_rtc *p = dev_get_drvdata(dev);
-	unsigned long flags, ret;
+	unsigned long ret;
 
-	spin_lock_irqsave(&p->lock, flags);
 	ret = get_rtc_time(tm);
-	spin_unlock_irqrestore(&p->lock, flags);
 
 	if (ret & RTC_BATT_BAD)
 		return -EOPNOTSUPP;
@@ -34,13 +30,9 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm)
 
 static int parisc_set_time(struct device *dev, struct rtc_time *tm)
 {
-	struct parisc_rtc *p = dev_get_drvdata(dev);
-	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&p->lock, flags);
 	ret = set_rtc_time(tm);
-	spin_unlock_irqrestore(&p->lock, flags);
 
 	if (ret < 0)
 		return -EOPNOTSUPP;
@@ -61,8 +53,6 @@ static int __devinit parisc_rtc_probe(struct platform_device *dev)
 	if (!p)
 		return -ENOMEM;
 
-	spin_lock_init(&p->lock);
-
 	p->rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
 					THIS_MODULE);
 	if (IS_ERR(p->rtc)) {
-- 
cgit v1.2.3


From 6b318f66dca829a72c974083cc10fd5556eec6f1 Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:51 -0700
Subject: rtc-parisc: remove struct parisc_rtc

parisc_rtc now only includes an rtc_device pointer, so let's
just use the rtc_device type directly.

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index cb087ad407f..ee4e9a3fb58 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -11,11 +11,6 @@
 
 #include <asm/rtc.h>
 
-/* as simple as can be, and no simpler. */
-struct parisc_rtc {
-	struct rtc_device *rtc;
-};
-
 static int parisc_get_time(struct device *dev, struct rtc_time *tm)
 {
 	unsigned long ret;
@@ -47,16 +42,16 @@ static const struct rtc_class_ops parisc_rtc_ops = {
 
 static int __devinit parisc_rtc_probe(struct platform_device *dev)
 {
-	struct parisc_rtc *p;
+	struct rtc_device *p;
 
 	p = kzalloc(sizeof (*p), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
-	p->rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
-					THIS_MODULE);
-	if (IS_ERR(p->rtc)) {
-		int err = PTR_ERR(p->rtc);
+	p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
+				THIS_MODULE);
+	if (IS_ERR(p)) {
+		int err = PTR_ERR(p);
 		kfree(p);
 		return err;
 	}
@@ -68,9 +63,9 @@ static int __devinit parisc_rtc_probe(struct platform_device *dev)
 
 static int __devexit parisc_rtc_remove(struct platform_device *dev)
 {
-	struct parisc_rtc *p = platform_get_drvdata(dev);
+	struct rtc_device *p = platform_get_drvdata(dev);
 
-	rtc_device_unregister(p->rtc);
+	rtc_device_unregister(p);
 	kfree(p);
 
 	return 0;
-- 
cgit v1.2.3


From f62bacd4d48a1a6b8931a0140fb2324a06dd89fe Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:52 -0700
Subject: rtc-parisc: use rtc_valid_tm() in parisc_get_time

Use the return value of rtc_valid_tm() instead of just returning 0.

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index ee4e9a3fb58..0477cc129c6 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -20,7 +20,7 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm)
 	if (ret & RTC_BATT_BAD)
 		return -EOPNOTSUPP;
 
-	return 0;
+	return rtc_valid_tm(tm);
 }
 
 static int parisc_set_time(struct device *dev, struct rtc_time *tm)
-- 
cgit v1.2.3


From 2b93cff4dc184bf7b4858dc7a9bd2e8d33c1a3eb Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:52 -0700
Subject: rtc-parisc: use platform_driver_probe

This isn't a hotpluggable device, so call platform_driver_probe
directly in parisc_rtc_init

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index 0477cc129c6..a2ca07ad42c 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -40,19 +40,14 @@ static const struct rtc_class_ops parisc_rtc_ops = {
 	.set_time = parisc_set_time,
 };
 
-static int __devinit parisc_rtc_probe(struct platform_device *dev)
+static int __init parisc_rtc_probe(struct platform_device *dev)
 {
 	struct rtc_device *p;
 
-	p = kzalloc(sizeof (*p), GFP_KERNEL);
-	if (!p)
-		return -ENOMEM;
-
 	p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
 				THIS_MODULE);
 	if (IS_ERR(p)) {
 		int err = PTR_ERR(p);
-		kfree(p);
 		return err;
 	}
 
@@ -61,12 +56,11 @@ static int __devinit parisc_rtc_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int __devexit parisc_rtc_remove(struct platform_device *dev)
+static int __exit parisc_rtc_remove(struct platform_device *dev)
 {
 	struct rtc_device *p = platform_get_drvdata(dev);
 
 	rtc_device_unregister(p);
-	kfree(p);
 
 	return 0;
 }
@@ -82,7 +76,7 @@ static struct platform_driver parisc_rtc_driver = {
 
 static int __init parisc_rtc_init(void)
 {
-	return platform_driver_register(&parisc_rtc_driver);
+	return platform_driver_probe(&parisc_rtc_driver, parisc_rtc_probe);
 }
 
 static void __exit parisc_rtc_fini(void)
-- 
cgit v1.2.3


From d09c091b6a8b2b73381e514d68c73580f2294b03 Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:53 -0700
Subject: rtc-parisc: declare rtc_parisc_dev as static

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/parisc/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 9d46c43a415..a479d08d5ed 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -216,7 +216,7 @@ void __init start_cpu_itimer(void)
 	per_cpu(cpu_data, cpu).it_value = next_tick;
 }
 
-struct platform_device rtc_parisc_dev = {
+static struct platform_device rtc_parisc_dev = {
 	.name = "rtc-parisc",
 	.id = -1,
 };
-- 
cgit v1.2.3


From cd875d4767f821dabd0feb668623a42e9d48158a Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:54 -0700
Subject: rtc-parisc: remove unnecessary ret variable

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/parisc/kernel/time.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index a479d08d5ed..e75cae6072c 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -223,10 +223,7 @@ static struct platform_device rtc_parisc_dev = {
 
 static int __init rtc_init(void)
 {
-	int ret;
-
-	ret = platform_device_register(&rtc_parisc_dev);
-	if (ret < 0)
+	if (platform_device_register(&rtc_parisc_dev) < 0)
 		printk(KERN_ERR "unable to register rtc device...\n");
 
 	/* not necessarily an error */
-- 
cgit v1.2.3


From a8c20cd3f7e2e223898c53adfb74420db5d9ac47 Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:55 -0700
Subject: rtc-parisc: remove a couple unnecessary variables

Signed-off-by: dann frazier <dannf@hp.com>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index a2ca07ad42c..c29e91814c9 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -25,11 +25,7 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm)
 
 static int parisc_set_time(struct device *dev, struct rtc_time *tm)
 {
-	int ret;
-
-	ret = set_rtc_time(tm);
-
-	if (ret < 0)
+	if (set_rtc_time(tm) < 0)
 		return -EOPNOTSUPP;
 
 	return 0;
@@ -46,10 +42,8 @@ static int __init parisc_rtc_probe(struct platform_device *dev)
 
 	p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
 				THIS_MODULE);
-	if (IS_ERR(p)) {
-		int err = PTR_ERR(p);
-		return err;
-	}
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
 	platform_set_drvdata(dev, p);
 
-- 
cgit v1.2.3


From b250c96ea9d7bc0b9ac3ff6e878b254b0b0b6abc Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Tue, 31 Mar 2009 15:24:55 -0700
Subject: rtc-parisc: rename p pointer to rtc

Signed-off-by: dann frazier <dannf@hp.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-parisc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index c29e91814c9..b966f56da97 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -38,23 +38,23 @@ static const struct rtc_class_ops parisc_rtc_ops = {
 
 static int __init parisc_rtc_probe(struct platform_device *dev)
 {
-	struct rtc_device *p;
+	struct rtc_device *rtc;
 
-	p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
-				THIS_MODULE);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
+				  THIS_MODULE);
+	if (IS_ERR(rtc))
+		return PTR_ERR(rtc);
 
-	platform_set_drvdata(dev, p);
+	platform_set_drvdata(dev, rtc);
 
 	return 0;
 }
 
 static int __exit parisc_rtc_remove(struct platform_device *dev)
 {
-	struct rtc_device *p = platform_get_drvdata(dev);
+	struct rtc_device *rtc = platform_get_drvdata(dev);
 
-	rtc_device_unregister(p);
+	rtc_device_unregister(rtc);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 30e7b039b1f9a6d5a4e50df5469a4f347ea1aa77 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@aristanetworks.com>
Date: Tue, 31 Mar 2009 15:24:56 -0700
Subject: rtc-ds1307: true SMBus compatibility

Allow the rtc-ds1307 driver to work with SMBus controllers like nforce2
that do not support i2c block transfers.

Signed-off-by: Ed Swierk <eswierk@aristanetworks.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: BARRE Sebastien <sbarre@sdelcc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-ds1307.c | 109 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 97 insertions(+), 12 deletions(-)

diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 7e5155e88ac..1c975e6439b 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -94,6 +94,10 @@ struct ds1307 {
 	struct i2c_client	*client;
 	struct rtc_device	*rtc;
 	struct work_struct	work;
+	s32 (*read_block_data)(struct i2c_client *client, u8 command,
+			       u8 length, u8 *values);
+	s32 (*write_block_data)(struct i2c_client *client, u8 command,
+				u8 length, const u8 *values);
 };
 
 struct chip_desc {
@@ -132,6 +136,79 @@ MODULE_DEVICE_TABLE(i2c, ds1307_id);
 
 /*----------------------------------------------------------------------*/
 
+#define BLOCK_DATA_MAX_TRIES 10
+
+static s32 ds1307_read_block_data_once(struct i2c_client *client, u8 command,
+				  u8 length, u8 *values)
+{
+	s32 i, data;
+
+	for (i = 0; i < length; i++) {
+		data = i2c_smbus_read_byte_data(client, command + i);
+		if (data < 0)
+			return data;
+		values[i] = data;
+	}
+	return i;
+}
+
+static s32 ds1307_read_block_data(struct i2c_client *client, u8 command,
+				  u8 length, u8 *values)
+{
+	u8 oldvalues[I2C_SMBUS_BLOCK_MAX];
+	s32 ret;
+	int tries = 0;
+
+	dev_dbg(&client->dev, "ds1307_read_block_data (length=%d)\n", length);
+	ret = ds1307_read_block_data_once(client, command, length, values);
+	if (ret < 0)
+		return ret;
+	do {
+		if (++tries > BLOCK_DATA_MAX_TRIES) {
+			dev_err(&client->dev,
+				"ds1307_read_block_data failed\n");
+			return -EIO;
+		}
+		memcpy(oldvalues, values, length);
+		ret = ds1307_read_block_data_once(client, command, length,
+						  values);
+		if (ret < 0)
+			return ret;
+	} while (memcmp(oldvalues, values, length));
+	return length;
+}
+
+static s32 ds1307_write_block_data(struct i2c_client *client, u8 command,
+				   u8 length, const u8 *values)
+{
+	u8 currvalues[I2C_SMBUS_BLOCK_MAX];
+	int tries = 0;
+
+	dev_dbg(&client->dev, "ds1307_write_block_data (length=%d)\n", length);
+	do {
+		s32 i, ret;
+
+		if (++tries > BLOCK_DATA_MAX_TRIES) {
+			dev_err(&client->dev,
+				"ds1307_write_block_data failed\n");
+			return -EIO;
+		}
+		for (i = 0; i < length; i++) {
+			ret = i2c_smbus_write_byte_data(client, command + i,
+							values[i]);
+			if (ret < 0)
+				return ret;
+		}
+		ret = ds1307_read_block_data_once(client, command, length,
+						  currvalues);
+		if (ret < 0)
+			return ret;
+	} while (memcmp(currvalues, values, length));
+	return length;
+}
+
+/*----------------------------------------------------------------------*/
+
 /*
  * The IRQ logic includes a "real" handler running in IRQ context just
  * long enough to schedule this workqueue entry.   We need a task context
@@ -202,7 +279,7 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t)
 	int		tmp;
 
 	/* read the RTC date and time registers all at once */
-	tmp = i2c_smbus_read_i2c_block_data(ds1307->client,
+	tmp = ds1307->read_block_data(ds1307->client,
 		DS1307_REG_SECS, 7, ds1307->regs);
 	if (tmp != 7) {
 		dev_err(dev, "%s error %d\n", "read", tmp);
@@ -279,7 +356,7 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
 		"write", buf[0], buf[1], buf[2], buf[3],
 		buf[4], buf[5], buf[6]);
 
-	result = i2c_smbus_write_i2c_block_data(ds1307->client, 0, 7, buf);
+	result = ds1307->write_block_data(ds1307->client, 0, 7, buf);
 	if (result < 0) {
 		dev_err(dev, "%s error %d\n", "write", result);
 		return result;
@@ -297,7 +374,7 @@ static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 		return -EINVAL;
 
 	/* read all ALARM1, ALARM2, and status registers at once */
-	ret = i2c_smbus_read_i2c_block_data(client,
+	ret = ds1307->read_block_data(client,
 			DS1339_REG_ALARM1_SECS, 9, ds1307->regs);
 	if (ret != 9) {
 		dev_err(dev, "%s error %d\n", "alarm read", ret);
@@ -356,7 +433,7 @@ static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 		t->enabled, t->pending);
 
 	/* read current status of both alarms and the chip */
-	ret = i2c_smbus_read_i2c_block_data(client,
+	ret = ds1307->read_block_data(client,
 			DS1339_REG_ALARM1_SECS, 9, buf);
 	if (ret != 9) {
 		dev_err(dev, "%s error %d\n", "alarm write", ret);
@@ -391,7 +468,7 @@ static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	}
 	buf[8] = status & ~(DS1337_BIT_A1I | DS1337_BIT_A2I);
 
-	ret = i2c_smbus_write_i2c_block_data(client,
+	ret = ds1307->write_block_data(client,
 			DS1339_REG_ALARM1_SECS, 9, buf);
 	if (ret < 0) {
 		dev_err(dev, "can't set alarm time\n");
@@ -479,7 +556,7 @@ ds1307_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
 	if (unlikely(!count))
 		return count;
 
-	result = i2c_smbus_read_i2c_block_data(client, 8 + off, count, buf);
+	result = ds1307->read_block_data(client, 8 + off, count, buf);
 	if (result < 0)
 		dev_err(&client->dev, "%s error %d\n", "nvram read", result);
 	return result;
@@ -490,9 +567,11 @@ ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct i2c_client	*client;
+	struct ds1307		*ds1307;
 	int			result;
 
 	client = kobj_to_i2c_client(kobj);
+	ds1307 = i2c_get_clientdata(client);
 
 	if (unlikely(off >= NVRAM_SIZE))
 		return -EFBIG;
@@ -501,7 +580,7 @@ ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
 	if (unlikely(!count))
 		return count;
 
-	result = i2c_smbus_write_i2c_block_data(client, 8 + off, count, buf);
+	result = ds1307->write_block_data(client, 8 + off, count, buf);
 	if (result < 0) {
 		dev_err(&client->dev, "%s error %d\n", "nvram write", result);
 		return result;
@@ -535,9 +614,8 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 	int			want_irq = false;
 	unsigned char		*buf;
 
-	if (!i2c_check_functionality(adapter,
-			I2C_FUNC_SMBUS_WRITE_BYTE_DATA |
-			I2C_FUNC_SMBUS_I2C_BLOCK))
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)
+	    && !i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK))
 		return -EIO;
 
 	if (!(ds1307 = kzalloc(sizeof(struct ds1307), GFP_KERNEL)))
@@ -547,6 +625,13 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 	i2c_set_clientdata(client, ds1307);
 	ds1307->type = id->driver_data;
 	buf = ds1307->regs;
+	if (i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) {
+		ds1307->read_block_data = i2c_smbus_read_i2c_block_data;
+		ds1307->write_block_data = i2c_smbus_write_i2c_block_data;
+	} else {
+		ds1307->read_block_data = ds1307_read_block_data;
+		ds1307->write_block_data = ds1307_write_block_data;
+	}
 
 	switch (ds1307->type) {
 	case ds_1337:
@@ -557,7 +642,7 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 			want_irq = true;
 		}
 		/* get registers that the "rtc" read below won't read... */
-		tmp = i2c_smbus_read_i2c_block_data(ds1307->client,
+		tmp = ds1307->read_block_data(ds1307->client,
 				DS1337_REG_CONTROL, 2, buf);
 		if (tmp != 2) {
 			pr_debug("read error %d\n", tmp);
@@ -595,7 +680,7 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 
 read_rtc:
 	/* read RTC registers */
-	tmp = i2c_smbus_read_i2c_block_data(ds1307->client, 0, 8, buf);
+	tmp = ds1307->read_block_data(ds1307->client, 0, 8, buf);
 	if (tmp != 8) {
 		pr_debug("read error %d\n", tmp);
 		err = -EIO;
-- 
cgit v1.2.3


From a216685818a54b4f15235068b53908f954850251 Mon Sep 17 00:00:00 2001
From: Matthias Fuchs <matthias.fuchs@esd-electronics.com>
Date: Tue, 31 Mar 2009 15:24:58 -0700
Subject: rtc: add EPSON RX8025 support to DS1307 RTC driver

Add support for the EPSON RX8025 RTC.  The date/time registers of this
chip are compatible with the DS1307.

Signed-off-by: Matthias Fuchs <matthias.fuchs@esd-electronics.com>
Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/Kconfig      |  7 +++--
 drivers/rtc/rtc-ds1307.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index d669b916927..09d5cd33a3f 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -129,13 +129,14 @@ comment "I2C RTC drivers"
 if I2C
 
 config RTC_DRV_DS1307
-	tristate "Dallas/Maxim DS1307/37/38/39/40, ST M41T00"
+	tristate "Dallas/Maxim DS1307/37/38/39/40, ST M41T00, EPSON RX-8025"
 	help
 	  If you say yes here you get support for various compatible RTC
 	  chips (often with battery backup) connected with I2C. This driver
 	  should handle DS1307, DS1337, DS1338, DS1339, DS1340, ST M41T00,
-	  and probably other chips. In some cases the RTC must already
-	  have been initialized (by manufacturing or a bootloader).
+	  EPSON RX-8025 and probably other chips. In some cases the RTC
+	  must already have been initialized (by manufacturing or a
+	  bootloader).
 
 	  The first seven registers on these chips hold an RTC, and other
 	  registers may add features such as NVRAM, a trickle charger for
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 1c975e6439b..2c4a65302a9 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 2005 James Chapman (ds1337 core)
  *  Copyright (C) 2006 David Brownell
+ *  Copyright (C) 2009 Matthias Fuchs (rx8025 support)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,6 +32,7 @@ enum ds_type {
 	ds_1339,
 	ds_1340,
 	m41t00,
+	rx_8025,
 	// rs5c372 too?  different address...
 };
 
@@ -83,6 +85,12 @@ enum ds_type {
 #define DS1339_REG_ALARM1_SECS	0x07
 #define DS1339_REG_TRICKLE	0x10
 
+#define RX8025_REG_CTRL1	0x0e
+#	define RX8025_BIT_2412		0x20
+#define RX8025_REG_CTRL2	0x0f
+#	define RX8025_BIT_PON		0x10
+#	define RX8025_BIT_VDET		0x40
+#	define RX8025_BIT_XST		0x20
 
 
 struct ds1307 {
@@ -121,6 +129,8 @@ static const struct chip_desc chips[] = {
 [ds_1340] = {
 },
 [m41t00] = {
+},
+[rx_8025] = {
 }, };
 
 static const struct i2c_device_id ds1307_id[] = {
@@ -130,6 +140,7 @@ static const struct i2c_device_id ds1307_id[] = {
 	{ "ds1339", ds_1339 },
 	{ "ds1340", ds_1340 },
 	{ "m41t00", m41t00 },
+	{ "rx8025", rx_8025 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, ds1307_id);
@@ -674,6 +685,72 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 			dev_warn(&client->dev, "SET TIME!\n");
 		}
 		break;
+
+	case rx_8025:
+		tmp = i2c_smbus_read_i2c_block_data(ds1307->client,
+				RX8025_REG_CTRL1 << 4 | 0x08, 2, buf);
+		if (tmp != 2) {
+			pr_debug("read error %d\n", tmp);
+			err = -EIO;
+			goto exit_free;
+		}
+
+		/* oscillator off?  turn it on, so clock can tick. */
+		if (!(ds1307->regs[1] & RX8025_BIT_XST)) {
+			ds1307->regs[1] |= RX8025_BIT_XST;
+			i2c_smbus_write_byte_data(client,
+						  RX8025_REG_CTRL2 << 4 | 0x08,
+						  ds1307->regs[1]);
+			dev_warn(&client->dev,
+				 "oscillator stop detected - SET TIME!\n");
+		}
+
+		if (ds1307->regs[1] & RX8025_BIT_PON) {
+			ds1307->regs[1] &= ~RX8025_BIT_PON;
+			i2c_smbus_write_byte_data(client,
+						  RX8025_REG_CTRL2 << 4 | 0x08,
+						  ds1307->regs[1]);
+			dev_warn(&client->dev, "power-on detected\n");
+		}
+
+		if (ds1307->regs[1] & RX8025_BIT_VDET) {
+			ds1307->regs[1] &= ~RX8025_BIT_VDET;
+			i2c_smbus_write_byte_data(client,
+						  RX8025_REG_CTRL2 << 4 | 0x08,
+						  ds1307->regs[1]);
+			dev_warn(&client->dev, "voltage drop detected\n");
+		}
+
+		/* make sure we are running in 24hour mode */
+		if (!(ds1307->regs[0] & RX8025_BIT_2412)) {
+			u8 hour;
+
+			/* switch to 24 hour mode */
+			i2c_smbus_write_byte_data(client,
+						  RX8025_REG_CTRL1 << 4 | 0x08,
+						  ds1307->regs[0] |
+						  RX8025_BIT_2412);
+
+			tmp = i2c_smbus_read_i2c_block_data(ds1307->client,
+					RX8025_REG_CTRL1 << 4 | 0x08, 2, buf);
+			if (tmp != 2) {
+				pr_debug("read error %d\n", tmp);
+				err = -EIO;
+				goto exit_free;
+			}
+
+			/* correct hour */
+			hour = bcd2bin(ds1307->regs[DS1307_REG_HOUR]);
+			if (hour == 12)
+				hour = 0;
+			if (ds1307->regs[DS1307_REG_HOUR] & DS1307_BIT_PM)
+				hour += 12;
+
+			i2c_smbus_write_byte_data(client,
+						  DS1307_REG_HOUR << 4 | 0x08,
+						  hour);
+		}
+		break;
 	default:
 		break;
 	}
@@ -734,6 +811,7 @@ read_rtc:
 			dev_warn(&client->dev, "SET TIME!\n");
 		}
 		break;
+	case rx_8025:
 	case ds_1337:
 	case ds_1339:
 		break;
@@ -747,6 +825,8 @@ read_rtc:
 		 * systems that will run through year 2100.
 		 */
 		break;
+	case rx_8025:
+		break;
 	default:
 		if (!(tmp & DS1307_BIT_12HR))
 			break;
-- 
cgit v1.2.3


From 62da659a7057f7227a99a42eea6aa606b09c1e8c Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:24:59 -0700
Subject: rtc-wm8350: retries will reach -1

With a postfix decrement retries will reach -1 rather than 0, so the
warning and error-out will not occur.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-wm8350.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index 616630d1e31..c91edc572eb 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -122,7 +122,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm)
 	do {
 		rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL);
 		schedule_timeout_uninterruptible(msecs_to_jiffies(1));
-	} while (retries-- && !(rtc_ctrl & WM8350_RTC_STS));
+	} while (--retries && !(rtc_ctrl & WM8350_RTC_STS));
 
 	if (!retries) {
 		dev_err(dev, "timed out on set confirmation\n");
@@ -437,7 +437,7 @@ static int wm8350_rtc_probe(struct platform_device *pdev)
 		do {
 			timectl = wm8350_reg_read(wm8350,
 						  WM8350_RTC_TIME_CONTROL);
-		} while (timectl & WM8350_RTC_STS && retries--);
+		} while (timectl & WM8350_RTC_STS && --retries);
 
 		if (retries == 0) {
 			dev_err(&pdev->dev, "failed to start: timeout\n");
-- 
cgit v1.2.3


From c08cf9daf66844c60ebe9f89885d3a3e1893e61f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Tue, 31 Mar 2009 15:24:59 -0700
Subject: rtc-v3020: coding style cleanup

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-v3020.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
index 14d4f036a76..66955cc9c74 100644
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -28,7 +28,7 @@
 #include <linux/rtc-v3020.h>
 #include <linux/delay.h>
 
-#include <asm/io.h>
+#include <linux/io.h>
 
 #undef DEBUG
 
@@ -63,7 +63,7 @@ static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 
 static unsigned char v3020_get_reg(struct v3020 *chip, unsigned char address)
 {
-	unsigned int data=0;
+	unsigned int data = 0;
 	int i;
 
 	for (i = 0; i < 4; i++) {
@@ -106,16 +106,14 @@ static int v3020_read_time(struct device *dev, struct rtc_time *dt)
 	tmp = v3020_get_reg(chip, V3020_YEAR);
 	dt->tm_year = bcd2bin(tmp)+100;
 
-#ifdef DEBUG
-	printk("\n%s : Read RTC values\n",__func__);
-	printk("tm_hour: %i\n",dt->tm_hour);
-	printk("tm_min : %i\n",dt->tm_min);
-	printk("tm_sec : %i\n",dt->tm_sec);
-	printk("tm_year: %i\n",dt->tm_year);
-	printk("tm_mon : %i\n",dt->tm_mon);
-	printk("tm_mday: %i\n",dt->tm_mday);
-	printk("tm_wday: %i\n",dt->tm_wday);
-#endif
+	dev_dbg(dev, "\n%s : Read RTC values\n", __func__);
+	dev_dbg(dev, "tm_hour: %i\n", dt->tm_hour);
+	dev_dbg(dev, "tm_min : %i\n", dt->tm_min);
+	dev_dbg(dev, "tm_sec : %i\n", dt->tm_sec);
+	dev_dbg(dev, "tm_year: %i\n", dt->tm_year);
+	dev_dbg(dev, "tm_mon : %i\n", dt->tm_mon);
+	dev_dbg(dev, "tm_mday: %i\n", dt->tm_mday);
+	dev_dbg(dev, "tm_wday: %i\n", dt->tm_wday);
 
 	return 0;
 }
@@ -125,15 +123,13 @@ static int v3020_set_time(struct device *dev, struct rtc_time *dt)
 {
 	struct v3020 *chip = dev_get_drvdata(dev);
 
-#ifdef DEBUG
-	printk("\n%s : Setting RTC values\n",__func__);
-	printk("tm_sec : %i\n",dt->tm_sec);
-	printk("tm_min : %i\n",dt->tm_min);
-	printk("tm_hour: %i\n",dt->tm_hour);
-	printk("tm_mday: %i\n",dt->tm_mday);
-	printk("tm_wday: %i\n",dt->tm_wday);
-	printk("tm_year: %i\n",dt->tm_year);
-#endif
+	dev_dbg(dev, "\n%s : Setting RTC values\n", __func__);
+	dev_dbg(dev, "tm_sec : %i\n", dt->tm_sec);
+	dev_dbg(dev, "tm_min : %i\n", dt->tm_min);
+	dev_dbg(dev, "tm_hour: %i\n", dt->tm_hour);
+	dev_dbg(dev, "tm_mday: %i\n", dt->tm_mday);
+	dev_dbg(dev, "tm_wday: %i\n", dt->tm_wday);
+	dev_dbg(dev, "tm_year: %i\n", dt->tm_year);
 
 	/* Write all the values to ram... */
 	v3020_set_reg(chip, V3020_SECONDS, 	bin2bcd(dt->tm_sec));
@@ -191,7 +187,7 @@ static int rtc_probe(struct platform_device *pdev)
 	/* Test chip by doing a write/read sequence
 	 * to the chip ram */
 	v3020_set_reg(chip, V3020_SECONDS, 0x33);
-	if(v3020_get_reg(chip, V3020_SECONDS) != 0x33) {
+	if (v3020_get_reg(chip, V3020_SECONDS) != 0x33) {
 		retval = -ENODEV;
 		goto err_io;
 	}
-- 
cgit v1.2.3


From fa7af8b1bb6dfca7a0c8541683a9bfffbc8dd345 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:00 -0700
Subject: rtc: test before subtraction on unsigned

new_alarm is unsigned so test before the subtraction.

[akpm@linux-foundation.org: time-wrapping fix]
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-ds1374.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index a5b0fc09f0c..4d32e328f6c 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -222,16 +222,16 @@ static int ds1374_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 	rtc_tm_to_time(&alarm->time, &new_alarm);
 	rtc_tm_to_time(&now, &itime);
 
-	new_alarm -= itime;
-
 	/* This can happen due to races, in addition to dates that are
 	 * truly in the past.  To avoid requiring the caller to check for
 	 * races, dates in the past are assumed to be in the recent past
 	 * (i.e. not something that we'd rather the caller know about via
 	 * an error), and the alarm is set to go off as soon as possible.
 	 */
-	if (new_alarm <= 0)
+	if (time_before_eq(new_alarm, itime))
 		new_alarm = 1;
+	else
+		new_alarm -= itime;
 
 	mutex_lock(&ds1374->mutex);
 
-- 
cgit v1.2.3


From 6e6fe42227e23a379d3c70f6ff257131399e4075 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:25:01 -0700
Subject: drivers/video/uvesafb.c: don't use gfp_any()

GFP_KERNEL is legal here - we don't need to use gfp_any().

Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/uvesafb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index 74ae7589900..398fd25e081 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -189,7 +189,7 @@ static int uvesafb_exec(struct uvesafb_ktask *task)
 	uvfb_tasks[seq] = task;
 	mutex_unlock(&uvfb_lock);
 
-	err = cn_netlink_send(m, 0, gfp_any());
+	err = cn_netlink_send(m, 0, GFP_KERNEL);
 	if (err == -ESRCH) {
 		/*
 		 * Try to start the userspace helper if sending
-- 
cgit v1.2.3


From d5cb78feee7b6631f578e12bda1e86eea7923637 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 31 Mar 2009 15:25:02 -0700
Subject: atyfb: fix header file trailing whitespace

Fix trailing whitespace because quilt complained about it.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/video/aty128.h |   2 +-
 include/video/radeon.h | 564 ++++++++++++++++++++++++-------------------------
 2 files changed, 283 insertions(+), 283 deletions(-)

diff --git a/include/video/aty128.h b/include/video/aty128.h
index 51ac69f05bd..f0851e3bb7c 100644
--- a/include/video/aty128.h
+++ b/include/video/aty128.h
@@ -415,7 +415,7 @@
 #define PWR_MGT_SLOWDOWN_MCLK			0x00002000
 
 #define PMI_PMSCR_REG				0x60
-                                                                                
+
 /* used by ATI bug fix for hardware ROM */
 #define RAGE128_MPP_TB_CONFIG                   0x01c0
 
diff --git a/include/video/radeon.h b/include/video/radeon.h
index e072b16b39a..56b188abfb5 100644
--- a/include/video/radeon.h
+++ b/include/video/radeon.h
@@ -5,12 +5,12 @@
 #define RADEON_REGSIZE			0x4000
 
 
-#define MM_INDEX                               0x0000  
-#define MM_DATA                                0x0004  
-#define BUS_CNTL                               0x0030  
-#define HI_STAT                                0x004C  
+#define MM_INDEX                               0x0000
+#define MM_DATA                                0x0004
+#define BUS_CNTL                               0x0030
+#define HI_STAT                                0x004C
 #define BUS_CNTL1                              0x0034
-#define I2C_CNTL_1			       0x0094  
+#define I2C_CNTL_1			       0x0094
 #define CNFG_CNTL                              0x00E0
 #define CNFG_MEMSIZE                           0x00F8
 #define CNFG_APER_0_BASE                       0x0100
@@ -18,8 +18,8 @@
 #define CNFG_APER_SIZE                         0x0108
 #define CNFG_REG_1_BASE                        0x010C
 #define CNFG_REG_APER_SIZE                     0x0110
-#define PAD_AGPINPUT_DELAY                     0x0164  
-#define PAD_CTLR_STRENGTH                      0x0168  
+#define PAD_AGPINPUT_DELAY                     0x0164
+#define PAD_CTLR_STRENGTH                      0x0168
 #define PAD_CTLR_UPDATE                        0x016C
 #define PAD_CTLR_MISC                          0x0aa0
 #define AGP_CNTL                               0x0174
@@ -27,171 +27,171 @@
 #define CAP0_TRIG_CNTL			       0x0950
 #define CAP1_TRIG_CNTL		               0x09c0
 #define VIPH_CONTROL			       0x0C40
-#define VENDOR_ID                              0x0F00  
-#define DEVICE_ID                              0x0F02  
-#define COMMAND                                0x0F04  
-#define STATUS                                 0x0F06  
-#define REVISION_ID                            0x0F08  
-#define REGPROG_INF                            0x0F09  
-#define SUB_CLASS                              0x0F0A  
-#define BASE_CODE                              0x0F0B  
-#define CACHE_LINE                             0x0F0C  
-#define LATENCY                                0x0F0D  
-#define HEADER                                 0x0F0E  
-#define BIST                                   0x0F0F  
-#define REG_MEM_BASE                           0x0F10  
-#define REG_IO_BASE                            0x0F14  
+#define VENDOR_ID                              0x0F00
+#define DEVICE_ID                              0x0F02
+#define COMMAND                                0x0F04
+#define STATUS                                 0x0F06
+#define REVISION_ID                            0x0F08
+#define REGPROG_INF                            0x0F09
+#define SUB_CLASS                              0x0F0A
+#define BASE_CODE                              0x0F0B
+#define CACHE_LINE                             0x0F0C
+#define LATENCY                                0x0F0D
+#define HEADER                                 0x0F0E
+#define BIST                                   0x0F0F
+#define REG_MEM_BASE                           0x0F10
+#define REG_IO_BASE                            0x0F14
 #define REG_REG_BASE                           0x0F18
 #define ADAPTER_ID                             0x0F2C
 #define BIOS_ROM                               0x0F30
-#define CAPABILITIES_PTR                       0x0F34  
-#define INTERRUPT_LINE                         0x0F3C  
-#define INTERRUPT_PIN                          0x0F3D  
-#define MIN_GRANT                              0x0F3E  
-#define MAX_LATENCY                            0x0F3F  
-#define ADAPTER_ID_W                           0x0F4C  
-#define PMI_CAP_ID                             0x0F50  
-#define PMI_NXT_CAP_PTR                        0x0F51  
-#define PMI_PMC_REG                            0x0F52  
-#define PM_STATUS                              0x0F54  
-#define PMI_DATA                               0x0F57  
-#define AGP_CAP_ID                             0x0F58  
-#define AGP_STATUS                             0x0F5C  
-#define AGP_COMMAND                            0x0F60  
+#define CAPABILITIES_PTR                       0x0F34
+#define INTERRUPT_LINE                         0x0F3C
+#define INTERRUPT_PIN                          0x0F3D
+#define MIN_GRANT                              0x0F3E
+#define MAX_LATENCY                            0x0F3F
+#define ADAPTER_ID_W                           0x0F4C
+#define PMI_CAP_ID                             0x0F50
+#define PMI_NXT_CAP_PTR                        0x0F51
+#define PMI_PMC_REG                            0x0F52
+#define PM_STATUS                              0x0F54
+#define PMI_DATA                               0x0F57
+#define AGP_CAP_ID                             0x0F58
+#define AGP_STATUS                             0x0F5C
+#define AGP_COMMAND                            0x0F60
 #define AIC_CTRL                               0x01D0
 #define AIC_STAT                               0x01D4
 #define AIC_PT_BASE                            0x01D8
-#define AIC_LO_ADDR                            0x01DC  
-#define AIC_HI_ADDR                            0x01E0  
-#define AIC_TLB_ADDR                           0x01E4  
-#define AIC_TLB_DATA                           0x01E8  
-#define DAC_CNTL                               0x0058  
+#define AIC_LO_ADDR                            0x01DC
+#define AIC_HI_ADDR                            0x01E0
+#define AIC_TLB_ADDR                           0x01E4
+#define AIC_TLB_DATA                           0x01E8
+#define DAC_CNTL                               0x0058
 #define DAC_CNTL2                              0x007c
-#define CRTC_GEN_CNTL                          0x0050  
-#define MEM_CNTL                               0x0140  
+#define CRTC_GEN_CNTL                          0x0050
+#define MEM_CNTL                               0x0140
 #define MC_CNTL                                0x0140
-#define EXT_MEM_CNTL                           0x0144  
+#define EXT_MEM_CNTL                           0x0144
 #define MC_TIMING_CNTL                         0x0144
-#define MC_AGP_LOCATION                        0x014C  
-#define MEM_IO_CNTL_A0                         0x0178  
+#define MC_AGP_LOCATION                        0x014C
+#define MEM_IO_CNTL_A0                         0x0178
 #define MEM_REFRESH_CNTL                       0x0178
-#define MEM_INIT_LATENCY_TIMER                 0x0154  
+#define MEM_INIT_LATENCY_TIMER                 0x0154
 #define MC_INIT_GFX_LAT_TIMER                  0x0154
-#define MEM_SDRAM_MODE_REG                     0x0158  
-#define AGP_BASE                               0x0170  
-#define MEM_IO_CNTL_A1                         0x017C  
+#define MEM_SDRAM_MODE_REG                     0x0158
+#define AGP_BASE                               0x0170
+#define MEM_IO_CNTL_A1                         0x017C
 #define MC_READ_CNTL_AB                        0x017C
 #define MEM_IO_CNTL_B0                         0x0180
 #define MC_INIT_MISC_LAT_TIMER                 0x0180
 #define MEM_IO_CNTL_B1                         0x0184
 #define MC_IOPAD_CNTL                          0x0184
 #define MC_DEBUG                               0x0188
-#define MC_STATUS                              0x0150  
-#define MEM_IO_OE_CNTL                         0x018C  
+#define MC_STATUS                              0x0150
+#define MEM_IO_OE_CNTL                         0x018C
 #define MC_CHIP_IO_OE_CNTL_AB                  0x018C
-#define MC_FB_LOCATION                         0x0148  
-#define HOST_PATH_CNTL                         0x0130  
-#define MEM_VGA_WP_SEL                         0x0038  
-#define MEM_VGA_RP_SEL                         0x003C  
-#define HDP_DEBUG                              0x0138  
+#define MC_FB_LOCATION                         0x0148
+#define HOST_PATH_CNTL                         0x0130
+#define MEM_VGA_WP_SEL                         0x0038
+#define MEM_VGA_RP_SEL                         0x003C
+#define HDP_DEBUG                              0x0138
 #define SW_SEMAPHORE                           0x013C
-#define CRTC2_GEN_CNTL                         0x03f8  
+#define CRTC2_GEN_CNTL                         0x03f8
 #define CRTC2_DISPLAY_BASE_ADDR                0x033c
-#define SURFACE_CNTL                           0x0B00  
-#define SURFACE0_LOWER_BOUND                   0x0B04  
-#define SURFACE1_LOWER_BOUND                   0x0B14  
-#define SURFACE2_LOWER_BOUND                   0x0B24  
-#define SURFACE3_LOWER_BOUND                   0x0B34  
-#define SURFACE4_LOWER_BOUND                   0x0B44  
+#define SURFACE_CNTL                           0x0B00
+#define SURFACE0_LOWER_BOUND                   0x0B04
+#define SURFACE1_LOWER_BOUND                   0x0B14
+#define SURFACE2_LOWER_BOUND                   0x0B24
+#define SURFACE3_LOWER_BOUND                   0x0B34
+#define SURFACE4_LOWER_BOUND                   0x0B44
 #define SURFACE5_LOWER_BOUND                   0x0B54
 #define SURFACE6_LOWER_BOUND                   0x0B64
 #define SURFACE7_LOWER_BOUND                   0x0B74
-#define SURFACE0_UPPER_BOUND                   0x0B08  
-#define SURFACE1_UPPER_BOUND                   0x0B18  
-#define SURFACE2_UPPER_BOUND                   0x0B28  
-#define SURFACE3_UPPER_BOUND                   0x0B38  
-#define SURFACE4_UPPER_BOUND                   0x0B48  
-#define SURFACE5_UPPER_BOUND                   0x0B58  
-#define SURFACE6_UPPER_BOUND                   0x0B68  
-#define SURFACE7_UPPER_BOUND                   0x0B78  
-#define SURFACE0_INFO                          0x0B0C  
-#define SURFACE1_INFO                          0x0B1C  
-#define SURFACE2_INFO                          0x0B2C  
-#define SURFACE3_INFO                          0x0B3C  
-#define SURFACE4_INFO                          0x0B4C  
-#define SURFACE5_INFO                          0x0B5C  
+#define SURFACE0_UPPER_BOUND                   0x0B08
+#define SURFACE1_UPPER_BOUND                   0x0B18
+#define SURFACE2_UPPER_BOUND                   0x0B28
+#define SURFACE3_UPPER_BOUND                   0x0B38
+#define SURFACE4_UPPER_BOUND                   0x0B48
+#define SURFACE5_UPPER_BOUND                   0x0B58
+#define SURFACE6_UPPER_BOUND                   0x0B68
+#define SURFACE7_UPPER_BOUND                   0x0B78
+#define SURFACE0_INFO                          0x0B0C
+#define SURFACE1_INFO                          0x0B1C
+#define SURFACE2_INFO                          0x0B2C
+#define SURFACE3_INFO                          0x0B3C
+#define SURFACE4_INFO                          0x0B4C
+#define SURFACE5_INFO                          0x0B5C
 #define SURFACE6_INFO                          0x0B6C
 #define SURFACE7_INFO                          0x0B7C
 #define SURFACE_ACCESS_FLAGS                   0x0BF8
-#define SURFACE_ACCESS_CLR                     0x0BFC  
-#define GEN_INT_CNTL                           0x0040  
-#define GEN_INT_STATUS                         0x0044  
+#define SURFACE_ACCESS_CLR                     0x0BFC
+#define GEN_INT_CNTL                           0x0040
+#define GEN_INT_STATUS                         0x0044
 #define CRTC_EXT_CNTL                          0x0054
-#define RB3D_CNTL			       0x1C3C  
-#define WAIT_UNTIL                             0x1720  
-#define ISYNC_CNTL                             0x1724  
-#define RBBM_GUICNTL                           0x172C  
-#define RBBM_STATUS                            0x0E40  
-#define RBBM_STATUS_alt_1                      0x1740  
-#define RBBM_CNTL                              0x00EC  
-#define RBBM_CNTL_alt_1                        0x0E44  
-#define RBBM_SOFT_RESET                        0x00F0  
-#define RBBM_SOFT_RESET_alt_1                  0x0E48  
-#define NQWAIT_UNTIL                           0x0E50  
+#define RB3D_CNTL			       0x1C3C
+#define WAIT_UNTIL                             0x1720
+#define ISYNC_CNTL                             0x1724
+#define RBBM_GUICNTL                           0x172C
+#define RBBM_STATUS                            0x0E40
+#define RBBM_STATUS_alt_1                      0x1740
+#define RBBM_CNTL                              0x00EC
+#define RBBM_CNTL_alt_1                        0x0E44
+#define RBBM_SOFT_RESET                        0x00F0
+#define RBBM_SOFT_RESET_alt_1                  0x0E48
+#define NQWAIT_UNTIL                           0x0E50
 #define RBBM_DEBUG                             0x0E6C
 #define RBBM_CMDFIFO_ADDR                      0x0E70
 #define RBBM_CMDFIFO_DATAL                     0x0E74
-#define RBBM_CMDFIFO_DATAH                     0x0E78  
-#define RBBM_CMDFIFO_STAT                      0x0E7C  
-#define CRTC_STATUS                            0x005C  
-#define GPIO_VGA_DDC                           0x0060  
-#define GPIO_DVI_DDC                           0x0064  
-#define GPIO_MONID                             0x0068  
+#define RBBM_CMDFIFO_DATAH                     0x0E78
+#define RBBM_CMDFIFO_STAT                      0x0E7C
+#define CRTC_STATUS                            0x005C
+#define GPIO_VGA_DDC                           0x0060
+#define GPIO_DVI_DDC                           0x0064
+#define GPIO_MONID                             0x0068
 #define GPIO_CRT2_DDC                          0x006c
-#define PALETTE_INDEX                          0x00B0  
-#define PALETTE_DATA                           0x00B4  
-#define PALETTE_30_DATA                        0x00B8  
-#define CRTC_H_TOTAL_DISP                      0x0200  
-#define CRTC_H_SYNC_STRT_WID                   0x0204  
-#define CRTC_V_TOTAL_DISP                      0x0208  
-#define CRTC_V_SYNC_STRT_WID                   0x020C  
-#define CRTC_VLINE_CRNT_VLINE                  0x0210  
+#define PALETTE_INDEX                          0x00B0
+#define PALETTE_DATA                           0x00B4
+#define PALETTE_30_DATA                        0x00B8
+#define CRTC_H_TOTAL_DISP                      0x0200
+#define CRTC_H_SYNC_STRT_WID                   0x0204
+#define CRTC_V_TOTAL_DISP                      0x0208
+#define CRTC_V_SYNC_STRT_WID                   0x020C
+#define CRTC_VLINE_CRNT_VLINE                  0x0210
 #define CRTC_CRNT_FRAME                        0x0214
 #define CRTC_GUI_TRIG_VLINE                    0x0218
 #define CRTC_DEBUG                             0x021C
-#define CRTC_OFFSET_RIGHT                      0x0220  
-#define CRTC_OFFSET                            0x0224  
-#define CRTC_OFFSET_CNTL                       0x0228  
-#define CRTC_PITCH                             0x022C  
-#define OVR_CLR                                0x0230  
-#define OVR_WID_LEFT_RIGHT                     0x0234  
-#define OVR_WID_TOP_BOTTOM                     0x0238  
-#define DISPLAY_BASE_ADDR                      0x023C  
-#define SNAPSHOT_VH_COUNTS                     0x0240  
-#define SNAPSHOT_F_COUNT                       0x0244  
-#define N_VIF_COUNT                            0x0248  
-#define SNAPSHOT_VIF_COUNT                     0x024C  
-#define FP_CRTC_H_TOTAL_DISP                   0x0250  
-#define FP_CRTC_V_TOTAL_DISP                   0x0254  
+#define CRTC_OFFSET_RIGHT                      0x0220
+#define CRTC_OFFSET                            0x0224
+#define CRTC_OFFSET_CNTL                       0x0228
+#define CRTC_PITCH                             0x022C
+#define OVR_CLR                                0x0230
+#define OVR_WID_LEFT_RIGHT                     0x0234
+#define OVR_WID_TOP_BOTTOM                     0x0238
+#define DISPLAY_BASE_ADDR                      0x023C
+#define SNAPSHOT_VH_COUNTS                     0x0240
+#define SNAPSHOT_F_COUNT                       0x0244
+#define N_VIF_COUNT                            0x0248
+#define SNAPSHOT_VIF_COUNT                     0x024C
+#define FP_CRTC_H_TOTAL_DISP                   0x0250
+#define FP_CRTC_V_TOTAL_DISP                   0x0254
 #define CRT_CRTC_H_SYNC_STRT_WID               0x0258
 #define CRT_CRTC_V_SYNC_STRT_WID               0x025C
 #define CUR_OFFSET                             0x0260
-#define CUR_HORZ_VERT_POSN                     0x0264  
-#define CUR_HORZ_VERT_OFF                      0x0268  
-#define CUR_CLR0                               0x026C  
-#define CUR_CLR1                               0x0270  
-#define FP_HORZ_VERT_ACTIVE                    0x0278  
-#define CRTC_MORE_CNTL                         0x027C  
+#define CUR_HORZ_VERT_POSN                     0x0264
+#define CUR_HORZ_VERT_OFF                      0x0268
+#define CUR_CLR0                               0x026C
+#define CUR_CLR1                               0x0270
+#define FP_HORZ_VERT_ACTIVE                    0x0278
+#define CRTC_MORE_CNTL                         0x027C
 #define CRTC_H_CUTOFF_ACTIVE_EN                (1<<4)
 #define CRTC_V_CUTOFF_ACTIVE_EN                (1<<5)
-#define DAC_EXT_CNTL                           0x0280  
-#define FP_GEN_CNTL                            0x0284  
-#define FP_HORZ_STRETCH                        0x028C  
-#define FP_VERT_STRETCH                        0x0290  
-#define FP_H_SYNC_STRT_WID                     0x02C4  
-#define FP_V_SYNC_STRT_WID                     0x02C8  
-#define AUX_WINDOW_HORZ_CNTL                   0x02D8  
-#define AUX_WINDOW_VERT_CNTL                   0x02DC  
+#define DAC_EXT_CNTL                           0x0280
+#define FP_GEN_CNTL                            0x0284
+#define FP_HORZ_STRETCH                        0x028C
+#define FP_VERT_STRETCH                        0x0290
+#define FP_H_SYNC_STRT_WID                     0x02C4
+#define FP_V_SYNC_STRT_WID                     0x02C8
+#define AUX_WINDOW_HORZ_CNTL                   0x02D8
+#define AUX_WINDOW_VERT_CNTL                   0x02DC
 //#define DDA_CONFIG			       0x02e0
 //#define DDA_ON_OFF			       0x02e4
 #define DVI_I2C_CNTL_1			       0x02e4
@@ -199,192 +199,192 @@
 #define GRPH2_BUFFER_CNTL                      0x03F0
 #define VGA_BUFFER_CNTL                        0x02F4
 #define OV0_Y_X_START                          0x0400
-#define OV0_Y_X_END                            0x0404  
-#define OV0_PIPELINE_CNTL                      0x0408  
-#define OV0_REG_LOAD_CNTL                      0x0410  
-#define OV0_SCALE_CNTL                         0x0420  
-#define OV0_V_INC                              0x0424  
-#define OV0_P1_V_ACCUM_INIT                    0x0428  
-#define OV0_P23_V_ACCUM_INIT                   0x042C  
-#define OV0_P1_BLANK_LINES_AT_TOP              0x0430  
-#define OV0_P23_BLANK_LINES_AT_TOP             0x0434  
-#define OV0_BASE_ADDR                          0x043C  
-#define OV0_VID_BUF0_BASE_ADRS                 0x0440  
-#define OV0_VID_BUF1_BASE_ADRS                 0x0444  
-#define OV0_VID_BUF2_BASE_ADRS                 0x0448  
-#define OV0_VID_BUF3_BASE_ADRS                 0x044C  
+#define OV0_Y_X_END                            0x0404
+#define OV0_PIPELINE_CNTL                      0x0408
+#define OV0_REG_LOAD_CNTL                      0x0410
+#define OV0_SCALE_CNTL                         0x0420
+#define OV0_V_INC                              0x0424
+#define OV0_P1_V_ACCUM_INIT                    0x0428
+#define OV0_P23_V_ACCUM_INIT                   0x042C
+#define OV0_P1_BLANK_LINES_AT_TOP              0x0430
+#define OV0_P23_BLANK_LINES_AT_TOP             0x0434
+#define OV0_BASE_ADDR                          0x043C
+#define OV0_VID_BUF0_BASE_ADRS                 0x0440
+#define OV0_VID_BUF1_BASE_ADRS                 0x0444
+#define OV0_VID_BUF2_BASE_ADRS                 0x0448
+#define OV0_VID_BUF3_BASE_ADRS                 0x044C
 #define OV0_VID_BUF4_BASE_ADRS                 0x0450
 #define OV0_VID_BUF5_BASE_ADRS                 0x0454
 #define OV0_VID_BUF_PITCH0_VALUE               0x0460
-#define OV0_VID_BUF_PITCH1_VALUE               0x0464  
-#define OV0_AUTO_FLIP_CNTRL                    0x0470  
-#define OV0_DEINTERLACE_PATTERN                0x0474  
-#define OV0_SUBMIT_HISTORY                     0x0478  
-#define OV0_H_INC                              0x0480  
-#define OV0_STEP_BY                            0x0484  
-#define OV0_P1_H_ACCUM_INIT                    0x0488  
-#define OV0_P23_H_ACCUM_INIT                   0x048C  
-#define OV0_P1_X_START_END                     0x0494  
-#define OV0_P2_X_START_END                     0x0498  
-#define OV0_P3_X_START_END                     0x049C  
-#define OV0_FILTER_CNTL                        0x04A0  
-#define OV0_FOUR_TAP_COEF_0                    0x04B0  
-#define OV0_FOUR_TAP_COEF_1                    0x04B4  
+#define OV0_VID_BUF_PITCH1_VALUE               0x0464
+#define OV0_AUTO_FLIP_CNTRL                    0x0470
+#define OV0_DEINTERLACE_PATTERN                0x0474
+#define OV0_SUBMIT_HISTORY                     0x0478
+#define OV0_H_INC                              0x0480
+#define OV0_STEP_BY                            0x0484
+#define OV0_P1_H_ACCUM_INIT                    0x0488
+#define OV0_P23_H_ACCUM_INIT                   0x048C
+#define OV0_P1_X_START_END                     0x0494
+#define OV0_P2_X_START_END                     0x0498
+#define OV0_P3_X_START_END                     0x049C
+#define OV0_FILTER_CNTL                        0x04A0
+#define OV0_FOUR_TAP_COEF_0                    0x04B0
+#define OV0_FOUR_TAP_COEF_1                    0x04B4
 #define OV0_FOUR_TAP_COEF_2                    0x04B8
 #define OV0_FOUR_TAP_COEF_3                    0x04BC
 #define OV0_FOUR_TAP_COEF_4                    0x04C0
-#define OV0_FLAG_CNTRL                         0x04DC  
-#define OV0_SLICE_CNTL                         0x04E0  
-#define OV0_VID_KEY_CLR_LOW                    0x04E4  
-#define OV0_VID_KEY_CLR_HIGH                   0x04E8  
-#define OV0_GRPH_KEY_CLR_LOW                   0x04EC  
-#define OV0_GRPH_KEY_CLR_HIGH                  0x04F0  
-#define OV0_KEY_CNTL                           0x04F4  
-#define OV0_TEST                               0x04F8  
-#define SUBPIC_CNTL                            0x0540  
-#define SUBPIC_DEFCOLCON                       0x0544  
-#define SUBPIC_Y_X_START                       0x054C  
-#define SUBPIC_Y_X_END                         0x0550  
-#define SUBPIC_V_INC                           0x0554  
-#define SUBPIC_H_INC                           0x0558  
+#define OV0_FLAG_CNTRL                         0x04DC
+#define OV0_SLICE_CNTL                         0x04E0
+#define OV0_VID_KEY_CLR_LOW                    0x04E4
+#define OV0_VID_KEY_CLR_HIGH                   0x04E8
+#define OV0_GRPH_KEY_CLR_LOW                   0x04EC
+#define OV0_GRPH_KEY_CLR_HIGH                  0x04F0
+#define OV0_KEY_CNTL                           0x04F4
+#define OV0_TEST                               0x04F8
+#define SUBPIC_CNTL                            0x0540
+#define SUBPIC_DEFCOLCON                       0x0544
+#define SUBPIC_Y_X_START                       0x054C
+#define SUBPIC_Y_X_END                         0x0550
+#define SUBPIC_V_INC                           0x0554
+#define SUBPIC_H_INC                           0x0558
 #define SUBPIC_BUF0_OFFSET                     0x055C
 #define SUBPIC_BUF1_OFFSET                     0x0560
 #define SUBPIC_LC0_OFFSET                      0x0564
-#define SUBPIC_LC1_OFFSET                      0x0568  
-#define SUBPIC_PITCH                           0x056C  
-#define SUBPIC_BTN_HLI_COLCON                  0x0570  
-#define SUBPIC_BTN_HLI_Y_X_START               0x0574  
-#define SUBPIC_BTN_HLI_Y_X_END                 0x0578  
-#define SUBPIC_PALETTE_INDEX                   0x057C  
-#define SUBPIC_PALETTE_DATA                    0x0580  
-#define SUBPIC_H_ACCUM_INIT                    0x0584  
-#define SUBPIC_V_ACCUM_INIT                    0x0588  
-#define DISP_MISC_CNTL                         0x0D00  
-#define DAC_MACRO_CNTL                         0x0D04  
-#define DISP_PWR_MAN                           0x0D08  
-#define DISP_TEST_DEBUG_CNTL                   0x0D10  
-#define DISP_HW_DEBUG                          0x0D14  
+#define SUBPIC_LC1_OFFSET                      0x0568
+#define SUBPIC_PITCH                           0x056C
+#define SUBPIC_BTN_HLI_COLCON                  0x0570
+#define SUBPIC_BTN_HLI_Y_X_START               0x0574
+#define SUBPIC_BTN_HLI_Y_X_END                 0x0578
+#define SUBPIC_PALETTE_INDEX                   0x057C
+#define SUBPIC_PALETTE_DATA                    0x0580
+#define SUBPIC_H_ACCUM_INIT                    0x0584
+#define SUBPIC_V_ACCUM_INIT                    0x0588
+#define DISP_MISC_CNTL                         0x0D00
+#define DAC_MACRO_CNTL                         0x0D04
+#define DISP_PWR_MAN                           0x0D08
+#define DISP_TEST_DEBUG_CNTL                   0x0D10
+#define DISP_HW_DEBUG                          0x0D14
 #define DAC_CRC_SIG1                           0x0D18
 #define DAC_CRC_SIG2                           0x0D1C
 #define OV0_LIN_TRANS_A                        0x0D20
-#define OV0_LIN_TRANS_B                        0x0D24  
-#define OV0_LIN_TRANS_C                        0x0D28  
-#define OV0_LIN_TRANS_D                        0x0D2C  
-#define OV0_LIN_TRANS_E                        0x0D30  
-#define OV0_LIN_TRANS_F                        0x0D34  
-#define OV0_GAMMA_0_F                          0x0D40  
-#define OV0_GAMMA_10_1F                        0x0D44  
-#define OV0_GAMMA_20_3F                        0x0D48  
-#define OV0_GAMMA_40_7F                        0x0D4C  
-#define OV0_GAMMA_380_3BF                      0x0D50  
-#define OV0_GAMMA_3C0_3FF                      0x0D54  
-#define DISP_MERGE_CNTL                        0x0D60  
-#define DISP_OUTPUT_CNTL                       0x0D64  
-#define DISP_LIN_TRANS_GRPH_A                  0x0D80  
+#define OV0_LIN_TRANS_B                        0x0D24
+#define OV0_LIN_TRANS_C                        0x0D28
+#define OV0_LIN_TRANS_D                        0x0D2C
+#define OV0_LIN_TRANS_E                        0x0D30
+#define OV0_LIN_TRANS_F                        0x0D34
+#define OV0_GAMMA_0_F                          0x0D40
+#define OV0_GAMMA_10_1F                        0x0D44
+#define OV0_GAMMA_20_3F                        0x0D48
+#define OV0_GAMMA_40_7F                        0x0D4C
+#define OV0_GAMMA_380_3BF                      0x0D50
+#define OV0_GAMMA_3C0_3FF                      0x0D54
+#define DISP_MERGE_CNTL                        0x0D60
+#define DISP_OUTPUT_CNTL                       0x0D64
+#define DISP_LIN_TRANS_GRPH_A                  0x0D80
 #define DISP_LIN_TRANS_GRPH_B                  0x0D84
 #define DISP_LIN_TRANS_GRPH_C                  0x0D88
 #define DISP_LIN_TRANS_GRPH_D                  0x0D8C
-#define DISP_LIN_TRANS_GRPH_E                  0x0D90  
-#define DISP_LIN_TRANS_GRPH_F                  0x0D94  
-#define DISP_LIN_TRANS_VID_A                   0x0D98  
-#define DISP_LIN_TRANS_VID_B                   0x0D9C  
-#define DISP_LIN_TRANS_VID_C                   0x0DA0  
-#define DISP_LIN_TRANS_VID_D                   0x0DA4  
-#define DISP_LIN_TRANS_VID_E                   0x0DA8  
-#define DISP_LIN_TRANS_VID_F                   0x0DAC  
-#define RMX_HORZ_FILTER_0TAP_COEF              0x0DB0  
-#define RMX_HORZ_FILTER_1TAP_COEF              0x0DB4  
-#define RMX_HORZ_FILTER_2TAP_COEF              0x0DB8  
-#define RMX_HORZ_PHASE                         0x0DBC  
-#define DAC_EMBEDDED_SYNC_CNTL                 0x0DC0  
-#define DAC_BROAD_PULSE                        0x0DC4  
+#define DISP_LIN_TRANS_GRPH_E                  0x0D90
+#define DISP_LIN_TRANS_GRPH_F                  0x0D94
+#define DISP_LIN_TRANS_VID_A                   0x0D98
+#define DISP_LIN_TRANS_VID_B                   0x0D9C
+#define DISP_LIN_TRANS_VID_C                   0x0DA0
+#define DISP_LIN_TRANS_VID_D                   0x0DA4
+#define DISP_LIN_TRANS_VID_E                   0x0DA8
+#define DISP_LIN_TRANS_VID_F                   0x0DAC
+#define RMX_HORZ_FILTER_0TAP_COEF              0x0DB0
+#define RMX_HORZ_FILTER_1TAP_COEF              0x0DB4
+#define RMX_HORZ_FILTER_2TAP_COEF              0x0DB8
+#define RMX_HORZ_PHASE                         0x0DBC
+#define DAC_EMBEDDED_SYNC_CNTL                 0x0DC0
+#define DAC_BROAD_PULSE                        0x0DC4
 #define DAC_SKEW_CLKS                          0x0DC8
 #define DAC_INCR                               0x0DCC
 #define DAC_NEG_SYNC_LEVEL                     0x0DD0
-#define DAC_POS_SYNC_LEVEL                     0x0DD4  
-#define DAC_BLANK_LEVEL                        0x0DD8  
-#define CLOCK_CNTL_INDEX                       0x0008  
-#define CLOCK_CNTL_DATA                        0x000C  
-#define CP_RB_CNTL                             0x0704  
-#define CP_RB_BASE                             0x0700  
-#define CP_RB_RPTR_ADDR                        0x070C  
-#define CP_RB_RPTR                             0x0710  
-#define CP_RB_WPTR                             0x0714  
-#define CP_RB_WPTR_DELAY                       0x0718  
-#define CP_IB_BASE                             0x0738  
-#define CP_IB_BUFSZ                            0x073C  
-#define SCRATCH_REG0                           0x15E0  
-#define GUI_SCRATCH_REG0                       0x15E0  
-#define SCRATCH_REG1                           0x15E4  
-#define GUI_SCRATCH_REG1                       0x15E4  
+#define DAC_POS_SYNC_LEVEL                     0x0DD4
+#define DAC_BLANK_LEVEL                        0x0DD8
+#define CLOCK_CNTL_INDEX                       0x0008
+#define CLOCK_CNTL_DATA                        0x000C
+#define CP_RB_CNTL                             0x0704
+#define CP_RB_BASE                             0x0700
+#define CP_RB_RPTR_ADDR                        0x070C
+#define CP_RB_RPTR                             0x0710
+#define CP_RB_WPTR                             0x0714
+#define CP_RB_WPTR_DELAY                       0x0718
+#define CP_IB_BASE                             0x0738
+#define CP_IB_BUFSZ                            0x073C
+#define SCRATCH_REG0                           0x15E0
+#define GUI_SCRATCH_REG0                       0x15E0
+#define SCRATCH_REG1                           0x15E4
+#define GUI_SCRATCH_REG1                       0x15E4
 #define SCRATCH_REG2                           0x15E8
 #define GUI_SCRATCH_REG2                       0x15E8
 #define SCRATCH_REG3                           0x15EC
-#define GUI_SCRATCH_REG3                       0x15EC  
-#define SCRATCH_REG4                           0x15F0  
-#define GUI_SCRATCH_REG4                       0x15F0  
-#define SCRATCH_REG5                           0x15F4  
-#define GUI_SCRATCH_REG5                       0x15F4  
-#define SCRATCH_UMSK                           0x0770  
-#define SCRATCH_ADDR                           0x0774  
-#define DP_BRUSH_FRGD_CLR                      0x147C  
+#define GUI_SCRATCH_REG3                       0x15EC
+#define SCRATCH_REG4                           0x15F0
+#define GUI_SCRATCH_REG4                       0x15F0
+#define SCRATCH_REG5                           0x15F4
+#define GUI_SCRATCH_REG5                       0x15F4
+#define SCRATCH_UMSK                           0x0770
+#define SCRATCH_ADDR                           0x0774
+#define DP_BRUSH_FRGD_CLR                      0x147C
 #define DP_BRUSH_BKGD_CLR                      0x1478
 #define DST_LINE_START                         0x1600
-#define DST_LINE_END                           0x1604  
-#define SRC_OFFSET                             0x15AC  
+#define DST_LINE_END                           0x1604
+#define SRC_OFFSET                             0x15AC
 #define SRC_PITCH                              0x15B0
 #define SRC_TILE                               0x1704
 #define SRC_PITCH_OFFSET                       0x1428
-#define SRC_X                                  0x1414  
-#define SRC_Y                                  0x1418  
-#define SRC_X_Y                                0x1590  
-#define SRC_Y_X                                0x1434  
+#define SRC_X                                  0x1414
+#define SRC_Y                                  0x1418
+#define SRC_X_Y                                0x1590
+#define SRC_Y_X                                0x1434
 #define DST_Y_X				       0x1438
 #define DST_WIDTH_HEIGHT		       0x1598
 #define DST_HEIGHT_WIDTH		       0x143c
 #define DST_OFFSET                             0x1404
-#define SRC_CLUT_ADDRESS                       0x1780  
-#define SRC_CLUT_DATA                          0x1784  
-#define SRC_CLUT_DATA_RD                       0x1788  
-#define HOST_DATA0                             0x17C0  
-#define HOST_DATA1                             0x17C4  
-#define HOST_DATA2                             0x17C8  
-#define HOST_DATA3                             0x17CC  
-#define HOST_DATA4                             0x17D0  
-#define HOST_DATA5                             0x17D4  
-#define HOST_DATA6                             0x17D8  
+#define SRC_CLUT_ADDRESS                       0x1780
+#define SRC_CLUT_DATA                          0x1784
+#define SRC_CLUT_DATA_RD                       0x1788
+#define HOST_DATA0                             0x17C0
+#define HOST_DATA1                             0x17C4
+#define HOST_DATA2                             0x17C8
+#define HOST_DATA3                             0x17CC
+#define HOST_DATA4                             0x17D0
+#define HOST_DATA5                             0x17D4
+#define HOST_DATA6                             0x17D8
 #define HOST_DATA7                             0x17DC
 #define HOST_DATA_LAST                         0x17E0
 #define DP_SRC_ENDIAN                          0x15D4
-#define DP_SRC_FRGD_CLR                        0x15D8  
-#define DP_SRC_BKGD_CLR                        0x15DC  
-#define SC_LEFT                                0x1640  
-#define SC_RIGHT                               0x1644  
-#define SC_TOP                                 0x1648  
-#define SC_BOTTOM                              0x164C  
-#define SRC_SC_RIGHT                           0x1654  
-#define SRC_SC_BOTTOM                          0x165C  
-#define DP_CNTL                                0x16C0  
-#define DP_CNTL_XDIR_YDIR_YMAJOR               0x16D0  
-#define DP_DATATYPE                            0x16C4  
-#define DP_MIX                                 0x16C8  
-#define DP_WRITE_MSK                           0x16CC  
-#define DP_XOP                                 0x17F8  
+#define DP_SRC_FRGD_CLR                        0x15D8
+#define DP_SRC_BKGD_CLR                        0x15DC
+#define SC_LEFT                                0x1640
+#define SC_RIGHT                               0x1644
+#define SC_TOP                                 0x1648
+#define SC_BOTTOM                              0x164C
+#define SRC_SC_RIGHT                           0x1654
+#define SRC_SC_BOTTOM                          0x165C
+#define DP_CNTL                                0x16C0
+#define DP_CNTL_XDIR_YDIR_YMAJOR               0x16D0
+#define DP_DATATYPE                            0x16C4
+#define DP_MIX                                 0x16C8
+#define DP_WRITE_MSK                           0x16CC
+#define DP_XOP                                 0x17F8
 #define CLR_CMP_CLR_SRC                        0x15C4
 #define CLR_CMP_CLR_DST                        0x15C8
 #define CLR_CMP_CNTL                           0x15C0
-#define CLR_CMP_MSK                            0x15CC  
-#define DSTCACHE_MODE                          0x1710  
-#define DSTCACHE_CTLSTAT                       0x1714  
-#define DEFAULT_PITCH_OFFSET                   0x16E0  
-#define DEFAULT_SC_BOTTOM_RIGHT                0x16E8  
+#define CLR_CMP_MSK                            0x15CC
+#define DSTCACHE_MODE                          0x1710
+#define DSTCACHE_CTLSTAT                       0x1714
+#define DEFAULT_PITCH_OFFSET                   0x16E0
+#define DEFAULT_SC_BOTTOM_RIGHT                0x16E8
 #define DEFAULT_SC_TOP_LEFT                    0x16EC
 #define SRC_PITCH_OFFSET                       0x1428
 #define DST_PITCH_OFFSET                       0x142C
-#define DP_GUI_MASTER_CNTL                     0x146C  
-#define SC_TOP_LEFT                            0x16EC  
-#define SC_BOTTOM_RIGHT                        0x16F0  
-#define SRC_SC_BOTTOM_RIGHT                    0x16F4  
+#define DP_GUI_MASTER_CNTL                     0x146C
+#define SC_TOP_LEFT                            0x16EC
+#define SC_BOTTOM_RIGHT                        0x16F0
+#define SRC_SC_BOTTOM_RIGHT                    0x16F4
 #define RB2D_DSTCACHE_MODE		       0x3428
 #define RB2D_DSTCACHE_CTLSTAT_broken	       0x342C /* do not use */
 #define LVDS_GEN_CNTL			       0x02d0
@@ -686,7 +686,7 @@
 #define VERT_FP_LOOP_STRETCH			   (0x7 << 28)
 #define VERT_STRETCH_RESERVED			   0xf1000000
 
-/* DAC_CNTL bit constants */   
+/* DAC_CNTL bit constants */
 #define DAC_8BIT_EN                                0x00000100
 #define DAC_4BPP_PIX_ORDER                         0x00000200
 #define DAC_CRC_EN                                 0x00080000
@@ -700,7 +700,7 @@
 #define DAC_CMP_EN                                 (1 <<  3)
 #define DAC_CMP_OUTPUT                             (1 <<  7)
 
-/* DAC_CNTL2 bit constants */   
+/* DAC_CNTL2 bit constants */
 #define DAC2_EXPAND_MODE			   (1 << 14)
 #define DAC2_CMP_EN                                (1 << 7)
 #define DAC2_PALETTE_ACCESS_CNTL                   (1 << 5)
-- 
cgit v1.2.3


From 75ed3a17a5bc0ecff5c256cfb81ed06f8a6fbb54 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@poczta.fm>
Date: Tue, 31 Mar 2009 15:25:03 -0700
Subject: cirrusfb: convert printks to dev_foo

Convert all printks to dev_dbg, dev_info or dev_err.  Kill some excessive
debug information and code in the process.

[akpm@linux-foundation.org: printk fixes]
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 455 ++++++++++++++++-------------------------------
 1 file changed, 158 insertions(+), 297 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index a2aa6ddffbe..3b4b0f1e061 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -34,8 +34,6 @@
  *
  */
 
-#define CIRRUSFB_VERSION "2.0-pre2"
-
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -72,20 +70,9 @@
  *
  */
 
-/* enable debug output? */
-/* #define CIRRUSFB_DEBUG 1 */
-
 /* disable runtime assertions? */
 /* #define CIRRUSFB_NDEBUG */
 
-/* debug output */
-#ifdef CIRRUSFB_DEBUG
-#define DPRINTK(fmt, args...) \
-	printk(KERN_DEBUG "%s: " fmt, __func__ , ## args)
-#else
-#define DPRINTK(fmt, args...)
-#endif
-
 /* debugging assertions */
 #ifndef CIRRUSFB_NDEBUG
 #define assert(expr) \
@@ -150,7 +137,7 @@ static const struct cirrusfb_board_info_rec {
 		.maxclock		= {
 			/* guess */
 			/* the SD64/P4 have a higher max. videoclock */
-			140000, 140000, 140000, 140000, 140000,
+			135100, 135100, 85500, 85500, 0
 		},
 		.init_sr07		= true,
 		.init_sr1f		= true,
@@ -426,11 +413,10 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 static void bestclock(long freq, int *nom, int *den, int *div);
 
 #ifdef CIRRUSFB_DEBUG
-static void cirrusfb_dump(void);
-static void cirrusfb_dbg_reg_dump(caddr_t regbase);
-static void cirrusfb_dbg_print_regs(caddr_t regbase,
+static void cirrusfb_dbg_reg_dump(struct fb_info *info, caddr_t regbase);
+static void cirrusfb_dbg_print_regs(struct fb_info *info,
+				    caddr_t regbase,
 				    enum cirrusfb_dbg_reg_class reg_class, ...);
-static void cirrusfb_dbg_print_byte(const char *name, unsigned char val);
 #endif /* CIRRUSFB_DEBUG */
 
 /*** END   PROTOTYPES ********************************************************/
@@ -460,23 +446,24 @@ static int cirrusfb_release(struct fb_info *info, int user)
 /**** BEGIN Hardware specific Routines **************************************/
 
 /* Check if the MCLK is not a better clock source */
-static int cirrusfb_check_mclk(struct cirrusfb_info *cinfo, long freq)
+static int cirrusfb_check_mclk(struct fb_info *info, long freq)
 {
+	struct cirrusfb_info *cinfo = info->par;
 	long mclk = vga_rseq(cinfo->regbase, CL_SEQR1F) & 0x3f;
 
 	/* Read MCLK value */
 	mclk = (14318 * mclk) >> 3;
-	DPRINTK("Read MCLK of %ld kHz\n", mclk);
+	dev_dbg(info->device, "Read MCLK of %ld kHz\n", mclk);
 
 	/* Determine if we should use MCLK instead of VCLK, and if so, what we
 	 * should divide it by to get VCLK
 	 */
 
 	if (abs(freq - mclk) < 250) {
-		DPRINTK("Using VCLK = MCLK\n");
+		dev_dbg(info->device, "Using VCLK = MCLK\n");
 		return 1;
 	} else if (abs(freq - (mclk / 2)) < 250) {
-		DPRINTK("Using VCLK = MCLK/2\n");
+		dev_dbg(info->device, "Using VCLK = MCLK/2\n");
 		return 2;
 	}
 
@@ -490,56 +477,6 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	/* memory size in pixels */
 	unsigned pixels = info->screen_size * 8 / var->bits_per_pixel;
 
-	switch (var->bits_per_pixel) {
-	case 1:
-		pixels /= 4;
-		break;		/* 8 pixel per byte, only 1/4th of mem usable */
-	case 8:
-	case 16:
-	case 32:
-		break;		/* 1 pixel == 1 byte */
-	default:
-		printk(KERN_ERR "cirrusfb: mode %dx%dx%d rejected..."
-			"color depth not supported.\n",
-			var->xres, var->yres, var->bits_per_pixel);
-		DPRINTK("EXIT - EINVAL error\n");
-		return -EINVAL;
-	}
-
-	if (var->xres_virtual < var->xres)
-		var->xres_virtual = var->xres;
-	/* use highest possible virtual resolution */
-	if (var->yres_virtual == -1) {
-		var->yres_virtual = pixels / var->xres_virtual;
-
-		printk(KERN_INFO "cirrusfb: virtual resolution set to "
-			"maximum of %dx%d\n", var->xres_virtual,
-			var->yres_virtual);
-	}
-	if (var->yres_virtual < var->yres)
-		var->yres_virtual = var->yres;
-
-	if (var->xres_virtual * var->yres_virtual > pixels) {
-		printk(KERN_ERR "cirrusfb: mode %dx%dx%d rejected... "
-		      "virtual resolution too high to fit into video memory!\n",
-			var->xres_virtual, var->yres_virtual,
-			var->bits_per_pixel);
-		DPRINTK("EXIT - EINVAL error\n");
-		return -EINVAL;
-	}
-
-
-	if (var->xoffset < 0)
-		var->xoffset = 0;
-	if (var->yoffset < 0)
-		var->yoffset = 0;
-
-	/* truncate xoffset and yoffset to maximum if too high */
-	if (var->xoffset > var->xres_virtual - var->xres)
-		var->xoffset = var->xres_virtual - var->xres - 1;
-	if (var->yoffset > var->yres_virtual - var->yres)
-		var->yoffset = var->yres_virtual - var->yres - 1;
-
 	switch (var->bits_per_pixel) {
 	case 1:
 		var->red.offset = 0;
@@ -586,12 +523,46 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 		break;
 
 	default:
-		DPRINTK("Unsupported bpp size: %d\n", var->bits_per_pixel);
+		dev_dbg(info->device,
+			"Unsupported bpp size: %d\n", var->bits_per_pixel);
 		assert(false);
 		/* should never occur */
 		break;
 	}
 
+	if (var->xres_virtual < var->xres)
+		var->xres_virtual = var->xres;
+	/* use highest possible virtual resolution */
+	if (var->yres_virtual == -1) {
+		var->yres_virtual = pixels / var->xres_virtual;
+
+		dev_info(info->device,
+			 "virtual resolution set to maximum of %dx%d\n",
+			 var->xres_virtual, var->yres_virtual);
+	}
+	if (var->yres_virtual < var->yres)
+		var->yres_virtual = var->yres;
+
+	if (var->xres_virtual * var->yres_virtual > pixels) {
+		dev_err(info->device, "mode %dx%dx%d rejected... "
+		      "virtual resolution too high to fit into video memory!\n",
+			var->xres_virtual, var->yres_virtual,
+			var->bits_per_pixel);
+		return -EINVAL;
+	}
+
+
+	if (var->xoffset < 0)
+		var->xoffset = 0;
+	if (var->yoffset < 0)
+		var->yoffset = 0;
+
+	/* truncate xoffset and yoffset to maximum if too high */
+	if (var->xoffset > var->xres_virtual - var->xres)
+		var->xoffset = var->xres_virtual - var->xres - 1;
+	if (var->yoffset > var->yres_virtual - var->yres)
+		var->yoffset = var->yres_virtual - var->yres - 1;
+
 	var->red.msb_right =
 	    var->green.msb_right =
 	    var->blue.msb_right =
@@ -606,9 +577,8 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 		yres = (yres + 1) / 2;
 
 	if (yres >= 1280) {
-		printk(KERN_ERR "cirrusfb: ERROR: VerticalTotal >= 1280; "
+		dev_err(info->device, "ERROR: VerticalTotal >= 1280; "
 			"special treatment required! (TODO)\n");
-		DPRINTK("EXIT - EINVAL error\n");
 		return -EINVAL;
 	}
 
@@ -642,7 +612,8 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
 		break;
 
 	default:
-		DPRINTK("Unsupported bpp size: %d\n", var->bits_per_pixel);
+		dev_dbg(info->device,
+			"Unsupported bpp size: %d\n", var->bits_per_pixel);
 		assert(false);
 		/* should never occur */
 		break;
@@ -653,7 +624,7 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
 	/* convert from ps to kHz */
 	freq = PICOS2KHZ(var->pixclock);
 
-	DPRINTK("desired pixclock: %ld kHz\n", freq);
+	dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq);
 
 	maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx];
 	regs->multiplexing = 0;
@@ -668,9 +639,9 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
 			break;
 
 		default:
-			printk(KERN_ERR "cirrusfb: Frequency greater "
-				"than maxclock (%ld kHz)\n", maxclock);
-			DPRINTK("EXIT - return -EINVAL\n");
+			dev_err(info->device,
+				"Frequency greater than maxclock (%ld kHz)\n",
+				maxclock);
 			return -EINVAL;
 		}
 	}
@@ -689,16 +660,17 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
 	return 0;
 }
 
-static void cirrusfb_set_mclk_as_source(const struct cirrusfb_info *cinfo,
-					int div)
+static void cirrusfb_set_mclk_as_source(const struct fb_info *info, int div)
 {
+	struct cirrusfb_info *cinfo = info->par;
 	unsigned char old1f, old1e;
+
 	assert(cinfo != NULL);
 	old1f = vga_rseq(cinfo->regbase, CL_SEQR1F) & ~0x40;
 
 	if (div) {
-		DPRINTK("Set %s as pixclock source.\n",
-					(div == 2) ? "MCLK/2" : "MCLK");
+		dev_dbg(info->device, "Set %s as pixclock source.\n",
+			(div == 2) ? "MCLK/2" : "MCLK");
 		old1f |= 0x40;
 		old1e = vga_rseq(cinfo->regbase, CL_SEQR1E) & ~0x1;
 		if (div == 2)
@@ -728,17 +700,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	long freq;
 	int nom, den, div;
 
-	DPRINTK("ENTER\n");
-	DPRINTK("Requested mode: %dx%dx%d\n",
+	dev_dbg(info->device, "Requested mode: %dx%dx%d\n",
 	       var->xres, var->yres, var->bits_per_pixel);
-	DPRINTK("pixclock: %d\n", var->pixclock);
+	dev_dbg(info->device, "pixclock: %d\n", var->pixclock);
 
 	init_vgachip(info);
 
 	err = cirrusfb_decode_var(var, &regs, info);
 	if (err) {
 		/* should never happen */
-		DPRINTK("mode change aborted.  invalid var.\n");
+		dev_dbg(info->device, "mode change aborted.  invalid var.\n");
 		return -EINVAL;
 	}
 
@@ -789,30 +760,30 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, 0x20);	/* previously: 0x00) */
 
 	/* if debugging is enabled, all parameters get output before writing */
-	DPRINTK("CRT0: %d\n", htotal);
+	dev_dbg(info->device, "CRT0: %d\n", htotal);
 	vga_wcrt(regbase, VGA_CRTC_H_TOTAL, htotal);
 
-	DPRINTK("CRT1: %d\n", hdispend);
+	dev_dbg(info->device, "CRT1: %d\n", hdispend);
 	vga_wcrt(regbase, VGA_CRTC_H_DISP, hdispend);
 
-	DPRINTK("CRT2: %d\n", var->xres / 8);
+	dev_dbg(info->device, "CRT2: %d\n", var->xres / 8);
 	vga_wcrt(regbase, VGA_CRTC_H_BLANK_START, var->xres / 8);
 
 	/*  + 128: Compatible read */
-	DPRINTK("CRT3: 128+%d\n", (htotal + 5) % 32);
+	dev_dbg(info->device, "CRT3: 128+%d\n", (htotal + 5) % 32);
 	vga_wcrt(regbase, VGA_CRTC_H_BLANK_END,
 		 128 + ((htotal + 5) % 32));
 
-	DPRINTK("CRT4: %d\n", hsyncstart);
+	dev_dbg(info->device, "CRT4: %d\n", hsyncstart);
 	vga_wcrt(regbase, VGA_CRTC_H_SYNC_START, hsyncstart);
 
 	tmp = hsyncend % 32;
 	if ((htotal + 5) & 32)
 		tmp += 128;
-	DPRINTK("CRT5: %d\n", tmp);
+	dev_dbg(info->device, "CRT5: %d\n", tmp);
 	vga_wcrt(regbase, VGA_CRTC_H_SYNC_END, tmp);
 
-	DPRINTK("CRT6: %d\n", vtotal & 0xff);
+	dev_dbg(info->device, "CRT6: %d\n", vtotal & 0xff);
 	vga_wcrt(regbase, VGA_CRTC_V_TOTAL, vtotal & 0xff);
 
 	tmp = 16;		/* LineCompare bit #9 */
@@ -830,7 +801,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		tmp |= 64;
 	if (vsyncstart & 512)
 		tmp |= 128;
-	DPRINTK("CRT7: %d\n", tmp);
+	dev_dbg(info->device, "CRT7: %d\n", tmp);
 	vga_wcrt(regbase, VGA_CRTC_OVERFLOW, tmp);
 
 	tmp = 0x40;		/* LineCompare bit #8 */
@@ -838,25 +809,25 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		tmp |= 0x20;
 	if (var->vmode & FB_VMODE_DOUBLE)
 		tmp |= 0x80;
-	DPRINTK("CRT9: %d\n", tmp);
+	dev_dbg(info->device, "CRT9: %d\n", tmp);
 	vga_wcrt(regbase, VGA_CRTC_MAX_SCAN, tmp);
 
-	DPRINTK("CRT10: %d\n", vsyncstart & 0xff);
+	dev_dbg(info->device, "CRT10: %d\n", vsyncstart & 0xff);
 	vga_wcrt(regbase, VGA_CRTC_V_SYNC_START, vsyncstart & 0xff);
 
-	DPRINTK("CRT11: 64+32+%d\n", vsyncend % 16);
+	dev_dbg(info->device, "CRT11: 64+32+%d\n", vsyncend % 16);
 	vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, vsyncend % 16 + 64 + 32);
 
-	DPRINTK("CRT12: %d\n", vdispend & 0xff);
+	dev_dbg(info->device, "CRT12: %d\n", vdispend & 0xff);
 	vga_wcrt(regbase, VGA_CRTC_V_DISP_END, vdispend & 0xff);
 
-	DPRINTK("CRT15: %d\n", (vdispend + 1) & 0xff);
+	dev_dbg(info->device, "CRT15: %d\n", (vdispend + 1) & 0xff);
 	vga_wcrt(regbase, VGA_CRTC_V_BLANK_START, (vdispend + 1) & 0xff);
 
-	DPRINTK("CRT16: %d\n", vtotal & 0xff);
+	dev_dbg(info->device, "CRT16: %d\n", vtotal & 0xff);
 	vga_wcrt(regbase, VGA_CRTC_V_BLANK_END, vtotal & 0xff);
 
-	DPRINTK("CRT18: 0xff\n");
+	dev_dbg(info->device, "CRT18: 0xff\n");
 	vga_wcrt(regbase, VGA_CRTC_LINE_COMPARE, 0xff);
 
 	tmp = 0;
@@ -871,12 +842,15 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	if (vtotal & 512)
 		tmp |= 128;
 
-	DPRINTK("CRT1a: %d\n", tmp);
+	dev_dbg(info->device, "CRT1a: %d\n", tmp);
 	vga_wcrt(regbase, CL_CRT1A, tmp);
 
 	freq = PICOS2KHZ(var->pixclock);
 	bestclock(freq, &nom, &den, &div);
 
+	dev_dbg(info->device, "VCLK freq: %ld kHz  nom: %d  den: %d  div: %d\n",
+		freq, nom, den, div);
+
 	/* set VCLK0 */
 	/* hardware RefClock: 14.31818 MHz */
 	/* formula: VClk = (OSC * N) / (D * (1+P)) */
@@ -886,10 +860,10 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		/* if freq is close to mclk or mclk/2 select mclk
 		 * as clock source
 		 */
-		int divMCLK = cirrusfb_check_mclk(cinfo, freq);
+		int divMCLK = cirrusfb_check_mclk(info, freq);
 		if (divMCLK)  {
 			nom = 0;
-			cirrusfb_set_mclk_as_source(cinfo, divMCLK);
+			cirrusfb_set_mclk_as_source(info, divMCLK);
 		}
 	}
 	if (nom) {
@@ -904,7 +878,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		    (cinfo->btype == BT_GD5480))
 			tmp |= 0x80;
 
-		DPRINTK("CL_SEQR1B: %ld\n", (long) tmp);
+		dev_dbg(info->device, "CL_SEQR1B: %ld\n", (long) tmp);
 		vga_wseq(regbase, CL_SEQR1B, tmp);
 	}
 
@@ -952,7 +926,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 	/* programming for different color depths */
 	if (var->bits_per_pixel == 1) {
-		DPRINTK("cirrusfb: preparing for 1 bit deep display\n");
+		dev_dbg(info->device, "preparing for 1 bit deep display\n");
 		vga_wgfx(regbase, VGA_GFX_MODE, 0);	/* mode register */
 
 		/* SR07 */
@@ -964,20 +938,18 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_PICASSO4:
 		case BT_ALPINE:
 		case BT_GD5480:
-			DPRINTK(" (for GD54xx)\n");
 			vga_wseq(regbase, CL_SEQR7,
-				  regs.multiplexing ?
+				 regs.multiplexing ?
 					bi->sr07_1bpp_mux : bi->sr07_1bpp);
 			break;
 
 		case BT_LAGUNA:
-			DPRINTK(" (for GD546x)\n");
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
 			break;
 
 		default:
-			printk(KERN_WARNING "cirrusfb: unknown Board\n");
+			dev_warn(info->device, "unknown Board\n");
 			break;
 		}
 
@@ -987,14 +959,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			/* setting the SEQRF on SD64 is not necessary
 			 * (only during init)
 			 */
-			DPRINTK("(for SD64)\n");
 			/*  MCLK select */
 			vga_wseq(regbase, CL_SEQR1F, 0x1a);
 			break;
 
 		case BT_PICCOLO:
 		case BT_SPECTRUM:
-			DPRINTK("(for Piccolo/Spectrum)\n");
 			/* ### ueberall 0x22? */
 			/* ##vorher 1c MCLK select */
 			vga_wseq(regbase, CL_SEQR1F, 0x22);
@@ -1003,7 +973,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_PICASSO:
-			DPRINTK("(for Picasso)\n");
 			/* ##vorher 22 MCLK select */
 			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			/* ## vorher d0 avoid FIFO underruns..? */
@@ -1014,12 +983,11 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_ALPINE:
 		case BT_GD5480:
 		case BT_LAGUNA:
-			DPRINTK(" (for GD54xx)\n");
 			/* do nothing */
 			break;
 
 		default:
-			printk(KERN_WARNING "cirrusfb: unknown Board\n");
+			dev_warn(info->device, "unknown Board\n");
 			break;
 		}
 
@@ -1045,7 +1013,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	 */
 
 	else if (var->bits_per_pixel == 8) {
-		DPRINTK("cirrusfb: preparing for 8 bit deep display\n");
+		dev_dbg(info->device, "preparing for 8 bit deep display\n");
 		switch (cinfo->btype) {
 		case BT_SD64:
 		case BT_PICCOLO:
@@ -1054,20 +1022,18 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_PICASSO4:
 		case BT_ALPINE:
 		case BT_GD5480:
-			DPRINTK(" (for GD54xx)\n");
 			vga_wseq(regbase, CL_SEQR7,
 				  regs.multiplexing ?
 					bi->sr07_8bpp_mux : bi->sr07_8bpp);
 			break;
 
 		case BT_LAGUNA:
-			DPRINTK(" (for GD546x)\n");
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) | 0x01);
 			break;
 
 		default:
-			printk(KERN_WARNING "cirrusfb: unknown Board\n");
+			dev_warn(info->device, "unknown Board\n");
 			break;
 		}
 
@@ -1095,18 +1061,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_ALPINE:
-			DPRINTK(" (for GD543x)\n");
 			/* We already set SRF and SR1F */
 			break;
 
 		case BT_GD5480:
 		case BT_LAGUNA:
-			DPRINTK(" (for GD54xx)\n");
 			/* do nothing */
 			break;
 
 		default:
-			printk(KERN_WARNING "cirrusfb: unknown Board\n");
+			dev_warn(info->device, "unknown board\n");
 			break;
 		}
 
@@ -1134,7 +1098,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	 */
 
 	else if (var->bits_per_pixel == 16) {
-		DPRINTK("cirrusfb: preparing for 16 bit deep display\n");
+		dev_dbg(info->device, "preparing for 16 bit deep display\n");
 		switch (cinfo->btype) {
 		case BT_SD64:
 			/* Extended Sequencer Mode: 256c col. mode */
@@ -1166,24 +1130,21 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_ALPINE:
-			DPRINTK(" (for GD543x)\n");
 			vga_wseq(regbase, CL_SEQR7, 0xa7);
 			break;
 
 		case BT_GD5480:
-			DPRINTK(" (for GD5480)\n");
 			vga_wseq(regbase, CL_SEQR7, 0x17);
 			/* We already set SRF and SR1F */
 			break;
 
 		case BT_LAGUNA:
-			DPRINTK(" (for GD546x)\n");
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
 			break;
 
 		default:
-			printk(KERN_WARNING "CIRRUSFB: unknown Board\n");
+			dev_warn(info->device, "unknown Board\n");
 			break;
 		}
 
@@ -1211,7 +1172,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	 */
 
 	else if (var->bits_per_pixel == 32) {
-		DPRINTK("cirrusfb: preparing for 32 bit deep display\n");
+		dev_dbg(info->device, "preparing for 32 bit deep display\n");
 		switch (cinfo->btype) {
 		case BT_SD64:
 			/* Extended Sequencer Mode: 256c col. mode */
@@ -1243,24 +1204,21 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_ALPINE:
-			DPRINTK(" (for GD543x)\n");
 			vga_wseq(regbase, CL_SEQR7, 0xa9);
 			break;
 
 		case BT_GD5480:
-			DPRINTK(" (for GD5480)\n");
 			vga_wseq(regbase, CL_SEQR7, 0x19);
 			/* We already set SRF and SR1F */
 			break;
 
 		case BT_LAGUNA:
-			DPRINTK(" (for GD546x)\n");
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
 			break;
 
 		default:
-			printk(KERN_WARNING "cirrusfb: unknown Board\n");
+			dev_warn(info->device, "unknown Board\n");
 			break;
 		}
 
@@ -1284,8 +1242,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	 */
 
 	else
-		printk(KERN_ERR "cirrusfb: What's this?? "
-			" requested color depth == %d.\n",
+		dev_err(info->device,
+			"What's this? requested color depth == %d.\n",
 			var->bits_per_pixel);
 
 	vga_wcrt(regbase, VGA_CRTC_OFFSET, offset & 0xff);
@@ -1355,7 +1313,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 */
 
 	vga_wseq(regbase, VGA_SEQ_CLOCK_MODE, tmp);
-	DPRINTK("CL_SEQR1: %d\n", tmp);
+	dev_dbg(info->device, "CL_SEQR1: %d\n", tmp);
 
 	cinfo->currentmode = regs;
 
@@ -1363,10 +1321,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	cirrusfb_pan_display(var, info);
 
 #ifdef CIRRUSFB_DEBUG
-	cirrusfb_dump();
+	cirrusfb_dbg_reg_dump(info, NULL);
 #endif
 
-	DPRINTK("EXIT\n");
 	return 0;
 }
 
@@ -1424,8 +1381,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	unsigned char tmp = 0, tmp2 = 0, xpix;
 	struct cirrusfb_info *cinfo = info->par;
 
-	DPRINTK("ENTER\n");
-	DPRINTK("virtual offset: (%d,%d)\n", var->xoffset, var->yoffset);
+	dev_dbg(info->device,
+		"virtual offset: (%d,%d)\n", var->xoffset, var->yoffset);
 
 	/* no range checks for xoffset and yoffset,   */
 	/* as fb_pan_display has already done this */
@@ -1481,7 +1438,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 
 	cirrusfb_WaitBLT(cinfo->regbase);
 
-	DPRINTK("EXIT\n");
 	return 0;
 }
 
@@ -1502,11 +1458,11 @@ static int cirrusfb_blank(int blank_mode, struct fb_info *info)
 	struct cirrusfb_info *cinfo = info->par;
 	int current_mode = cinfo->blank_mode;
 
-	DPRINTK("ENTER, blank mode = %d\n", blank_mode);
+	dev_dbg(info->device, "ENTER, blank mode = %d\n", blank_mode);
 
 	if (info->state != FBINFO_STATE_RUNNING ||
 	    current_mode == blank_mode) {
-		DPRINTK("EXIT, returning 0\n");
+		dev_dbg(info->device, "EXIT, returning 0\n");
 		return 0;
 	}
 
@@ -1543,12 +1499,12 @@ static int cirrusfb_blank(int blank_mode, struct fb_info *info)
 		vga_wgfx(cinfo->regbase, CL_GRE, 0x06);
 		break;
 	default:
-		DPRINTK("EXIT, returning 1\n");
+		dev_dbg(info->device, "EXIT, returning 1\n");
 		return 1;
 	}
 
 	cinfo->blank_mode = blank_mode;
-	DPRINTK("EXIT, returning 0\n");
+	dev_dbg(info->device, "EXIT, returning 0\n");
 
 	/* Let fbcon do a soft blank for us */
 	return (blank_mode == FB_BLANK_NORMAL) ? 1 : 0;
@@ -1562,8 +1518,6 @@ static void init_vgachip(struct fb_info *info)
 	struct cirrusfb_info *cinfo = info->par;
 	const struct cirrusfb_board_info_rec *bi;
 
-	DPRINTK("ENTER\n");
-
 	assert(cinfo != NULL);
 
 	bi = &cirrusfb_board_info[cinfo->btype];
@@ -1609,7 +1563,7 @@ static void init_vgachip(struct fb_info *info)
 		break;
 
 	default:
-		printk(KERN_ERR "cirrusfb: Warning: Unknown board type\n");
+		dev_err(info->device, "Warning: Unknown board type\n");
 		break;
 	}
 
@@ -1798,8 +1752,6 @@ static void init_vgachip(struct fb_info *info)
 
 	/* misc... */
 	WHDR(cinfo, 0);	/* Hidden DAC register: - */
-
-	DPRINTK("EXIT\n");
 	return;
 }
 
@@ -1808,8 +1760,6 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on)
 #ifdef CONFIG_ZORRO /* only works on Zorro boards */
 	static int IsOn = 0;	/* XXX not ok for multiple boards */
 
-	DPRINTK("ENTER\n");
-
 	if (cinfo->btype == BT_PICASSO4)
 		return;		/* nothing to switch */
 	if (cinfo->btype == BT_ALPINE)
@@ -1819,8 +1769,6 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on)
 	if (cinfo->btype == BT_PICASSO) {
 		if ((on && !IsOn) || (!on && IsOn))
 			WSFR(cinfo, 0xff);
-
-		DPRINTK("EXIT\n");
 		return;
 	}
 	if (on) {
@@ -1847,11 +1795,10 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on)
 		case BT_SPECTRUM:
 			WSFR(cinfo, 0x4f);
 			break;
-		default: /* do nothing */ break;
+		default: /* do nothing */
+			break;
 		}
 	}
-
-	DPRINTK("EXIT\n");
 #endif /* CONFIG_ZORRO */
 }
 
@@ -1953,12 +1900,8 @@ static void cirrusfb_imageblit(struct fb_info *info,
 #define PREP_IO_BASE    ((volatile unsigned char *) 0x80000000)
 static void get_prep_addrs(unsigned long *display, unsigned long *registers)
 {
-	DPRINTK("ENTER\n");
-
 	*display = PREP_VIDEO_BASE;
 	*registers = (unsigned long) PREP_IO_BASE;
-
-	DPRINTK("EXIT\n");
 }
 
 #endif				/* CONFIG_PPC_PREP */
@@ -1970,13 +1913,12 @@ static int release_io_ports;
  * based on the DRAM bandwidth bit and DRAM bank switching bit.  This
  * works with 1MB, 2MB and 4MB configurations (which the Motorola boards
  * seem to have. */
-static unsigned int __devinit cirrusfb_get_memsize(u8 __iomem *regbase)
+static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info,
+						   u8 __iomem *regbase)
 {
 	unsigned long mem;
 	unsigned char SRF;
 
-	DPRINTK("ENTER\n");
-
 	SRF = vga_rseq(regbase, CL_SEQRF);
 	switch ((SRF & 0x18)) {
 	case 0x08:
@@ -1992,7 +1934,7 @@ static unsigned int __devinit cirrusfb_get_memsize(u8 __iomem *regbase)
 		mem = 2048 * 1024;
 		break;
 	default:
-		printk(KERN_WARNING "CLgenfb: Unknown memory size!\n");
+		dev_warn(info->device, "CLgenfb: Unknown memory size!\n");
 		mem = 1024 * 1024;
 	}
 	if (SRF & 0x80)
@@ -2002,8 +1944,6 @@ static unsigned int __devinit cirrusfb_get_memsize(u8 __iomem *regbase)
 		mem *= 2;
 
 	/* TODO: Handling of GD5446/5480 (see XF86 sources ...) */
-
-	DPRINTK("EXIT\n");
 	return mem;
 }
 
@@ -2014,8 +1954,6 @@ static void get_pci_addrs(const struct pci_dev *pdev,
 	assert(display != NULL);
 	assert(registers != NULL);
 
-	DPRINTK("ENTER\n");
-
 	*display = 0;
 	*registers = 0;
 
@@ -2030,8 +1968,6 @@ static void get_pci_addrs(const struct pci_dev *pdev,
 	}
 
 	assert(*display != 0);
-
-	DPRINTK("EXIT\n");
 }
 
 static void cirrusfb_pci_unmap(struct fb_info *info)
@@ -2117,11 +2053,6 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 	int err;
 	enum cirrus_board btype;
 
-	DPRINTK("ENTER\n");
-
-	printk(KERN_INFO "cirrusfb: Driver for Cirrus Logic based "
-		"graphic boards, v" CIRRUSFB_VERSION "\n");
-
 	btype = cinfo->btype;
 
 	/* sanity checks */
@@ -2130,11 +2061,11 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 	/* set all the vital stuff */
 	cirrusfb_set_fbinfo(info);
 
-	DPRINTK("cirrusfb: (RAM start set to: 0x%p)\n", info->screen_base);
+	dev_dbg(info->device, "(RAM start set to: 0x%p)\n", info->screen_base);
 
 	err = fb_find_mode(&info->var, info, mode_option, NULL, 0, NULL, 8);
 	if (!err) {
-		DPRINTK("wrong initial video mode\n");
+		dev_dbg(info->device, "wrong initial video mode\n");
 		err = -EINVAL;
 		goto err_dealloc_cmap;
 	}
@@ -2144,18 +2075,18 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 	err = cirrusfb_decode_var(&info->var, &cinfo->currentmode, info);
 	if (err < 0) {
 		/* should never happen */
-		DPRINTK("choking on default var... umm, no good.\n");
+		dev_dbg(info->device,
+			"choking on default var... umm, no good.\n");
 		goto err_dealloc_cmap;
 	}
 
 	err = register_framebuffer(info);
 	if (err < 0) {
-		printk(KERN_ERR "cirrusfb: could not register "
-			"fb device; err = %d!\n", err);
+		dev_err(info->device,
+			"could not register fb device; err = %d!\n", err);
 		goto err_dealloc_cmap;
 	}
 
-	DPRINTK("EXIT, returning 0\n");
 	return 0;
 
 err_dealloc_cmap:
@@ -2168,17 +2099,13 @@ err_dealloc_cmap:
 static void __devexit cirrusfb_cleanup(struct fb_info *info)
 {
 	struct cirrusfb_info *cinfo = info->par;
-	DPRINTK("ENTER\n");
 
 	switch_monitor(cinfo, 0);
-
 	unregister_framebuffer(info);
 	fb_dealloc_cmap(&info->cmap);
-	printk("Framebuffer unregistered\n");
+	dev_dbg(info->device, "Framebuffer unregistered\n");
 	cinfo->unmap(info);
 	framebuffer_release(info);
-
-	DPRINTK("EXIT\n");
 }
 
 #ifdef CONFIG_PCI
@@ -2207,9 +2134,11 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	cinfo = info->par;
 	cinfo->btype = btype = (enum cirrus_board) ent->driver_data;
 
-	DPRINTK(" Found PCI device, base address 0 is 0x%x, btype set to %d\n",
-		pdev->resource[0].start, btype);
-	DPRINTK(" base address 1 is 0x%x\n", pdev->resource[1].start);
+	dev_dbg(info->device,
+		" Found PCI device, base address 0 is 0x%Lx, btype set to %d\n",
+		(unsigned long long)pdev->resource[0].start, btype);
+	dev_dbg(info->device, " base address 1 is 0x%Lx\n",
+		(unsigned long long)pdev->resource[1].start);
 
 	if (isPReP) {
 		pci_write_config_dword(pdev, PCI_BASE_ADDRESS_0, 0x00000000);
@@ -2219,30 +2148,29 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	/* PReP dies if we ioremap the IO registers, but it works w/out... */
 		cinfo->regbase = (char __iomem *) info->fix.mmio_start;
 	} else {
-		DPRINTK("Attempt to get PCI info for Cirrus Graphics Card\n");
+		dev_dbg(info->device,
+			"Attempt to get PCI info for Cirrus Graphics Card\n");
 		get_pci_addrs(pdev, &board_addr, &info->fix.mmio_start);
 		/* FIXME: this forces VGA.  alternatives? */
 		cinfo->regbase = NULL;
 	}
 
-	DPRINTK("Board address: 0x%lx, register address: 0x%lx\n",
+	dev_dbg(info->device, "Board address: 0x%lx, register address: 0x%lx\n",
 		board_addr, info->fix.mmio_start);
 
 	board_size = (btype == BT_GD5480) ?
-		32 * MB_ : cirrusfb_get_memsize(cinfo->regbase);
+		32 * MB_ : cirrusfb_get_memsize(info, cinfo->regbase);
 
 	ret = pci_request_regions(pdev, "cirrusfb");
 	if (ret < 0) {
-		printk(KERN_ERR "cirrusfb: cannot reserve region 0x%lx, "
-		       "abort\n",
-		       board_addr);
+		dev_err(info->device, "cannot reserve region 0x%lx, abort\n",
+			board_addr);
 		goto err_release_fb;
 	}
 #if 0 /* if the system didn't claim this region, we would... */
 	if (!request_mem_region(0xA0000, 65535, "cirrusfb")) {
-		printk(KERN_ERR "cirrusfb: cannot reserve region 0x%lx, abort\n"
-,
-		       0xA0000L);
+		dev_err(info->device, "cannot reserve region 0x%lx, abort\n",
+			0xA0000L);
 		ret = -EBUSY;
 		goto err_release_regions;
 	}
@@ -2260,9 +2188,9 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	info->screen_size = board_size;
 	cinfo->unmap = cirrusfb_pci_unmap;
 
-	printk(KERN_INFO "RAM (%lu kB) at 0x%lx, Cirrus "
-			"Logic chipset on PCI bus\n",
-			info->screen_size >> 10, board_addr);
+	dev_info(info->device,
+		 "Cirrus Logic chipset on PCI bus, RAM (%lu kB) at 0x%lx\n",
+		 info->screen_size >> 10, board_addr);
 	pci_set_drvdata(pdev, info);
 
 	ret = cirrusfb_register(info);
@@ -2288,11 +2216,8 @@ err_out:
 static void __devexit cirrusfb_pci_unregister(struct pci_dev *pdev)
 {
 	struct fb_info *info = pci_get_drvdata(pdev);
-	DPRINTK("ENTER\n");
 
 	cirrusfb_cleanup(info);
-
-	DPRINTK("EXIT\n");
 }
 
 static struct pci_driver cirrusfb_pci_driver = {
@@ -2324,8 +2249,6 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 	if (cirrusfb_zorro_table2[btype].id2)
 		z2 = zorro_find_device(cirrusfb_zorro_table2[btype].id2, NULL);
 	size = cirrusfb_zorro_table2[btype].size;
-	printk(KERN_INFO "cirrusfb: %s board detected; ",
-	       cirrusfb_board_info[btype].name);
 
 	info = framebuffer_alloc(sizeof(struct cirrusfb_info), &z->dev);
 	if (!info) {
@@ -2334,6 +2257,9 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 		goto err_out;
 	}
 
+	dev_info(info->device, "%s board detected\n",
+		 cirrusfb_board_info[btype].name);
+
 	cinfo = info->par;
 	cinfo->btype = btype;
 
@@ -2345,19 +2271,16 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 	info->screen_size = size;
 
 	if (!zorro_request_device(z, "cirrusfb")) {
-		printk(KERN_ERR "cirrusfb: cannot reserve region 0x%lx, "
-		       "abort\n",
-		       board_addr);
+		dev_err(info->device, "cannot reserve region 0x%lx, abort\n",
+			board_addr);
 		ret = -EBUSY;
 		goto err_release_fb;
 	}
 
-	printk(" RAM (%lu MB) at $%lx, ", board_size / MB_, board_addr);
-
 	ret = -EIO;
 
 	if (btype == BT_PICASSO4) {
-		printk(KERN_INFO " REG at $%lx\n", board_addr + 0x600000);
+		dev_info(info->device, " REG at $%lx\n", board_addr + 0x600000);
 
 		/* To be precise, for the P4 this is not the */
 		/* begin of the board, but the begin of RAM. */
@@ -2367,7 +2290,7 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 		if (!cinfo->regbase)
 			goto err_release_region;
 
-		DPRINTK("cirrusfb: Virtual address for board set to: $%p\n",
+		dev_dbg(info->device, "Virtual address for board set to: $%p\n",
 			cinfo->regbase);
 		cinfo->regbase += 0x600000;
 		info->fix.mmio_start = board_addr + 0x600000;
@@ -2377,8 +2300,8 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 		if (!info->screen_base)
 			goto err_unmap_regbase;
 	} else {
-		printk(KERN_INFO " REG at $%lx\n",
-			(unsigned long) z2->resource.start);
+		dev_info(info->device, " REG at $%lx\n",
+			 (unsigned long) z2->resource.start);
 
 		info->fix.smem_start = board_addr;
 		if (board_addr > 0x01000000)
@@ -2392,12 +2315,15 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 		cinfo->regbase = (caddr_t) ZTWO_VADDR(z2->resource.start);
 		info->fix.mmio_start = z2->resource.start;
 
-		DPRINTK("cirrusfb: Virtual address for board set to: $%p\n",
+		dev_dbg(info->device, "Virtual address for board set to: $%p\n",
 			cinfo->regbase);
 	}
 	cinfo->unmap = cirrusfb_zorro_unmap;
 
-	printk(KERN_INFO "Cirrus Logic chipset on Zorro bus\n");
+	dev_info(info->device,
+		 "Cirrus Logic chipset on Zorro bus, RAM (%lu MB) at $%lx\n",
+		 board_size / MB_, board_addr);
+
 	zorro_set_drvdata(z, info);
 
 	ret = cirrusfb_register(info);
@@ -2424,11 +2350,8 @@ err_out:
 void __devexit cirrusfb_zorro_unregister(struct zorro_dev *z)
 {
 	struct fb_info *info = zorro_get_drvdata(z);
-	DPRINTK("ENTER\n");
 
 	cirrusfb_cleanup(info);
-
-	DPRINTK("EXIT\n");
 }
 
 static struct zorro_driver cirrusfb_zorro_driver = {
@@ -2461,11 +2384,10 @@ static int __init cirrusfb_init(void)
 }
 
 #ifndef MODULE
-static int __init cirrusfb_setup(char *options) {
+static int __init cirrusfb_setup(char *options)
+{
 	char *this_opt;
 
-	DPRINTK("ENTER\n");
-
 	if (!options || !*options)
 		return 0;
 
@@ -2473,8 +2395,6 @@ static int __init cirrusfb_setup(char *options) {
 		if (!*this_opt)
 			continue;
 
-		DPRINTK("cirrusfb_setup: option '%s'\n", this_opt);
-
 		if (!strcmp(this_opt, "noaccel"))
 			noaccel = 1;
 		else if (!strncmp(this_opt, "mode:", 5))
@@ -2560,8 +2480,6 @@ static void AttrOn(const struct cirrusfb_info *cinfo)
 {
 	assert(cinfo != NULL);
 
-	DPRINTK("ENTER\n");
-
 	if (vga_rcrt(cinfo->regbase, CL_CRT24) & 0x80) {
 		/* if we're just in "write value" mode, write back the */
 		/* same value as before to not modify anything */
@@ -2574,8 +2492,6 @@ static void AttrOn(const struct cirrusfb_info *cinfo)
 
 	/* dummy write on Reg0 to be on "write index" mode next time */
 	vga_w(cinfo->regbase, VGA_ATT_IW, 0x00);
-
-	DPRINTK("EXIT\n");
 }
 
 /*** WHDR() - write into the Hidden DAC register ***/
@@ -2723,8 +2639,6 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
 	u_long nsrc, ndest;
 	u_char bltmode;
 
-	DPRINTK("ENTER\n");
-
 	nwidth = width - 1;
 	nheight = height - 1;
 
@@ -2813,8 +2727,6 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
 
 	/* and finally: GO! */
 	vga_wgfx(regbase, CL_GR31, 0x02);	/* BLT Start/status */
-
-	DPRINTK("EXIT\n");
 }
 
 /*******************************************************************
@@ -2831,8 +2743,6 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 	u_long ndest;
 	u_char op;
 
-	DPRINTK("ENTER\n");
-
 	nwidth = width - 1;
 	nheight = height - 1;
 
@@ -2896,8 +2806,6 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 
 	/* and finally: GO! */
 	vga_wgfx(regbase, CL_GR31, 0x02);	/* BLT Start/status */
-
-	DPRINTK("EXIT\n");
 }
 
 /**************************************************************************
@@ -2917,8 +2825,6 @@ static void bestclock(long freq, int *nom, int *den, int *div)
 	*den = 0;
 	*div = 0;
 
-	DPRINTK("ENTER\n");
-
 	if (freq < 8000)
 		freq = 8000;
 
@@ -2960,12 +2866,6 @@ static void bestclock(long freq, int *nom, int *den, int *div)
 			}
 		}
 	}
-
-	DPRINTK("Best possible values for given frequency:\n");
-	DPRINTK("	freq: %ld kHz  nom: %d  den: %d  div: %d\n",
-		freq, *nom, *den, *div);
-
-	DPRINTK("EXIT\n");
 }
 
 /* -------------------------------------------------------------------------
@@ -2977,32 +2877,6 @@ static void bestclock(long freq, int *nom, int *den, int *div)
 
 #ifdef CIRRUSFB_DEBUG
 
-/**
- * cirrusfb_dbg_print_byte
- * @name: name associated with byte value to be displayed
- * @val: byte value to be displayed
- *
- * DESCRIPTION:
- * Display an indented string, along with a hexidecimal byte value, and
- * its decoded bits.  Bits 7 through 0 are listed in left-to-right
- * order.
- */
-
-static
-void cirrusfb_dbg_print_byte(const char *name, unsigned char val)
-{
-	DPRINTK("%8s = 0x%02X (bits 7-0: %c%c%c%c%c%c%c%c)\n",
-		name, val,
-		val & 0x80 ? '1' : '0',
-		val & 0x40 ? '1' : '0',
-		val & 0x20 ? '1' : '0',
-		val & 0x10 ? '1' : '0',
-		val & 0x08 ? '1' : '0',
-		val & 0x04 ? '1' : '0',
-		val & 0x02 ? '1' : '0',
-		val & 0x01 ? '1' : '0');
-}
-
 /**
  * cirrusfb_dbg_print_regs
  * @base: If using newmmio, the newmmio base address, otherwise %NULL
@@ -3014,9 +2888,9 @@ void cirrusfb_dbg_print_byte(const char *name, unsigned char val)
  * used at the given @base address to query the information.
  */
 
-static
-void cirrusfb_dbg_print_regs(caddr_t regbase,
-			     enum cirrusfb_dbg_reg_class reg_class, ...)
+static void cirrusfb_dbg_print_regs(struct fb_info *info,
+				    caddr_t regbase,
+				    enum cirrusfb_dbg_reg_class reg_class, ...)
 {
 	va_list list;
 	unsigned char val = 0;
@@ -3042,7 +2916,7 @@ void cirrusfb_dbg_print_regs(caddr_t regbase,
 			break;
 		}
 
-		cirrusfb_dbg_print_byte(name, val);
+		dev_dbg(info->device, "%8s = 0x%02X\n", name, val);
 
 		name = va_arg(list, char *);
 	}
@@ -3050,18 +2924,6 @@ void cirrusfb_dbg_print_regs(caddr_t regbase,
 	va_end(list);
 }
 
-/**
- * cirrusfb_dump
- * @cirrusfbinfo:
- *
- * DESCRIPTION:
- */
-
-static void cirrusfb_dump(void)
-{
-	cirrusfb_dbg_reg_dump(NULL);
-}
-
 /**
  * cirrusfb_dbg_reg_dump
  * @base: If using newmmio, the newmmio base address, otherwise %NULL
@@ -3072,12 +2934,11 @@ static void cirrusfb_dump(void)
  * used at the given @base address to query the information.
  */
 
-static
-void cirrusfb_dbg_reg_dump(caddr_t regbase)
+static void cirrusfb_dbg_reg_dump(struct fb_info *info, caddr_t regbase)
 {
-	DPRINTK("CIRRUSFB VGA CRTC register dump:\n");
+	dev_dbg(info->device, "VGA CRTC register dump:\n");
 
-	cirrusfb_dbg_print_regs(regbase, CRT,
+	cirrusfb_dbg_print_regs(info, regbase, CRT,
 			   "CR00", 0x00,
 			   "CR01", 0x01,
 			   "CR02", 0x02,
@@ -3127,11 +2988,11 @@ void cirrusfb_dbg_reg_dump(caddr_t regbase)
 			   "CR3F", 0x3F,
 			   NULL);
 
-	DPRINTK("\n");
+	dev_dbg(info->device, "\n");
 
-	DPRINTK("CIRRUSFB VGA SEQ register dump:\n");
+	dev_dbg(info->device, "VGA SEQ register dump:\n");
 
-	cirrusfb_dbg_print_regs(regbase, SEQ,
+	cirrusfb_dbg_print_regs(info, regbase, SEQ,
 			   "SR00", 0x00,
 			   "SR01", 0x01,
 			   "SR02", 0x02,
@@ -3160,7 +3021,7 @@ void cirrusfb_dbg_reg_dump(caddr_t regbase)
 			   "SR1F", 0x1F,
 			   NULL);
 
-	DPRINTK("\n");
+	dev_dbg(info->device, "\n");
 }
 
 #endif				/* CIRRUSFB_DEBUG */
-- 
cgit v1.2.3


From 55a4ea6ab0fff0c02f101a60d2ba4f1794990499 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:04 -0700
Subject: cirrusfb: fix Laguna chipset memory detection and clock setting

Fix memory detection and clock setting for Cirrus Laguna chipsets
(GD5464/GD5465).  The changes are done after the Xorg code.

The driver still does not display anything on the GD5465 but it switches
resolutions correctly at least.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 65 ++++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 3b4b0f1e061..dd09bae910f 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -867,19 +867,24 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		}
 	}
 	if (nom) {
-		vga_wseq(regbase, CL_SEQRB, nom);
 		tmp = den << 1;
 		if (div != 0)
 			tmp |= 1;
-
 		/* 6 bit denom; ONLY 5434!!! (bugged me 10 days) */
 		if ((cinfo->btype == BT_SD64) ||
 		    (cinfo->btype == BT_ALPINE) ||
 		    (cinfo->btype == BT_GD5480))
 			tmp |= 0x80;
 
-		dev_dbg(info->device, "CL_SEQR1B: %ld\n", (long) tmp);
-		vga_wseq(regbase, CL_SEQR1B, tmp);
+		dev_dbg(info->device, "CL_SEQR1B: %d\n", (int) tmp);
+		/* Laguna chipset has reversed clock registers */
+		if (cinfo->btype == BT_LAGUNA) {
+			vga_wseq(regbase, CL_SEQRE, tmp);
+			vga_wseq(regbase, CL_SEQR1E, nom);
+		} else {
+			vga_wseq(regbase, CL_SEQRB, nom);
+			vga_wseq(regbase, CL_SEQR1B, tmp);
+		}
 	}
 
 	if (yres >= 1024)
@@ -1917,31 +1922,37 @@ static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info,
 						   u8 __iomem *regbase)
 {
 	unsigned long mem;
-	unsigned char SRF;
+	struct cirrusfb_info *cinfo = info->par;
 
-	SRF = vga_rseq(regbase, CL_SEQRF);
-	switch ((SRF & 0x18)) {
-	case 0x08:
-		mem = 512 * 1024;
-		break;
-	case 0x10:
-		mem = 1024 * 1024;
-		break;
-	/* 64-bit DRAM data bus width; assume 2MB. Also indicates 2MB memory
-	 * on the 5430.
-	 */
-	case 0x18:
-		mem = 2048 * 1024;
-		break;
-	default:
-		dev_warn(info->device, "CLgenfb: Unknown memory size!\n");
-		mem = 1024 * 1024;
+	if (cinfo->btype == BT_LAGUNA) {
+		unsigned char SR14 = vga_rseq(regbase, CL_SEQR14);
+
+		mem = ((SR14 & 7) + 1) << 20;
+	} else {
+		unsigned char SRF = vga_rseq(regbase, CL_SEQRF);
+		switch ((SRF & 0x18)) {
+		case 0x08:
+			mem = 512 * 1024;
+			break;
+		case 0x10:
+			mem = 1024 * 1024;
+			break;
+		/* 64-bit DRAM data bus width; assume 2MB.
+		 * Also indicates 2MB memory on the 5430.
+		 */
+		case 0x18:
+			mem = 2048 * 1024;
+			break;
+		default:
+			dev_warn(info->device, "Unknown memory size!\n");
+			mem = 1024 * 1024;
+		}
+		/* If DRAM bank switching is enabled, there must be
+		 * twice as much memory installed. (4MB on the 5434)
+		 */
+		if (SRF & 0x80)
+			mem *= 2;
 	}
-	if (SRF & 0x80)
-	/* If DRAM bank switching is enabled, there must be twice as much
-	 * memory installed. (4MB on the 5434)
-	 */
-		mem *= 2;
 
 	/* TODO: Handling of GD5446/5480 (see XF86 sources ...) */
 	return mem;
-- 
cgit v1.2.3


From 213d4bdd8cd405d9ba59ee78165b8c870f83a018 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:04 -0700
Subject: cirrusfb: add Laguna additional overflow register

Add additional overflow register setting for Laguna chips.

Also, simplify some code in the cirrusfb_pan_display() and
cirrusfb_blank().

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 68 ++++++++++++++++++++++++++++++------------------
 include/video/cirrus.h   |  1 +
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index dd09bae910f..119e49ed621 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -1259,13 +1259,32 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	/* screen start addr #16-18, fastpagemode cycles */
 	vga_wcrt(regbase, CL_CRT1B, tmp);
 
-	if (cinfo->btype == BT_SD64 ||
-	    cinfo->btype == BT_PICASSO4 ||
-	    cinfo->btype == BT_ALPINE ||
-	    cinfo->btype == BT_GD5480)
-		/* screen start address bit 19 */
+	/* screen start address bit 19 */
+	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19)
 		vga_wcrt(regbase, CL_CRT1D, 0x00);
 
+	if (cinfo->btype == BT_LAGUNA ||
+	    cinfo->btype == BT_GD5480) {
+
+		tmp = 0;
+		if ((htotal + 5) & 256)
+			tmp |= 128;
+		if (hdispend & 256)
+			tmp |= 64;
+		if (hsyncstart & 256)
+			tmp |= 48;
+		if (vtotal & 1024)
+			tmp |= 8;
+		if (vdispend & 1024)
+			tmp |= 4;
+		if (vsyncstart & 1024)
+			tmp |= 3;
+
+		vga_wcrt(regbase, CL_CRT1E, tmp);
+		dev_dbg(info->device, "CRT1e: %d\n", tmp);
+	}
+
+
 	/* text cursor location high */
 	vga_wcrt(regbase, VGA_CRTC_CURSOR_HI, 0);
 	/* text cursor location low */
@@ -1383,7 +1402,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	int xoffset = 0;
 	int yoffset = 0;
 	unsigned long base;
-	unsigned char tmp = 0, tmp2 = 0, xpix;
+	unsigned char tmp, xpix;
 	struct cirrusfb_info *cinfo = info->par;
 
 	dev_dbg(info->device,
@@ -1418,6 +1437,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI,
 		 (unsigned char) (base >> 8));
 
+	/* 0xf2 is %11110010, exclude tmp bits */
+	tmp = vga_rcrt(cinfo->regbase, CL_CRT1B) & 0xf2;
 	/* construct bits 16, 17 and 18 of screen start address */
 	if (base & 0x10000)
 		tmp |= 0x01;
@@ -1426,9 +1447,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	if (base & 0x40000)
 		tmp |= 0x08;
 
-	/* 0xf2 is %11110010, exclude tmp bits */
-	tmp2 = (vga_rcrt(cinfo->regbase, CL_CRT1B) & 0xf2) | tmp;
-	vga_wcrt(cinfo->regbase, CL_CRT1B, tmp2);
+	vga_wcrt(cinfo->regbase, CL_CRT1B, tmp);
 
 	/* construct bit 19 of screen start address */
 	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19)
@@ -1473,47 +1492,44 @@ static int cirrusfb_blank(int blank_mode, struct fb_info *info)
 
 	/* Undo current */
 	if (current_mode == FB_BLANK_NORMAL ||
-	    current_mode == FB_BLANK_UNBLANK) {
-		/* unblank the screen */
-		val = vga_rseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE);
+	    current_mode == FB_BLANK_UNBLANK)
 		/* clear "FullBandwidth" bit */
-		vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, val & 0xdf);
-		/* and undo VESA suspend trickery */
-		vga_wgfx(cinfo->regbase, CL_GRE, 0x00);
-	}
-
-	/* set new */
-	if (blank_mode > FB_BLANK_NORMAL) {
-		/* blank the screen */
-		val = vga_rseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE);
+		val = 0;
+	else
 		/* set "FullBandwidth" bit */
-		vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, val | 0x20);
-	}
+		val = 0x20;
+
+	val |= vga_rseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE) & 0xdf;
+	vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, val);
 
 	switch (blank_mode) {
 	case FB_BLANK_UNBLANK:
 	case FB_BLANK_NORMAL:
+		val = 0x00;
 		break;
 	case FB_BLANK_VSYNC_SUSPEND:
-		vga_wgfx(cinfo->regbase, CL_GRE, 0x04);
+		val = 0x04;
 		break;
 	case FB_BLANK_HSYNC_SUSPEND:
-		vga_wgfx(cinfo->regbase, CL_GRE, 0x02);
+		val = 0x02;
 		break;
 	case FB_BLANK_POWERDOWN:
-		vga_wgfx(cinfo->regbase, CL_GRE, 0x06);
+		val = 0x06;
 		break;
 	default:
 		dev_dbg(info->device, "EXIT, returning 1\n");
 		return 1;
 	}
 
+	vga_wgfx(cinfo->regbase, CL_GRE, val);
+
 	cinfo->blank_mode = blank_mode;
 	dev_dbg(info->device, "EXIT, returning 0\n");
 
 	/* Let fbcon do a soft blank for us */
 	return (blank_mode == FB_BLANK_NORMAL) ? 1 : 0;
 }
+
 /**** END   Hardware specific Routines **************************************/
 /****************************************************************************/
 /**** BEGIN Internal Routines ***********************************************/
diff --git a/include/video/cirrus.h b/include/video/cirrus.h
index b2776b6c867..a1b7985b6b2 100644
--- a/include/video/cirrus.h
+++ b/include/video/cirrus.h
@@ -71,6 +71,7 @@
 #define CL_CRT1B	0x1b	/* Extended Display Controls */
 #define CL_CRT1C	0x1c	/* Sync adjust and genlock register */
 #define CL_CRT1D	0x1d	/* Overlay Extended Control register */
+#define CL_CRT1E	0x1e	/* Another overflow register */
 #define CL_CRT25	0x25	/* Part Status Register */
 #define CL_CRT27	0x27	/* ID Register */
 #define CL_CRT51	0x51	/* P4 disable "flicker fixer" */
-- 
cgit v1.2.3


From 6e30fc086d000d15abfe5550cc8b286335f7e132 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:05 -0700
Subject: cirrusfb: add mmio registers for Laguna chipsets

The Laguna chipsets use special registers which are available through the
mmio area.  The cirrusfb driver does not use memory mapped registers for
the PCI cards.

Add the memory mapped area for Laguna chipsets and add basic usage of the
special Laguna registers after SVGALIB code.

This gives readable console at 16bpp on the GD-5465 (Laguna AGP).  The
8bpp and 32bpp depths are still broken.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 119e49ed621..378d60e0190 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -327,6 +327,7 @@ enum cirrusfb_dbg_reg_class {
 /* info about board */
 struct cirrusfb_info {
 	u8 __iomem *regbase;
+	u8 __iomem *laguna_mmio;
 	enum cirrus_board btype;
 	unsigned char SFR;	/* Shadow of special function register */
 
@@ -699,6 +700,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	int yres, vdispend, vsyncstart, vsyncend, vtotal;
 	long freq;
 	int nom, den, div;
+	unsigned int control, format, threshold;
 
 	dev_dbg(info->device, "Requested mode: %dx%dx%d\n",
 	       var->xres, var->yres, var->bits_per_pixel);
@@ -866,6 +868,23 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			cirrusfb_set_mclk_as_source(info, divMCLK);
 		}
 	}
+	if (cinfo->btype == BT_LAGUNA) {
+		long pcifc = fb_readl(cinfo->laguna_mmio + 0x3fc);
+		unsigned char tile = fb_readb(cinfo->laguna_mmio + 0x407);
+		unsigned short tile_control;
+
+		tile_control = fb_readw(cinfo->laguna_mmio + 0x2c4);
+		fb_writew(tile_control & ~0x80, cinfo->laguna_mmio + 0x2c4);
+
+		fb_writel(pcifc | 0x10000000l, cinfo->laguna_mmio + 0x3fc);
+		fb_writeb(tile & 0x3f, cinfo->laguna_mmio + 0x407);
+		control = fb_readw(cinfo->laguna_mmio + 0x402);
+		threshold = fb_readw(cinfo->laguna_mmio + 0xea);
+		control &= ~0x6800;
+		format = 0;
+		threshold &= 0xffe0;
+		threshold &= 0x3fbf;
+	}
 	if (nom) {
 		tmp = den << 1;
 		if (div != 0)
@@ -1035,6 +1054,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_LAGUNA:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) | 0x01);
+			threshold |= 0x10;
 			break;
 
 		default:
@@ -1146,6 +1166,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_LAGUNA:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
+			control |= 0x2000;
+			format |= 0x1400;
+			threshold |= 0x10;
 			break;
 
 		default:
@@ -1220,6 +1243,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_LAGUNA:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
+			control |= 0x6000;
+			format |= 0x3400;
+			threshold |= 0x20;
 			break;
 
 		default:
@@ -1327,6 +1353,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	/* graphics cursor attributes: nothing special */
 	vga_wseq(regbase, CL_SEQR12, 0x0);
 
+	if (cinfo->btype == BT_LAGUNA) {
+		/* no tiles */
+		fb_writew(control | 0x1000, cinfo->laguna_mmio + 0x402);
+		fb_writew(format, cinfo->laguna_mmio + 0xc0);
+		fb_writew(threshold, cinfo->laguna_mmio + 0xea);
+	}
 	/* finally, turn on everything - turn off "FullBandwidth" bit */
 	/* also, set "DotClock%2" bit where requested */
 	tmp = 0x01;
@@ -2000,7 +2032,10 @@ static void get_pci_addrs(const struct pci_dev *pdev,
 static void cirrusfb_pci_unmap(struct fb_info *info)
 {
 	struct pci_dev *pdev = to_pci_dev(info->device);
+	struct cirrusfb_info *cinfo = info->par;
 
+	if (cinfo->laguna_mmio == NULL)
+		iounmap(cinfo->laguna_mmio);
 	iounmap(info->screen_base);
 #if 0 /* if system didn't claim this region, we would... */
 	release_mem_region(0xA0000, 65535);
@@ -2180,6 +2215,7 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 		get_pci_addrs(pdev, &board_addr, &info->fix.mmio_start);
 		/* FIXME: this forces VGA.  alternatives? */
 		cinfo->regbase = NULL;
+		cinfo->laguna_mmio = ioremap(info->fix.mmio_start, 0x1000);
 	}
 
 	dev_dbg(info->device, "Board address: 0x%lx, register address: 0x%lx\n",
@@ -2234,6 +2270,8 @@ err_release_regions:
 #endif
 	pci_release_regions(pdev);
 err_release_fb:
+	if (cinfo->laguna_mmio == NULL)
+		iounmap(cinfo->laguna_mmio);
 	framebuffer_release(info);
 err_disable:
 err_out:
-- 
cgit v1.2.3


From 6683e01e2c950f635a6c0e2bbc80db1b1838311f Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:06 -0700
Subject: cirrusfb: do not calculate line length twice

A line length is calculated twice: first in the cirrusfb_decode_var() then
in the cirrusfb_set_par_foo().

Use the first calculated value.  A nice side effect is that 32bpp mode
works now.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert.uytterhoeven@gmail.com>
Cc: Arthur Marsh <arthur.marsh@internode.on.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 378d60e0190..53572c0f347 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -694,7 +694,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	struct cirrusfb_regs regs;
 	u8 __iomem *regbase = cinfo->regbase;
 	unsigned char tmp;
-	int offset = 0, err;
+	int err;
+	int pitch;
 	const struct cirrusfb_board_info_rec *bi;
 	int hdispend, hsyncstart, hsyncend, htotal;
 	int yres, vdispend, vsyncstart, vsyncend, vtotal;
@@ -1027,7 +1028,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x06);
 		/* plane mask: only write to first plane */
 		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0x01);
-		offset = var->xres_virtual / 16;
 	}
 
 	/******************************************************
@@ -1113,7 +1113,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
 		/* plane mask: enable writing to all 4 planes */
 		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff);
-		offset = var->xres_virtual / 8;
 	}
 
 	/******************************************************
@@ -1190,7 +1189,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
 		/* plane mask: enable writing to all 4 planes */
 		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff);
-		offset = var->xres_virtual / 4;
 	}
 
 	/******************************************************
@@ -1263,7 +1261,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
 		/* plane mask: enable writing to all 4 planes */
 		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff);
-		offset = var->xres_virtual / 4;
 	}
 
 	/******************************************************
@@ -1277,9 +1274,10 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			"What's this? requested color depth == %d.\n",
 			var->bits_per_pixel);
 
-	vga_wcrt(regbase, VGA_CRTC_OFFSET, offset & 0xff);
+	pitch = info->fix.line_length >> 3;
+	vga_wcrt(regbase, VGA_CRTC_OFFSET, pitch & 0xff);
 	tmp = 0x22;
-	if (offset & 0x100)
+	if (pitch & 0x100)
 		tmp |= 0x10;	/* offset overflow bit */
 
 	/* screen start addr #16-18, fastpagemode cycles */
@@ -1287,7 +1285,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 	/* screen start address bit 19 */
 	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19)
-		vga_wcrt(regbase, CL_CRT1D, 0x00);
+		vga_wcrt(regbase, CL_CRT1D, (pitch >> 9) & 1);
 
 	if (cinfo->btype == BT_LAGUNA ||
 	    cinfo->btype == BT_GD5480) {
-- 
cgit v1.2.3


From c4dec3962d6bff26010fcfc61500c1241469a6e0 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:07 -0700
Subject: cirrusfb: use 5-6-5 RGB for 16bpp mode

Use the 5-6-5 RGB mode instead of the 5-5-5 mode at 16bpp depth.

It fixes colors in the 16bpp modes on Cirrus Laguna chips.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert.uytterhoeven@gmail.com>
Cc: Arthur Marsh <arthur.marsh@internode.on.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 53572c0f347..d844c41e201 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -499,12 +499,12 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 			var->green.offset = -3;
 			var->blue.offset = 8;
 		} else {
-			var->red.offset = 10;
+			var->red.offset = 11;
 			var->green.offset = 5;
 			var->blue.offset = 0;
 		}
 		var->red.length = 5;
-		var->green.length = 5;
+		var->green.length = 6;
 		var->blue.length = 5;
 		break;
 
@@ -1180,7 +1180,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		/* pixel mask: pass-through all planes */
 		WGen(cinfo, VGA_PEL_MSK, 0xff);
 #ifdef CONFIG_PCI
-		WHDR(cinfo, 0xc0);	/* Copy Xbh */
+		WHDR(cinfo, 0xc1);	/* Copy Xbh */
 #elif defined(CONFIG_ZORRO)
 		/* FIXME: CONFIG_PCI and CONFIG_ZORRO may be defined both */
 		WHDR(cinfo, 0xa0);	/* hidden dac reg: nothing special */
-- 
cgit v1.2.3


From 48c329e906f834711906ab4b0986ea0e857aff16 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:08 -0700
Subject: cirrusfb: various improvements

Various improvements to the code:
- kill a structure with only one
  field: multiplexing and use the
  field directly
- move the cirrusfb_ops structure
  down the file to kill forward
  declarations
- move cirrusfb_init() to kill
  forward declaration
- kill register loads done already
  in the init_vgachip()
- kill assigments done by higher
  layer in the cirrusfb_pan_display()
- do not overwrite line pitch bit in
  the CL_CRT1D register
- kill btype variables if they were
  used only once or twice
- add cpu_relax() in the busy waiting
  loop

The fix to the CL_CRT1D register handling makess the 1024x768 32bpp mode
work.  Previously, only lower resolution modes have worked with 32bpp.

Signed-off-by: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 202 ++++++++++++++---------------------------------
 1 file changed, 58 insertions(+), 144 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index d844c41e201..9e52db7d333 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -313,10 +313,6 @@ static const struct {
 };
 #endif /* CONFIG_ZORRO */
 
-struct cirrusfb_regs {
-	int multiplexing;
-};
-
 #ifdef CIRRUSFB_DEBUG
 enum cirrusfb_dbg_reg_class {
 	CRT,
@@ -331,7 +327,7 @@ struct cirrusfb_info {
 	enum cirrus_board btype;
 	unsigned char SFR;	/* Shadow of special function register */
 
-	struct cirrusfb_regs currentmode;
+	int multiplexing;
 	int blank_mode;
 	u32 pseudo_palette[16];
 
@@ -345,43 +341,8 @@ static char *mode_option __devinitdata = "640x480@60";
 /**** BEGIN PROTOTYPES ******************************************************/
 
 /*--- Interface used by the world ------------------------------------------*/
-static int cirrusfb_init(void);
-#ifndef MODULE
-static int cirrusfb_setup(char *options);
-#endif
-
-static int cirrusfb_open(struct fb_info *info, int user);
-static int cirrusfb_release(struct fb_info *info, int user);
-static int cirrusfb_setcolreg(unsigned regno, unsigned red, unsigned green,
-			      unsigned blue, unsigned transp,
-			      struct fb_info *info);
-static int cirrusfb_check_var(struct fb_var_screeninfo *var,
-			      struct fb_info *info);
-static int cirrusfb_set_par(struct fb_info *info);
 static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 				struct fb_info *info);
-static int cirrusfb_blank(int blank_mode, struct fb_info *info);
-static void cirrusfb_fillrect(struct fb_info *info,
-			      const struct fb_fillrect *region);
-static void cirrusfb_copyarea(struct fb_info *info,
-			      const struct fb_copyarea *area);
-static void cirrusfb_imageblit(struct fb_info *info,
-			       const struct fb_image *image);
-
-/* function table of the above functions */
-static struct fb_ops cirrusfb_ops = {
-	.owner		= THIS_MODULE,
-	.fb_open	= cirrusfb_open,
-	.fb_release	= cirrusfb_release,
-	.fb_setcolreg	= cirrusfb_setcolreg,
-	.fb_check_var	= cirrusfb_check_var,
-	.fb_set_par	= cirrusfb_set_par,
-	.fb_pan_display = cirrusfb_pan_display,
-	.fb_blank	= cirrusfb_blank,
-	.fb_fillrect	= cirrusfb_fillrect,
-	.fb_copyarea	= cirrusfb_copyarea,
-	.fb_imageblit	= cirrusfb_imageblit,
-};
 
 /*--- Internal routines ----------------------------------------------------*/
 static void init_vgachip(struct fb_info *info);
@@ -587,7 +548,6 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 }
 
 static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
-				struct cirrusfb_regs *regs,
 				struct fb_info *info)
 {
 	long freq;
@@ -628,7 +588,7 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
 	dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq);
 
 	maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx];
-	regs->multiplexing = 0;
+	cinfo->multiplexing = 0;
 
 	/* If the frequency is greater than we can support, we might be able
 	 * to use multiplexing for the video mode */
@@ -636,7 +596,7 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
 		switch (cinfo->btype) {
 		case BT_ALPINE:
 		case BT_GD5480:
-			regs->multiplexing = 1;
+			cinfo->multiplexing = 1;
 			break;
 
 		default:
@@ -691,7 +651,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 {
 	struct cirrusfb_info *cinfo = info->par;
 	struct fb_var_screeninfo *var = &info->var;
-	struct cirrusfb_regs regs;
 	u8 __iomem *regbase = cinfo->regbase;
 	unsigned char tmp;
 	int err;
@@ -709,7 +668,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 	init_vgachip(info);
 
-	err = cirrusfb_decode_var(var, &regs, info);
+	err = cirrusfb_decode_var(var, info);
 	if (err) {
 		/* should never happen */
 		dev_dbg(info->device, "mode change aborted.  invalid var.\n");
@@ -753,7 +712,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		vsyncend /= 2;
 		vdispend /= 2;
 	}
-	if (regs.multiplexing) {
+	if (cinfo->multiplexing) {
 		htotal /= 2;
 		hsyncstart /= 2;
 		hsyncend /= 2;
@@ -964,7 +923,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_ALPINE:
 		case BT_GD5480:
 			vga_wseq(regbase, CL_SEQR7,
-				 regs.multiplexing ?
+				 cinfo->multiplexing ?
 					bi->sr07_1bpp_mux : bi->sr07_1bpp);
 			break;
 
@@ -1018,7 +977,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		/* pixel mask: pass-through for first plane */
 		WGen(cinfo, VGA_PEL_MSK, 0x01);
-		if (regs.multiplexing)
+		if (cinfo->multiplexing)
 			/* hidden dac reg: 1280x1024 */
 			WHDR(cinfo, 0x4a);
 		else
@@ -1047,7 +1006,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_ALPINE:
 		case BT_GD5480:
 			vga_wseq(regbase, CL_SEQR7,
-				  regs.multiplexing ?
+				  cinfo->multiplexing ?
 					bi->sr07_8bpp_mux : bi->sr07_8bpp);
 			break;
 
@@ -1101,18 +1060,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		/* mode register: 256 color mode */
 		vga_wgfx(regbase, VGA_GFX_MODE, 64);
-		/* pixel mask: pass-through all planes */
-		WGen(cinfo, VGA_PEL_MSK, 0xff);
-		if (regs.multiplexing)
+		if (cinfo->multiplexing)
 			/* hidden dac reg: 1280x1024 */
 			WHDR(cinfo, 0x4a);
 		else
 			/* hidden dac: nothing */
 			WHDR(cinfo, 0);
-		/* memory mode: chain4, ext. memory */
-		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
-		/* plane mask: enable writing to all 4 planes */
-		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff);
 	}
 
 	/******************************************************
@@ -1177,18 +1130,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		/* mode register: 256 color mode */
 		vga_wgfx(regbase, VGA_GFX_MODE, 64);
-		/* pixel mask: pass-through all planes */
-		WGen(cinfo, VGA_PEL_MSK, 0xff);
 #ifdef CONFIG_PCI
 		WHDR(cinfo, 0xc1);	/* Copy Xbh */
 #elif defined(CONFIG_ZORRO)
 		/* FIXME: CONFIG_PCI and CONFIG_ZORRO may be defined both */
 		WHDR(cinfo, 0xa0);	/* hidden dac reg: nothing special */
 #endif
-		/* memory mode: chain4, ext. memory */
-		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
-		/* plane mask: enable writing to all 4 planes */
-		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff);
 	}
 
 	/******************************************************
@@ -1253,14 +1200,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		/* mode register: 256 color mode */
 		vga_wgfx(regbase, VGA_GFX_MODE, 64);
-		/* pixel mask: pass-through all planes */
-		WGen(cinfo, VGA_PEL_MSK, 0xff);
 		/* hidden dac reg: 8-8-8 mode (24 or 32) */
 		WHDR(cinfo, 0xc5);
-		/* memory mode: chain4, ext. memory */
-		vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
-		/* plane mask: enable writing to all 4 planes */
-		vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff);
 	}
 
 	/******************************************************
@@ -1309,48 +1250,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	}
 
 
-	/* text cursor location high */
-	vga_wcrt(regbase, VGA_CRTC_CURSOR_HI, 0);
-	/* text cursor location low */
-	vga_wcrt(regbase, VGA_CRTC_CURSOR_LO, 0);
-	/* underline row scanline = at very bottom */
-	vga_wcrt(regbase, VGA_CRTC_UNDERLINE, 0);
-
-	/* controller mode */
-	vga_wattr(regbase, VGA_ATC_MODE, 1);
-	/* overscan (border) color */
-	vga_wattr(regbase, VGA_ATC_OVERSCAN, 0);
-	/* color plane enable */
-	vga_wattr(regbase, VGA_ATC_PLANE_ENABLE, 15);
 	/* pixel panning */
 	vga_wattr(regbase, CL_AR33, 0);
-	/* color select */
-	vga_wattr(regbase, VGA_ATC_COLOR_PAGE, 0);
 
 	/* [ EGS: SetOffset(); ] */
 	/* From SetOffset(): Turn on VideoEnable bit in Attribute controller */
 	AttrOn(cinfo);
 
-	/* set/reset register */
-	vga_wgfx(regbase, VGA_GFX_SR_VALUE, 0);
-	/* set/reset enable */
-	vga_wgfx(regbase, VGA_GFX_SR_ENABLE, 0);
-	/* color compare */
-	vga_wgfx(regbase, VGA_GFX_COMPARE_VALUE, 0);
-	/* data rotate */
-	vga_wgfx(regbase, VGA_GFX_DATA_ROTATE, 0);
-	/* read map select */
-	vga_wgfx(regbase, VGA_GFX_PLANE_READ, 0);
-	/* miscellaneous register */
-	vga_wgfx(regbase, VGA_GFX_MISC, 1);
-	/* color don't care */
-	vga_wgfx(regbase, VGA_GFX_COMPARE_MASK, 15);
-	/* bit mask */
-	vga_wgfx(regbase, VGA_GFX_BIT_MASK, 255);
-
-	/* graphics cursor attributes: nothing special */
-	vga_wseq(regbase, CL_SEQR12, 0x0);
-
 	if (cinfo->btype == BT_LAGUNA) {
 		/* no tiles */
 		fb_writew(control | 0x1000, cinfo->laguna_mmio + 0x402);
@@ -1369,8 +1275,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	vga_wseq(regbase, VGA_SEQ_CLOCK_MODE, tmp);
 	dev_dbg(info->device, "CL_SEQR1: %d\n", tmp);
 
-	cinfo->currentmode = regs;
-
 	/* pan to requested offset */
 	cirrusfb_pan_display(var, info);
 
@@ -1443,9 +1347,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	if (var->vmode & FB_VMODE_YWRAP)
 		return -EINVAL;
 
-	info->var.xoffset = var->xoffset;
-	info->var.yoffset = var->yoffset;
-
 	xoffset = var->xoffset * info->var.bits_per_pixel / 8;
 	yoffset = var->yoffset;
 
@@ -1480,8 +1381,11 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	vga_wcrt(cinfo->regbase, CL_CRT1B, tmp);
 
 	/* construct bit 19 of screen start address */
-	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19)
-		vga_wcrt(cinfo->regbase, CL_CRT1D, (base >> 12) & 0x80);
+	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) {
+		tmp = vga_rcrt(cinfo->regbase, CL_CRT1D) & ~0x80;
+		tmp |= (base >> 12) & 0x80;
+		vga_wcrt(cinfo->regbase, CL_CRT1D, tmp);
+	}
 
 	/* write pixel panning value to AR33; this does not quite work in 8bpp
 	 *
@@ -1670,8 +1574,8 @@ static void init_vgachip(struct fb_info *info)
 	vga_wseq(cinfo->regbase, VGA_SEQ_PLANE_WRITE, 0xff);
 	/* character map select: doesn't even matter in gx mode */
 	vga_wseq(cinfo->regbase, VGA_SEQ_CHARACTER_MAP, 0x00);
-	/* memory mode: chain-4, no odd/even, ext. memory */
-	vga_wseq(cinfo->regbase, VGA_SEQ_MEMORY_MODE, 0x0e);
+	/* memory mode: chain4, ext. memory */
+	vga_wseq(cinfo->regbase, VGA_SEQ_MEMORY_MODE, 0x0a);
 
 	/* controller-internal base address of video memory */
 	if (bi->init_sr07)
@@ -1784,7 +1688,6 @@ static void init_vgachip(struct fb_info *info)
 	vga_wattr(cinfo->regbase, VGA_ATC_OVERSCAN, 0x00);
 	/* Color Plane enable: Enable all 4 planes */
 	vga_wattr(cinfo->regbase, VGA_ATC_PLANE_ENABLE, 0x0f);
-/* ###  vga_wattr(cinfo->regbase, CL_AR33, 0x00); * Pixel Panning: - */
 	/* Color Select: - */
 	vga_wattr(cinfo->regbase, VGA_ATC_COLOR_PAGE, 0x00);
 
@@ -2063,6 +1966,21 @@ static void cirrusfb_zorro_unmap(struct fb_info *info)
 }
 #endif /* CONFIG_ZORRO */
 
+/* function table of the above functions */
+static struct fb_ops cirrusfb_ops = {
+	.owner		= THIS_MODULE,
+	.fb_open	= cirrusfb_open,
+	.fb_release	= cirrusfb_release,
+	.fb_setcolreg	= cirrusfb_setcolreg,
+	.fb_check_var	= cirrusfb_check_var,
+	.fb_set_par	= cirrusfb_set_par,
+	.fb_pan_display = cirrusfb_pan_display,
+	.fb_blank	= cirrusfb_blank,
+	.fb_fillrect	= cirrusfb_fillrect,
+	.fb_copyarea	= cirrusfb_copyarea,
+	.fb_imageblit	= cirrusfb_imageblit,
+};
+
 static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 {
 	struct cirrusfb_info *cinfo = info->par;
@@ -2111,12 +2029,9 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 {
 	struct cirrusfb_info *cinfo = info->par;
 	int err;
-	enum cirrus_board btype;
-
-	btype = cinfo->btype;
 
 	/* sanity checks */
-	assert(btype != BT_NONE);
+	assert(cinfo->btype != BT_NONE);
 
 	/* set all the vital stuff */
 	cirrusfb_set_fbinfo(info);
@@ -2132,7 +2047,7 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 
 	info->var.activate = FB_ACTIVATE_NOW;
 
-	err = cirrusfb_decode_var(&info->var, &cinfo->currentmode, info);
+	err = cirrusfb_decode_var(&info->var, info);
 	if (err < 0) {
 		/* should never happen */
 		dev_dbg(info->device,
@@ -2174,7 +2089,6 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 {
 	struct cirrusfb_info *cinfo;
 	struct fb_info *info;
-	enum cirrus_board btype;
 	unsigned long board_addr, board_size;
 	int ret;
 
@@ -2192,11 +2106,11 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	}
 
 	cinfo = info->par;
-	cinfo->btype = btype = (enum cirrus_board) ent->driver_data;
+	cinfo->btype = (enum cirrus_board) ent->driver_data;
 
 	dev_dbg(info->device,
 		" Found PCI device, base address 0 is 0x%Lx, btype set to %d\n",
-		(unsigned long long)pdev->resource[0].start, btype);
+		(unsigned long long)pdev->resource[0].start,  cinfo->btype);
 	dev_dbg(info->device, " base address 1 is 0x%Lx\n",
 		(unsigned long long)pdev->resource[1].start);
 
@@ -2219,7 +2133,7 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	dev_dbg(info->device, "Board address: 0x%lx, register address: 0x%lx\n",
 		board_addr, info->fix.mmio_start);
 
-	board_size = (btype == BT_GD5480) ?
+	board_size = (cinfo->btype == BT_GD5480) ?
 		32 * MB_ : cirrusfb_get_memsize(info, cinfo->regbase);
 
 	ret = pci_request_regions(pdev, "cirrusfb");
@@ -2425,27 +2339,6 @@ static struct zorro_driver cirrusfb_zorro_driver = {
 };
 #endif /* CONFIG_ZORRO */
 
-static int __init cirrusfb_init(void)
-{
-	int error = 0;
-
-#ifndef MODULE
-	char *option = NULL;
-
-	if (fb_get_options("cirrusfb", &option))
-		return -ENODEV;
-	cirrusfb_setup(option);
-#endif
-
-#ifdef CONFIG_ZORRO
-	error |= zorro_register_driver(&cirrusfb_zorro_driver);
-#endif
-#ifdef CONFIG_PCI
-	error |= pci_register_driver(&cirrusfb_pci_driver);
-#endif
-	return error;
-}
-
 #ifndef MODULE
 static int __init cirrusfb_setup(char *options)
 {
@@ -2477,6 +2370,27 @@ MODULE_AUTHOR("Copyright 1999,2000 Jeff Garzik <jgarzik@pobox.com>");
 MODULE_DESCRIPTION("Accelerated FBDev driver for Cirrus Logic chips");
 MODULE_LICENSE("GPL");
 
+static int __init cirrusfb_init(void)
+{
+	int error = 0;
+
+#ifndef MODULE
+	char *option = NULL;
+
+	if (fb_get_options("cirrusfb", &option))
+		return -ENODEV;
+	cirrusfb_setup(option);
+#endif
+
+#ifdef CONFIG_ZORRO
+	error |= zorro_register_driver(&cirrusfb_zorro_driver);
+#endif
+#ifdef CONFIG_PCI
+	error |= pci_register_driver(&cirrusfb_pci_driver);
+#endif
+	return error;
+}
+
 static void __exit cirrusfb_exit(void)
 {
 #ifdef CONFIG_PCI
@@ -2683,7 +2597,7 @@ static void cirrusfb_WaitBLT(u8 __iomem *regbase)
 {
 	/* now busy-wait until we're done */
 	while (vga_rgfx(regbase, CL_GR31) & 0x08)
-		/* do nothing */ ;
+		cpu_relax();
 }
 
 /*******************************************************************
-- 
cgit v1.2.3


From 1b48cb563d59e03dbf530174f30c0ed3b6fba513 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:08 -0700
Subject: cirrusfb: Laguna chipset 8bpp fix

Fix 8bpp mode by adding handling of the Laguna chipsets to various places
and stop trashing a HDR register which probably does not exist on the
Laguna.

Fix compilation warnings about uninitialized variables also.

Finally, all 8bpp, 16bpp and 32bpp modes work on the Laguna chipset.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 34 +++++++++++++++++++++-------------
 include/video/cirrus.h   |  1 -
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 9e52db7d333..12f45520e5d 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -660,7 +660,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	int yres, vdispend, vsyncstart, vsyncend, vtotal;
 	long freq;
 	int nom, den, div;
-	unsigned int control, format, threshold;
+	unsigned int control = 0, format = 0, threshold = 0;
 
 	dev_dbg(info->device, "Requested mode: %dx%dx%d\n",
 	       var->xres, var->yres, var->bits_per_pixel);
@@ -842,8 +842,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		threshold = fb_readw(cinfo->laguna_mmio + 0xea);
 		control &= ~0x6800;
 		format = 0;
-		threshold &= 0xffe0;
-		threshold &= 0x3fbf;
+		threshold &= 0xffe0 & 0x3fbf;
 	}
 	if (nom) {
 		tmp = den << 1;
@@ -893,6 +892,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		tmp |= 0x40;
 	if (var->sync & FB_SYNC_VERT_HIGH_ACT)
 		tmp |= 0x80;
+	if (cinfo->btype == BT_LAGUNA)
+		tmp |= 0xc;
 	WGen(cinfo, VGA_MIS_W, tmp);
 
 	/* Screen A Preset Row-Scan register */
@@ -1228,9 +1229,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19)
 		vga_wcrt(regbase, CL_CRT1D, (pitch >> 9) & 1);
 
-	if (cinfo->btype == BT_LAGUNA ||
-	    cinfo->btype == BT_GD5480) {
-
+	if (cinfo->btype == BT_LAGUNA) {
 		tmp = 0;
 		if ((htotal + 5) & 256)
 			tmp |= 128;
@@ -1360,7 +1359,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 		xpix = (unsigned char) ((xoffset % 4) * 2);
 	}
 
-	cirrusfb_WaitBLT(cinfo->regbase); /* make sure all the BLT's are done */
+	if (cinfo->btype != BT_LAGUNA)
+		cirrusfb_WaitBLT(cinfo->regbase);
 
 	/* lower 8 + 8 bits of screen start address */
 	vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO,
@@ -1394,7 +1394,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	if (info->var.bits_per_pixel == 1)
 		vga_wattr(cinfo->regbase, CL_AR33, xpix);
 
-	cirrusfb_WaitBLT(cinfo->regbase);
+	if (cinfo->btype != BT_LAGUNA)
+		cirrusfb_WaitBLT(cinfo->regbase);
 
 	return 0;
 }
@@ -1513,6 +1514,7 @@ static void init_vgachip(struct fb_info *info)
 		vga_wgfx(cinfo->regbase, CL_GR2F, 0x00);
 		break;
 
+	case BT_LAGUNA:
 	case BT_ALPINE:
 		/* Nothing to do to reset the board. */
 		break;
@@ -1538,7 +1540,7 @@ static void init_vgachip(struct fb_info *info)
 			WGen(cinfo, CL_VSSM2, 0x01);
 
 		/* reset sequencer logic */
-		vga_wseq(cinfo->regbase, CL_SEQR0, 0x03);
+		vga_wseq(cinfo->regbase, VGA_SEQ_RESET, 0x03);
 
 		/* FullBandwidth (video off) and 8/9 dot clock */
 		vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, 0x21);
@@ -1560,6 +1562,7 @@ static void init_vgachip(struct fb_info *info)
 			vga_wseq(cinfo->regbase, CL_SEQRF, 0x98);
 			break;
 		case BT_ALPINE:
+		case BT_LAGUNA:
 			break;
 		case BT_SD64:
 			vga_wseq(cinfo->regbase, CL_SEQRF, 0xb8);
@@ -1648,7 +1651,8 @@ static void init_vgachip(struct fb_info *info)
 	vga_wgfx(cinfo->regbase, VGA_GFX_COMPARE_MASK, 0x0f);
 	/* Bit Mask: no mask at all */
 	vga_wgfx(cinfo->regbase, VGA_GFX_BIT_MASK, 0xff);
-	if (cinfo->btype == BT_ALPINE)
+
+	if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_LAGUNA)
 		/* (5434 can't have bit 3 set for bitblt) */
 		vga_wgfx(cinfo->regbase, CL_GRB, 0x20);
 	else
@@ -1845,7 +1849,8 @@ static void cirrusfb_imageblit(struct fb_info *info,
 {
 	struct cirrusfb_info *cinfo = info->par;
 
-	cirrusfb_WaitBLT(cinfo->regbase);
+	if (cinfo->btype != BT_LAGUNA)
+		cirrusfb_WaitBLT(cinfo->regbase);
 	cfb_imageblit(info, image);
 }
 
@@ -1992,7 +1997,7 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 		    | FBINFO_HWACCEL_YPAN
 		    | FBINFO_HWACCEL_FILLRECT
 		    | FBINFO_HWACCEL_COPYAREA;
-	if (noaccel)
+	if (noaccel || cinfo->btype == BT_LAGUNA)
 		info->flags |= FBINFO_HWACCEL_DISABLED;
 	info->fbops = &cirrusfb_ops;
 	if (cinfo->btype == BT_GD5480) {
@@ -2481,6 +2486,8 @@ static void WHDR(const struct cirrusfb_info *cinfo, unsigned char val)
 {
 	unsigned char dummy;
 
+	if (cinfo->btype == BT_LAGUNA)
+		return;
 	if (cinfo->btype == BT_PICASSO) {
 		/* Klaus' hint for correct access to HDR on some boards */
 		/* first write 0 to pixel mask (3c6) */
@@ -2548,7 +2555,8 @@ static void WClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch
 	vga_w(cinfo->regbase, VGA_PEL_IW, regnum);
 
 	if (cinfo->btype == BT_PICASSO || cinfo->btype == BT_PICASSO4 ||
-	    cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480) {
+	    cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480 ||
+	    cinfo->btype == BT_LAGUNA) {
 		/* but DAC data register IS, at least for Picasso II */
 		if (cinfo->btype == BT_PICASSO)
 			data += 0xfff;
diff --git a/include/video/cirrus.h b/include/video/cirrus.h
index a1b7985b6b2..9a5e9ee3078 100644
--- a/include/video/cirrus.h
+++ b/include/video/cirrus.h
@@ -32,7 +32,6 @@
 #define CL_VSSM2	0x3c3	/* Motherboard Sleep */
 
 /*** VGA Sequencer Registers ***/
-#define CL_SEQR0	0x0	/* Reset */
 /* the following are from the "extension registers" group */
 #define CL_SEQR6	0x6	/* Unlock ALL Extensions */
 #define CL_SEQR7	0x7	/* Extended Sequencer Mode */
-- 
cgit v1.2.3


From 99a4584752bb41330342a427d014482525de7433 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:09 -0700
Subject: cirrusfb: check_var improvements

Break cirrusfb_decode_var() function into two parts:
cirrusfb_check_pixclock() which can be called from the
cirrusfb_check_var() aand merge rest into the cirrusfb_set_par_foo().
This allows rejecting modes with too high pixclock before before any
change to hardware state (and a console is messed up).

Also, fix RGB field's lengths for 8bpp modes to correct ones so X11 works
with fbdev driver with cirrusfb.

Kill some redundant function calls or register loads.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 195 ++++++++++++++++++-----------------------------
 1 file changed, 74 insertions(+), 121 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 12f45520e5d..bab713b63a0 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -432,6 +432,53 @@ static int cirrusfb_check_mclk(struct fb_info *info, long freq)
 	return 0;
 }
 
+static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var,
+				   struct fb_info *info)
+{
+	long freq;
+	long maxclock;
+	struct cirrusfb_info *cinfo = info->par;
+	unsigned maxclockidx = var->bits_per_pixel >> 3;
+
+	/* convert from ps to kHz */
+	freq = PICOS2KHZ(var->pixclock);
+
+	dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq);
+
+	maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx];
+	cinfo->multiplexing = 0;
+
+	/* If the frequency is greater than we can support, we might be able
+	 * to use multiplexing for the video mode */
+	if (freq > maxclock) {
+		switch (cinfo->btype) {
+		case BT_ALPINE:
+		case BT_GD5480:
+			cinfo->multiplexing = 1;
+			break;
+
+		default:
+			dev_err(info->device,
+				"Frequency greater than maxclock (%ld kHz)\n",
+				maxclock);
+			return -EINVAL;
+		}
+	}
+#if 0
+	/* TODO: If we have a 1MB 5434, we need to put ourselves in a mode where
+	 * the VCLK is double the pixel clock. */
+	switch (var->bits_per_pixel) {
+	case 16:
+	case 32:
+		if (var->xres <= 800)
+			/* Xbh has this type of clock for 32-bit */
+			freq /= 2;
+		break;
+	}
+#endif
+	return 0;
+}
+
 static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 			      struct fb_info *info)
 {
@@ -449,7 +496,7 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 
 	case 8:
 		var->red.offset = 0;
-		var->red.length = 6;
+		var->red.length = 8;
 		var->green = var->red;
 		var->blue = var->red;
 		break;
@@ -513,7 +560,6 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 		return -EINVAL;
 	}
 
-
 	if (var->xoffset < 0)
 		var->xoffset = 0;
 	if (var->yoffset < 0)
@@ -544,80 +590,9 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 		return -EINVAL;
 	}
 
-	return 0;
-}
-
-static int cirrusfb_decode_var(const struct fb_var_screeninfo *var,
-				struct fb_info *info)
-{
-	long freq;
-	long maxclock;
-	int maxclockidx = var->bits_per_pixel >> 3;
-	struct cirrusfb_info *cinfo = info->par;
-
-	switch (var->bits_per_pixel) {
-	case 1:
-		info->fix.line_length = var->xres_virtual / 8;
-		info->fix.visual = FB_VISUAL_MONO10;
-		break;
-
-	case 8:
-		info->fix.line_length = var->xres_virtual;
-		info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
-		break;
-
-	case 16:
-	case 32:
-		info->fix.line_length = var->xres_virtual * maxclockidx;
-		info->fix.visual = FB_VISUAL_TRUECOLOR;
-		break;
-
-	default:
-		dev_dbg(info->device,
-			"Unsupported bpp size: %d\n", var->bits_per_pixel);
-		assert(false);
-		/* should never occur */
-		break;
-	}
-
-	info->fix.type = FB_TYPE_PACKED_PIXELS;
-
-	/* convert from ps to kHz */
-	freq = PICOS2KHZ(var->pixclock);
-
-	dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq);
-
-	maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx];
-	cinfo->multiplexing = 0;
-
-	/* If the frequency is greater than we can support, we might be able
-	 * to use multiplexing for the video mode */
-	if (freq > maxclock) {
-		switch (cinfo->btype) {
-		case BT_ALPINE:
-		case BT_GD5480:
-			cinfo->multiplexing = 1;
-			break;
+	if (cirrusfb_check_pixclock(var, info))
+		return -EINVAL;
 
-		default:
-			dev_err(info->device,
-				"Frequency greater than maxclock (%ld kHz)\n",
-				maxclock);
-			return -EINVAL;
-		}
-	}
-#if 0
-	/* TODO: If we have a 1MB 5434, we need to put ourselves in a mode where
-	 * the VCLK is double the pixel clock. */
-	switch (var->bits_per_pixel) {
-	case 16:
-	case 32:
-		if (var->xres <= 800)
-			/* Xbh has this type of clock for 32-bit */
-			freq /= 2;
-		break;
-	}
-#endif
 	return 0;
 }
 
@@ -653,7 +628,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	struct fb_var_screeninfo *var = &info->var;
 	u8 __iomem *regbase = cinfo->regbase;
 	unsigned char tmp;
-	int err;
 	int pitch;
 	const struct cirrusfb_board_info_rec *bi;
 	int hdispend, hsyncstart, hsyncend, htotal;
@@ -664,16 +638,28 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 	dev_dbg(info->device, "Requested mode: %dx%dx%d\n",
 	       var->xres, var->yres, var->bits_per_pixel);
-	dev_dbg(info->device, "pixclock: %d\n", var->pixclock);
 
-	init_vgachip(info);
+	switch (var->bits_per_pixel) {
+	case 1:
+		info->fix.line_length = var->xres_virtual / 8;
+		info->fix.visual = FB_VISUAL_MONO10;
+		break;
 
-	err = cirrusfb_decode_var(var, info);
-	if (err) {
-		/* should never happen */
-		dev_dbg(info->device, "mode change aborted.  invalid var.\n");
-		return -EINVAL;
+	case 8:
+		info->fix.line_length = var->xres_virtual;
+		info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
+		break;
+
+	case 16:
+	case 32:
+		info->fix.line_length = var->xres_virtual *
+					var->bits_per_pixel >> 3;
+		info->fix.visual = FB_VISUAL_TRUECOLOR;
+		break;
 	}
+	info->fix.type = FB_TYPE_PACKED_PIXELS;
+
+	init_vgachip(info);
 
 	bi = &cirrusfb_board_info[cinfo->btype];
 
@@ -873,9 +859,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		 * address wrap, no compat. */
 		vga_wcrt(regbase, VGA_CRTC_MODE, 0xc3);
 
-/* HAEH?	vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, 0x20);
- * previously: 0x00  unlock VGA_CRTC_H_TOTAL..CRT7 */
-
 	/* don't know if it would hurt to also program this if no interlaced */
 	/* mode is used, but I feel better this way.. :-) */
 	if (var->vmode & FB_VMODE_INTERLACED)
@@ -883,8 +866,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	else
 		vga_wcrt(regbase, VGA_CRTC_REGS, 0x00);	/* interlace control */
 
-	vga_wseq(regbase, VGA_SEQ_CHARACTER_MAP, 0);
-
 	/* adjust horizontal/vertical sync type (low/high) */
 	/* enable display memory & CRTC I/O address for color mode */
 	tmp = 0x03;
@@ -896,8 +877,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		tmp |= 0xc;
 	WGen(cinfo, VGA_MIS_W, tmp);
 
-	/* Screen A Preset Row-Scan register */
-	vga_wcrt(regbase, VGA_CRTC_PRESET_ROW, 0);
 	/* text cursor on and start line */
 	vga_wcrt(regbase, VGA_CRTC_CURSOR_START, 0);
 	/* text cursor end line */
@@ -1248,7 +1227,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		dev_dbg(info->device, "CRT1e: %d\n", tmp);
 	}
 
-
 	/* pixel panning */
 	vga_wattr(regbase, CL_AR33, 0);
 
@@ -1274,9 +1252,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	vga_wseq(regbase, VGA_SEQ_CLOCK_MODE, tmp);
 	dev_dbg(info->device, "CL_SEQR1: %d\n", tmp);
 
-	/* pan to requested offset */
-	cirrusfb_pan_display(var, info);
-
 #ifdef CIRRUSFB_DEBUG
 	cirrusfb_dbg_reg_dump(info, NULL);
 #endif
@@ -1332,8 +1307,7 @@ static int cirrusfb_setcolreg(unsigned regno, unsigned red, unsigned green,
 static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 				struct fb_info *info)
 {
-	int xoffset = 0;
-	int yoffset = 0;
+	int xoffset;
 	unsigned long base;
 	unsigned char tmp, xpix;
 	struct cirrusfb_info *cinfo = info->par;
@@ -1347,9 +1321,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 		return -EINVAL;
 
 	xoffset = var->xoffset * info->var.bits_per_pixel / 8;
-	yoffset = var->yoffset;
 
-	base = yoffset * info->fix.line_length + xoffset;
+	base = var->yoffset * info->fix.line_length + xoffset;
 
 	if (info->var.bits_per_pixel == 1) {
 		/* base is already correct */
@@ -1363,10 +1336,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 		cirrusfb_WaitBLT(cinfo->regbase);
 
 	/* lower 8 + 8 bits of screen start address */
-	vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO,
-		 (unsigned char) (base & 0xff));
-	vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI,
-		 (unsigned char) (base >> 8));
+	vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO, base & 0xff);
+	vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI, (base >> 8) & 0xff);
 
 	/* 0xf2 is %11110010, exclude tmp bits */
 	tmp = vga_rcrt(cinfo->regbase, CL_CRT1B) & 0xf2;
@@ -1544,10 +1515,6 @@ static void init_vgachip(struct fb_info *info)
 
 		/* FullBandwidth (video off) and 8/9 dot clock */
 		vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, 0x21);
-		/* polarity (-/-), disable access to display memory,
-		 * VGA_CRTC_START_HI base address: color
-		 */
-		WGen(cinfo, VGA_MIS_W, 0xc1);
 
 		/* "magic cookie" - doesn't make any sense to me.. */
 /*      vga_wgfx(cinfo->regbase, CL_GRA, 0xce);   */
@@ -1614,10 +1581,6 @@ static void init_vgachip(struct fb_info *info)
 	vga_wcrt(cinfo->regbase, VGA_CRTC_CURSOR_START, 0x20);
 	/* Text cursor end: - */
 	vga_wcrt(cinfo->regbase, VGA_CRTC_CURSOR_END, 0x00);
-	/* Screen start address high: 0 */
-	vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI, 0x00);
-	/* Screen start address low: 0 */
-	vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO, 0x00);
 	/* text cursor location high: 0 */
 	vga_wcrt(cinfo->regbase, VGA_CRTC_CURSOR_HI, 0x00);
 	/* text cursor location low: 0 */
@@ -1625,10 +1588,6 @@ static void init_vgachip(struct fb_info *info)
 
 	/* Underline Row scanline: - */
 	vga_wcrt(cinfo->regbase, VGA_CRTC_UNDERLINE, 0x00);
-	/* mode control: timing enable, byte mode, no compat modes */
-	vga_wcrt(cinfo->regbase, VGA_CRTC_MODE, 0xc3);
-	/* Line Compare: not needed */
-	vga_wcrt(cinfo->regbase, VGA_CRTC_LINE_COMPARE, 0x00);
 	/* ### add 0x40 for text modes with > 30 MHz pixclock */
 	/* ext. display controls: ext.adr. wrap */
 	vga_wcrt(cinfo->regbase, CL_CRT1B, 0x02);
@@ -1697,12 +1656,6 @@ static void init_vgachip(struct fb_info *info)
 
 	WGen(cinfo, VGA_PEL_MSK, 0xff);	/* Pixel mask: no mask */
 
-	if (cinfo->btype != BT_ALPINE && cinfo->btype != BT_GD5480)
-	/* polarity (-/-), enable display mem,
-	 * VGA_CRTC_START_HI i/o base = color
-	 */
-		WGen(cinfo, VGA_MIS_W, 0xc3);
-
 	/* BLT Start/status: Blitter reset */
 	vga_wgfx(cinfo->regbase, CL_GR31, 0x04);
 	/* - " -	   : "end-of-reset" */
@@ -2052,7 +2005,7 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 
 	info->var.activate = FB_ACTIVATE_NOW;
 
-	err = cirrusfb_decode_var(&info->var, info);
+	err = cirrusfb_check_var(&info->var, info);
 	if (err < 0) {
 		/* should never happen */
 		dev_dbg(info->device,
-- 
cgit v1.2.3


From 78d780e07247d52d3943b019bf9459bc9e95de1e Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:10 -0700
Subject: cirrusfb: various Laguna fixes

- The Laguna GD5465 (AGP) has one register more than non-AGP chips.
  Recognize the AGP version and write a tile control register only on the
  AGP version.  Tested only on an AGP card.

- Bump up RAMDAC frequencies after X11 code.  This allow to drive a flat
  panel resolution 1680x1050 at 16bpp from the 4MB card.

- Fix screen start address overflow bits on Laguna cards (CRT1D
  register).

- Fix exit path in the cirrusfb_pci_register() in case of error.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 84 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 27 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index bab713b63a0..cac4e2b0cba 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -102,7 +102,8 @@ enum cirrus_board {
 	BT_PICASSO4,	/* GD5446 */
 	BT_ALPINE,	/* GD543x/4x */
 	BT_GD5480,
-	BT_LAGUNA,	/* GD546x */
+	BT_LAGUNA,	/* GD5462/64 */
+	BT_LAGUNAB,	/* GD5465 */
 };
 
 /*
@@ -234,8 +235,18 @@ static const struct cirrusfb_board_info_rec {
 	[BT_LAGUNA] = {
 		.name			= "CL Laguna",
 		.maxclock		= {
-			/* guess */
-			135100, 135100, 135100, 135100, 135100,
+			/* taken from X11 code */
+			170000, 170000, 170000, 170000, 135100,
+		},
+		.init_sr07		= false,
+		.init_sr1f		= false,
+		.scrn_start_bit19	= true,
+	},
+	[BT_LAGUNAB] = {
+		.name			= "CL Laguna AGP",
+		.maxclock		= {
+			/* taken from X11 code */
+			170000, 250000, 170000, 170000, 135100,
 		},
 		.init_sr07		= false,
 		.init_sr1f		= false,
@@ -258,7 +269,7 @@ static struct pci_device_id cirrusfb_pci_table[] = {
 	CHIP(PCI_DEVICE_ID_CIRRUS_5446, BT_PICASSO4), /* Picasso 4 is 5446 */
 	CHIP(PCI_DEVICE_ID_CIRRUS_5462, BT_LAGUNA), /* CL Laguna */
 	CHIP(PCI_DEVICE_ID_CIRRUS_5464, BT_LAGUNA), /* CL Laguna 3D */
-	CHIP(PCI_DEVICE_ID_CIRRUS_5465, BT_LAGUNA), /* CL Laguna 3DA*/
+	CHIP(PCI_DEVICE_ID_CIRRUS_5465, BT_LAGUNAB), /* CL Laguna 3DA*/
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, cirrusfb_pci_table);
@@ -385,6 +396,11 @@ static void cirrusfb_dbg_print_regs(struct fb_info *info,
 /*****************************************************************************/
 /*** BEGIN Interface Used by the World ***************************************/
 
+static inline int is_laguna(const struct cirrusfb_info *cinfo)
+{
+	return cinfo->btype == BT_LAGUNA || cinfo->btype == BT_LAGUNAB;
+}
+
 static int opencount;
 
 /*--- Open /dev/fbx ---------------------------------------------------------*/
@@ -814,13 +830,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			cirrusfb_set_mclk_as_source(info, divMCLK);
 		}
 	}
-	if (cinfo->btype == BT_LAGUNA) {
+	if (is_laguna(cinfo)) {
 		long pcifc = fb_readl(cinfo->laguna_mmio + 0x3fc);
 		unsigned char tile = fb_readb(cinfo->laguna_mmio + 0x407);
 		unsigned short tile_control;
 
-		tile_control = fb_readw(cinfo->laguna_mmio + 0x2c4);
-		fb_writew(tile_control & ~0x80, cinfo->laguna_mmio + 0x2c4);
+		if (cinfo->btype == BT_LAGUNAB) {
+			tile_control = fb_readw(cinfo->laguna_mmio + 0x2c4);
+			tile_control &= ~0x80;
+			fb_writew(tile_control, cinfo->laguna_mmio + 0x2c4);
+		}
 
 		fb_writel(pcifc | 0x10000000l, cinfo->laguna_mmio + 0x3fc);
 		fb_writeb(tile & 0x3f, cinfo->laguna_mmio + 0x407);
@@ -842,7 +861,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		dev_dbg(info->device, "CL_SEQR1B: %d\n", (int) tmp);
 		/* Laguna chipset has reversed clock registers */
-		if (cinfo->btype == BT_LAGUNA) {
+		if (is_laguna(cinfo)) {
 			vga_wseq(regbase, CL_SEQRE, tmp);
 			vga_wseq(regbase, CL_SEQR1E, nom);
 		} else {
@@ -873,7 +892,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		tmp |= 0x40;
 	if (var->sync & FB_SYNC_VERT_HIGH_ACT)
 		tmp |= 0x80;
-	if (cinfo->btype == BT_LAGUNA)
+	if (is_laguna(cinfo))
 		tmp |= 0xc;
 	WGen(cinfo, VGA_MIS_W, tmp);
 
@@ -908,6 +927,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
 			break;
@@ -947,6 +967,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_ALPINE:
 		case BT_GD5480:
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			/* do nothing */
 			break;
 
@@ -991,6 +1012,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) | 0x01);
 			threshold |= 0x10;
@@ -1030,6 +1052,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		case BT_GD5480:
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			/* do nothing */
 			break;
 
@@ -1096,6 +1119,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
 			control |= 0x2000;
@@ -1166,6 +1190,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
 			control |= 0x6000;
@@ -1208,7 +1233,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19)
 		vga_wcrt(regbase, CL_CRT1D, (pitch >> 9) & 1);
 
-	if (cinfo->btype == BT_LAGUNA) {
+	if (is_laguna(cinfo)) {
 		tmp = 0;
 		if ((htotal + 5) & 256)
 			tmp |= 128;
@@ -1234,7 +1259,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	/* From SetOffset(): Turn on VideoEnable bit in Attribute controller */
 	AttrOn(cinfo);
 
-	if (cinfo->btype == BT_LAGUNA) {
+	if (is_laguna(cinfo)) {
 		/* no tiles */
 		fb_writew(control | 0x1000, cinfo->laguna_mmio + 0x402);
 		fb_writew(format, cinfo->laguna_mmio + 0xc0);
@@ -1332,7 +1357,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 		xpix = (unsigned char) ((xoffset % 4) * 2);
 	}
 
-	if (cinfo->btype != BT_LAGUNA)
+	if (!is_laguna(cinfo))
 		cirrusfb_WaitBLT(cinfo->regbase);
 
 	/* lower 8 + 8 bits of screen start address */
@@ -1353,8 +1378,11 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 
 	/* construct bit 19 of screen start address */
 	if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) {
-		tmp = vga_rcrt(cinfo->regbase, CL_CRT1D) & ~0x80;
-		tmp |= (base >> 12) & 0x80;
+		tmp = vga_rcrt(cinfo->regbase, CL_CRT1D);
+		if (is_laguna(cinfo))
+			tmp = (tmp & ~0x18) | ((base >> 16) & 0x18);
+		else
+			tmp = (tmp & ~0x80) | ((base >> 12) & 0x80);
 		vga_wcrt(cinfo->regbase, CL_CRT1D, tmp);
 	}
 
@@ -1365,7 +1393,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	if (info->var.bits_per_pixel == 1)
 		vga_wattr(cinfo->regbase, CL_AR33, xpix);
 
-	if (cinfo->btype != BT_LAGUNA)
+	if (!is_laguna(cinfo))
 		cirrusfb_WaitBLT(cinfo->regbase);
 
 	return 0;
@@ -1486,6 +1514,7 @@ static void init_vgachip(struct fb_info *info)
 		break;
 
 	case BT_LAGUNA:
+	case BT_LAGUNAB:
 	case BT_ALPINE:
 		/* Nothing to do to reset the board. */
 		break;
@@ -1530,6 +1559,7 @@ static void init_vgachip(struct fb_info *info)
 			break;
 		case BT_ALPINE:
 		case BT_LAGUNA:
+		case BT_LAGUNAB:
 			break;
 		case BT_SD64:
 			vga_wseq(cinfo->regbase, CL_SEQRF, 0xb8);
@@ -1611,7 +1641,7 @@ static void init_vgachip(struct fb_info *info)
 	/* Bit Mask: no mask at all */
 	vga_wgfx(cinfo->regbase, VGA_GFX_BIT_MASK, 0xff);
 
-	if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_LAGUNA)
+	if (cinfo->btype == BT_ALPINE || is_laguna(cinfo))
 		/* (5434 can't have bit 3 set for bitblt) */
 		vga_wgfx(cinfo->regbase, CL_GRB, 0x20);
 	else
@@ -1802,7 +1832,7 @@ static void cirrusfb_imageblit(struct fb_info *info,
 {
 	struct cirrusfb_info *cinfo = info->par;
 
-	if (cinfo->btype != BT_LAGUNA)
+	if (!is_laguna(cinfo))
 		cirrusfb_WaitBLT(cinfo->regbase);
 	cfb_imageblit(info, image);
 }
@@ -1831,7 +1861,7 @@ static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info,
 	unsigned long mem;
 	struct cirrusfb_info *cinfo = info->par;
 
-	if (cinfo->btype == BT_LAGUNA) {
+	if (is_laguna(cinfo)) {
 		unsigned char SR14 = vga_rseq(regbase, CL_SEQR14);
 
 		mem = ((SR14 & 7) + 1) << 20;
@@ -1950,7 +1980,7 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 		    | FBINFO_HWACCEL_YPAN
 		    | FBINFO_HWACCEL_FILLRECT
 		    | FBINFO_HWACCEL_COPYAREA;
-	if (noaccel || cinfo->btype == BT_LAGUNA)
+	if (noaccel || is_laguna(cinfo))
 		info->flags |= FBINFO_HWACCEL_DISABLED;
 	info->fbops = &cirrusfb_ops;
 	if (cinfo->btype == BT_GD5480) {
@@ -2060,7 +2090,7 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	if (!info) {
 		printk(KERN_ERR "cirrusfb: could not allocate memory\n");
 		ret = -ENOMEM;
-		goto err_disable;
+		goto err_out;
 	}
 
 	cinfo = info->par;
@@ -2127,10 +2157,11 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, info);
 
 	ret = cirrusfb_register(info);
-	if (ret)
-		iounmap(info->screen_base);
-	return ret;
+	if (!ret)
+		return 0;
 
+	pci_set_drvdata(pdev, NULL);
+	iounmap(info->screen_base);
 err_release_legacy:
 	if (release_io_ports)
 		release_region(0x3C0, 32);
@@ -2140,10 +2171,9 @@ err_release_regions:
 #endif
 	pci_release_regions(pdev);
 err_release_fb:
-	if (cinfo->laguna_mmio == NULL)
+	if (cinfo->laguna_mmio != NULL)
 		iounmap(cinfo->laguna_mmio);
 	framebuffer_release(info);
-err_disable:
 err_out:
 	return ret;
 }
@@ -2439,7 +2469,7 @@ static void WHDR(const struct cirrusfb_info *cinfo, unsigned char val)
 {
 	unsigned char dummy;
 
-	if (cinfo->btype == BT_LAGUNA)
+	if (is_laguna(cinfo))
 		return;
 	if (cinfo->btype == BT_PICASSO) {
 		/* Klaus' hint for correct access to HDR on some boards */
@@ -2509,7 +2539,7 @@ static void WClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch
 
 	if (cinfo->btype == BT_PICASSO || cinfo->btype == BT_PICASSO4 ||
 	    cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480 ||
-	    cinfo->btype == BT_LAGUNA) {
+	    is_laguna(cinfo)) {
 		/* but DAC data register IS, at least for Picasso II */
 		if (cinfo->btype == BT_PICASSO)
 			data += 0xfff;
-- 
cgit v1.2.3


From 8343c89c4f1aac4fced7bb6919b0bdd0c13edcdc Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:11 -0700
Subject: cirrusfb: acceleration improvements

- Fix color expansion for 16bpp and 32bpp modes in the
  cirrusfb_RectFill().

- Make a function with a common blitter code (cirrusfb_set_blitter).

- Add fb_sync function to allow a higher layer synchronize with the
  blitter.

- Kill one redundant blitter reset.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 186 +++++++++++++++++++----------------------------
 1 file changed, 75 insertions(+), 111 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index cac4e2b0cba..e4ac2f6d4d9 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -381,7 +381,7 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
 static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 			      u_short x, u_short y,
 			      u_short width, u_short height,
-			      u_char color, u_short line_length);
+			      u32 color, u_short line_length);
 
 static void bestclock(long freq, int *nom, int *den, int *div);
 
@@ -1550,9 +1550,6 @@ static void init_vgachip(struct fb_info *info)
 		/* unlock all extension registers */
 		vga_wseq(cinfo->regbase, CL_SEQR6, 0x12);
 
-		/* reset blitter */
-		vga_wgfx(cinfo->regbase, CL_GR31, 0x04);
-
 		switch (cinfo->btype) {
 		case BT_GD5480:
 			vga_wseq(cinfo->regbase, CL_SEQRF, 0x98);
@@ -1747,6 +1744,17 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on)
 /* Linux 2.6-style  accelerated functions */
 /******************************************/
 
+static int cirrusfb_sync(struct fb_info *info)
+{
+	struct cirrusfb_info *cinfo = info->par;
+
+	if (!is_laguna(cinfo)) {
+		while (vga_rgfx(cinfo->regbase, CL_GR31) & 0x03)
+			cpu_relax();
+	}
+	return 0;
+}
+
 static void cirrusfb_fillrect(struct fb_info *info,
 			      const struct fb_fillrect *region)
 {
@@ -1966,6 +1974,7 @@ static struct fb_ops cirrusfb_ops = {
 	.fb_blank	= cirrusfb_blank,
 	.fb_fillrect	= cirrusfb_fillrect,
 	.fb_copyarea	= cirrusfb_copyarea,
+	.fb_sync	= cirrusfb_sync,
 	.fb_imageblit	= cirrusfb_imageblit,
 };
 
@@ -2586,7 +2595,6 @@ static void RClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch
 /* FIXME: use interrupts instead */
 static void cirrusfb_WaitBLT(u8 __iomem *regbase)
 {
-	/* now busy-wait until we're done */
 	while (vga_rgfx(regbase, CL_GR31) & 0x08)
 		cpu_relax();
 }
@@ -2597,58 +2605,12 @@ static void cirrusfb_WaitBLT(u8 __iomem *regbase)
 	perform accelerated "scrolling"
 ********************************************************************/
 
-static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
-			    u_short curx, u_short cury,
-			    u_short destx, u_short desty,
-			    u_short width, u_short height,
-			    u_short line_length)
-{
-	u_short nwidth, nheight;
-	u_long nsrc, ndest;
-	u_char bltmode;
-
-	nwidth = width - 1;
-	nheight = height - 1;
-
-	bltmode = 0x00;
-	/* if source adr < dest addr, do the Blt backwards */
-	if (cury <= desty) {
-		if (cury == desty) {
-			/* if src and dest are on the same line, check x */
-			if (curx < destx)
-				bltmode |= 0x01;
-		} else
-			bltmode |= 0x01;
-	}
-	if (!bltmode) {
-		/* standard case: forward blitting */
-		nsrc = (cury * line_length) + curx;
-		ndest = (desty * line_length) + destx;
-	} else {
-		/* this means start addresses are at the end,
-		 * counting backwards
-		 */
-		nsrc = cury * line_length + curx +
-			nheight * line_length + nwidth;
-		ndest = desty * line_length + destx +
-			nheight * line_length + nwidth;
-	}
-
-	/*
-	   run-down of registers to be programmed:
-	   destination pitch
-	   source pitch
-	   BLT width/height
-	   source start
-	   destination start
-	   BLT mode
-	   BLT ROP
-	   VGA_GFX_SR_VALUE / VGA_GFX_SR_ENABLE: "fill color"
-	   start/stop
-	 */
-
-	cirrusfb_WaitBLT(regbase);
+static void cirrusfb_set_blitter(u8 __iomem *regbase,
+			    u_short nwidth, u_short nheight,
+			    u_long nsrc, u_long ndest,
+			    u_short bltmode, u_short line_length)
 
+{
 	/* pitch: set to line_length */
 	/* dest pitch low */
 	vga_wgfx(regbase, CL_GR24, line_length & 0xff);
@@ -2694,56 +2656,67 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
 	vga_wgfx(regbase, CL_GR32, 0x0d);	/* BLT ROP */
 
 	/* and finally: GO! */
-	vga_wgfx(regbase, CL_GR31, 0x02);	/* BLT Start/status */
+	vga_wgfx(regbase, CL_GR31, 0x82);	/* BLT Start/status */
 }
 
 /*******************************************************************
-	cirrusfb_RectFill()
+	cirrusfb_BitBLT()
 
-	perform accelerated rectangle fill
+	perform accelerated "scrolling"
 ********************************************************************/
 
-static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
-		     u_short x, u_short y, u_short width, u_short height,
-		     u_char color, u_short line_length)
+static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
+			    u_short curx, u_short cury,
+			    u_short destx, u_short desty,
+			    u_short width, u_short height,
+			    u_short line_length)
 {
-	u_short nwidth, nheight;
-	u_long ndest;
-	u_char op;
-
-	nwidth = width - 1;
-	nheight = height - 1;
+	u_short nwidth = width - 1;
+	u_short nheight = height - 1;
+	u_long nsrc, ndest;
+	u_char bltmode;
 
-	ndest = (y * line_length) + x;
+	bltmode = 0x00;
+	/* if source adr < dest addr, do the Blt backwards */
+	if (cury <= desty) {
+		if (cury == desty) {
+			/* if src and dest are on the same line, check x */
+			if (curx < destx)
+				bltmode |= 0x01;
+		} else
+			bltmode |= 0x01;
+	}
+	/* standard case: forward blitting */
+	nsrc = (cury * line_length) + curx;
+	ndest = (desty * line_length) + destx;
+	if (bltmode) {
+		/* this means start addresses are at the end,
+		 * counting backwards
+		 */
+		nsrc += nheight * line_length + nwidth;
+		ndest += nheight * line_length + nwidth;
+	}
 
 	cirrusfb_WaitBLT(regbase);
 
-	/* pitch: set to line_length */
-	vga_wgfx(regbase, CL_GR24, line_length & 0xff);	/* dest pitch low */
-	vga_wgfx(regbase, CL_GR25, line_length >> 8);	/* dest pitch hi */
-	vga_wgfx(regbase, CL_GR26, line_length & 0xff);	/* source pitch low */
-	vga_wgfx(regbase, CL_GR27, line_length >> 8);	/* source pitch hi */
+	cirrusfb_set_blitter(regbase, nwidth, nheight,
+			    nsrc, ndest, bltmode, line_length);
+}
 
-	/* BLT width: actual number of pixels - 1 */
-	vga_wgfx(regbase, CL_GR20, nwidth & 0xff);	/* BLT width low */
-	vga_wgfx(regbase, CL_GR21, nwidth >> 8);	/* BLT width hi */
+/*******************************************************************
+	cirrusfb_RectFill()
 
-	/* BLT height: actual number of lines -1 */
-	vga_wgfx(regbase, CL_GR22, nheight & 0xff);	/* BLT height low */
-	vga_wgfx(regbase, CL_GR23, nheight >> 8);	/* BLT width hi */
+	perform accelerated rectangle fill
+********************************************************************/
 
-	/* BLT destination */
-	/* BLT dest low */
-	vga_wgfx(regbase, CL_GR28, (u_char) (ndest & 0xff));
-	/* BLT dest mid */
-	vga_wgfx(regbase, CL_GR29, (u_char) (ndest >> 8));
-	/* BLT dest hi */
-	vga_wgfx(regbase, CL_GR2A, (u_char) (ndest >> 16));
+static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
+		     u_short x, u_short y, u_short width, u_short height,
+		     u32 color, u_short line_length)
+{
+	u_long ndest = (y * line_length) + x;
+	u_char op;
 
-	/* BLT source: set to 0 (is a dummy here anyway) */
-	vga_wgfx(regbase, CL_GR2C, 0x00);	/* BLT src low */
-	vga_wgfx(regbase, CL_GR2D, 0x00);	/* BLT src mid */
-	vga_wgfx(regbase, CL_GR2E, 0x00);	/* BLT src hi */
+	cirrusfb_WaitBLT(regbase);
 
 	/* This is a ColorExpand Blt, using the */
 	/* same color for foreground and background */
@@ -2751,29 +2724,20 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 	vga_wgfx(regbase, VGA_GFX_SR_ENABLE, color);	/* background color */
 
 	op = 0xc0;
-	if (bits_per_pixel == 16) {
-		vga_wgfx(regbase, CL_GR10, color);	/* foreground color */
-		vga_wgfx(regbase, CL_GR11, color);	/* background color */
-		op = 0x50;
+	if (bits_per_pixel >= 16) {
+		vga_wgfx(regbase, CL_GR10, color >> 8);	/* foreground color */
+		vga_wgfx(regbase, CL_GR11, color >> 8);	/* background color */
 		op = 0xd0;
-	} else if (bits_per_pixel == 32) {
-		vga_wgfx(regbase, CL_GR10, color);	/* foreground color */
-		vga_wgfx(regbase, CL_GR11, color);	/* background color */
-		vga_wgfx(regbase, CL_GR12, color);	/* foreground color */
-		vga_wgfx(regbase, CL_GR13, color);	/* background color */
-		vga_wgfx(regbase, CL_GR14, 0);	/* foreground color */
-		vga_wgfx(regbase, CL_GR15, 0);	/* background color */
-		op = 0x50;
+	}
+	if (bits_per_pixel == 32) {
+		vga_wgfx(regbase, CL_GR12, color >> 16);/* foreground color */
+		vga_wgfx(regbase, CL_GR13, color >> 16);/* background color */
+		vga_wgfx(regbase, CL_GR14, color >> 24);/* foreground color */
+		vga_wgfx(regbase, CL_GR15, color >> 24);/* background color */
 		op = 0xf0;
 	}
-	/* BLT mode: color expand, Enable 8x8 copy (faster?) */
-	vga_wgfx(regbase, CL_GR30, op);	/* BLT mode */
-
-	/* BLT ROP: SrcCopy */
-	vga_wgfx(regbase, CL_GR32, 0x0d);	/* BLT ROP */
-
-	/* and finally: GO! */
-	vga_wgfx(regbase, CL_GR31, 0x02);	/* BLT Start/status */
+	cirrusfb_set_blitter(regbase, width - 1, height - 1,
+			    0, ndest, op, line_length);
 }
 
 /**************************************************************************
-- 
cgit v1.2.3


From 9e848062533207130667f6eaa748549367ccbedf Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:11 -0700
Subject: cirrusfb: add imageblit function

Add hardware color expansion (imageblit) function.  It roughly doubles
scrolling speed of my Alpine card (GD5430).

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 64 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index e4ac2f6d4d9..a2a09b39c59 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -381,7 +381,8 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
 static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 			      u_short x, u_short y,
 			      u_short width, u_short height,
-			      u32 color, u_short line_length);
+			      u32 fg_color, u32 bg_color,
+			      u_short line_length, u_char blitmode);
 
 static void bestclock(long freq, int *nom, int *den, int *div);
 
@@ -1790,8 +1791,8 @@ static void cirrusfb_fillrect(struct fb_info *info,
 			  info->var.bits_per_pixel,
 			  (region->dx * m) / 8, region->dy,
 			  (region->width * m) / 8, region->height,
-			  color,
-			  info->fix.line_length);
+			  color, color,
+			  info->fix.line_length, 0x40);
 }
 
 static void cirrusfb_copyarea(struct fb_info *info,
@@ -1840,9 +1841,33 @@ static void cirrusfb_imageblit(struct fb_info *info,
 {
 	struct cirrusfb_info *cinfo = info->par;
 
-	if (!is_laguna(cinfo))
+	if (info->state != FBINFO_STATE_RUNNING)
+		return;
+	if (info->flags & FBINFO_HWACCEL_DISABLED)
+		cfb_imageblit(info, image);
+	else {
+		unsigned size = ((image->width + 7) >> 3) * image->height;
+		int m = info->var.bits_per_pixel;
+		u32 fg, bg;
+
+		if (info->var.bits_per_pixel == 8) {
+			fg = image->fg_color;
+			bg = image->bg_color;
+		} else {
+			fg = ((u32 *)(info->pseudo_palette))[image->fg_color];
+			bg = ((u32 *)(info->pseudo_palette))[image->bg_color];
+		}
 		cirrusfb_WaitBLT(cinfo->regbase);
-	cfb_imageblit(info, image);
+		/* byte rounded scanlines */
+		vga_wgfx(cinfo->regbase, CL_GR33, 0x00);
+		cirrusfb_RectFill(cinfo->regbase,
+				  info->var.bits_per_pixel,
+				  (image->dx * m) / 8, image->dy,
+				  (image->width * m) / 8, image->height,
+				  fg, bg,
+				  info->fix.line_length, 0x04);
+		memcpy(info->screen_base, image->data, size);
+	}
 }
 
 #ifdef CONFIG_PPC_PREP
@@ -1988,10 +2013,12 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 		    | FBINFO_HWACCEL_XPAN
 		    | FBINFO_HWACCEL_YPAN
 		    | FBINFO_HWACCEL_FILLRECT
+		    | FBINFO_HWACCEL_IMAGEBLIT
 		    | FBINFO_HWACCEL_COPYAREA;
 	if (noaccel || is_laguna(cinfo))
 		info->flags |= FBINFO_HWACCEL_DISABLED;
 	info->fbops = &cirrusfb_ops;
+
 	if (cinfo->btype == BT_GD5480) {
 		if (var->bits_per_pixel == 16)
 			info->screen_base += 1 * MB_;
@@ -2711,7 +2738,8 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel,
 
 static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 		     u_short x, u_short y, u_short width, u_short height,
-		     u32 color, u_short line_length)
+		     u32 fg_color, u32 bg_color, u_short line_length,
+		     u_char blitmode)
 {
 	u_long ndest = (y * line_length) + x;
 	u_char op;
@@ -2720,24 +2748,24 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 
 	/* This is a ColorExpand Blt, using the */
 	/* same color for foreground and background */
-	vga_wgfx(regbase, VGA_GFX_SR_VALUE, color);	/* foreground color */
-	vga_wgfx(regbase, VGA_GFX_SR_ENABLE, color);	/* background color */
+	vga_wgfx(regbase, VGA_GFX_SR_VALUE, bg_color);
+	vga_wgfx(regbase, VGA_GFX_SR_ENABLE, fg_color);
 
-	op = 0xc0;
+	op = 0x80;
 	if (bits_per_pixel >= 16) {
-		vga_wgfx(regbase, CL_GR10, color >> 8);	/* foreground color */
-		vga_wgfx(regbase, CL_GR11, color >> 8);	/* background color */
-		op = 0xd0;
+		vga_wgfx(regbase, CL_GR10, bg_color >> 8);
+		vga_wgfx(regbase, CL_GR11, fg_color >> 8);
+		op = 0x90;
 	}
 	if (bits_per_pixel == 32) {
-		vga_wgfx(regbase, CL_GR12, color >> 16);/* foreground color */
-		vga_wgfx(regbase, CL_GR13, color >> 16);/* background color */
-		vga_wgfx(regbase, CL_GR14, color >> 24);/* foreground color */
-		vga_wgfx(regbase, CL_GR15, color >> 24);/* background color */
-		op = 0xf0;
+		vga_wgfx(regbase, CL_GR12, bg_color >> 16);
+		vga_wgfx(regbase, CL_GR13, fg_color >> 16);
+		vga_wgfx(regbase, CL_GR14, bg_color >> 24);
+		vga_wgfx(regbase, CL_GR15, fg_color >> 24);
+		op = 0xb0;
 	}
 	cirrusfb_set_blitter(regbase, width - 1, height - 1,
-			    0, ndest, op, line_length);
+			    0, ndest, op | blitmode, line_length);
 }
 
 /**************************************************************************
-- 
cgit v1.2.3


From bc5d8ac02f24d68efe8e267c96dd75c0531009ab Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:12 -0700
Subject: cirrusfb: fix error paths in cirrusfb_xxx_register()

Balance iomap and iounmap and alloc and free calls in case of error druing
device register (probing).

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index a2a09b39c59..ffc514df245 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -2090,8 +2090,6 @@ static int __devinit cirrusfb_register(struct fb_info *info)
 
 err_dealloc_cmap:
 	fb_dealloc_cmap(&info->cmap);
-	cinfo->unmap(info);
-	framebuffer_release(info);
 	return err;
 }
 
@@ -2328,18 +2326,15 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 	zorro_set_drvdata(z, info);
 
 	ret = cirrusfb_register(info);
-	if (ret) {
-		if (btype == BT_PICASSO4) {
-			iounmap(info->screen_base);
-			iounmap(cinfo->regbase - 0x600000);
-		} else if (board_addr > 0x01000000)
-			iounmap(info->screen_base);
-	}
-	return ret;
+	if (!ret)
+		return 0;
+
+	if (btype == BT_PICASSO4 || board_addr > 0x01000000)
+		iounmap(info->screen_base);
 
 err_unmap_regbase:
-	/* Parental advisory: explicit hack */
-	iounmap(cinfo->regbase - 0x600000);
+	if (btype == BT_PICASSO4)
+		iounmap(cinfo->regbase - 0x600000);
 err_release_region:
 	release_region(board_addr, board_size);
 err_release_fb:
-- 
cgit v1.2.3


From 527410ff7fc5d45fe41523c0ba061113dea22017 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:13 -0700
Subject: cirrusfb: GD5446 fixes

Various fixes to make Cirrus GD5446 chip work.

Another Cirrus chip works with the cirrusfb.  The gd5446 seems very
similar to Alpine chips.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index ffc514df245..6603273f4ce 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -198,9 +198,11 @@ static const struct cirrusfb_board_info_rec {
 		.init_sr07		= true,
 		.init_sr1f		= false,
 		.scrn_start_bit19	= true,
-		.sr07			= 0x20,
-		.sr07_1bpp		= 0x20,
-		.sr07_8bpp		= 0x21,
+		.sr07			= 0xA0,
+		.sr07_1bpp		= 0xA0,
+		.sr07_1bpp_mux		= 0xA6,
+		.sr07_8bpp		= 0xA1,
+		.sr07_8bpp_mux		= 0xA7,
 		.sr1f			= 0
 	},
 	[BT_ALPINE] = {
@@ -213,8 +215,8 @@ static const struct cirrusfb_board_info_rec {
 		.init_sr1f		= true,
 		.scrn_start_bit19	= true,
 		.sr07			= 0xA0,
-		.sr07_1bpp		= 0xA1,
-		.sr07_1bpp_mux		= 0xA7,
+		.sr07_1bpp		= 0xA0,
+		.sr07_1bpp_mux		= 0xA6,
 		.sr07_8bpp		= 0xA1,
 		.sr07_8bpp_mux		= 0xA7,
 		.sr1f			= 0x1C
@@ -821,7 +823,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	/* formula: VClk = (OSC * N) / (D * (1+P)) */
 	/* Example: VClk = (14.31818 * 91) / (23 * (1+1)) = 28.325 MHz */
 
-	if (cinfo->btype == BT_ALPINE) {
+	if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_PICASSO4) {
 		/* if freq is close to mclk or mclk/2 select mclk
 		 * as clock source
 		 */
@@ -1044,9 +1046,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			/* ### INCOMPLETE!! */
 			vga_wseq(regbase, CL_SEQRF, 0xb8);
 #endif
-/*	  		vga_wseq(regbase, CL_SEQR1F, 0x1c); */
-			break;
-
 		case BT_ALPINE:
 			/* We already set SRF and SR1F */
 			break;
@@ -1106,10 +1105,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_PICASSO4:
-			vga_wseq(regbase, CL_SEQR7, 0x27);
-/*			vga_wseq(regbase, CL_SEQR1F, 0x1c);  */
-			break;
-
 		case BT_ALPINE:
 			vga_wseq(regbase, CL_SEQR7, 0xa7);
 			break;
@@ -1177,10 +1172,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			break;
 
 		case BT_PICASSO4:
-			vga_wseq(regbase, CL_SEQR7, 0x25);
-/*			vga_wseq(regbase, CL_SEQR1F, 0x1c);  */
-			break;
-
 		case BT_ALPINE:
 			vga_wseq(regbase, CL_SEQR7, 0xa9);
 			break;
@@ -2678,7 +2669,7 @@ static void cirrusfb_set_blitter(u8 __iomem *regbase,
 	vga_wgfx(regbase, CL_GR32, 0x0d);	/* BLT ROP */
 
 	/* and finally: GO! */
-	vga_wgfx(regbase, CL_GR31, 0x82);	/* BLT Start/status */
+	vga_wgfx(regbase, CL_GR31, 0x02);	/* BLT Start/status */
 }
 
 /*******************************************************************
-- 
cgit v1.2.3


From 7cade31cabec33c396b1dfd9c2842e793c2648ef Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:13 -0700
Subject: cirrusfb: use 24bpp instead of 32bpp

The 32bpp is supported only on the latest Cirrus Logic chips.  Use the
24bpp which is supported at least since Alpine chips (GD543x).

Change 32bpp mode setting to 24bpp mode.  Change acceleration as well.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 79 +++++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 35 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 6603273f4ce..65a1831ddd0 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -95,10 +95,10 @@
 /* board types */
 enum cirrus_board {
 	BT_NONE = 0,
-	BT_SD64,
-	BT_PICCOLO,
-	BT_PICASSO,
-	BT_SPECTRUM,
+	BT_SD64,	/* GD5434 */
+	BT_PICCOLO,	/* GD5426 */
+	BT_PICASSO,	/* GD5426 or GD5428 */
+	BT_SPECTRUM,	/* GD5426 or GD5428 */
 	BT_PICASSO4,	/* GD5446 */
 	BT_ALPINE,	/* GD543x/4x */
 	BT_GD5480,
@@ -488,7 +488,7 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var,
 	 * the VCLK is double the pixel clock. */
 	switch (var->bits_per_pixel) {
 	case 16:
-	case 32:
+	case 24:
 		if (var->xres <= 800)
 			/* Xbh has this type of clock for 32-bit */
 			freq /= 2;
@@ -535,11 +535,11 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 		var->blue.length = 5;
 		break;
 
-	case 32:
+	case 24:
 		if (isPReP) {
-			var->red.offset = 8;
-			var->green.offset = 16;
-			var->blue.offset = 24;
+			var->red.offset = 0;
+			var->green.offset = 8;
+			var->blue.offset = 16;
 		} else {
 			var->red.offset = 16;
 			var->green.offset = 8;
@@ -670,7 +670,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		break;
 
 	case 16:
-	case 32:
+	case 24:
 		info->fix.line_length = var->xres_virtual *
 					var->bits_per_pixel >> 3;
 		info->fix.visual = FB_VISUAL_TRUECOLOR;
@@ -813,6 +813,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	vga_wcrt(regbase, CL_CRT1A, tmp);
 
 	freq = PICOS2KHZ(var->pixclock);
+	if (cinfo->btype == BT_ALPINE && var->bits_per_pixel == 24)
+		freq *= 3;
+
 	bestclock(freq, &nom, &den, &div);
 
 	dev_dbg(info->device, "VCLK freq: %ld kHz  nom: %d  den: %d  div: %d\n",
@@ -1140,16 +1143,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 	/******************************************************
 	 *
-	 * 32 bpp
+	 * 24 bpp
 	 *
 	 */
 
-	else if (var->bits_per_pixel == 32) {
-		dev_dbg(info->device, "preparing for 32 bit deep display\n");
+	else if (var->bits_per_pixel == 24) {
+		dev_dbg(info->device, "preparing for 24 bit deep display\n");
 		switch (cinfo->btype) {
 		case BT_SD64:
 			/* Extended Sequencer Mode: 256c col. mode */
-			vga_wseq(regbase, CL_SEQR7, 0xf9);
+			vga_wseq(regbase, CL_SEQR7, 0xf5);
 			/* MCLK select */
 			vga_wseq(regbase, CL_SEQR1F, 0x1e);
 			break;
@@ -1173,11 +1176,11 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		case BT_PICASSO4:
 		case BT_ALPINE:
-			vga_wseq(regbase, CL_SEQR7, 0xa9);
+			vga_wseq(regbase, CL_SEQR7, 0xa5);
 			break;
 
 		case BT_GD5480:
-			vga_wseq(regbase, CL_SEQR7, 0x19);
+			vga_wseq(regbase, CL_SEQR7, 0x15);
 			/* We already set SRF and SR1F */
 			break;
 
@@ -1185,8 +1188,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_LAGUNAB:
 			vga_wseq(regbase, CL_SEQR7,
 				vga_rseq(regbase, CL_SEQR7) & ~0x01);
-			control |= 0x6000;
-			format |= 0x3400;
+			control |= 0x4000;
+			format |= 0x2400;
 			threshold |= 0x20;
 			break;
 
@@ -1385,9 +1388,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	if (info->var.bits_per_pixel == 1)
 		vga_wattr(cinfo->regbase, CL_AR33, xpix);
 
-	if (!is_laguna(cinfo))
-		cirrusfb_WaitBLT(cinfo->regbase);
-
 	return 0;
 }
 
@@ -1492,22 +1492,18 @@ static void init_vgachip(struct fb_info *info)
 		/* disable flickerfixer */
 		vga_wcrt(cinfo->regbase, CL_CRT51, 0x00);
 		mdelay(100);
-		/* from Klaus' NetBSD driver: */
-		vga_wgfx(cinfo->regbase, CL_GR2F, 0x00);
-		/* put blitter into 542x compat */
-		vga_wgfx(cinfo->regbase, CL_GR33, 0x00);
 		/* mode */
 		vga_wgfx(cinfo->regbase, CL_GR31, 0x00);
-		break;
-
-	case BT_GD5480:
+	case BT_GD5480:  /* fall through */
 		/* from Klaus' NetBSD driver: */
 		vga_wgfx(cinfo->regbase, CL_GR2F, 0x00);
+	case BT_ALPINE:  /* fall through */
+		/* put blitter into 542x compat */
+		vga_wgfx(cinfo->regbase, CL_GR33, 0x00);
 		break;
 
 	case BT_LAGUNA:
 	case BT_LAGUNAB:
-	case BT_ALPINE:
 		/* Nothing to do to reset the board. */
 		break;
 
@@ -1831,10 +1827,13 @@ static void cirrusfb_imageblit(struct fb_info *info,
 			       const struct fb_image *image)
 {
 	struct cirrusfb_info *cinfo = info->par;
+	unsigned char op = (info->var.bits_per_pixel == 24) ? 0xc : 0x4;
 
 	if (info->state != FBINFO_STATE_RUNNING)
 		return;
-	if (info->flags & FBINFO_HWACCEL_DISABLED)
+	/* Alpine acceleration does not work at 24bpp ?!? */
+	if (info->flags & FBINFO_HWACCEL_DISABLED || image->depth != 1 ||
+	    (cinfo->btype == BT_ALPINE && op == 0xc))
 		cfb_imageblit(info, image);
 	else {
 		unsigned size = ((image->width + 7) >> 3) * image->height;
@@ -1848,15 +1847,22 @@ static void cirrusfb_imageblit(struct fb_info *info,
 			fg = ((u32 *)(info->pseudo_palette))[image->fg_color];
 			bg = ((u32 *)(info->pseudo_palette))[image->bg_color];
 		}
-		cirrusfb_WaitBLT(cinfo->regbase);
-		/* byte rounded scanlines */
-		vga_wgfx(cinfo->regbase, CL_GR33, 0x00);
+		if (info->var.bits_per_pixel == 24) {
+			/* clear background first */
+			cirrusfb_RectFill(cinfo->regbase,
+					  info->var.bits_per_pixel,
+					  (image->dx * m) / 8, image->dy,
+					  (image->width * m) / 8,
+					  image->height,
+					  bg, bg,
+					  info->fix.line_length, 0x40);
+		}
 		cirrusfb_RectFill(cinfo->regbase,
 				  info->var.bits_per_pixel,
 				  (image->dx * m) / 8, image->dy,
 				  (image->width * m) / 8, image->height,
 				  fg, bg,
-				  info->fix.line_length, 0x04);
+				  info->fix.line_length, op);
 		memcpy(info->screen_base, image->data, size);
 	}
 }
@@ -2743,9 +2749,12 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel,
 		vga_wgfx(regbase, CL_GR11, fg_color >> 8);
 		op = 0x90;
 	}
-	if (bits_per_pixel == 32) {
+	if (bits_per_pixel >= 24) {
 		vga_wgfx(regbase, CL_GR12, bg_color >> 16);
 		vga_wgfx(regbase, CL_GR13, fg_color >> 16);
+		op = 0xa0;
+	}
+	if (bits_per_pixel == 32) {
 		vga_wgfx(regbase, CL_GR14, bg_color >> 24);
 		vga_wgfx(regbase, CL_GR15, fg_color >> 24);
 		op = 0xb0;
-- 
cgit v1.2.3


From dd14f71cc62dd07b588cc6de935155e6fd3911c9 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:14 -0700
Subject: cirrusfb: fix clock doubling

Cirrus' Alpine and Picasso4 chips uses DAC clock doubling to achieve full
range of pixclock frequencies.

[akpm@linux-foundation.org: fix spelling, use usual comment layout]
Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 65a1831ddd0..15e2e6bfcbf 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -470,10 +470,25 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var,
 	/* If the frequency is greater than we can support, we might be able
 	 * to use multiplexing for the video mode */
 	if (freq > maxclock) {
+		dev_err(info->device,
+			"Frequency greater than maxclock (%ld kHz)\n",
+			maxclock);
+		return -EINVAL;
+	}
+	/*
+	 * Additional constraint: 8bpp uses DAC clock doubling to allow maximum
+	 * pixel clock
+	 */
+	if (var->bits_per_pixel == 8) {
 		switch (cinfo->btype) {
 		case BT_ALPINE:
+		case BT_PICASSO4:
+			if (freq > 85500)
+				cinfo->multiplexing = 1;
+			break;
 		case BT_GD5480:
-			cinfo->multiplexing = 1;
+			if (freq > 135100)
+				cinfo->multiplexing = 1;
 			break;
 
 		default:
@@ -815,6 +830,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	freq = PICOS2KHZ(var->pixclock);
 	if (cinfo->btype == BT_ALPINE && var->bits_per_pixel == 24)
 		freq *= 3;
+	if (cinfo->multiplexing)
+		freq /= 2;
 
 	bestclock(freq, &nom, &den, &div);
 
-- 
cgit v1.2.3


From 614c0dc93284404be2a4d5750c79bb95f2b6c980 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:15 -0700
Subject: cirrusfb: add accelerator constant

Add an accelerator constant so almost all Cirrus are recognized as
accelerators by the fbset command.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 11 +++++++++--
 include/linux/fb.h       |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 15e2e6bfcbf..e9a2661669e 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -519,6 +519,7 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	int yres;
 	/* memory size in pixels */
 	unsigned pixels = info->screen_size * 8 / var->bits_per_pixel;
+	struct cirrusfb_info *cinfo = info->par;
 
 	switch (var->bits_per_pixel) {
 	case 1:
@@ -627,6 +628,9 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	if (cirrusfb_check_pixclock(var, info))
 		return -EINVAL;
 
+	if (!is_laguna(cinfo))
+		var->accel_flags = FB_ACCELF_TEXT;
+
 	return 0;
 }
 
@@ -2029,8 +2033,12 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 		    | FBINFO_HWACCEL_FILLRECT
 		    | FBINFO_HWACCEL_IMAGEBLIT
 		    | FBINFO_HWACCEL_COPYAREA;
-	if (noaccel || is_laguna(cinfo))
+	if (noaccel || is_laguna(cinfo)) {
 		info->flags |= FBINFO_HWACCEL_DISABLED;
+		info->fix.accel = FB_ACCEL_NONE;
+	} else
+		info->fix.accel = FB_ACCEL_CIRRUS_ALPINE;
+
 	info->fbops = &cirrusfb_ops;
 
 	if (cinfo->btype == BT_GD5480) {
@@ -2056,7 +2064,6 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 
 	/* FIXME: map region at 0xB8000 if available, fill in here */
 	info->fix.mmio_len   = 0;
-	info->fix.accel = FB_ACCEL_NONE;
 
 	fb_alloc_cmap(&info->cmap, 256, 0);
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 31527e17076..fe7d0d7907a 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -123,6 +123,7 @@ struct dentry;
 #define FB_ACCEL_TRIDENT_3DIMAGE 51	/* Trident 3DImage		*/
 #define FB_ACCEL_TRIDENT_BLADE3D 52	/* Trident Blade3D		*/
 #define FB_ACCEL_TRIDENT_BLADEXP 53	/* Trident BladeXP		*/
+#define FB_ACCEL_CIRRUS_ALPINE   53	/* Cirrus Logic 543x/544x/5480	*/
 #define FB_ACCEL_NEOMAGIC_NM2070 90	/* NeoMagic NM2070              */
 #define FB_ACCEL_NEOMAGIC_NM2090 91	/* NeoMagic NM2090              */
 #define FB_ACCEL_NEOMAGIC_NM2093 92	/* NeoMagic NM2093              */
-- 
cgit v1.2.3


From 8f19e15b8ad23e28add5760ed049be2359f39fe8 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:15 -0700
Subject: cirrusfb: set MCLK in one place

A memory clock (MCLK) is set at various places.  Move the setting into one
place.

Set the MCLK only for Zorro cards as the x86 cards should be initialized by
BIOS.

Improve handling of the GD5434 (SD64).

Kill one annoying debug output "virtual offset: ...".

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 70 +++++++++++-------------------------------------
 1 file changed, 16 insertions(+), 54 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index e9a2661669e..62032743608 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -146,7 +146,7 @@ static const struct cirrusfb_board_info_rec {
 		.sr07			= 0xF0,
 		.sr07_1bpp		= 0xF0,
 		.sr07_8bpp		= 0xF1,
-		.sr1f			= 0x20
+		.sr1f			= 0x1E
 	},
 	[BT_PICCOLO] = {
 		.name			= "CL Piccolo",
@@ -482,6 +482,7 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var,
 	if (var->bits_per_pixel == 8) {
 		switch (cinfo->btype) {
 		case BT_ALPINE:
+		case BT_SD64:
 		case BT_PICASSO4:
 			if (freq > 85500)
 				cinfo->multiplexing = 1;
@@ -492,10 +493,7 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var,
 			break;
 
 		default:
-			dev_err(info->device,
-				"Frequency greater than maxclock (%ld kHz)\n",
-				maxclock);
-			return -EINVAL;
+			break;
 		}
 	}
 #if 0
@@ -847,7 +845,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	/* formula: VClk = (OSC * N) / (D * (1+P)) */
 	/* Example: VClk = (14.31818 * 91) / (23 * (1+1)) = 28.325 MHz */
 
-	if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_PICASSO4) {
+	if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_PICASSO4 ||
+	    cinfo->btype == BT_SD64) {
 		/* if freq is close to mclk or mclk/2 select mclk
 		 * as clock source
 		 */
@@ -966,30 +965,19 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 		/* Extended Sequencer Mode */
 		switch (cinfo->btype) {
-		case BT_SD64:
-			/* setting the SEQRF on SD64 is not necessary
-			 * (only during init)
-			 */
-			/*  MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x1a);
-			break;
 
 		case BT_PICCOLO:
 		case BT_SPECTRUM:
-			/* ### ueberall 0x22? */
-			/* ##vorher 1c MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			/* evtl d0 bei 1 bit? avoid FIFO underruns..? */
 			vga_wseq(regbase, CL_SEQRF, 0xb0);
 			break;
 
 		case BT_PICASSO:
-			/* ##vorher 22 MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			/* ## vorher d0 avoid FIFO underruns..? */
 			vga_wseq(regbase, CL_SEQRF, 0xd0);
 			break;
 
+		case BT_SD64:
 		case BT_PICASSO4:
 		case BT_ALPINE:
 		case BT_GD5480:
@@ -1051,16 +1039,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		}
 
 		switch (cinfo->btype) {
-		case BT_SD64:
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x1d);
-			break;
-
 		case BT_PICCOLO:
 		case BT_PICASSO:
 		case BT_SPECTRUM:
-			/* ### vorher 1c MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			/* Fast Page-Mode writes */
 			vga_wseq(regbase, CL_SEQRF, 0xb0);
 			break;
@@ -1074,6 +1055,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			/* We already set SRF and SR1F */
 			break;
 
+		case BT_SD64:
 		case BT_GD5480:
 		case BT_LAGUNA:
 		case BT_LAGUNAB:
@@ -1104,32 +1086,23 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	else if (var->bits_per_pixel == 16) {
 		dev_dbg(info->device, "preparing for 16 bit deep display\n");
 		switch (cinfo->btype) {
-		case BT_SD64:
-			/* Extended Sequencer Mode: 256c col. mode */
-			vga_wseq(regbase, CL_SEQR7, 0xf7);
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x1e);
-			break;
-
 		case BT_PICCOLO:
 		case BT_SPECTRUM:
 			vga_wseq(regbase, CL_SEQR7, 0x87);
 			/* Fast Page-Mode writes */
 			vga_wseq(regbase, CL_SEQRF, 0xb0);
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			break;
 
 		case BT_PICASSO:
 			vga_wseq(regbase, CL_SEQR7, 0x27);
 			/* Fast Page-Mode writes */
 			vga_wseq(regbase, CL_SEQRF, 0xb0);
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			break;
 
+		case BT_SD64:
 		case BT_PICASSO4:
 		case BT_ALPINE:
+			/* Extended Sequencer Mode: 256c col. mode */
 			vga_wseq(regbase, CL_SEQR7, 0xa7);
 			break;
 
@@ -1171,32 +1144,23 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	else if (var->bits_per_pixel == 24) {
 		dev_dbg(info->device, "preparing for 24 bit deep display\n");
 		switch (cinfo->btype) {
-		case BT_SD64:
-			/* Extended Sequencer Mode: 256c col. mode */
-			vga_wseq(regbase, CL_SEQR7, 0xf5);
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x1e);
-			break;
-
 		case BT_PICCOLO:
 		case BT_SPECTRUM:
 			vga_wseq(regbase, CL_SEQR7, 0x85);
 			/* Fast Page-Mode writes */
 			vga_wseq(regbase, CL_SEQRF, 0xb0);
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			break;
 
 		case BT_PICASSO:
 			vga_wseq(regbase, CL_SEQR7, 0x25);
 			/* Fast Page-Mode writes */
 			vga_wseq(regbase, CL_SEQRF, 0xb0);
-			/* MCLK select */
-			vga_wseq(regbase, CL_SEQR1F, 0x22);
 			break;
 
+		case BT_SD64:
 		case BT_PICASSO4:
 		case BT_ALPINE:
+			/* Extended Sequencer Mode: 256c col. mode */
 			vga_wseq(regbase, CL_SEQR7, 0xa5);
 			break;
 
@@ -1353,9 +1317,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var,
 	unsigned char tmp, xpix;
 	struct cirrusfb_info *cinfo = info->par;
 
-	dev_dbg(info->device,
-		"virtual offset: (%d,%d)\n", var->xoffset, var->yoffset);
-
 	/* no range checks for xoffset and yoffset,   */
 	/* as fb_pan_display has already done this */
 	if (var->vmode & FB_VMODE_YWRAP)
@@ -1607,10 +1568,6 @@ static void init_vgachip(struct fb_info *info)
 		vga_wseq(cinfo->regbase, CL_SEQR18, 0x02);
 	}
 
-	/* MCLK select etc. */
-	if (bi->init_sr1f)
-		vga_wseq(cinfo->regbase, CL_SEQR1F, bi->sr1f);
-
 	/* Screen A preset row scan: none */
 	vga_wcrt(cinfo->regbase, VGA_CRTC_PRESET_ROW, 0x00);
 	/* Text cursor start: disable text cursor */
@@ -2346,6 +2303,11 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z,
 
 	zorro_set_drvdata(z, info);
 
+	/* MCLK select etc. */
+	if (cirrusfb_board_info[btype].init_sr1f)
+		vga_wseq(cinfo->regbase, CL_SEQR1F,
+			 cirrusfb_board_info[btype].sr1f);
+
 	ret = cirrusfb_register(info);
 	if (!ret)
 		return 0;
-- 
cgit v1.2.3


From df3aafd57d590d6f3d95310fc3430f3a536d1e59 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:16 -0700
Subject: cirrusfb: GD5434 (aka SD64) support fixed

Fix handling of the Cirrus Logic GD5434 chip.  Distinguish this chip from the
GD5430.  It allows detecting memory size for both models correctly.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 72 +++++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 62032743608..a364e1b0dcb 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -145,7 +145,9 @@ static const struct cirrusfb_board_info_rec {
 		.scrn_start_bit19	= true,
 		.sr07			= 0xF0,
 		.sr07_1bpp		= 0xF0,
+		.sr07_1bpp_mux		= 0xF6,
 		.sr07_8bpp		= 0xF1,
+		.sr07_8bpp_mux		= 0xF7,
 		.sr1f			= 0x1E
 	},
 	[BT_PICCOLO] = {
@@ -262,8 +264,8 @@ static const struct cirrusfb_board_info_rec {
 
 static struct pci_device_id cirrusfb_pci_table[] = {
 	CHIP(PCI_DEVICE_ID_CIRRUS_5436, BT_ALPINE),
-	CHIP(PCI_DEVICE_ID_CIRRUS_5434_8, BT_ALPINE),
-	CHIP(PCI_DEVICE_ID_CIRRUS_5434_4, BT_ALPINE),
+	CHIP(PCI_DEVICE_ID_CIRRUS_5434_8, BT_SD64),
+	CHIP(PCI_DEVICE_ID_CIRRUS_5434_4, BT_SD64),
 	CHIP(PCI_DEVICE_ID_CIRRUS_5430, BT_ALPINE), /* GD-5440 is same id */
 	CHIP(PCI_DEVICE_ID_CIRRUS_7543, BT_ALPINE),
 	CHIP(PCI_DEVICE_ID_CIRRUS_7548, BT_ALPINE),
@@ -341,6 +343,7 @@ struct cirrusfb_info {
 	unsigned char SFR;	/* Shadow of special function register */
 
 	int multiplexing;
+	int doubleVCLK;
 	int blank_mode;
 	u32 pseudo_palette[16];
 
@@ -496,18 +499,15 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var,
 			break;
 		}
 	}
-#if 0
-	/* TODO: If we have a 1MB 5434, we need to put ourselves in a mode where
+
+	/* If we have a 1MB 5434, we need to put ourselves in a mode where
 	 * the VCLK is double the pixel clock. */
-	switch (var->bits_per_pixel) {
-	case 16:
-	case 24:
-		if (var->xres <= 800)
-			/* Xbh has this type of clock for 32-bit */
-			freq /= 2;
-		break;
+	cinfo->doubleVCLK = 0;
+	if (cinfo->btype == BT_SD64 && info->fix.smem_len <= MB_ &&
+	    var->bits_per_pixel == 16) {
+		cinfo->doubleVCLK = 1;
 	}
-#endif
+
 	return 0;
 }
 
@@ -830,10 +830,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	vga_wcrt(regbase, CL_CRT1A, tmp);
 
 	freq = PICOS2KHZ(var->pixclock);
-	if (cinfo->btype == BT_ALPINE && var->bits_per_pixel == 24)
-		freq *= 3;
+	if (var->bits_per_pixel == 24)
+		if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_SD64)
+			freq *= 3;
 	if (cinfo->multiplexing)
 		freq /= 2;
+	if (cinfo->doubleVCLK)
+		freq *= 2;
 
 	bestclock(freq, &nom, &den, &div);
 
@@ -851,10 +854,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		 * as clock source
 		 */
 		int divMCLK = cirrusfb_check_mclk(info, freq);
-		if (divMCLK)  {
+		if (divMCLK)
 			nom = 0;
-			cirrusfb_set_mclk_as_source(info, divMCLK);
-		}
+		cirrusfb_set_mclk_as_source(info, divMCLK);
 	}
 	if (is_laguna(cinfo)) {
 		long pcifc = fb_readl(cinfo->laguna_mmio + 0x3fc);
@@ -885,14 +887,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		    (cinfo->btype == BT_GD5480))
 			tmp |= 0x80;
 
-		dev_dbg(info->device, "CL_SEQR1B: %d\n", (int) tmp);
 		/* Laguna chipset has reversed clock registers */
 		if (is_laguna(cinfo)) {
 			vga_wseq(regbase, CL_SEQRE, tmp);
 			vga_wseq(regbase, CL_SEQR1E, nom);
 		} else {
-			vga_wseq(regbase, CL_SEQRB, nom);
-			vga_wseq(regbase, CL_SEQR1B, tmp);
+			vga_wseq(regbase, CL_SEQRE, nom);
+			vga_wseq(regbase, CL_SEQR1E, tmp);
 		}
 	}
 
@@ -911,15 +912,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 	else
 		vga_wcrt(regbase, VGA_CRTC_REGS, 0x00);	/* interlace control */
 
-	/* adjust horizontal/vertical sync type (low/high) */
+	/* adjust horizontal/vertical sync type (low/high), use VCLK3 */
 	/* enable display memory & CRTC I/O address for color mode */
-	tmp = 0x03;
+	tmp = 0x03 | 0xc;
 	if (var->sync & FB_SYNC_HOR_HIGH_ACT)
 		tmp |= 0x40;
 	if (var->sync & FB_SYNC_VERT_HIGH_ACT)
 		tmp |= 0x80;
-	if (is_laguna(cinfo))
-		tmp |= 0xc;
 	WGen(cinfo, VGA_MIS_W, tmp);
 
 	/* text cursor on and start line */
@@ -1052,9 +1051,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 			vga_wseq(regbase, CL_SEQRF, 0xb8);
 #endif
 		case BT_ALPINE:
-			/* We already set SRF and SR1F */
-			break;
-
 		case BT_SD64:
 		case BT_GD5480:
 		case BT_LAGUNA:
@@ -1103,7 +1099,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		case BT_PICASSO4:
 		case BT_ALPINE:
 			/* Extended Sequencer Mode: 256c col. mode */
-			vga_wseq(regbase, CL_SEQR7, 0xa7);
+			vga_wseq(regbase, CL_SEQR7,
+					cinfo->doubleVCLK ? 0xa3 : 0xa7);
 			break;
 
 		case BT_GD5480:
@@ -1128,7 +1125,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		/* mode register: 256 color mode */
 		vga_wgfx(regbase, VGA_GFX_MODE, 64);
 #ifdef CONFIG_PCI
-		WHDR(cinfo, 0xc1);	/* Copy Xbh */
+		WHDR(cinfo, cinfo->doubleVCLK ? 0xe1 : 0xc1);
 #elif defined(CONFIG_ZORRO)
 		/* FIXME: CONFIG_PCI and CONFIG_ZORRO may be defined both */
 		WHDR(cinfo, 0xa0);	/* hidden dac reg: nothing special */
@@ -1529,7 +1526,9 @@ static void init_vgachip(struct fb_info *info)
 		case BT_LAGUNAB:
 			break;
 		case BT_SD64:
+#ifdef CONFIG_ZORRO
 			vga_wseq(cinfo->regbase, CL_SEQRF, 0xb8);
+#endif
 			break;
 		default:
 			vga_wseq(cinfo->regbase, CL_SEQR16, 0x0f);
@@ -1604,7 +1603,8 @@ static void init_vgachip(struct fb_info *info)
 	/* Bit Mask: no mask at all */
 	vga_wgfx(cinfo->regbase, VGA_GFX_BIT_MASK, 0xff);
 
-	if (cinfo->btype == BT_ALPINE || is_laguna(cinfo))
+	if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_SD64 ||
+	    is_laguna(cinfo))
 		/* (5434 can't have bit 3 set for bitblt) */
 		vga_wgfx(cinfo->regbase, CL_GRB, 0x20);
 	else
@@ -1809,9 +1809,11 @@ static void cirrusfb_imageblit(struct fb_info *info,
 
 	if (info->state != FBINFO_STATE_RUNNING)
 		return;
-	/* Alpine acceleration does not work at 24bpp ?!? */
-	if (info->flags & FBINFO_HWACCEL_DISABLED || image->depth != 1 ||
-	    (cinfo->btype == BT_ALPINE && op == 0xc))
+	/* Alpine/SD64 does not work at 24bpp ??? */
+	if (info->flags & FBINFO_HWACCEL_DISABLED || image->depth != 1)
+		cfb_imageblit(info, image);
+	else if ((cinfo->btype == BT_ALPINE || cinfo->btype == BT_SD64) &&
+		  op == 0xc)
 		cfb_imageblit(info, image);
 	else {
 		unsigned size = ((image->width + 7) >> 3) * image->height;
@@ -1895,7 +1897,7 @@ static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info,
 		/* If DRAM bank switching is enabled, there must be
 		 * twice as much memory installed. (4MB on the 5434)
 		 */
-		if (SRF & 0x80)
+		if (cinfo->btype != BT_ALPINE && (SRF & 0x80) != 0)
 			mem *= 2;
 	}
 
@@ -2553,7 +2555,7 @@ static void WClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch
 
 	if (cinfo->btype == BT_PICASSO || cinfo->btype == BT_PICASSO4 ||
 	    cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480 ||
-	    is_laguna(cinfo)) {
+	    cinfo->btype == BT_SD64 || is_laguna(cinfo)) {
 		/* but DAC data register IS, at least for Picasso II */
 		if (cinfo->btype == BT_PICASSO)
 			data += 0xfff;
-- 
cgit v1.2.3


From 4242a23c9e6b8e2462bb49bf78b76bfdf32158b5 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:17 -0700
Subject: cirrusfb: fix threshold register mask for Laguna chips

Fix threshold register mask for Laguna chips otherwise some 8bpp modes are
garbled after selecting a 24bpp mode.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index a364e1b0dcb..9bb811d5672 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -875,7 +875,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 		threshold = fb_readw(cinfo->laguna_mmio + 0xea);
 		control &= ~0x6800;
 		format = 0;
-		threshold &= 0xffe0 & 0x3fbf;
+		threshold &= 0xffc0 & 0x3fbf;
 	}
 	if (nom) {
 		tmp = den << 1;
-- 
cgit v1.2.3


From 8636a9240cc93efa6b36f4cfe6253e0574f832c6 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:17 -0700
Subject: cirrusfb: fix interlaced modes

Fix calculations of timings for interlaced modes.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 9bb811d5672..d42e385f091 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -701,45 +701,52 @@ static int cirrusfb_set_par_foo(struct fb_info *info)
 
 	hsyncstart = var->xres + var->right_margin;
 	hsyncend = hsyncstart + var->hsync_len;
-	htotal = (hsyncend + var->left_margin) / 8 - 5;
-	hdispend = var->xres / 8 - 1;
-	hsyncstart = hsyncstart / 8 + 1;
-	hsyncend = hsyncend / 8 + 1;
+	htotal = (hsyncend + var->left_margin) / 8;
+	hdispend = var->xres / 8;
+	hsyncstart = hsyncstart / 8;
+	hsyncend = hsyncend / 8;
 
-	yres = var->yres;
-	vsyncstart = yres + var->lower_margin;
+	vdispend = var->yres;
+	vsyncstart = vdispend + var->lower_margin;
 	vsyncend = vsyncstart + var->vsync_len;
 	vtotal = vsyncend + var->upper_margin;
-	vdispend = yres - 1;
 
 	if (var->vmode & FB_VMODE_DOUBLE) {
-		yres *= 2;
+		vdispend *= 2;
 		vsyncstart *= 2;
 		vsyncend *= 2;
 		vtotal *= 2;
 	} else if (var->vmode & FB_VMODE_INTERLACED) {
-		yres = (yres + 1) / 2;
+		vdispend = (vdispend + 1) / 2;
 		vsyncstart = (vsyncstart + 1) / 2;
 		vsyncend = (vsyncend + 1) / 2;
 		vtotal = (vtotal + 1) / 2;
 	}
-
-	vtotal -= 2;
-	vsyncstart -= 1;
-	vsyncend -= 1;
-
+	yres = vdispend;
 	if (yres >= 1024) {
 		vtotal /= 2;
 		vsyncstart /= 2;
 		vsyncend /= 2;
 		vdispend /= 2;
 	}
+
+	vdispend -= 1;
+	vsyncstart -= 1;
+	vsyncend -= 1;
+	vtotal -= 2;
+
 	if (cinfo->multiplexing) {
 		htotal /= 2;
 		hsyncstart /= 2;
 		hsyncend /= 2;
 		hdispend /= 2;
 	}
+
+	htotal -= 5;
+	hdispend -= 1;
+	hsyncstart += 1;
+	hsyncend += 1;
+
 	/* unlock register VGA_CRTC_H_TOTAL..CRT7 */
 	vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, 0x20);	/* previously: 0x00) */
 
-- 
cgit v1.2.3


From 66c1ca019078220dc1bf968f2bb18421100ef147 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Tue, 31 Mar 2009 15:25:18 -0700
Subject: fbmem: fix fb_info->lock and mm->mmap_sem circular locking dependency

Fix a circular locking dependency in the frame buffer console driver
pushing down the mutex fb_info->lock.

Circular locking dependecies occur calling the blocking
fb_notifier_call_chain() with fb_info->lock held.  Notifier callbacks can
try to acquire mm->mmap_sem, while fb_mmap() acquires the locks in the
reverse order mm->mmap_sem => fb_info->lock.

Tested-by: Andrey Borzenkov <arvidjaar@mail.ru>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/backlight/backlight.c |  3 ++
 drivers/video/backlight/lcd.c       |  3 ++
 drivers/video/console/fbcon.c       | 73 ++++++++++++++++++++++++++++++++-----
 drivers/video/fbmem.c               | 11 +-----
 4 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 157057c79ca..dd37cbcaf8c 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -35,6 +35,8 @@ static int fb_notifier_callback(struct notifier_block *self,
 		return 0;
 
 	bd = container_of(self, struct backlight_device, fb_notif);
+	if (!lock_fb_info(evdata->info))
+		return -ENODEV;
 	mutex_lock(&bd->ops_lock);
 	if (bd->ops)
 		if (!bd->ops->check_fb ||
@@ -47,6 +49,7 @@ static int fb_notifier_callback(struct notifier_block *self,
 			backlight_update_status(bd);
 		}
 	mutex_unlock(&bd->ops_lock);
+	unlock_fb_info(evdata->info);
 	return 0;
 }
 
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index b6449470106..0bb13df0fa8 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -40,6 +40,8 @@ static int fb_notifier_callback(struct notifier_block *self,
 	if (!ld->ops)
 		return 0;
 
+	if (!lock_fb_info(evdata->info))
+		return -ENODEV;
 	mutex_lock(&ld->ops_lock);
 	if (!ld->ops->check_fb || ld->ops->check_fb(ld, evdata->info)) {
 		if (event == FB_EVENT_BLANK) {
@@ -51,6 +53,7 @@ static int fb_notifier_callback(struct notifier_block *self,
 		}
 	}
 	mutex_unlock(&ld->ops_lock);
+	unlock_fb_info(evdata->info);
 	return 0;
 }
 
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 1657b9608b0..2cd500a304f 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -2954,8 +2954,11 @@ static int fbcon_fb_unbind(int idx)
 
 static int fbcon_fb_unregistered(struct fb_info *info)
 {
-	int i, idx = info->node;
+	int i, idx;
 
+	if (!lock_fb_info(info))
+		return -ENODEV;
+	idx = info->node;
 	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] == idx)
 			con2fb_map[i] = -1;
@@ -2979,13 +2982,14 @@ static int fbcon_fb_unregistered(struct fb_info *info)
 		}
 	}
 
-	if (!num_registered_fb)
-		unregister_con_driver(&fb_con);
-
-
 	if (primary_device == idx)
 		primary_device = -1;
 
+	unlock_fb_info(info);
+
+	if (!num_registered_fb)
+		unregister_con_driver(&fb_con);
+
 	return 0;
 }
 
@@ -3021,9 +3025,13 @@ static inline void fbcon_select_primary(struct fb_info *info)
 
 static int fbcon_fb_registered(struct fb_info *info)
 {
-	int ret = 0, i, idx = info->node;
+	int ret = 0, i, idx;
 
+	if (!lock_fb_info(info))
+		return -ENODEV;
+	idx = info->node;
 	fbcon_select_primary(info);
+	unlock_fb_info(info);
 
 	if (info_idx == -1) {
 		for (i = first_fb_vc; i <= last_fb_vc; i++) {
@@ -3124,7 +3132,7 @@ static void fbcon_get_requirement(struct fb_info *info,
 	}
 }
 
-static int fbcon_event_notify(struct notifier_block *self, 
+static int fbcon_event_notify(struct notifier_block *self,
 			      unsigned long action, void *data)
 {
 	struct fb_event *event = data;
@@ -3132,7 +3140,7 @@ static int fbcon_event_notify(struct notifier_block *self,
 	struct fb_videomode *mode;
 	struct fb_con2fbmap *con2fb;
 	struct fb_blit_caps *caps;
-	int ret = 0;
+	int idx, ret = 0;
 
 	/*
 	 * ignore all events except driver registration and deregistration
@@ -3144,23 +3152,54 @@ static int fbcon_event_notify(struct notifier_block *self,
 
 	switch(action) {
 	case FB_EVENT_SUSPEND:
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_suspended(info);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_RESUME:
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_resumed(info);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_MODE_CHANGE:
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_modechanged(info);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_MODE_CHANGE_ALL:
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_set_all_vcs(info);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_MODE_DELETE:
 		mode = event->data;
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		ret = fbcon_mode_deleted(info, mode);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_FB_UNBIND:
-		ret = fbcon_fb_unbind(info->node);
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
+		idx = info->node;
+		unlock_fb_info(info);
+		ret = fbcon_fb_unbind(idx);
 		break;
 	case FB_EVENT_FB_REGISTERED:
 		ret = fbcon_fb_registered(info);
@@ -3178,17 +3217,31 @@ static int fbcon_event_notify(struct notifier_block *self,
 		con2fb->framebuffer = con2fb_map[con2fb->console - 1];
 		break;
 	case FB_EVENT_BLANK:
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_fb_blanked(info, *(int *)event->data);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_NEW_MODELIST:
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_new_modelist(info);
+		unlock_fb_info(info);
 		break;
 	case FB_EVENT_GET_REQ:
 		caps = event->data;
+		if (!lock_fb_info(info)) {
+			ret = -ENODEV;
+			goto done;
+		}
 		fbcon_get_requirement(info, caps);
+		unlock_fb_info(info);
 		break;
 	}
-
 done:
 	return ret;
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index cfd9dce1ce0..b64f061dd44 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1086,13 +1086,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			return -EINVAL;
 		con2fb.framebuffer = -1;
 		event.data = &con2fb;
-
-		if (!lock_fb_info(info))
-			return -ENODEV;
 		event.info = info;
 		fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event);
-		unlock_fb_info(info);
-
 		ret = copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0;
 		break;
 	case FBIOPUT_CON2FBMAP:
@@ -1109,12 +1104,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			break;
 		}
 		event.data = &con2fb;
-		if (!lock_fb_info(info))
-			return -ENODEV;
 		event.info = info;
-		ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP,
-					      &event);
-		unlock_fb_info(info);
+		ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, &event);
 		break;
 	case FBIOBLANK:
 		if (!lock_fb_info(info))
-- 
cgit v1.2.3


From 6a7f2829b5f8be124e168265f176dbbbea8861a0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:25:19 -0700
Subject: fbdev: uninline lock_fb_info()

Before:

   text    data     bss     dec     hex filename
   3648    2910      32    6590    19be drivers/video/backlight/backlight.o
   3226    2812      32    6070    17b6 drivers/video/backlight/lcd.o
  30990   16688    8480   56158    db5e drivers/video/console/fbcon.o
  15488    8400      24   23912    5d68 drivers/video/fbmem.o

After:

   text    data     bss     dec     hex filename
   3537    2870      32    6439    1927 drivers/video/backlight/backlight.o
   3131    2772      32    5935    172f drivers/video/backlight/lcd.o
  30876   16648    8480   56004    dac4 drivers/video/console/fbcon.o
  15506    8400      24   23930    5d7a drivers/video/fbmem.o

Cc: Andrea Righi <righi.andrea@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbmem.c | 11 +++++++++++
 include/linux/fb.h    | 10 +---------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index b64f061dd44..2ac32e6b595 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -46,6 +46,17 @@
 struct fb_info *registered_fb[FB_MAX] __read_mostly;
 int num_registered_fb __read_mostly;
 
+int lock_fb_info(struct fb_info *info)
+{
+	mutex_lock(&info->lock);
+	if (!info->fbops) {
+		mutex_unlock(&info->lock);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lock_fb_info);
+
 /*
  * Helpers
  */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index fe7d0d7907a..f563c501393 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -961,15 +961,7 @@ extern struct fb_info *registered_fb[FB_MAX];
 extern int num_registered_fb;
 extern struct class *fb_class;
 
-static inline int lock_fb_info(struct fb_info *info)
-{
-	mutex_lock(&info->lock);
-	if (!info->fbops) {
-		mutex_unlock(&info->lock);
-		return 0;
-	}
-	return 1;
-}
+extern int lock_fb_info(struct fb_info *info);
 
 static inline void unlock_fb_info(struct fb_info *info)
 {
-- 
cgit v1.2.3


From d4bc4e8af0a4a34c713f8c1a33a78cedffe8e0b7 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:20 -0700
Subject: drivers/video/sgivwfb.c: fix memory leaks in removal path

We were leaking both the cmap memory and the info struct memory.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/sgivwfb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/video/sgivwfb.c b/drivers/video/sgivwfb.c
index f5252c2552f..bba53714a7b 100644
--- a/drivers/video/sgivwfb.c
+++ b/drivers/video/sgivwfb.c
@@ -837,6 +837,8 @@ static int sgivwfb_remove(struct platform_device *dev)
 		iounmap(par->regs);
 		iounmap(info->screen_base);
 		release_mem_region(DBE_REG_PHYS, DBE_REG_SIZE);
+		fb_dealloc_cmap(&info->cmap);
+		framebuffer_release(info);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 895d72279da7f24f266f9583c239e7b22230127c Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:21 -0700
Subject: tdfxfb: fix memory leaks in removal path

We were leaking the cmap memory.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/tdfxfb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/video/tdfxfb.c b/drivers/video/tdfxfb.c
index 14bd3f3680b..ee64771fbe3 100644
--- a/drivers/video/tdfxfb.c
+++ b/drivers/video/tdfxfb.c
@@ -1393,6 +1393,7 @@ static void __devexit tdfxfb_remove(struct pci_dev *pdev)
 	release_mem_region(pci_resource_start(pdev, 0),
 			   pci_resource_len(pdev, 0));
 	pci_set_drvdata(pdev, NULL);
+	fb_dealloc_cmap(&info->cmap);
 	framebuffer_release(info);
 }
 
-- 
cgit v1.2.3


From 07b39b49b402355a7172c113102a8b68aafb17dd Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:22 -0700
Subject: tridentfb: fix memory leaks in removal path

We were leaking the cmap memory.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/tridentfb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/video/tridentfb.c b/drivers/video/tridentfb.c
index 479b2e79ad6..c5369ba9485 100644
--- a/drivers/video/tridentfb.c
+++ b/drivers/video/tridentfb.c
@@ -1563,6 +1563,7 @@ static void __devexit trident_pci_remove(struct pci_dev *dev)
 	release_mem_region(tridentfb_fix.mmio_start, tridentfb_fix.mmio_len);
 	pci_set_drvdata(dev, NULL);
 	kfree(info->pixmap.addr);
+	fb_dealloc_cmap(&info->cmap);
 	framebuffer_release(info);
 }
 
-- 
cgit v1.2.3


From 5e266e2e0e19532c1b8e2e2bff1eb6ccf42e478a Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:22 -0700
Subject: vfb: fix memory leaks in removal path

We were leaking the cmap memory.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/vfb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index 93fe08d6c78..cc919ae4657 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -543,6 +543,7 @@ static int vfb_remove(struct platform_device *dev)
 	if (info) {
 		unregister_framebuffer(info);
 		rvfree(videomemory, videomemorysize);
+		fb_dealloc_cmap(&info->cmap);
 		framebuffer_release(info);
 	}
 	return 0;
-- 
cgit v1.2.3


From 0fd853118dd821de59106c5b9a0a2a6f488bc4b5 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:23 -0700
Subject: skeletonfb: check fb_alloc_cmap return value and handle failure
 properly

Bad example code, no cookie!

Signed-off-by: Andres Salomon <dilinger@debian.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/skeletonfb.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/video/skeletonfb.c b/drivers/video/skeletonfb.c
index df5336561d1..a439159204a 100644
--- a/drivers/video/skeletonfb.c
+++ b/drivers/video/skeletonfb.c
@@ -795,8 +795,9 @@ static int __devinit xxxfb_probe(struct pci_dev *dev,
     if (!retval || retval == 4)
 	return -EINVAL;			
 
-    /* This has to been done !!! */	
-    fb_alloc_cmap(&info->cmap, cmap_len, 0);
+    /* This has to be done! */
+    if (fb_alloc_cmap(&info->cmap, cmap_len, 0))
+	return -ENOMEM;
 	
     /* 
      * The following is done in the case of having hardware with a static 
@@ -820,8 +821,10 @@ static int __devinit xxxfb_probe(struct pci_dev *dev,
      */
     /* xxxfb_set_par(info); */
 
-    if (register_framebuffer(info) < 0)
+    if (register_framebuffer(info) < 0) {
+	fb_dealloc_cmap(&info->cmap);
 	return -EINVAL;
+    }
     printk(KERN_INFO "fb%d: %s frame buffer device\n", info->node,
 	   info->fix.id);
     pci_set_drvdata(dev, info); /* or platform_set_drvdata(pdev, info) */
-- 
cgit v1.2.3


From 0a5d924e5954e81a905907512f8c7a1cbf81d700 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:24 -0700
Subject: sm501fb: check fb_alloc_cmap return value and handle failure properly

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/sm501fb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c
index dcd98793d56..eb5d73a0670 100644
--- a/drivers/video/sm501fb.c
+++ b/drivers/video/sm501fb.c
@@ -1525,7 +1525,10 @@ static int sm501fb_init_fb(struct fb_info *fb,
 	}
 
 	/* initialise and set the palette */
-	fb_alloc_cmap(&fb->cmap, NR_PALETTE, 0);
+	if (fb_alloc_cmap(&fb->cmap, NR_PALETTE, 0)) {
+		dev_err(info->dev, "failed to allocate cmap memory\n");
+		return -ENOMEM;
+	}
 	fb_set_cmap(&fb->cmap, fb);
 
 	ret = (fb->fbops->fb_check_var)(&fb->var, fb);
-- 
cgit v1.2.3


From c23124277e58998703278c26c53b159cea0f9643 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:25 -0700
Subject: sstfb: check fb_alloc_cmap return value and handle failure properly

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/sstfb.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/video/sstfb.c b/drivers/video/sstfb.c
index 5b11a00f49b..609d0a521ca 100644
--- a/drivers/video/sstfb.c
+++ b/drivers/video/sstfb.c
@@ -1421,13 +1421,16 @@ static int __devinit sstfb_probe(struct pci_dev *pdev,
 		goto fail;
 	}
 	
-	fb_alloc_cmap(&info->cmap, 256, 0);
+	if (fb_alloc_cmap(&info->cmap, 256, 0)) {
+		printk(KERN_ERR "sstfb: can't alloc cmap memory.\n");
+		goto fail;
+	}
 
 	/* register fb */
 	info->device = &pdev->dev;
 	if (register_framebuffer(info) < 0) {
 		printk(KERN_ERR "sstfb: can't register framebuffer.\n");
-		goto fail;
+		goto fail_register;
 	}
 
 	sstfb_clear_screen(info);
@@ -1441,8 +1444,9 @@ static int __devinit sstfb_probe(struct pci_dev *pdev,
 
 	return 0;
 
-fail:
+fail_register:
 	fb_dealloc_cmap(&info->cmap);
+fail:
 	iounmap(info->screen_base);
 fail_fb_remap:
 	iounmap(par->mmio_vbase);
-- 
cgit v1.2.3


From 175b39fb7e145e1aa06f6369c1fbea16873dee9e Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:26 -0700
Subject: stifb: check fb_alloc_cmap return value and handle failure properly

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/stifb.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c
index 16648140241..eabaad765ae 100644
--- a/drivers/video/stifb.c
+++ b/drivers/video/stifb.c
@@ -1262,24 +1262,25 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref)
 	info->flags = FBINFO_DEFAULT;
 	info->pseudo_palette = &fb->pseudo_palette;
 
-	/* This has to been done !!! */
-	fb_alloc_cmap(&info->cmap, NR_PALETTE, 0);
+	/* This has to be done !!! */
+	if (fb_alloc_cmap(&info->cmap, NR_PALETTE, 0))
+		goto out_err1;
 	stifb_init_display(fb);
 
 	if (!request_mem_region(fix->smem_start, fix->smem_len, "stifb fb")) {
 		printk(KERN_ERR "stifb: cannot reserve fb region 0x%04lx-0x%04lx\n",
 				fix->smem_start, fix->smem_start+fix->smem_len);
-		goto out_err1;
+		goto out_err2;
 	}
 		
 	if (!request_mem_region(fix->mmio_start, fix->mmio_len, "stifb mmio")) {
 		printk(KERN_ERR "stifb: cannot reserve sti mmio region 0x%04lx-0x%04lx\n",
 				fix->mmio_start, fix->mmio_start+fix->mmio_len);
-		goto out_err2;
+		goto out_err3;
 	}
 
 	if (register_framebuffer(&fb->info) < 0)
-		goto out_err3;
+		goto out_err4;
 
 	sti->info = info; /* save for unregister_framebuffer() */
 
@@ -1297,13 +1298,14 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref)
 	return 0;
 
 
-out_err3:
+out_err4:
 	release_mem_region(fix->mmio_start, fix->mmio_len);
-out_err2:
+out_err3:
 	release_mem_region(fix->smem_start, fix->smem_len);
+out_err2:
+	fb_dealloc_cmap(&info->cmap);
 out_err1:
 	iounmap(info->screen_base);
-	fb_dealloc_cmap(&info->cmap);
 out_err0:
 	kfree(fb);
 	return -ENXIO;
-- 
cgit v1.2.3


From ccb121e6958eca5f58938e56523fc589fed36fa8 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:26 -0700
Subject: valkyriefb: check fb_alloc_cmap return value and handle failure
 properly

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/valkyriefb.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/video/valkyriefb.c b/drivers/video/valkyriefb.c
index 7b0cef9ca8f..4bb9a0b1895 100644
--- a/drivers/video/valkyriefb.c
+++ b/drivers/video/valkyriefb.c
@@ -119,7 +119,7 @@ static void set_valkyrie_clock(unsigned char *params);
 static int valkyrie_var_to_par(struct fb_var_screeninfo *var,
 	struct fb_par_valkyrie *par, const struct fb_info *fb_info);
 
-static void valkyrie_init_info(struct fb_info *info, struct fb_info_valkyrie *p);
+static int valkyrie_init_info(struct fb_info *info, struct fb_info_valkyrie *p);
 static void valkyrie_par_to_fix(struct fb_par_valkyrie *par, struct fb_fix_screeninfo *fix);
 static void valkyrie_init_fix(struct fb_fix_screeninfo *fix, struct fb_info_valkyrie *p);
 
@@ -381,18 +381,22 @@ int __init valkyriefb_init(void)
 
 	valkyrie_choose_mode(p);
 	mac_vmode_to_var(default_vmode, default_cmode, &p->info.var);
-	valkyrie_init_info(&p->info, p);
+	err = valkyrie_init_info(&p->info, p);
+	if (err < 0)
+		goto out_free;
 	valkyrie_init_fix(&p->info.fix, p);
 	if (valkyriefb_set_par(&p->info))
 		/* "can't happen" */
 		printk(KERN_ERR "valkyriefb: can't set default video mode\n");
 
 	if ((err = register_framebuffer(&p->info)) != 0)
-		goto out_free;
+		goto out_cmap_free;
 
 	printk(KERN_INFO "fb%d: valkyrie frame buffer device\n", p->info.node);
 	return 0;
 
+ out_cmap_free:
+	fb_dealloc_cmap(&p->info.cmap);
  out_free:
 	if (p->frame_buffer)
 		iounmap(p->frame_buffer);
@@ -538,14 +542,15 @@ static void valkyrie_par_to_fix(struct fb_par_valkyrie *par,
 		/* ywrapstep, xpanstep, ypanstep */
 }
 
-static void __init valkyrie_init_info(struct fb_info *info, struct fb_info_valkyrie *p)
+static int __init valkyrie_init_info(struct fb_info *info,
+		struct fb_info_valkyrie *p)
 {
 	info->fbops = &valkyriefb_ops;
 	info->screen_base = p->frame_buffer + 0x1000;
 	info->flags = FBINFO_DEFAULT;
 	info->pseudo_palette = p->pseudo_palette;
-	fb_alloc_cmap(&info->cmap, 256, 0);
 	info->par = &p->par;
+	return fb_alloc_cmap(&info->cmap, 256, 0);
 }
 
 
-- 
cgit v1.2.3


From cc880a715782fe31116284d90e0b5bfb1411535b Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:27 -0700
Subject: sunxvr500: fix cmap memory leaks

- fix cmap leak in removal path
 - fix cmap leak when register_framebuffer fails

Signed-off-by: Andres Salomon <dilinger@debian.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/sunxvr500.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/video/sunxvr500.c b/drivers/video/sunxvr500.c
index c2ba51b7ea1..18b950706ca 100644
--- a/drivers/video/sunxvr500.c
+++ b/drivers/video/sunxvr500.c
@@ -349,11 +349,14 @@ static int __devinit e3d_pci_register(struct pci_dev *pdev,
 	if (err < 0) {
 		printk(KERN_ERR "e3d: Could not register framebuffer %s\n",
 		       pci_name(pdev));
-		goto err_unmap_fb;
+		goto err_free_cmap;
 	}
 
 	return 0;
 
+err_free_cmap:
+	fb_dealloc_cmap(&info->cmap);
+
 err_unmap_fb:
 	iounmap(ep->fb_base);
 
@@ -389,6 +392,7 @@ static void __devexit e3d_pci_unregister(struct pci_dev *pdev)
 	pci_release_region(pdev, 0);
 	pci_release_region(pdev, 1);
 
+	fb_dealloc_cmap(&info->cmap);
         framebuffer_release(info);
 
 	pci_disable_device(pdev);
-- 
cgit v1.2.3


From 327fc8752a3c08fc7dc7d382883e65aad2f03bde Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:28 -0700
Subject: tgafb: fix cmap memory leak

Fix cmap leak when register_framebuffer fails.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/tgafb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/video/tgafb.c b/drivers/video/tgafb.c
index 680642c089c..a86046ff60a 100644
--- a/drivers/video/tgafb.c
+++ b/drivers/video/tgafb.c
@@ -1663,7 +1663,7 @@ tgafb_register(struct device *dev)
 	if (register_framebuffer(info) < 0) {
 		printk(KERN_ERR "tgafb: Could not register framebuffer\n");
 		ret = -EINVAL;
-		goto err1;
+		goto err2;
 	}
 
 	if (tga_bus_pci) {
@@ -1682,6 +1682,8 @@ tgafb_register(struct device *dev)
 
 	return 0;
 
+ err2:
+	fb_dealloc_cmap(&info->cmap);
  err1:
 	if (mem_base)
 		iounmap(mem_base);
-- 
cgit v1.2.3


From e98d9b407c248ba1419bed0823488d3cc71a2c31 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:28 -0700
Subject: 68328fb: fix cmap memory leaks

- fix cmap leak in removal path

- fix cmap leak when register_framebuffer fails

- check return value of fb_alloc_cmap

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/68328fb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c
index 7f907fb23b8..0b17824b0eb 100644
--- a/drivers/video/68328fb.c
+++ b/drivers/video/68328fb.c
@@ -471,9 +471,11 @@ int __init mc68x328fb_init(void)
 	fb_info.pseudo_palette = &mc68x328fb_pseudo_palette;
 	fb_info.flags = FBINFO_DEFAULT | FBINFO_HWACCEL_YPAN;
 
-	fb_alloc_cmap(&fb_info.cmap, 256, 0);
+	if (fb_alloc_cmap(&fb_info.cmap, 256, 0))
+		return -ENOMEM;
 
 	if (register_framebuffer(&fb_info) < 0) {
+		fb_dealloc_cmap(&fb_info.cmap);
 		return -EINVAL;
 	}
 
@@ -494,6 +496,7 @@ module_init(mc68x328fb_init);
 static void __exit mc68x328fb_cleanup(void)
 {
 	unregister_framebuffer(&fb_info);
+	fb_dealloc_cmap(&fb_info.cmap);
 }
 
 module_exit(mc68x328fb_cleanup);
-- 
cgit v1.2.3


From 909baf0092545e5c2082b045303e7a4b1d2a0522 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:29 -0700
Subject: amba-clcd: fix cmap memory leaks

- fix cmap leak in removal path

- fix cmap leak when register_framebuffer fails

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/amba-clcd.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c
index 4e046fed138..61050ab1412 100644
--- a/drivers/video/amba-clcd.c
+++ b/drivers/video/amba-clcd.c
@@ -408,7 +408,9 @@ static int clcdfb_register(struct clcd_fb *fb)
 	/*
 	 * Allocate colourmap.
 	 */
-	fb_alloc_cmap(&fb->fb.cmap, 256, 0);
+	ret = fb_alloc_cmap(&fb->fb.cmap, 256, 0);
+	if (ret)
+		goto unmap;
 
 	/*
 	 * Ensure interrupts are disabled.
@@ -426,6 +428,8 @@ static int clcdfb_register(struct clcd_fb *fb)
 
 	printk(KERN_ERR "CLCD: cannot register framebuffer (%d)\n", ret);
 
+	fb_dealloc_cmap(&fb->fb.cmap);
+ unmap:
 	iounmap(fb->regs);
  free_clk:
 	clk_put(fb->clk);
@@ -485,6 +489,8 @@ static int clcdfb_remove(struct amba_device *dev)
 
 	clcdfb_disable(fb);
 	unregister_framebuffer(&fb->fb);
+	if (fb->fb.cmap.len)
+		fb_dealloc_cmap(&fb->fb.cmap);
 	iounmap(fb->regs);
 	clk_put(fb->clk);
 
-- 
cgit v1.2.3


From eb8972b4407f81b07ea6fc71fd91f9fc7a35a81e Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 31 Mar 2009 15:25:30 -0700
Subject: amifb: check fb_alloc_cmap return value and handle failure properly

Signed-off-by: Andres Salomon <dilinger@debian.org>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/amifb.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/video/amifb.c b/drivers/video/amifb.c
index 100f2366146..82bedd7f778 100644
--- a/drivers/video/amifb.c
+++ b/drivers/video/amifb.c
@@ -2437,7 +2437,9 @@ default_chipset:
 		goto amifb_error;
 	}
 
-	fb_alloc_cmap(&fb_info.cmap, 1<<fb_info.var.bits_per_pixel, 0);
+	err = fb_alloc_cmap(&fb_info.cmap, 1<<fb_info.var.bits_per_pixel, 0);
+	if (err)
+		goto amifb_error;
 
 	if (register_framebuffer(&fb_info) < 0) {
 		err = -EINVAL;
@@ -2456,7 +2458,8 @@ amifb_error:
 
 static void amifb_deinit(void)
 {
-	fb_dealloc_cmap(&fb_info.cmap);
+	if (fb_info.cmap.len)
+		fb_dealloc_cmap(&fb_info.cmap);
 	chipfree();
 	if (videomemory)
 		iounmap((void*)videomemory);
-- 
cgit v1.2.3


From 91ad1203535da95bb13072bdb59e1dc3ca76ec5d Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:30 -0700
Subject: fbdev: newport: newport_*wait() return 0 on timeout

With a postfix decrement t reaches -1 on timeout which results in a
return of 0.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/video/newport.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/video/newport.h b/include/video/newport.h
index 1f5ebeaa818..001b935e71c 100644
--- a/include/video/newport.h
+++ b/include/video/newport.h
@@ -453,7 +453,7 @@ static __inline__ int newport_wait(struct newport_regs *regs)
 {
 	int t = BUSY_TIMEOUT;
 
-	while (t--)
+	while (--t)
 		if (!(regs->cset.status & NPORT_STAT_GBUSY))
 			break;
 	return !t;
@@ -463,7 +463,7 @@ static __inline__ int newport_bfwait(struct newport_regs *regs)
 {
 	int t = BUSY_TIMEOUT;
 
-	while (t--)
+	while (--t)
 		if(!(regs->cset.status & NPORT_STAT_BBUSY))
 			break;
 	return !t;
-- 
cgit v1.2.3


From afbb9d8d5266b4121cb503b4e097f8e65286a077 Mon Sep 17 00:00:00 2001
From: Kristoffer Ericson <kristoffer.ericson@gmail.com>
Date: Tue, 31 Mar 2009 15:25:31 -0700
Subject: fbdev: update s1d13xxxfb to differ between revisions and production
 ids

The s1d13xxx chip provides two values of identification value: the
Production id (e.g 13506/13505/13806..) and a revision number 0,1,2,3).
Together these can help us to differentiate between similiar setups.

This patch adds the proper way of grabbing both those values and save them
for future reference (in order to decide what functions a card supports,
e.g acceleration).

We also move away from the concept of all s1d13xxx = s1d13806 when we
really support alot more.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: simplify s1d13xxxfb_probe()]
Signed-off-by: Kristoffer Ericson <kristoffer.ericson@gmail.com
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/s1d13xxxfb.c | 48 ++++++++++++++++++++++++++++++++++------------
 include/video/s1d13xxxfb.h | 16 ++++++++++------
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/video/s1d13xxxfb.c b/drivers/video/s1d13xxxfb.c
index a7b01d2724b..0726aecf3b7 100644
--- a/drivers/video/s1d13xxxfb.c
+++ b/drivers/video/s1d13xxxfb.c
@@ -50,9 +50,22 @@
 #define dbg(fmt, args...) do { } while (0)
 #endif
 
-static const int __devinitconst s1d13xxxfb_revisions[] = {
-	S1D13506_CHIP_REV,	/* Rev.4 on HP Jornada 7xx S1D13506 */
-	S1D13806_CHIP_REV,	/* Rev.7 on .. */
+/*
+ * List of card production ids
+ */
+static const int s1d13xxxfb_prod_ids[] = {
+	S1D13505_PROD_ID,
+	S1D13506_PROD_ID,
+	S1D13806_PROD_ID,
+};
+
+/*
+ * List of card strings
+ */
+static const char *s1d13xxxfb_prod_names[] = {
+	"S1D13505",
+	"S1D13506",
+	"S1D13806",
 };
 
 /*
@@ -377,7 +390,6 @@ s1d13xxxfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
 	return 0;
 }
 
-
 /* framebuffer information structures */
 
 static struct fb_ops s1d13xxxfb_fbops = {
@@ -544,7 +556,7 @@ s1d13xxxfb_probe(struct platform_device *pdev)
 	struct s1d13xxxfb_pdata *pdata = NULL;
 	int ret = 0;
 	int i;
-	u8 revision;
+	u8 revision, prod_id;
 
 	dbg("probe called: device is %p\n", pdev);
 
@@ -613,19 +625,31 @@ s1d13xxxfb_probe(struct platform_device *pdev)
 		goto bail;
 	}
 
-	revision = s1d13xxxfb_readreg(default_par, S1DREG_REV_CODE) >> 2;
-
+	/* production id is top 6 bits */
+	prod_id = s1d13xxxfb_readreg(default_par, S1DREG_REV_CODE) >> 2;
+	/* revision id is lower 2 bits */
+	revision = s1d13xxxfb_readreg(default_par, S1DREG_REV_CODE) & 0x3;
 	ret = -ENODEV;
 
-	for (i = 0; i < ARRAY_SIZE(s1d13xxxfb_revisions); i++) {
-		if (revision == s1d13xxxfb_revisions[i])
+	for (i = 0; i < ARRAY_SIZE(s1d13xxxfb_prod_ids); i++) {
+		if (prod_id == s1d13xxxfb_prod_ids[i]) {
+			/* looks like we got it in our list */
+			default_par->prod_id = prod_id;
+			default_par->revision = revision;
 			ret = 0;
+			break;
+		}
 	}
 
-	if (!ret)
+	if (!ret) {
+		printk(KERN_INFO PFX "chip production id %i = %s\n",
+			prod_id, s1d13xxxfb_prod_names[i]);
 		printk(KERN_INFO PFX "chip revision %i\n", revision);
-	else {
-		printk(KERN_INFO PFX "unknown chip revision %i\n", revision);
+	} else {
+		printk(KERN_INFO PFX
+			"unknown chip production id %i, revision %i\n",
+			prod_id, revision);
+		printk(KERN_INFO PFX "please contant maintainer\n");
 		goto bail;
 	}
 
diff --git a/include/video/s1d13xxxfb.h b/include/video/s1d13xxxfb.h
index fe41b840794..c3b2a2aa714 100644
--- a/include/video/s1d13xxxfb.h
+++ b/include/video/s1d13xxxfb.h
@@ -14,13 +14,16 @@
 #define	S1D13XXXFB_H
 
 #define S1D_PALETTE_SIZE		256
-#define S1D13506_CHIP_REV		4	/* expected chip revision number for s1d13506 */
-#define S1D13806_CHIP_REV		7	/* expected chip revision number for s1d13806 */
-#define S1D_FBID			"S1D13806"
-#define S1D_DEVICENAME			"s1d13806fb"
+#define S1D_FBID			"S1D13xxx"
+#define S1D_DEVICENAME			"s1d13xxxfb"
+
+/* S1DREG_REV_CODE register = prod_id (6 bits) + revision (2 bits) */
+#define S1D13505_PROD_ID		0x3	/* 000011 */
+#define S1D13506_PROD_ID		0x4	/* 000100 */
+#define S1D13806_PROD_ID		0x7	/* 000111 */
 
 /* register definitions (tested on s1d13896) */
-#define S1DREG_REV_CODE			0x0000	/* Revision Code Register */
+#define S1DREG_REV_CODE			0x0000	/* Prod + Rev Code Register */
 #define S1DREG_MISC			0x0001	/* Miscellaneous Register */
 #define S1DREG_GPIO_CNF0		0x0004	/* General IO Pins Configuration Register 0 */
 #define S1DREG_GPIO_CNF1		0x0005	/* General IO Pins Configuration Register 1 */
@@ -141,10 +144,11 @@ struct s1d13xxxfb_regval {
 	u8	value;
 };
 
-
 struct s1d13xxxfb_par {
 	void __iomem	*regs;
 	unsigned char	display;
+	unsigned char	prod_id;
+	unsigned char	revision;
 
 	unsigned int	pseudo_palette[16];
 #ifdef CONFIG_PM
-- 
cgit v1.2.3


From ba78289343226773b27dc25e7d1e739d0162b9e8 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:32 -0700
Subject: drivers/video/omap/hwa742.c: div reaches max_clk_div

With for(div = 0; div < max_clk_div; div++) { ... } div reaches max_clk_div.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Joe Perches <joe@perches.com>
Acked-by: Trilok Soni <soni.trilok@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/omap/hwa742.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/video/omap/hwa742.c b/drivers/video/omap/hwa742.c
index f24df0b54e1..8aa6e47202b 100644
--- a/drivers/video/omap/hwa742.c
+++ b/drivers/video/omap/hwa742.c
@@ -742,7 +742,7 @@ static int calc_extif_timings(unsigned long sysclk, int *extif_mem_div)
 		if (calc_reg_timing(sysclk, div) == 0)
 			break;
 	}
-	if (div > max_clk_div)
+	if (div >= max_clk_div)
 		goto err;
 
 	*extif_mem_div = div;
@@ -752,7 +752,7 @@ static int calc_extif_timings(unsigned long sysclk, int *extif_mem_div)
 			break;
 	}
 
-	if (div > max_clk_div)
+	if (div >= max_clk_div)
 		goto err;
 
 	return 0;
-- 
cgit v1.2.3


From 032220ba310204be9cb2ddbbf848020fadc63ce6 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@debian.org>
Date: Tue, 31 Mar 2009 15:25:33 -0700
Subject: asiliantfb: fix cmap memory leaks

- fix cmap leak in removal path
 - fix cmap leak when register_framebuffer fails
 - check return value of fb_alloc_cmap
 - don't continue with driver setup if register_framebuffer fails

[krzysztof.h1@wp.pl: spotted missing iounmap]
[randy.dunlap@oracle.com: move data declaration before any code]
Signed-off-by: Andres Salomon <dilinger@debian.org>
Cc: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/asiliantfb.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/video/asiliantfb.c b/drivers/video/asiliantfb.c
index 1fd22f460b0..1a1f946d8fe 100644
--- a/drivers/video/asiliantfb.c
+++ b/drivers/video/asiliantfb.c
@@ -505,19 +505,27 @@ static struct fb_var_screeninfo asiliantfb_var __devinitdata = {
 	.vsync_len 	= 2,
 };
 
-static void __devinit init_asiliant(struct fb_info *p, unsigned long addr)
+static int __devinit init_asiliant(struct fb_info *p, unsigned long addr)
 {
+	int err;
+
 	p->fix			= asiliantfb_fix;
 	p->fix.smem_start	= addr;
 	p->var			= asiliantfb_var;
 	p->fbops		= &asiliantfb_ops;
 	p->flags		= FBINFO_DEFAULT;
 
-	fb_alloc_cmap(&p->cmap, 256, 0);
+	err = fb_alloc_cmap(&p->cmap, 256, 0);
+	if (err) {
+		printk(KERN_ERR "C&T 69000 fb failed to alloc cmap memory\n");
+		return err;
+	}
 
-	if (register_framebuffer(p) < 0) {
+	err = register_framebuffer(p);
+	if (err < 0) {
 		printk(KERN_ERR "C&T 69000 framebuffer failed to register\n");
-		return;
+		fb_dealloc_cmap(&p->cmap);
+		return err;
 	}
 
 	printk(KERN_INFO "fb%d: Asiliant 69000 frame buffer (%dK RAM detected)\n",
@@ -532,6 +540,7 @@ asiliantfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent)
 {
 	unsigned long addr, size;
 	struct fb_info *p;
+	int err;
 
 	if ((dp->resource[0].flags & IORESOURCE_MEM) == 0)
 		return -ENODEV;
@@ -560,7 +569,13 @@ asiliantfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent)
 	pci_write_config_dword(dp, 4, 0x02800083);
 	writeb(3, p->screen_base + 0x400784);
 
-	init_asiliant(p, addr);
+	err = init_asiliant(p, addr);
+	if (err) {
+		iounmap(p->screen_base);
+		release_mem_region(addr, size);
+		framebuffer_release(p);
+		return err;
+	}
 
 	pci_set_drvdata(dp, p);
 	return 0;
@@ -571,6 +586,7 @@ static void __devexit asiliantfb_remove(struct pci_dev *dp)
 	struct fb_info *p = pci_get_drvdata(dp);
 
 	unregister_framebuffer(p);
+	fb_dealloc_cmap(&p->cmap);
 	iounmap(p->screen_base);
 	release_mem_region(pci_resource_start(dp, 0), pci_resource_len(dp, 0));
 	pci_set_drvdata(dp, NULL);
-- 
cgit v1.2.3


From b935257b1f98291ec1c8cbf7dbccbe0b20665bf6 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:34 -0700
Subject: arkfb: fix misplaced parentheses

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Ondrej Zajicek <santiago@crfreenet.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/arkfb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c
index 314d18694b6..d583bea608f 100644
--- a/drivers/video/arkfb.c
+++ b/drivers/video/arkfb.c
@@ -470,7 +470,7 @@ static void ark_dac_read_regs(void *data, u8 *code, int count)
 
 	while (count != 0)
 	{
-		vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0);
+		vga_wseq(NULL, 0x1C, regval | (code[0] & 4 ? 0x80 : 0));
 		code[1] = vga_r(NULL, dac_regs[code[0] & 3]);
 		count--;
 		code += 2;
@@ -485,7 +485,7 @@ static void ark_dac_write_regs(void *data, u8 *code, int count)
 
 	while (count != 0)
 	{
-		vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0);
+		vga_wseq(NULL, 0x1C, regval | (code[0] & 4 ? 0x80 : 0));
 		vga_w(NULL, dac_regs[code[0] & 3], code[1]);
 		count--;
 		code += 2;
-- 
cgit v1.2.3


From 1cc9fb6dbf915e5c7e7e59bb7fab10572ddbb349 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:35 -0700
Subject: uvesafb: bitwise OR has higher precedence than ?:

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Michal Januszewski <michalj@gmail.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/uvesafb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index 398fd25e081..c288270e42f 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -1552,7 +1552,7 @@ static void __devinit uvesafb_init_info(struct fb_info *info,
 	}
 
 	info->flags = FBINFO_FLAG_DEFAULT |
-			(par->ypan) ? FBINFO_HWACCEL_YPAN : 0;
+			(par->ypan ? FBINFO_HWACCEL_YPAN : 0);
 
 	if (!par->ypan)
 		info->fbops->fb_pan_display = NULL;
-- 
cgit v1.2.3


From b83734ec0975e1f53420b7a2d454612fc905a9d0 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:36 -0700
Subject: vesafb: bitwise OR has higher precedence than ?:

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Michal Januszewski <michalj@gmail.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/vesafb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c
index e16322d157d..d6856f43d24 100644
--- a/drivers/video/vesafb.c
+++ b/drivers/video/vesafb.c
@@ -438,7 +438,7 @@ static int __init vesafb_probe(struct platform_device *dev)
 	info->var = vesafb_defined;
 	info->fix = vesafb_fix;
 	info->flags = FBINFO_FLAG_DEFAULT |
-		(ypan) ? FBINFO_HWACCEL_YPAN : 0;
+		(ypan ? FBINFO_HWACCEL_YPAN : 0);
 
 	if (!ypan)
 		info->fbops->fb_pan_display = NULL;
-- 
cgit v1.2.3


From 2bd8c47597b2522795f5eb2e61c22dcfec5dfa6a Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 31 Mar 2009 15:25:36 -0700
Subject: viafb: returns 0 two too early

Otherwise this will already return 0 if iteration MAXLOOP-2 occurs in the
first loop.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Joseph Chan <josephchan@via.com.tw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/via/accel.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/video/via/accel.c b/drivers/video/via/accel.c
index 632523ff1fb..45c54bfe99b 100644
--- a/drivers/video/via/accel.c
+++ b/drivers/video/via/accel.c
@@ -267,13 +267,17 @@ int viafb_wait_engine_idle(void)
 	int loop = 0;
 
 	while (!(readl(viaparinfo->io_virt + VIA_REG_STATUS) &
-			VIA_VR_QUEUE_BUSY) && (loop++ < MAXLOOP))
+			VIA_VR_QUEUE_BUSY) && (loop < MAXLOOP)) {
+		loop++;
 		cpu_relax();
+	}
 
 	while ((readl(viaparinfo->io_virt + VIA_REG_STATUS) &
 		    (VIA_CMD_RGTR_BUSY | VIA_2D_ENG_BUSY | VIA_3D_ENG_BUSY)) &&
-		    (loop++ < MAXLOOP))
+		    (loop < MAXLOOP)) {
+		loop++;
 		cpu_relax();
+	}
 
 	return loop >= MAXLOOP;
 }
-- 
cgit v1.2.3


From 4c8714310afbaabd94ac30db1e499a90e4a69c4e Mon Sep 17 00:00:00 2001
From: Herton Ronaldo Krzesinski <herton@mandriva.com.br>
Date: Tue, 31 Mar 2009 15:25:37 -0700
Subject: n411: add missing Makefile entry

There is no entry for n411.c to be built, include one in Makefile.

Signed-off-by: Herton Ronaldo Krzesinski <herton@mandriva.com.br>
Cc: Jaya Kumar <jayakumar.lkml@gmail.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index bb265eca7d5..77cfa807d16 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_FB_ATARI)            += atafb.o c2p_iplan2.o atafb_mfb.o \
                                      atafb_iplan2p2.o atafb_iplan2p4.o atafb_iplan2p8.o
 obj-$(CONFIG_FB_MAC)              += macfb.o
 obj-$(CONFIG_FB_HECUBA)           += hecubafb.o
+obj-$(CONFIG_FB_N411)             += n411.o
 obj-$(CONFIG_FB_HGA)              += hgafb.o
 obj-$(CONFIG_FB_XVR500)           += sunxvr500.o
 obj-$(CONFIG_FB_XVR2500)          += sunxvr2500.o
-- 
cgit v1.2.3


From ec549a0fdc32171b26677f1ef0b5309faa743362 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Tue, 31 Mar 2009 15:25:39 -0700
Subject: fb: add s3c-fb driver for newer Samsung SoC framebuffer devices

Add support for the newer Samsung devices, such as found in the S3C2443,
S3C6400 or S3C6410 series SoC.

It currently does not support all the alpha- or chroma-key options but it
will support more exporting more than one framebuffer ready for adding
overlay and blending functions.

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/Kconfig  |   24 ++
 drivers/video/Makefile |    1 +
 drivers/video/s3c-fb.c | 1036 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1061 insertions(+)
 create mode 100644 drivers/video/s3c-fb.c

diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 41c27a44bd8..c19f6feb4e5 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -1920,6 +1920,30 @@ config FB_TMIO_ACCELL
 	depends on FB_TMIO
 	default y
 
+config FB_S3C
+	tristate "Samsung S3C framebuffer support"
+	depends on FB && ARCH_S3C64XX
+	select FB_CFB_FILLRECT
+	select FB_CFB_COPYAREA
+	select FB_CFB_IMAGEBLIT
+	---help---
+	  Frame buffer driver for the built-in FB controller in the Samsung
+	  SoC line from the S3C2443 onwards, including the S3C2416, S3C2450,
+	  and the S3C64XX series such as the S3C6400 and S3C6410.
+
+	  These chips all have the same basic framebuffer design with the
+	  actual capabilities depending on the chip. For instance the S3C6400
+	  and S3C6410 support 4 hardware windows whereas the S3C24XX series
+	  currently only have two.
+
+	  Currently the support is only for the S3C6400 and S3C6410 SoCs.
+
+config FB_S3C_DEBUG_REGWRITE
+       bool "Debug register writes"
+       depends on FB_S3C
+       ---help---
+         Show all register writes via printk(KERN_DEBUG)
+
 config FB_S3C2410
 	tristate "S3C2410 LCD framebuffer support"
 	depends on FB && ARCH_S3C2410
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index 77cfa807d16..0dbd6c68d76 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_FB_BROADSHEET)       += broadsheetfb.o
 obj-$(CONFIG_FB_S1D13XXX)	  += s1d13xxxfb.o
 obj-$(CONFIG_FB_SH7760)		  += sh7760fb.o
 obj-$(CONFIG_FB_IMX)              += imxfb.o
+obj-$(CONFIG_FB_S3C)		  += s3c-fb.o
 obj-$(CONFIG_FB_S3C2410)	  += s3c2410fb.o
 obj-$(CONFIG_FB_FSL_DIU)	  += fsl-diu-fb.o
 obj-$(CONFIG_FB_COBALT)           += cobalt_lcdfb.o
diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c
new file mode 100644
index 00000000000..5e9c6302433
--- /dev/null
+++ b/drivers/video/s3c-fb.c
@@ -0,0 +1,1036 @@
+/* linux/drivers/video/s3c-fb.c
+ *
+ * Copyright 2008 Openmoko Inc.
+ * Copyright 2008 Simtec Electronics
+ *      Ben Dooks <ben@simtec.co.uk>
+ *      http://armlinux.simtec.co.uk/
+ *
+ * Samsung SoC Framebuffer driver
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/clk.h>
+#include <linux/fb.h>
+#include <linux/io.h>
+
+#include <mach/map.h>
+#include <mach/regs-fb.h>
+#include <plat/fb.h>
+
+/* This driver will export a number of framebuffer interfaces depending
+ * on the configuration passed in via the platform data. Each fb instance
+ * maps to a hardware window. Currently there is no support for runtime
+ * setting of the alpha-blending functions that each window has, so only
+ * window 0 is actually useful.
+ *
+ * Window 0 is treated specially, it is used for the basis of the LCD
+ * output timings and as the control for the output power-down state.
+*/
+
+/* note, some of the functions that get called are derived from including
+ * <mach/regs-fb.h> as they are specific to the architecture that the code
+ * is being built for.
+*/
+
+#ifdef CONFIG_FB_S3C_DEBUG_REGWRITE
+#undef writel
+#define writel(v, r) do { \
+	printk(KERN_DEBUG "%s: %08x => %p\n", __func__, (unsigned int)v, r); \
+	__raw_writel(v, r); } while(0)
+#endif /* FB_S3C_DEBUG_REGWRITE */
+
+struct s3c_fb;
+
+/**
+ * struct s3c_fb_win - per window private data for each framebuffer.
+ * @windata: The platform data supplied for the window configuration.
+ * @parent: The hardware that this window is part of.
+ * @fbinfo: Pointer pack to the framebuffer info for this window.
+ * @palette_buffer: Buffer/cache to hold palette entries.
+ * @pseudo_palette: For use in TRUECOLOUR modes for entries 0..15/
+ * @index: The window number of this window.
+ * @palette: The bitfields for changing r/g/b into a hardware palette entry.
+ */
+struct s3c_fb_win {
+	struct s3c_fb_pd_win	*windata;
+	struct s3c_fb		*parent;
+	struct fb_info		*fbinfo;
+	struct s3c_fb_palette	 palette;
+
+	u32			*palette_buffer;
+	u32			 pseudo_palette[16];
+	unsigned int		 index;
+};
+
+/**
+ * struct s3c_fb - overall hardware state of the hardware
+ * @dev: The device that we bound to, for printing, etc.
+ * @regs_res: The resource we claimed for the IO registers.
+ * @bus_clk: The clk (hclk) feeding our interface and possibly pixclk.
+ * @regs: The mapped hardware registers.
+ * @enabled: A bitmask of enabled hardware windows.
+ * @pdata: The platform configuration data passed with the device.
+ * @windows: The hardware windows that have been claimed.
+ */
+struct s3c_fb {
+	struct device		*dev;
+	struct resource		*regs_res;
+	struct clk		*bus_clk;
+	void __iomem		*regs;
+
+	unsigned char		 enabled;
+
+	struct s3c_fb_platdata	*pdata;
+	struct s3c_fb_win	*windows[S3C_FB_MAX_WIN];
+};
+
+/**
+ * s3c_fb_win_has_palette() - determine if a mode has a palette
+ * @win: The window number being queried.
+ * @bpp: The number of bits per pixel to test.
+ *
+ * Work out if the given window supports palletised data at the specified bpp.
+ */
+static int s3c_fb_win_has_palette(unsigned int win, unsigned int bpp)
+{
+	return s3c_fb_win_pal_size(win) <= (1 << bpp);
+}
+
+/**
+ * s3c_fb_check_var() - framebuffer layer request to verify a given mode.
+ * @var: The screen information to verify.
+ * @info: The framebuffer device.
+ *
+ * Framebuffer layer call to verify the given information and allow us to
+ * update various information depending on the hardware capabilities.
+ */
+static int s3c_fb_check_var(struct fb_var_screeninfo *var,
+			    struct fb_info *info)
+{
+	struct s3c_fb_win *win = info->par;
+	struct s3c_fb_pd_win *windata = win->windata;
+	struct s3c_fb *sfb = win->parent;
+
+	dev_dbg(sfb->dev, "checking parameters\n");
+
+	var->xres_virtual = max((unsigned int)windata->virtual_x, var->xres);
+	var->yres_virtual = max((unsigned int)windata->virtual_y, var->yres);
+
+	if (!s3c_fb_validate_win_bpp(win->index, var->bits_per_pixel)) {
+		dev_dbg(sfb->dev, "win %d: unsupported bpp %d\n",
+			win->index, var->bits_per_pixel);
+		return -EINVAL;
+	}
+
+	/* always ensure these are zero, for drop through cases below */
+	var->transp.offset = 0;
+	var->transp.length = 0;
+
+	switch (var->bits_per_pixel) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		if (!s3c_fb_win_has_palette(win->index, var->bits_per_pixel)) {
+			/* non palletised, A:1,R:2,G:3,B:2 mode */
+			var->red.offset		= 4;
+			var->green.offset	= 2;
+			var->blue.offset	= 0;
+			var->red.length		= 5;
+			var->green.length	= 3;
+			var->blue.length	= 2;
+			var->transp.offset	= 7;
+			var->transp.length	= 1;
+		} else {
+			var->red.offset	= 0;
+			var->red.length	= var->bits_per_pixel;
+			var->green	= var->red;
+			var->blue	= var->red;
+		}
+		break;
+
+	case 19:
+		/* 666 with one bit alpha/transparency */
+		var->transp.offset	= 18;
+		var->transp.length	= 1;
+	case 18:
+		var->bits_per_pixel	= 32;
+
+		/* 666 format */
+		var->red.offset		= 12;
+		var->green.offset	= 6;
+		var->blue.offset	= 0;
+		var->red.length		= 6;
+		var->green.length	= 6;
+		var->blue.length	= 6;
+		break;
+
+	case 16:
+		/* 16 bpp, 565 format */
+		var->red.offset		= 11;
+		var->green.offset	= 5;
+		var->blue.offset	= 0;
+		var->red.length		= 5;
+		var->green.length	= 6;
+		var->blue.length	= 5;
+		break;
+
+	case 28:
+	case 25:
+		var->transp.length	= var->bits_per_pixel - 24;
+		var->transp.offset	= 24;
+		/* drop through */
+	case 24:
+		/* our 24bpp is unpacked, so 32bpp */
+		var->bits_per_pixel	= 32;
+	case 32:
+		var->red.offset		= 16;
+		var->red.length		= 8;
+		var->green.offset	= 8;
+		var->green.length	= 8;
+		var->blue.offset	= 0;
+		var->blue.length	= 8;
+		break;
+
+	default:
+		dev_err(sfb->dev, "invalid bpp\n");
+	}
+
+	dev_dbg(sfb->dev, "%s: verified parameters\n", __func__);
+	return 0;
+}
+
+/**
+ * s3c_fb_calc_pixclk() - calculate the divider to create the pixel clock.
+ * @sfb: The hardware state.
+ * @pixclock: The pixel clock wanted, in picoseconds.
+ *
+ * Given the specified pixel clock, work out the necessary divider to get
+ * close to the output frequency.
+ */
+static int s3c_fb_calc_pixclk(struct s3c_fb *sfb, unsigned int pixclk)
+{
+	unsigned long clk = clk_get_rate(sfb->bus_clk);
+	unsigned long long tmp;
+	unsigned int result;
+
+	tmp = (unsigned long long)clk;
+	tmp *= pixclk;
+
+	do_div(tmp, 1000000000UL);
+	result = (unsigned int)tmp / 1000;
+
+	dev_dbg(sfb->dev, "pixclk=%u, clk=%lu, div=%d (%lu)\n",
+		pixclk, clk, result, clk / result);
+
+	return result;
+}
+
+/**
+ * s3c_fb_align_word() - align pixel count to word boundary
+ * @bpp: The number of bits per pixel
+ * @pix: The value to be aligned.
+ *
+ * Align the given pixel count so that it will start on an 32bit word
+ * boundary.
+ */
+static int s3c_fb_align_word(unsigned int bpp, unsigned int pix)
+{
+	int pix_per_word;
+
+	if (bpp > 16)
+		return pix;
+
+	pix_per_word = (8 * 32) / bpp;
+	return ALIGN(pix, pix_per_word);
+}
+
+/**
+ * s3c_fb_set_par() - framebuffer request to set new framebuffer state.
+ * @info: The framebuffer to change.
+ *
+ * Framebuffer layer request to set a new mode for the specified framebuffer
+ */
+static int s3c_fb_set_par(struct fb_info *info)
+{
+	struct fb_var_screeninfo *var = &info->var;
+	struct s3c_fb_win *win = info->par;
+	struct s3c_fb *sfb = win->parent;
+	void __iomem *regs = sfb->regs;
+	int win_no = win->index;
+	u32 data;
+	u32 pagewidth;
+	int clkdiv;
+
+	dev_dbg(sfb->dev, "setting framebuffer parameters\n");
+
+	switch (var->bits_per_pixel) {
+	case 32:
+	case 24:
+	case 16:
+	case 12:
+		info->fix.visual = FB_VISUAL_TRUECOLOR;
+		break;
+	case 8:
+		if (s3c_fb_win_has_palette(win_no, 8))
+			info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
+		else
+			info->fix.visual = FB_VISUAL_TRUECOLOR;
+		break;
+	case 1:
+		info->fix.visual = FB_VISUAL_MONO01;
+		break;
+	default:
+		info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
+		break;
+	}
+
+	info->fix.line_length = (var->xres_virtual * var->bits_per_pixel) / 8;
+
+	/* disable the window whilst we update it */
+	writel(0, regs + WINCON(win_no));
+
+	/* use window 0 as the basis for the lcd output timings */
+
+	if (win_no == 0) {
+		clkdiv = s3c_fb_calc_pixclk(sfb, var->pixclock);
+
+		data = sfb->pdata->vidcon0;
+		data &= ~(VIDCON0_CLKVAL_F_MASK | VIDCON0_CLKDIR);
+
+		if (clkdiv > 1)
+			data |= VIDCON0_CLKVAL_F(clkdiv-1) | VIDCON0_CLKDIR;
+		else
+			data &= ~VIDCON0_CLKDIR;	/* 1:1 clock */
+
+		/* write the timing data to the panel */
+
+		data |= VIDCON0_ENVID | VIDCON0_ENVID_F;
+		writel(data, regs + VIDCON0);
+
+		data = VIDTCON0_VBPD(var->upper_margin - 1) |
+		       VIDTCON0_VFPD(var->lower_margin - 1) |
+		       VIDTCON0_VSPW(var->vsync_len - 1);
+
+		writel(data, regs + VIDTCON0);
+
+		data = VIDTCON1_HBPD(var->left_margin - 1) |
+		       VIDTCON1_HFPD(var->right_margin - 1) |
+		       VIDTCON1_HSPW(var->hsync_len - 1);
+
+		writel(data, regs + VIDTCON1);
+
+		data = VIDTCON2_LINEVAL(var->yres - 1) |
+		       VIDTCON2_HOZVAL(var->xres - 1);
+		writel(data, regs + VIDTCON2);
+	}
+
+	/* write the buffer address */
+
+	writel(info->fix.smem_start, regs + VIDW_BUF_START(win_no));
+
+	data = info->fix.smem_start + info->fix.line_length * var->yres;
+	writel(data, regs + VIDW_BUF_END(win_no));
+
+	pagewidth = (var->xres * var->bits_per_pixel) >> 3;
+	data = VIDW_BUF_SIZE_OFFSET(info->fix.line_length - pagewidth) |
+	       VIDW_BUF_SIZE_PAGEWIDTH(pagewidth);
+	writel(data, regs + VIDW_BUF_SIZE(win_no));
+
+	/* write 'OSD' registers to control position of framebuffer */
+
+	data = VIDOSDxA_TOPLEFT_X(0) | VIDOSDxA_TOPLEFT_Y(0);
+	writel(data, regs + VIDOSD_A(win_no));
+
+	data = VIDOSDxB_BOTRIGHT_X(s3c_fb_align_word(var->bits_per_pixel,
+						     var->xres - 1)) |
+	       VIDOSDxB_BOTRIGHT_Y(var->yres - 1);
+
+	writel(data, regs + VIDOSD_B(win_no));
+
+	data = var->xres * var->yres;
+	if (s3c_fb_has_osd_d(win_no)) {
+		writel(data, regs + VIDOSD_D(win_no));
+		writel(0, regs + VIDOSD_C(win_no));
+	} else
+		writel(data, regs + VIDOSD_C(win_no));
+
+	data = WINCONx_ENWIN;
+
+	/* note, since we have to round up the bits-per-pixel, we end up
+	 * relying on the bitfield information for r/g/b/a to work out
+	 * exactly which mode of operation is intended. */
+
+	switch (var->bits_per_pixel) {
+	case 1:
+		data |= WINCON0_BPPMODE_1BPP;
+		data |= WINCONx_BITSWP;
+		data |= WINCONx_BURSTLEN_4WORD;
+		break;
+	case 2:
+		data |= WINCON0_BPPMODE_2BPP;
+		data |= WINCONx_BITSWP;
+		data |= WINCONx_BURSTLEN_8WORD;
+		break;
+	case 4:
+		data |= WINCON0_BPPMODE_4BPP;
+		data |= WINCONx_BITSWP;
+		data |= WINCONx_BURSTLEN_8WORD;
+		break;
+	case 8:
+		if (var->transp.length != 0)
+			data |= WINCON1_BPPMODE_8BPP_1232;
+		else
+			data |= WINCON0_BPPMODE_8BPP_PALETTE;
+		data |= WINCONx_BURSTLEN_8WORD;
+		data |= WINCONx_BYTSWP;
+		break;
+	case 16:
+		if (var->transp.length != 0)
+			data |= WINCON1_BPPMODE_16BPP_A1555;
+		else
+			data |= WINCON0_BPPMODE_16BPP_565;
+		data |= WINCONx_HAWSWP;
+		data |= WINCONx_BURSTLEN_16WORD;
+		break;
+	case 24:
+	case 32:
+		if (var->red.length == 6) {
+			if (var->transp.length != 0)
+				data |= WINCON1_BPPMODE_19BPP_A1666;
+			else
+				data |= WINCON1_BPPMODE_18BPP_666;
+		} else if (var->transp.length != 0)
+			data |= WINCON1_BPPMODE_25BPP_A1888;
+		else
+			data |= WINCON0_BPPMODE_24BPP_888;
+
+		data |= WINCONx_BURSTLEN_16WORD;
+		break;
+	}
+
+	writel(data, regs + WINCON(win_no));
+	writel(0x0, regs + WINxMAP(win_no));
+
+	return 0;
+}
+
+/**
+ * s3c_fb_update_palette() - set or schedule a palette update.
+ * @sfb: The hardware information.
+ * @win: The window being updated.
+ * @reg: The palette index being changed.
+ * @value: The computed palette value.
+ *
+ * Change the value of a palette register, either by directly writing to
+ * the palette (this requires the palette RAM to be disconnected from the
+ * hardware whilst this is in progress) or schedule the update for later.
+ *
+ * At the moment, since we have no VSYNC interrupt support, we simply set
+ * the palette entry directly.
+ */
+static void s3c_fb_update_palette(struct s3c_fb *sfb,
+				  struct s3c_fb_win *win,
+				  unsigned int reg,
+				  u32 value)
+{
+	void __iomem *palreg;
+	u32 palcon;
+
+	palreg = sfb->regs + s3c_fb_pal_reg(win->index, reg);
+
+	dev_dbg(sfb->dev, "%s: win %d, reg %d (%p): %08x\n",
+		__func__, win->index, reg, palreg, value);
+
+	win->palette_buffer[reg] = value;
+
+	palcon = readl(sfb->regs + WPALCON);
+	writel(palcon | WPALCON_PAL_UPDATE, sfb->regs + WPALCON);
+
+	if (s3c_fb_pal_is16(win->index))
+		writew(value, palreg);
+	else
+		writel(value, palreg);
+
+	writel(palcon, sfb->regs + WPALCON);
+}
+
+static inline unsigned int chan_to_field(unsigned int chan,
+					 struct fb_bitfield *bf)
+{
+	chan &= 0xffff;
+	chan >>= 16 - bf->length;
+	return chan << bf->offset;
+}
+
+/**
+ * s3c_fb_setcolreg() - framebuffer layer request to change palette.
+ * @regno: The palette index to change.
+ * @red: The red field for the palette data.
+ * @green: The green field for the palette data.
+ * @blue: The blue field for the palette data.
+ * @trans: The transparency (alpha) field for the palette data.
+ * @info: The framebuffer being changed.
+ */
+static int s3c_fb_setcolreg(unsigned regno,
+			    unsigned red, unsigned green, unsigned blue,
+			    unsigned transp, struct fb_info *info)
+{
+	struct s3c_fb_win *win = info->par;
+	struct s3c_fb *sfb = win->parent;
+	unsigned int val;
+
+	dev_dbg(sfb->dev, "%s: win %d: %d => rgb=%d/%d/%d\n",
+		__func__, win->index, regno, red, green, blue);
+
+	switch (info->fix.visual) {
+	case FB_VISUAL_TRUECOLOR:
+		/* true-colour, use pseudo-palette */
+
+		if (regno < 16) {
+			u32 *pal = info->pseudo_palette;
+
+			val  = chan_to_field(red,   &info->var.red);
+			val |= chan_to_field(green, &info->var.green);
+			val |= chan_to_field(blue,  &info->var.blue);
+
+			pal[regno] = val;
+		}
+		break;
+
+	case FB_VISUAL_PSEUDOCOLOR:
+		if (regno < s3c_fb_win_pal_size(win->index)) {
+			val  = chan_to_field(red, &win->palette.r);
+			val |= chan_to_field(green, &win->palette.g);
+			val |= chan_to_field(blue, &win->palette.b);
+
+			s3c_fb_update_palette(sfb, win, regno, val);
+		}
+
+		break;
+
+	default:
+		return 1;	/* unknown type */
+	}
+
+	return 0;
+}
+
+/**
+ * s3c_fb_enable() - Set the state of the main LCD output
+ * @sfb: The main framebuffer state.
+ * @enable: The state to set.
+ */
+static void s3c_fb_enable(struct s3c_fb *sfb, int enable)
+{
+	u32 vidcon0 = readl(sfb->regs + VIDCON0);
+
+	if (enable)
+		vidcon0 |= VIDCON0_ENVID | VIDCON0_ENVID_F;
+	else {
+		/* see the note in the framebuffer datasheet about
+		 * why you cannot take both of these bits down at the
+		 * same time. */
+
+		if (!(vidcon0 & VIDCON0_ENVID))
+			return;
+
+		vidcon0 |= VIDCON0_ENVID;
+		vidcon0 &= ~VIDCON0_ENVID_F;
+	}
+
+	writel(vidcon0, sfb->regs + VIDCON0);
+}
+
+/**
+ * s3c_fb_blank() - blank or unblank the given window
+ * @blank_mode: The blank state from FB_BLANK_*
+ * @info: The framebuffer to blank.
+ *
+ * Framebuffer layer request to change the power state.
+ */
+static int s3c_fb_blank(int blank_mode, struct fb_info *info)
+{
+	struct s3c_fb_win *win = info->par;
+	struct s3c_fb *sfb = win->parent;
+	unsigned int index = win->index;
+	u32 wincon;
+
+	dev_dbg(sfb->dev, "blank mode %d\n", blank_mode);
+
+	wincon = readl(sfb->regs + WINCON(index));
+
+	switch (blank_mode) {
+	case FB_BLANK_POWERDOWN:
+		wincon &= ~WINCONx_ENWIN;
+		sfb->enabled &= ~(1 << index);
+		/* fall through to FB_BLANK_NORMAL */
+
+	case FB_BLANK_NORMAL:
+		/* disable the DMA and display 0x0 (black) */
+		writel(WINxMAP_MAP | WINxMAP_MAP_COLOUR(0x0),
+		       sfb->regs + WINxMAP(index));
+		break;
+
+	case FB_BLANK_UNBLANK:
+		writel(0x0, sfb->regs + WINxMAP(index));
+		wincon |= WINCONx_ENWIN;
+		sfb->enabled |= (1 << index);
+		break;
+
+	case FB_BLANK_VSYNC_SUSPEND:
+	case FB_BLANK_HSYNC_SUSPEND:
+	default:
+		return 1;
+	}
+
+	writel(wincon, sfb->regs + WINCON(index));
+
+	/* Check the enabled state to see if we need to be running the
+	 * main LCD interface, as if there are no active windows then
+	 * it is highly likely that we also do not need to output
+	 * anything.
+	 */
+
+	/* We could do something like the following code, but the current
+	 * system of using framebuffer events means that we cannot make
+	 * the distinction between just window 0 being inactive and all
+	 * the windows being down.
+	 *
+	 * s3c_fb_enable(sfb, sfb->enabled ? 1 : 0);
+	*/
+
+	/* we're stuck with this until we can do something about overriding
+	 * the power control using the blanking event for a single fb.
+	 */
+	if (index == 0)
+		s3c_fb_enable(sfb, blank_mode != FB_BLANK_POWERDOWN ? 1 : 0);
+
+	return 0;
+}
+
+static struct fb_ops s3c_fb_ops = {
+	.owner		= THIS_MODULE,
+	.fb_check_var	= s3c_fb_check_var,
+	.fb_set_par	= s3c_fb_set_par,
+	.fb_blank	= s3c_fb_blank,
+	.fb_setcolreg	= s3c_fb_setcolreg,
+	.fb_fillrect	= cfb_fillrect,
+	.fb_copyarea	= cfb_copyarea,
+	.fb_imageblit	= cfb_imageblit,
+};
+
+/**
+ * s3c_fb_alloc_memory() - allocate display memory for framebuffer window
+ * @sfb: The base resources for the hardware.
+ * @win: The window to initialise memory for.
+ *
+ * Allocate memory for the given framebuffer.
+ */
+static int __devinit s3c_fb_alloc_memory(struct s3c_fb *sfb,
+					 struct s3c_fb_win *win)
+{
+	struct s3c_fb_pd_win *windata = win->windata;
+	unsigned int real_size, virt_size, size;
+	struct fb_info *fbi = win->fbinfo;
+	dma_addr_t map_dma;
+
+	dev_dbg(sfb->dev, "allocating memory for display\n");
+
+	real_size = windata->win_mode.xres * windata->win_mode.yres;
+	virt_size = windata->virtual_x * windata->virtual_y;
+
+	dev_dbg(sfb->dev, "real_size=%u (%u.%u), virt_size=%u (%u.%u)\n",
+		real_size, windata->win_mode.xres, windata->win_mode.yres,
+		virt_size, windata->virtual_x, windata->virtual_y);
+
+	size = (real_size > virt_size) ? real_size : virt_size;
+	size *= (windata->max_bpp > 16) ? 32 : windata->max_bpp;
+	size /= 8;
+
+	fbi->fix.smem_len = size;
+	size = PAGE_ALIGN(size);
+
+	dev_dbg(sfb->dev, "want %u bytes for window\n", size);
+
+	fbi->screen_base = dma_alloc_writecombine(sfb->dev, size,
+						  &map_dma, GFP_KERNEL);
+	if (!fbi->screen_base)
+		return -ENOMEM;
+
+	dev_dbg(sfb->dev, "mapped %x to %p\n",
+		(unsigned int)map_dma, fbi->screen_base);
+
+	memset(fbi->screen_base, 0x0, size);
+	fbi->fix.smem_start = map_dma;
+
+	return 0;
+}
+
+/**
+ * s3c_fb_free_memory() - free the display memory for the given window
+ * @sfb: The base resources for the hardware.
+ * @win: The window to free the display memory for.
+ *
+ * Free the display memory allocated by s3c_fb_alloc_memory().
+ */
+static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
+{
+	struct fb_info *fbi = win->fbinfo;
+
+	dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
+			      fbi->screen_base, fbi->fix.smem_start);
+}
+
+/**
+ * s3c_fb_release_win() - release resources for a framebuffer window.
+ * @win: The window to cleanup the resources for.
+ *
+ * Release the resources that where claimed for the hardware window,
+ * such as the framebuffer instance and any memory claimed for it.
+ */
+static void s3c_fb_release_win(struct s3c_fb *sfb, struct s3c_fb_win *win)
+{
+	fb_dealloc_cmap(&win->fbinfo->cmap);
+	unregister_framebuffer(win->fbinfo);
+	s3c_fb_free_memory(sfb, win);
+}
+
+/**
+ * s3c_fb_probe_win() - register an hardware window
+ * @sfb: The base resources for the hardware
+ * @res: Pointer to where to place the resultant window.
+ *
+ * Allocate and do the basic initialisation for one of the hardware's graphics
+ * windows.
+ */
+static int __devinit s3c_fb_probe_win(struct s3c_fb *sfb, unsigned int win_no,
+				      struct s3c_fb_win **res)
+{
+	struct fb_var_screeninfo *var;
+	struct fb_videomode *initmode;
+	struct s3c_fb_pd_win *windata;
+	struct s3c_fb_win *win;
+	struct fb_info *fbinfo;
+	int palette_size;
+	int ret;
+
+	dev_dbg(sfb->dev, "probing window %d\n", win_no);
+
+	palette_size = s3c_fb_win_pal_size(win_no);
+
+	fbinfo = framebuffer_alloc(sizeof(struct s3c_fb_win) +
+				   palette_size * sizeof(u32), sfb->dev);
+	if (!fbinfo) {
+		dev_err(sfb->dev, "failed to allocate framebuffer\n");
+		return -ENOENT;
+	}
+
+	windata = sfb->pdata->win[win_no];
+	initmode = &windata->win_mode;
+
+	WARN_ON(windata->max_bpp == 0);
+	WARN_ON(windata->win_mode.xres == 0);
+	WARN_ON(windata->win_mode.yres == 0);
+
+	win = fbinfo->par;
+	var = &fbinfo->var;
+	win->fbinfo = fbinfo;
+	win->parent = sfb;
+	win->windata = windata;
+	win->index = win_no;
+	win->palette_buffer = (u32 *)(win + 1);
+
+	ret = s3c_fb_alloc_memory(sfb, win);
+	if (ret) {
+		dev_err(sfb->dev, "failed to allocate display memory\n");
+		goto err_framebuffer;
+	}
+
+	/* setup the r/b/g positions for the window's palette */
+	s3c_fb_init_palette(win_no, &win->palette);
+
+	/* setup the initial video mode from the window */
+	fb_videomode_to_var(&fbinfo->var, initmode);
+
+	fbinfo->fix.type	= FB_TYPE_PACKED_PIXELS;
+	fbinfo->fix.accel	= FB_ACCEL_NONE;
+	fbinfo->var.activate	= FB_ACTIVATE_NOW;
+	fbinfo->var.vmode	= FB_VMODE_NONINTERLACED;
+	fbinfo->var.bits_per_pixel = windata->default_bpp;
+	fbinfo->fbops		= &s3c_fb_ops;
+	fbinfo->flags		= FBINFO_FLAG_DEFAULT;
+	fbinfo->pseudo_palette  = &win->pseudo_palette;
+
+	/* prepare to actually start the framebuffer */
+
+	ret = s3c_fb_check_var(&fbinfo->var, fbinfo);
+	if (ret < 0) {
+		dev_err(sfb->dev, "check_var failed on initial video params\n");
+		goto err_alloc_mem;
+	}
+
+	/* create initial colour map */
+
+	ret = fb_alloc_cmap(&fbinfo->cmap, s3c_fb_win_pal_size(win_no), 1);
+	if (ret == 0)
+		fb_set_cmap(&fbinfo->cmap, fbinfo);
+	else
+		dev_err(sfb->dev, "failed to allocate fb cmap\n");
+
+	s3c_fb_set_par(fbinfo);
+
+	dev_dbg(sfb->dev, "about to register framebuffer\n");
+
+	/* run the check_var and set_par on our configuration. */
+
+	ret = register_framebuffer(fbinfo);
+	if (ret < 0) {
+		dev_err(sfb->dev, "failed to register framebuffer\n");
+		goto err_alloc_mem;
+	}
+
+	*res = win;
+	dev_info(sfb->dev, "window %d: fb %s\n", win_no, fbinfo->fix.id);
+
+	return 0;
+
+err_alloc_mem:
+	s3c_fb_free_memory(sfb, win);
+
+err_framebuffer:
+	unregister_framebuffer(fbinfo);
+	return ret;
+}
+
+/**
+ * s3c_fb_clear_win() - clear hardware window registers.
+ * @sfb: The base resources for the hardware.
+ * @win: The window to process.
+ *
+ * Reset the specific window registers to a known state.
+ */
+static void s3c_fb_clear_win(struct s3c_fb *sfb, int win)
+{
+	void __iomem *regs = sfb->regs;
+
+	writel(0, regs + WINCON(win));
+	writel(0xffffff, regs + WxKEYCONy(win, 0));
+	writel(0xffffff, regs + WxKEYCONy(win, 1));
+
+	writel(0, regs + VIDOSD_A(win));
+	writel(0, regs + VIDOSD_B(win));
+	writel(0, regs + VIDOSD_C(win));
+}
+
+static int __devinit s3c_fb_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct s3c_fb_platdata *pd;
+	struct s3c_fb *sfb;
+	struct resource *res;
+	int win;
+	int ret = 0;
+
+	pd = pdev->dev.platform_data;
+	if (!pd) {
+		dev_err(dev, "no platform data specified\n");
+		return -EINVAL;
+	}
+
+	sfb = kzalloc(sizeof(struct s3c_fb), GFP_KERNEL);
+	if (!sfb) {
+		dev_err(dev, "no memory for framebuffers\n");
+		return -ENOMEM;
+	}
+
+	sfb->dev = dev;
+	sfb->pdata = pd;
+
+	sfb->bus_clk = clk_get(dev, "lcd");
+	if (IS_ERR(sfb->bus_clk)) {
+		dev_err(dev, "failed to get bus clock\n");
+		goto err_sfb;
+	}
+
+	clk_enable(sfb->bus_clk);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "failed to find registers\n");
+		ret = -ENOENT;
+		goto err_clk;
+	}
+
+	sfb->regs_res = request_mem_region(res->start, resource_size(res),
+					   dev_name(dev));
+	if (!sfb->regs_res) {
+		dev_err(dev, "failed to claim register region\n");
+		ret = -ENOENT;
+		goto err_clk;
+	}
+
+	sfb->regs = ioremap(res->start, resource_size(res));
+	if (!sfb->regs) {
+		dev_err(dev, "failed to map registers\n");
+		ret = -ENXIO;
+		goto err_req_region;
+	}
+
+	dev_dbg(dev, "got resources (regs %p), probing windows\n", sfb->regs);
+
+	/* setup gpio and output polarity controls */
+
+	pd->setup_gpio();
+
+	writel(pd->vidcon1, sfb->regs + VIDCON1);
+
+	/* zero all windows before we do anything */
+
+	for (win = 0; win < S3C_FB_MAX_WIN; win++)
+		s3c_fb_clear_win(sfb, win);
+
+	/* we have the register setup, start allocating framebuffers */
+
+	for (win = 0; win < S3C_FB_MAX_WIN; win++) {
+		if (!pd->win[win])
+			continue;
+
+		ret = s3c_fb_probe_win(sfb, win, &sfb->windows[win]);
+		if (ret < 0) {
+			dev_err(dev, "failed to create window %d\n", win);
+			for (; win >= 0; win--)
+				s3c_fb_release_win(sfb, sfb->windows[win]);
+			goto err_ioremap;
+		}
+	}
+
+	platform_set_drvdata(pdev, sfb);
+
+	return 0;
+
+err_ioremap:
+	iounmap(sfb->regs);
+
+err_req_region:
+	release_resource(sfb->regs_res);
+	kfree(sfb->regs_res);
+
+err_clk:
+	clk_disable(sfb->bus_clk);
+	clk_put(sfb->bus_clk);
+
+err_sfb:
+	kfree(sfb);
+	return ret;
+}
+
+/**
+ * s3c_fb_remove() - Cleanup on module finalisation
+ * @pdev: The platform device we are bound to.
+ *
+ * Shutdown and then release all the resources that the driver allocated
+ * on initialisation.
+ */
+static int __devexit s3c_fb_remove(struct platform_device *pdev)
+{
+	struct s3c_fb *sfb = platform_get_drvdata(pdev);
+	int win;
+
+	for (win = 0; win <= S3C_FB_MAX_WIN; win++)
+		s3c_fb_release_win(sfb, sfb->windows[win]);
+
+	iounmap(sfb->regs);
+
+	clk_disable(sfb->bus_clk);
+	clk_put(sfb->bus_clk);
+
+	release_resource(sfb->regs_res);
+	kfree(sfb->regs_res);
+
+	kfree(sfb);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int s3c_fb_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct s3c_fb *sfb = platform_get_drvdata(pdev);
+	struct s3c_fb_win *win;
+	int win_no;
+
+	for (win_no = S3C_FB_MAX_WIN; win_no >= 0; win_no--) {
+		win = sfb->windows[win_no];
+		if (!win)
+			continue;
+
+		/* use the blank function to push into power-down */
+		s3c_fb_blank(FB_BLANK_POWERDOWN, win->fbinfo);
+	}
+
+	clk_disable(sfb->bus_clk);
+	return 0;
+}
+
+static int s3c_fb_resume(struct platform_device *pdev)
+{
+	struct s3c_fb *sfb = platform_get_drvdata(pdev);
+	struct s3c_fb_win *win;
+	int win_no;
+
+	clk_enable(sfb->bus_clk);
+
+	for (win_no = 0; win_no < S3C_FB_MAX_WIN; win_no++) {
+		win = sfb->windows[win_no];
+		if (!win)
+			continue;
+
+		dev_dbg(&pdev->dev, "resuming window %d\n", win_no);
+		s3c_fb_set_par(win->fbinfo);
+	}
+
+	return 0;
+}
+#else
+#define s3c_fb_suspend NULL
+#define s3c_fb_resume  NULL
+#endif
+
+static struct platform_driver s3c_fb_driver = {
+	.probe		= s3c_fb_probe,
+	.remove		= s3c_fb_remove,
+	.suspend	= s3c_fb_suspend,
+	.resume		= s3c_fb_resume,
+	.driver		= {
+		.name	= "s3c-fb",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init s3c_fb_init(void)
+{
+	return platform_driver_register(&s3c_fb_driver);
+}
+
+static void __exit s3c_fb_cleanup(void)
+{
+	platform_driver_unregister(&s3c_fb_driver);
+}
+
+module_init(s3c_fb_init);
+module_exit(s3c_fb_cleanup);
+
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_DESCRIPTION("Samsung S3C SoC Framebuffer driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:s3c-fb");
-- 
cgit v1.2.3


From ddb53d48da5b0e691f35e703ac29118747f86c99 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:40 -0700
Subject: fbdev: remove cyblafb driver

A tridentfb driver has all the functionality of the cyblafb driver without
the bugs of the latter.

Changes to the tridentfb driver:

- FBINFO_READS_FAST added to the tridentfb.  The cyblafb used a blitter
  for scrolling which is faster than color expansion on Cyberblade
  chipsets.  The blitter is slower on a discrete Blade3D core.  Use the
  blitter for scrolling in the tridentfb only for integrated Blade3D
  cores.  Now, scrolling speed is about equal for the tridentfb and the
  cyblafb.

- a copyright notice addition is done on request of Jani Monoses (the
  first author of the tridentfb).

Tested on AGP Blade3D card and PCChips
M787CLR motherboard: VIA C3 cpu +
VT8601 north  bridge (aka Cyberblade/i1).

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: "Jani Monoses" <jani@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/fb/00-INDEX              |    2 -
 Documentation/fb/cyblafb/bugs          |   13 -
 Documentation/fb/cyblafb/credits       |    7 -
 Documentation/fb/cyblafb/documentation |   17 -
 Documentation/fb/cyblafb/fb.modes      |  154 ---
 Documentation/fb/cyblafb/performance   |   79 --
 Documentation/fb/cyblafb/todo          |   31 -
 Documentation/fb/cyblafb/usage         |  217 ----
 Documentation/fb/cyblafb/whatsnew      |   29 -
 Documentation/fb/cyblafb/whycyblafb    |   85 --
 drivers/video/Kconfig                  |   28 +-
 drivers/video/cyblafb.c                | 1683 --------------------------------
 drivers/video/tridentfb.c              |    6 +-
 13 files changed, 7 insertions(+), 2344 deletions(-)
 delete mode 100644 Documentation/fb/cyblafb/bugs
 delete mode 100644 Documentation/fb/cyblafb/credits
 delete mode 100644 Documentation/fb/cyblafb/documentation
 delete mode 100644 Documentation/fb/cyblafb/fb.modes
 delete mode 100644 Documentation/fb/cyblafb/performance
 delete mode 100644 Documentation/fb/cyblafb/todo
 delete mode 100644 Documentation/fb/cyblafb/usage
 delete mode 100644 Documentation/fb/cyblafb/whatsnew
 delete mode 100644 Documentation/fb/cyblafb/whycyblafb
 delete mode 100644 drivers/video/cyblafb.c

diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX
index caabbd395e6..a618fd99c9f 100644
--- a/Documentation/fb/00-INDEX
+++ b/Documentation/fb/00-INDEX
@@ -11,8 +11,6 @@ aty128fb.txt
 	- info on the ATI Rage128 frame buffer driver.
 cirrusfb.txt
 	- info on the driver for Cirrus Logic chipsets.
-cyblafb/
-	- directory with documentation files related to the cyblafb driver.
 deferred_io.txt
 	- an introduction to deferred IO.
 fbcon.txt
diff --git a/Documentation/fb/cyblafb/bugs b/Documentation/fb/cyblafb/bugs
deleted file mode 100644
index 9443a6d72cd..00000000000
--- a/Documentation/fb/cyblafb/bugs
+++ /dev/null
@@ -1,13 +0,0 @@
-Bugs
-====
-
-I currently don't know of any bug. Please do send reports to:
- - linux-fbdev-devel@lists.sourceforge.net
- - Knut_Petersen@t-online.de.
-
-
-Untested features
-=================
-
-All LCD stuff is untested. If it worked in tridentfb, it should work in
-cyblafb. Please test and report the results to Knut_Petersen@t-online.de.
diff --git a/Documentation/fb/cyblafb/credits b/Documentation/fb/cyblafb/credits
deleted file mode 100644
index 0eb3b443dc2..00000000000
--- a/Documentation/fb/cyblafb/credits
+++ /dev/null
@@ -1,7 +0,0 @@
-Thanks to
-=========
-   * 	Alan Hourihane, for writing the X trident driver
-   *	Jani Monoses, for writing the tridentfb driver
-   *	Antonino A. Daplas, for review of the first published
-	version of cyblafb and some code
-   *	Jochen Hein, for testing and a helpfull bug report
diff --git a/Documentation/fb/cyblafb/documentation b/Documentation/fb/cyblafb/documentation
deleted file mode 100644
index bb1aac04842..00000000000
--- a/Documentation/fb/cyblafb/documentation
+++ /dev/null
@@ -1,17 +0,0 @@
-Available Documentation
-=======================
-
-Apollo PLE 133 Chipset VT8601A North Bridge Datasheet, Rev. 1.82, October 22,
-2001, available from VIA:
-
-	http://www.viavpsd.com/product/6/15/DS8601A182.pdf
-
-The datasheet is incomplete, some registers that need to be programmed are not
-explained at all and important bits are listed as "reserved". But you really
-need the datasheet to understand the code.  "p. xxx" comments refer to page
-numbers of this document.
-
-XFree/XOrg drivers are available and of good quality, looking at the code
-there is a good idea if the datasheet does not provide enough information
-or if the datasheet seems to be wrong.
-
diff --git a/Documentation/fb/cyblafb/fb.modes b/Documentation/fb/cyblafb/fb.modes
deleted file mode 100644
index fe0e5223ba8..00000000000
--- a/Documentation/fb/cyblafb/fb.modes
+++ /dev/null
@@ -1,154 +0,0 @@
-#
-#   Sample fb.modes file
-#
-#	Provides an incomplete list of working modes for
-#	the cyberblade/i1 graphics core.
-#
-#	The value 4294967256 is used instead of -40. Of course, -40 is not
-#	a really reasonable value, but chip design does not always follow
-#	logic. Believe me, it's ok, and it's the way the BIOS does it.
-#
-#	fbset requires 4294967256 in fb.modes and -40 as an argument to
-#	the -t parameter. That's also not too reasonable, and it might change
-#	in the future or might even be differt for your current version.
-#
-
-mode "640x480-50"
-    geometry 640 480 2048 4096 8
-    timings 47619 4294967256 24 17 0 216 3
-endmode
-
-mode "640x480-60"
-    geometry 640 480 2048 4096 8
-    timings 39682 4294967256 24 17 0 216 3
-endmode
-
-mode "640x480-70"
-    geometry 640 480 2048 4096 8
-    timings 34013 4294967256 24 17 0 216 3
-endmode
-
-mode "640x480-72"
-    geometry 640 480 2048 4096 8
-    timings 33068 4294967256 24 17 0 216 3
-endmode
-
-mode "640x480-75"
-    geometry 640 480 2048 4096 8
-    timings 31746 4294967256 24 17 0 216 3
-endmode
-
-mode "640x480-80"
-    geometry 640 480 2048 4096 8
-    timings 29761 4294967256 24 17 0 216 3
-endmode
-
-mode "640x480-85"
-    geometry 640 480 2048 4096 8
-    timings 28011 4294967256 24 17 0 216 3
-endmode
-
-mode "800x600-50"
-    geometry 800 600 2048 4096 8
-    timings 30303 96 24 14 0 136 11
-endmode
-
-mode "800x600-60"
-    geometry 800 600 2048 4096 8
-    timings 25252 96 24 14 0 136 11
-endmode
-
-mode "800x600-70"
-    geometry 800 600 2048 4096 8
-    timings 21645 96 24 14 0 136 11
-endmode
-
-mode "800x600-72"
-    geometry 800 600 2048 4096 8
-    timings 21043 96 24 14 0 136 11
-endmode
-
-mode "800x600-75"
-    geometry 800 600 2048 4096 8
-    timings 20202 96 24 14 0 136 11
-endmode
-
-mode "800x600-80"
-    geometry 800 600 2048 4096 8
-    timings 18939 96 24 14 0 136 11
-endmode
-
-mode "800x600-85"
-    geometry 800 600 2048 4096 8
-    timings 17825 96 24 14 0 136 11
-endmode
-
-mode "1024x768-50"
-    geometry 1024 768 2048 4096 8
-    timings 19054 144 24 29 0 120 3
-endmode
-
-mode "1024x768-60"
-    geometry 1024 768 2048 4096 8
-    timings 15880 144 24 29 0 120 3
-endmode
-
-mode "1024x768-70"
-    geometry 1024 768 2048 4096 8
-    timings 13610 144 24 29 0 120 3
-endmode
-
-mode "1024x768-72"
-    geometry 1024 768 2048 4096 8
-    timings 13232 144 24 29 0 120 3
-endmode
-
-mode "1024x768-75"
-    geometry 1024 768 2048 4096 8
-    timings 12703 144 24 29 0 120 3
-endmode
-
-mode "1024x768-80"
-    geometry 1024 768 2048 4096 8
-    timings 11910 144 24 29 0 120 3
-endmode
-
-mode "1024x768-85"
-    geometry 1024 768 2048 4096 8
-    timings 11209 144 24 29 0 120 3
-endmode
-
-mode "1280x1024-50"
-    geometry 1280 1024 2048 4096 8
-    timings 11114 232 16 39 0 160 3
-endmode
-
-mode "1280x1024-60"
-    geometry 1280 1024 2048 4096 8
-    timings 9262 232 16 39 0 160 3
-endmode
-
-mode "1280x1024-70"
-    geometry 1280 1024 2048 4096 8
-    timings 7939 232 16 39 0 160 3
-endmode
-
-mode "1280x1024-72"
-    geometry 1280 1024 2048 4096 8
-    timings 7719 232 16 39 0 160 3
-endmode
-
-mode "1280x1024-75"
-    geometry 1280 1024 2048 4096 8
-    timings 7410 232 16 39 0 160 3
-endmode
-
-mode "1280x1024-80"
-    geometry 1280 1024 2048 4096 8
-    timings 6946 232 16 39 0 160 3
-endmode
-
-mode "1280x1024-85"
-    geometry 1280 1024 2048 4096 8
-    timings 6538 232 16 39 0 160 3
-endmode
diff --git a/Documentation/fb/cyblafb/performance b/Documentation/fb/cyblafb/performance
deleted file mode 100644
index 8d15d5dfc6b..00000000000
--- a/Documentation/fb/cyblafb/performance
+++ /dev/null
@@ -1,79 +0,0 @@
-Speed
-=====
-
-CyBlaFB is much faster than tridentfb and vesafb. Compare the performance data
-for mode 1280x1024-[8,16,32]@61 Hz.
-
-Test 1: Cat a file with 2000 lines of 0 characters.
-Test 2: Cat a file with 2000 lines of 80 characters.
-Test 3: Cat a file with 2000 lines of 160 characters.
-
-All values show system time use in seconds, kernel 2.6.12 was used for
-the measurements. 2.6.13 is a bit slower, 2.6.14 hopefully will include a
-patch that speeds up kernel bitblitting a lot ( > 20%).
-
-+-----------+-----------------------------------------------------+
-|	    |			not accelerated 		  |
-| TRIDENTFB +-----------------+-----------------+-----------------+
-| of 2.6.12 |	   8 bpp      |     16 bpp	|     32 bpp	  |
-|	    | noypan |	 ypan | noypan |   ypan | noypan |   ypan |
-+-----------+--------+--------+--------+--------+--------+--------+
-|    Test 1 |	4.31 |	 4.33 |   6.05 |  12.81 |  ----  |  ----  |
-|    Test 2 |  67.94 |	 5.44 | 123.16 |  14.79 |  ----  |  ----  |
-|    Test 3 | 131.36 |	 6.55 | 240.12 |  16.76 |  ----  |  ----  |
-+-----------+--------+--------+--------+--------+--------+--------+
-|  Comments |		      | 		| completely bro- |
-|	    |		      | 		| ken, monitor	  |
-|	    |		      | 		| switches off	  |
-+-----------+-----------------+-----------------+-----------------+
-
-
-+-----------+-----------------------------------------------------+
-|	    |			  accelerated			  |
-| TRIDENTFB +-----------------+-----------------+-----------------+
-| of 2.6.12 |	   8 bpp      |     16 bpp	|     32 bpp	  |
-|	    | noypan |	 ypan | noypan |   ypan | noypan |   ypan |
-+-----------+--------+--------+--------+--------+--------+--------+
-|    Test 1 |  ----  |	----  |  20.62 |   1.22 |  ----  |  ----  |
-|    Test 2 |  ----  |	----  |  22.61 |   3.19 |  ----  |  ----  |
-|    Test 3 |  ----  |	----  |  24.59 |   5.16 |  ----  |  ----  |
-+-----------+--------+--------+--------+--------+--------+--------+
-|  Comments | broken, writing | broken, ok only | completely bro- |
-|	    | to wrong places | if bgcolor is	| ken, monitor	  |
-|	    | on screen + bug | black, bug in	| switches off	  |
-|	    | in fillrect()   | fillrect()	|		  |
-+-----------+-----------------+-----------------+-----------------+
-
-
-+-----------+-----------------------------------------------------+
-|	    |			not accelerated 		  |
-|   VESAFB  +-----------------+-----------------+-----------------+
-| of 2.6.12 |	   8 bpp      |     16 bpp	|     32 bpp	  |
-|	    | noypan |	 ypan | noypan |   ypan | noypan |   ypan |
-+-----------+--------+--------+--------+--------+--------+--------+
-|    Test 1 |	4.26 |	 3.76 |   5.99 |   7.23 |  ----  |  ----  |
-|    Test 2 |  65.65 |	 4.89 | 120.88 |   9.08 |  ----  |  ----  |
-|    Test 3 | 126.91 |	 5.94 | 235.77 |  11.03 |  ----  |  ----  |
-+-----------+--------+--------+--------+--------+--------+--------+
-|  Comments | vga=0x307       | vga=0x31a	| vga=0x31b not   |
-|	    | fh=80kHz	      | fh=80kHz	| supported by	  |
-|	    | fv=75kHz	      | fv=75kHz	| video BIOS and  |
-|	    |		      | 		| hardware	  |
-+-----------+-----------------+-----------------+-----------------+
-
-
-+-----------+-----------------------------------------------------+
-|	    |			  accelerated			  |
-|  CYBLAFB  +-----------------+-----------------+-----------------+
-|	    |	   8 bpp      |     16 bpp	|     32 bpp	  |
-|	    | noypan |	 ypan | noypan |   ypan | noypan |   ypan |
-+-----------+--------+--------+--------+--------+--------+--------+
-|    Test 1 |	8.02 |	 0.23 |  19.04 |   0.61 |  57.12 |   2.74 |
-|    Test 2 |	8.38 |	 0.55 |  19.39 |   0.92 |  57.54 |   3.13 |
-|    Test 3 |	8.73 |	 0.86 |  19.74 |   1.24 |  57.95 |   3.51 |
-+-----------+--------+--------+--------+--------+--------+--------+
-|  Comments |		      | 		|		  |
-|	    |		      | 		|		  |
-|	    |		      | 		|		  |
-|	    |		      | 		|		  |
-+-----------+-----------------+-----------------+-----------------+
diff --git a/Documentation/fb/cyblafb/todo b/Documentation/fb/cyblafb/todo
deleted file mode 100644
index c5f6d0eae54..00000000000
--- a/Documentation/fb/cyblafb/todo
+++ /dev/null
@@ -1,31 +0,0 @@
-TODO / Missing features
-=======================
-
-Verify LCD stuff		"stretch" and "center" options are
-				completely untested ... this code needs to be
-				verified. As I don't have access to such
-				hardware, please contact me if you are
-				willing run some tests.
-
-Interlaced video modes		The reason that interleaved
-				modes are disabled is that I do not know
-				the meaning of the vertical interlace
-				parameter. Also the datasheet mentions a
-				bit d8 of a horizontal interlace parameter,
-				but nowhere the lower 8 bits. Please help
-				if you can.
-
-low-res double scan modes	Who needs it?
-
-accelerated color blitting	Who needs it? The console driver does use color
-				blitting for nothing but drawing the penguine,
-				everything else is done using color expanding
-				blitting of 1bpp character bitmaps.
-
-ioctls				Who needs it?
-
-TV-out				Will be done later. Use "vga= " at boot time
-				to set a suitable video mode.
-
-???				Feel free to contact me if you have any
-				feature requests
diff --git a/Documentation/fb/cyblafb/usage b/Documentation/fb/cyblafb/usage
deleted file mode 100644
index a39bb3d402a..00000000000
--- a/Documentation/fb/cyblafb/usage
+++ /dev/null
@@ -1,217 +0,0 @@
-CyBlaFB is a framebuffer driver for the Cyberblade/i1 graphics core integrated
-into the VIA Apollo PLE133 (aka vt8601) south bridge. It is developed and
-tested using a VIA EPIA 5000 board.
-
-Cyblafb - compiled into the kernel or as a module?
-==================================================
-
-You might compile cyblafb either as a module or compile it permanently into the
-kernel.
-
-Unless you have a real reason to do so you should not compile both vesafb and
-cyblafb permanently into the kernel. It's possible and it helps during the
-developement cycle, but it's useless and will at least block some otherwise
-usefull memory for ordinary users.
-
-Selecting Modes
-===============
-
-	Startup Mode
-	============
-
-	First of all, you might use the "vga=???" boot parameter as it is
-	documented in vesafb.txt and svga.txt. Cyblafb will detect the video
-	mode selected and will use the geometry and timings found by
-	inspecting the hardware registers.
-
-		video=cyblafb vga=0x317
-
-	Alternatively you might use a combination of the mode, ref and bpp
-	parameters. If you compiled the driver into the kernel, add something
-	like this to the kernel command line:
-
-		video=cyblafb:1280x1024,bpp=16,ref=50 ...
-
-	If you compiled the driver as a module, the same mode would be
-	selected by the following command:
-
-		modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ...
-
-	None of the modes possible to select as startup modes are affected by
-	the problems described at the end of the next subsection.
-
-	For all startup modes cyblafb chooses a virtual x resolution of 2048,
-	the only exception is mode 1280x1024 in combination with 32 bpp. This
-	allows ywrap scrolling for all those modes if rotation is 0 or 2, and
-	also fast scrolling if rotation is 1 or 3. The default virtual y reso-
-	lution is 4096 for bpp == 8, 2048 for bpp==16 and 1024 for bpp == 32,
-	again with the only exception of 1280x1024 at 32 bpp.
-
-	Please do set your video memory size to 8 Mb in the Bios setup. Other
-	values will work, but performace is decreased for a lot of modes.
-
-	Mode changes using fbset
-	========================
-
-	You might use fbset to change the video mode, see "man fbset". Cyblafb
-	generally does assume that you know what you are doing. But it does
-	some checks, especially those that are needed to prevent you from
-	damaging your hardware.
-
-		- only 8, 16, 24 and 32 bpp video modes are accepted
-		- interlaced video modes are not accepted
-		- double scan video modes are not accepted
-		- if a flat panel is found, cyblafb does not allow you
-		  to program a resolution higher than the physical
-		  resolution of the flat panel monitor
-		- cyblafb does not allow vclk to exceed 230 MHz. As 32 bpp
-		  and (currently) 24 bit modes use a doubled vclk internally,
-		  the dotclock limit as seen by fbset is 115 MHz for those
-		  modes and 230 MHz for 8 and 16 bpp modes.
-		- cyblafb will allow you to select very high resolutions as
-		  long as the hardware can be programmed to these modes. The
-		  documented limit 1600x1200 is not enforced, but don't expect
-		  perfect signal quality.
-
-	Any request that violates the rules given above will be either changed
-	to something the hardware supports or an error value will be returned.
-
-	If you program a virtual y resolution higher than the hardware limit,
-	cyblafb will silently decrease that value to the highest possible
-	value. The same is true for a virtual x resolution that is not
-	supported by the hardware. Cyblafb tries to adapt vyres first because
-	vxres decides if ywrap scrolling is possible or not.
-
-	Attempts to disable acceleration are ignored, I believe that this is
-	safe.
-
-	Some video modes that should work do not work as expected. If you use
-	the standard fb.modes, fbset 640x480-60 will program that mode, but
-	you will see a vertical area, about two characters wide, with only
-	much darker characters than the other characters on the screen.
-	Cyblafb does allow that mode to be set, as it does not violate the
-	official specifications. It would need a lot of code to reliably sort
-	out all invalid modes, playing around with the margin values will
-	give a valid mode quickly. And if cyblafb would detect such an invalid
-	mode, should it silently alter the requested values or should it
-	report an error? Both options have some pros and cons. As stated
-	above, none of the startup modes are affected, and if you set
-	verbosity to 1 or higher, cyblafb will print the fbset command that
-	would be needed to program that mode using fbset.
-
-
-Other Parameters
-================
-
-
-crt		don't autodetect, assume monitor connected to
-		standard VGA connector
-
-fp		don't autodetect, assume flat panel display
-		connected to flat panel monitor interface
-
-nativex 	inform driver about native x resolution of
-		flat panel monitor connected to special
-		interface (should be autodetected)
-
-stretch 	stretch image to adapt low resolution modes to
-		higer resolutions of flat panel monitors
-		connected to special interface
-
-center		center image to adapt low resolution modes to
-		higer resolutions of flat panel monitors
-		connected to special interface
-
-memsize 	use if autodetected memsize is wrong ...
-		should never be necessary
-
-nopcirr 	disable PCI read retry
-nopciwr 	disable PCI write retry
-nopcirb 	disable PCI read bursts
-nopciwb 	disable PCI write bursts
-
-bpp		bpp for specified modes
-		valid values: 8 || 16 || 24 || 32
-
-ref		refresh rate for specified mode
-		valid values: 50 <= ref <= 85
-
-mode		640x480 or 800x600 or 1024x768 or 1280x1024
-		if not specified, the startup mode will be detected
-		and used, so you might also use the vga=??? parameter
-		described in vesafb.txt. If you do not specify a mode,
-		bpp and ref parameters are ignored.
-
-verbosity	0 is the default, increase to at least 2 for every
-		bug report!
-
-Development hints
-=================
-
-It's much faster do compile a module and to load the new version after
-unloading the old module than to compile a new kernel and to reboot. So if you
-try to work on cyblafb, it might be a good idea to use cyblafb as a module.
-In real life, fast often means dangerous, and that's also the case here. If
-you introduce a serious bug when cyblafb is compiled into the kernel, the
-kernel will lock or oops with a high probability before the file system is
-mounted, and the danger for your data is low. If you load a broken own version
-of cyblafb on a running system, the danger for the integrity of the file
-system is much higher as you might need a hard reset afterwards. Decide
-yourself.
-
-Module unloading, the vfb method
-================================
-
-If you want to unload/reload cyblafb using the virtual framebuffer, you need
-to enable vfb support in the kernel first. After that, load the modules as
-shown below:
-
-	modprobe vfb vfb_enable=1
-	modprobe fbcon
-	modprobe cyblafb
-	fbset -fb /dev/fb1 1280x1024-60 -vyres 2662
-	con2fb /dev/fb1 /dev/tty1
-	...
-
-If you now made some changes to cyblafb and want to reload it, you might do it
-as show below:
-
-	con2fb /dev/fb0 /dev/tty1
-	...
-	rmmod cyblafb
-	modprobe cyblafb
-	con2fb /dev/fb1 /dev/tty1
-	...
-
-Of course, you might choose another mode, and most certainly you also want to
-map some other /dev/tty* to the real framebuffer device. You might also choose
-to compile fbcon as a kernel module or place it permanently in the kernel.
-
-I do not know of any way to unload fbcon, and fbcon will prevent the
-framebuffer device loaded first from unloading. [If there is a way, then
-please add a description here!]
-
-Module unloading, the vesafb method
-===================================
-
-Configure the kernel:
-
-	<*> Support for frame buffer devices
-	[*]   VESA VGA graphics support
-	<M>   Cyberblade/i1 support
-
-Add e.g. "video=vesafb:ypan vga=0x307" to the kernel parameters. The ypan
-parameter is important, choose any vga parameter you like as long as it is
-a graphics mode.
-
-After booting, load cyblafb without any mode and bpp parameter and assign
-cyblafb to individual ttys using con2fb, e.g.:
-
-	modprobe cyblafb
-	con2fb /dev/fb1 /dev/tty1
-
-Unloading cyblafb works without problems after you assign vesafb to all
-ttys again, e.g.:
-
-	con2fb /dev/fb0 /dev/tty1
-	rmmod cyblafb
diff --git a/Documentation/fb/cyblafb/whatsnew b/Documentation/fb/cyblafb/whatsnew
deleted file mode 100644
index 76c07a26e04..00000000000
--- a/Documentation/fb/cyblafb/whatsnew
+++ /dev/null
@@ -1,29 +0,0 @@
-0.62
-====
-
-      - the vesafb parameter has been removed as I decided to allow the
-      	feature without any special parameter.
-
-      - Cyblafb does not use the vga style of panning any longer, now the
-      	"right view" register in the graphics engine IO space is used. Without
-	that change it was impossible to use all available memory, and without
-	access to all available memory it is impossible to ywrap.
-
-      - The imageblit function now uses hardware acceleration for all font
-        widths. Hardware blitting across pixel column 2048 is broken in the
-	cyberblade/i1 graphics core, but we work around that hardware bug.
-
-      - modes with vxres != xres are supported now.
-
-      - ywrap scrolling is supported now and the default. This is a big
-        performance gain.
-
-      - default video modes use vyres > yres and vxres > xres to allow
-        almost optimal scrolling speed for normal and rotated screens
-
-      - some features mainly usefull for debugging the upper layers of the
-        framebuffer system have been added, have a look at the code
-
-      - fixed: Oops after unloading cyblafb when reading /proc/io*
-
-      - we work around some bugs of the higher framebuffer layers.
diff --git a/Documentation/fb/cyblafb/whycyblafb b/Documentation/fb/cyblafb/whycyblafb
deleted file mode 100644
index a123bc11e69..00000000000
--- a/Documentation/fb/cyblafb/whycyblafb
+++ /dev/null
@@ -1,85 +0,0 @@
-I tried the following framebuffer drivers:
-
-	- TRIDENTFB is full of bugs. Acceleration is broken for Blade3D
-	  graphics cores like the cyberblade/i1. It claims to support a great
-	  number of devices, but documentation for most of these devices is
-	  unfortunately not available. There is _no_ reason to use tridentfb
-	  for cyberblade/i1 + CRT users. VESAFB is faster, and the one
-	  advantage, mode switching, is broken in tridentfb.
-
-	- VESAFB is used by many distributions as a standard. Vesafb does
-	  not support mode switching. VESAFB is a bit faster than the working
-	  configurations of TRIDENTFB, but it is still too slow, even if you
-	  use ypan.
-
-	- EPIAFB (you'll find it on sourceforge) supports the Cyberblade/i1
-	  graphics core, but it still has serious bugs and developement seems
-	  to have stopped. This is the one driver with TV-out support. If you
-	  do need this feature, try epiafb.
-
-None of these drivers was a real option for me.
-
-I believe that is unreasonable to change code that announces to support 20
-devices if I only have more or less sufficient documentation for exactly one
-of these. The risk of breaking device foo while fixing device bar is too high.
-
-So I decided to start CyBlaFB as a stripped down tridentfb.
-
-All code specific to other Trident chips has been removed. After that there
-were a lot of cosmetic changes to increase the readability of the code. All
-register names were changed to those mnemonics used in the datasheet. Function
-and macro names were changed if they hindered easy understanding of the code.
-
-After that I debugged the code and implemented some new features. I'll try to
-give a little summary of the main changes:
-
-	- calculation of vertical and horizontal timings was fixed
-
-	- video signal quality has been improved dramatically
-
-	- acceleration:
-
-		- fillrect and copyarea were fixed and reenabled
-
-		- color expanding imageblit was newly implemented, color
-		  imageblit (only used to draw the penguine) still uses the
-		  generic code.
-
-		- init of the acceleration engine was improved and moved to a
-		  place where it really works ...
-
-		- sync function has a timeout now and tries to reset and
-		  reinit the accel engine if necessary
-
-		- fewer slow copyarea calls when doing ypan scrolling by using
-		  undocumented bit d21 of screen start address stored in
-		  CR2B[5]. BIOS does use it also, so this should be safe.
-
-	- cyblafb rejects any attempt to set modes that would cause vclk
-	  values above reasonable 230 MHz. 32bit modes use a clock
-	  multiplicator of 2, so fbset does show the correct values for
-	  pixclock but not for vclk in this case. The fbset limit is 115 MHz
-	  for 32 bpp modes.
-
-	- cyblafb rejects modes known to be broken or unimplemented (all
-	  interlaced modes, all doublescan modes for now)
-
-	- cyblafb now works independant of the video mode in effect at startup
-	  time (tridentfb does not init all needed registers to reasonable
-	  values)
-
-	- switching between video modes does work reliably now
-
-	- the first video mode now is the one selected on startup using the
-	  vga=???? mechanism or any of
-		- 640x480, 800x600, 1024x768, 1280x1024
-		- 8, 16, 24 or 32 bpp
-		- refresh between 50 Hz and 85 Hz, 1 Hz steps (1280x1024-32
-		  is limited to 63Hz)
-
-	- pci retry and pci burst mode are settable (try to disable if you
-	  experience latency problems)
-
-	- built as a module cyblafb might be unloaded and reloaded using
-	  the vfb module and con2vt or might be used together with vesafb
-
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index c19f6feb4e5..db7a4f42eda 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -1597,30 +1597,6 @@ config FB_VT8623
 	  Driver for CastleRock integrated graphics core in the
 	  VIA VT8623 [Apollo CLE266] chipset.
 
-config FB_CYBLA
-	tristate "Cyberblade/i1 support"
-	depends on FB && PCI && X86_32 && !64BIT
-	select FB_CFB_IMAGEBLIT
-	---help---
-	  This driver is supposed to support the Trident Cyberblade/i1
-	  graphics core integrated in the VIA VT8601A North Bridge,
-	  also known as VIA Apollo PLE133.
-
-	  Status:
-	   - Developed, tested and working on EPIA 5000 and EPIA 800.
-	   - Does work reliable on all systems with CRT/LCD connected to
-	     normal VGA ports.
-	   - Should work on systems that do use the internal LCD port, but
-	     this is absolutely not tested.
-
-	  Character imageblit, copyarea and rectangle fill are hw accelerated,
-	  ypan scrolling is used by default.
-
-	  Please do read <file:Documentation/fb/cyblafb/*>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called cyblafb.
-
 config FB_TRIDENT
 	tristate "Trident support"
 	depends on FB && PCI
@@ -1633,8 +1609,8 @@ config FB_TRIDENT
 	  and Blade XP.
 	  There are also integrated versions of these chips called CyberXXXX,
 	  CyberImage or CyberBlade. These chips are mostly found in laptops
-	  but also on some motherboards. For more information, read
-	  <file:Documentation/fb/tridentfb.txt>
+	  but also on some motherboards including early VIA EPIA motherboards.
+	  For more information, read <file:Documentation/fb/tridentfb.txt>
 
 	  Say Y if you have such a graphics board.
 
diff --git a/drivers/video/cyblafb.c b/drivers/video/cyblafb.c
deleted file mode 100644
index 9704b73135f..00000000000
--- a/drivers/video/cyblafb.c
+++ /dev/null
@@ -1,1683 +0,0 @@
-/*
- * Frame buffer driver for Trident Cyberblade/i1 graphics core
- *
- * Copyright 2005 Knut Petersen <Knut_Petersen@t-online.de>
- *
- * CREDITS:
- *	tridentfb.c by Jani Monoses
- *	see files above for further credits
- *
- */
-
-#define CYBLAFB_DEBUG 0
-#define CYBLAFB_KD_GRAPHICS_QUIRK 1
-
-#define CYBLAFB_PIXMAPSIZE 8192
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <asm/types.h>
-#include <video/cyblafb.h>
-
-#define VERSION "0.62"
-
-struct cyblafb_par {
-	u32 pseudo_pal[16];
-	struct fb_ops ops;
-};
-
-static struct fb_fix_screeninfo cyblafb_fix __devinitdata = {
-	.id = "CyBla",
-	.type = FB_TYPE_PACKED_PIXELS,
-	.xpanstep = 1,
-	.ypanstep = 1,
-	.ywrapstep = 1,
-	.visual = FB_VISUAL_PSEUDOCOLOR,
-	.accel = FB_ACCEL_NONE,
-};
-
-static char *mode __devinitdata = NULL;
-static int bpp __devinitdata = 8;
-static int ref __devinitdata = 75;
-static int fp __devinitdata;
-static int crt __devinitdata;
-static int memsize __devinitdata;
-
-static int basestride;
-static int vesafb;
-static int nativex;
-static int center;
-static int stretch;
-static int pciwb = 1;
-static int pcirb = 1;
-static int pciwr = 1;
-static int pcirr = 1;
-static int disabled;
-static int verbosity;
-static int displaytype;
-
-static void __iomem *io_virt;	// iospace virtual memory address
-
-module_param(mode, charp, 0);
-module_param(bpp, int, 0);
-module_param(ref, int, 0);
-module_param(fp, int, 0);
-module_param(crt, int, 0);
-module_param(nativex, int, 0);
-module_param(center, int, 0);
-module_param(stretch, int, 0);
-module_param(pciwb, int, 0);
-module_param(pcirb, int, 0);
-module_param(pciwr, int, 0);
-module_param(pcirr, int, 0);
-module_param(memsize, int, 0);
-module_param(verbosity, int, 0);
-
-//=========================================
-//
-// Well, we have to fix the upper layers.
-// Until this has been done, we work around
-// the bugs.
-//
-//=========================================
-
-#if (CYBLAFB_KD_GRAPHICS_QUIRK && CYBLAFB_DEBUG)
-	if (disabled) { \
-		printk("********\n");\
-		dump_stack();\
-		return val;\
-	}
-
-#elif CYBLAFB_KD_GRAPHICS_QUIRK
-#define KD_GRAPHICS_RETURN(val)\
-	if (disabled) {\
-		return val;\
-	}
-#else
-#define KD_GRAPHICS_RETURN(val)
-#endif
-
-//=========================================
-//
-// Port access macros for memory mapped io
-//
-//=========================================
-
-#define out8(r, v) writeb(v, io_virt + r)
-#define out32(r, v) writel(v, io_virt + r)
-#define in8(r) readb(io_virt + r)
-#define in32(r) readl(io_virt + r)
-
-//======================================
-//
-// Hardware access inline functions
-//
-//======================================
-
-static inline u8 read3X4(u32 reg)
-{
-	out8(0x3D4, reg);
-	return in8(0x3D5);
-}
-
-static inline u8 read3C4(u32 reg)
-{
-	out8(0x3C4, reg);
-	return in8(0x3C5);
-}
-
-static inline u8 read3CE(u32 reg)
-{
-	out8(0x3CE, reg);
-	return in8(0x3CF);
-}
-
-static inline void write3X4(u32 reg, u8 val)
-{
-	out8(0x3D4, reg);
-	out8(0x3D5, val);
-}
-
-static inline void write3C4(u32 reg, u8 val)
-{
-	out8(0x3C4, reg);
-	out8(0x3C5, val);
-}
-
-static inline void write3CE(u32 reg, u8 val)
-{
-	out8(0x3CE, reg);
-	out8(0x3CF, val);
-}
-
-static inline void write3C0(u32 reg, u8 val)
-{
-	in8(0x3DA);		// read to reset index
-	out8(0x3C0, reg);
-	out8(0x3C0, val);
-}
-
-//=================================================
-//
-// Enable memory mapped io and unprotect registers
-//
-//=================================================
-
-static void enable_mmio(void)
-{
-	u8 tmp;
-
-	outb(0x0B, 0x3C4);
-	inb(0x3C5);		// Set NEW mode
-	outb(SR0E, 0x3C4);	// write enable a lot of extended ports
-	outb(0x80, 0x3C5);
-
-	outb(SR11, 0x3C4);	// write enable those extended ports that
-	outb(0x87, 0x3C5);	// are not affected by SR0E_New
-
-	outb(CR1E, 0x3d4);	// clear write protect bit for port 0x3c2
-	tmp = inb(0x3d5) & 0xBF;
-	outb(CR1E, 0x3d4);
-	outb(tmp, 0x3d5);
-
-	outb(CR39, 0x3D4);
-	outb(inb(0x3D5) | 0x01, 0x3D5); // Enable mmio
-}
-
-//=================================================
-//
-// Set pixel clock VCLK1
-// - multipliers set elswhere
-// - freq in units of 0.01 MHz
-//
-// Hardware bug: SR18 >= 250 is broken for the
-//		 cyberblade/i1
-//
-//=================================================
-
-static void set_vclk(struct cyblafb_par *par, int freq)
-{
-	u32 m, n, k;
-	int f, fi, d, di;
-	u8 lo = 0, hi = 0;
-
-	d = 2000;
-	k = freq >= 10000 ? 0 : freq >= 5000 ? 1 : freq >= 2500 ? 2 : 3;
-	for (m = 0; m < 64; m++)
-		for (n = 0; n < 250; n++) {
-			fi = (int)(((5864727 * (n + 8)) /
-				    ((m + 2) * (1 << k))) >> 12);
-			if ((di = abs(fi - freq)) < d) {
-				d = di;
-				f = fi;
-				lo = (u8) n;
-				hi = (u8) ((k << 6) | m);
-			}
-		}
-	write3C4(SR19, hi);
-	write3C4(SR18, lo);
-	if (verbosity > 0)
-		output("pixclock = %d.%02d MHz, k/m/n %x %x %x\n",
-		       freq / 100, freq % 100, (hi & 0xc0) >> 6, hi & 0x3f, lo);
-}
-
-//================================================
-//
-// Cyberblade specific Graphics Engine (GE) setup
-//
-//================================================
-
-static void cyblafb_setup_GE(int pitch, int bpp)
-{
-	KD_GRAPHICS_RETURN();
-
-	switch (bpp) {
-	case 8:
-		basestride = ((pitch >> 3) << 20) | (0 << 29);
-		break;
-	case 15:
-		basestride = ((pitch >> 3) << 20) | (5 << 29);
-		break;
-	case 16:
-		basestride = ((pitch >> 3) << 20) | (1 << 29);
-		break;
-	case 24:
-	case 32:
-		basestride = ((pitch >> 3) << 20) | (2 << 29);
-		break;
-	}
-
-	write3X4(CR36, 0x90);	// reset GE
-	write3X4(CR36, 0x80);	// enable GE
-	out32(GE24, 1 << 7);	// reset all GE pointers by toggling
-	out32(GE24, 0); 	//   d7 of GE24
-	write3X4(CR2D, 0x00);	// GE Timinigs, no delays
-	out32(GE6C, 0); 	// Pattern and Style, p 129, ok
-}
-
-//=====================================================================
-//
-// Cyberblade specific syncing
-//
-//   A timeout might be caused by disabled mmio.
-//   Cause:
-//     - bit CR39 & 1 == 0 upon return, X trident driver bug
-//     - kdm bug (KD_GRAPHICS not set on first switch)
-//     - kernel design flaw (it believes in the correctness
-//	 of kdm/X
-//   First we try to sync ignoring that problem, as most of the
-//   time that will succeed immediately and the enable_mmio()
-//   would only degrade performance.
-//
-//=====================================================================
-
-static int cyblafb_sync(struct fb_info *info)
-{
-	u32 status, i = 100000;
-
-	KD_GRAPHICS_RETURN(0);
-
-	while (((status = in32(GE20)) & 0xFe800000) && i != 0)
-		i--;
-
-	if (i == 0) {
-		enable_mmio();
-		i = 1000000;
-		while (((status = in32(GE20)) & 0xFA800000) && i != 0)
-			i--;
-		if (i == 0) {
-			output("GE Timeout, status: %x\n", status);
-			if (status & 0x80000000)
-				output("Bresenham Engine : Busy\n");
-			if (status & 0x40000000)
-				output("Setup Engine     : Busy\n");
-			if (status & 0x20000000)
-				output("SP / DPE         : Busy\n");
-			if (status & 0x10000000)
-				output("Memory Interface : Busy\n");
-			if (status & 0x08000000)
-				output("Com Lst Proc     : Busy\n");
-			if (status & 0x04000000)
-				output("Block Write      : Busy\n");
-			if (status & 0x02000000)
-				output("Command Buffer   : Full\n");
-			if (status & 0x01000000)
-				output("RESERVED         : Busy\n");
-			if (status & 0x00800000)
-				output("PCI Write Buffer : Busy\n");
-			cyblafb_setup_GE(info->var.xres,
-					 info->var.bits_per_pixel);
-		}
-	}
-
-	return 0;
-}
-
-//==============================
-//
-// Cyberblade specific fillrect
-//
-//==============================
-
-static void cyblafb_fillrect(struct fb_info *info, const struct fb_fillrect *fr)
-{
-	u32 bpp = info->var.bits_per_pixel, col, desty, height;
-
-	KD_GRAPHICS_RETURN();
-
-	switch (bpp) {
-	default:
-	case 8:
-		col = fr->color;
-		col |= col << 8;
-		col |= col << 16;
-		break;
-	case 16:
-		col = ((u32 *) (info->pseudo_palette))[fr->color];
-		col |= col << 16;
-		break;
-	case 32:
-		col = ((u32 *) (info->pseudo_palette))[fr->color];
-		break;
-	}
-
-	desty = fr->dy;
-	height = fr->height;
-	while (height) {
-		out32(GEB8, basestride | ((desty * info->var.xres_virtual *
-					   bpp) >> 6));
-		out32(GE60, col);
-		out32(GE48, fr->rop ? 0x66 : ROP_S);
-		out32(GE44, 0x20000000 | 1 << 19 | 1 << 4 | 2 << 2);
-		out32(GE08, point(fr->dx, 0));
-		out32(GE0C, point(fr->dx + fr->width - 1,
-				  height > 4096 ? 4095 : height - 1));
-		if (likely(height <= 4096))
-			return;
-		desty += 4096;
-		height -= 4096;
-	}
-}
-
-//================================================
-//
-// Cyberblade specific copyarea
-//
-// This function silently assumes that it never
-// will be called with width or height exceeding
-// 4096.
-//
-//================================================
-
-static void cyblafb_copyarea(struct fb_info *info, const struct fb_copyarea *ca)
-{
-	u32 s1, s2, d1, d2, direction;
-
-	KD_GRAPHICS_RETURN();
-
-	s1 = point(ca->sx, 0);
-	s2 = point(ca->sx + ca->width - 1, ca->height - 1);
-	d1 = point(ca->dx, 0);
-	d2 = point(ca->dx + ca->width - 1, ca->height - 1);
-
-	if ((ca->sy > ca->dy) || ((ca->sy == ca->dy) && (ca->sx > ca->dx)))
-		direction = 0;
-	else
-		direction = 2;
-
-	out32(GEB8, basestride | ((ca->dy * info->var.xres_virtual *
-				   info->var.bits_per_pixel) >> 6));
-	out32(GEC8, basestride | ((ca->sy * info->var.xres_virtual *
-				   info->var.bits_per_pixel) >> 6));
-	out32(GE44, 0xa0000000 | 1 << 19 | 1 << 2 | direction);
-	out32(GE00, direction ? s2 : s1);
-	out32(GE04, direction ? s1 : s2);
-	out32(GE08, direction ? d2 : d1);
-	out32(GE0C, direction ? d1 : d2);
-}
-
-//=======================================================================
-//
-// Cyberblade specific imageblit
-//
-// Accelerated for the most usual case, blitting 1 - bit deep
-// character images. Everything else is passed to the generic imageblit
-// unless it is so insane that it is better to printk an alert.
-//
-// Hardware bug: _Never_ blit across pixel column 2048, that will lock
-// the system. We split those blit requests into three blitting
-// operations.
-//
-//=======================================================================
-
-static void cyblafb_imageblit(struct fb_info *info,
-			      const struct fb_image *image)
-{
-	u32 fgcol, bgcol;
-	u32 *pd = (u32 *) image->data;
-	u32 bpp = info->var.bits_per_pixel;
-
-	KD_GRAPHICS_RETURN();
-
-	// Used only for drawing the penguine (image->depth > 1)
-	if (image->depth != 1) {
-		cfb_imageblit(info, image);
-		return;
-	}
-	// That should never happen, but it would be fatal
-	if (image->width == 0 || image->height == 0) {
-		output("imageblit: width/height 0 detected\n");
-		return;
-	}
-
-	if (info->fix.visual == FB_VISUAL_TRUECOLOR ||
-	    info->fix.visual == FB_VISUAL_DIRECTCOLOR) {
-		fgcol = ((u32 *) (info->pseudo_palette))[image->fg_color];
-		bgcol = ((u32 *) (info->pseudo_palette))[image->bg_color];
-	} else {
-		fgcol = image->fg_color;
-		bgcol = image->bg_color;
-	}
-
-	switch (bpp) {
-	case 8:
-		fgcol |= fgcol << 8;
-		bgcol |= bgcol << 8;
-	case 16:
-		fgcol |= fgcol << 16;
-		bgcol |= bgcol << 16;
-	default:
-		break;
-	}
-
-	out32(GEB8, basestride | ((image->dy * info->var.xres_virtual *
-				   bpp) >> 6));
-	out32(GE60, fgcol);
-	out32(GE64, bgcol);
-
-	if (!(image->dx < 2048 && (image->dx + image->width - 1) >= 2048)) {
-		u32 dds = ((image->width + 31) >> 5) * image->height;
-		out32(GE44, 0xa0000000 | 1 << 20 | 1 << 19);
-		out32(GE08, point(image->dx, 0));
-		out32(GE0C, point(image->dx + image->width - 1,
-				  image->height - 1));
-		while (dds--)
-			out32(GE9C, *pd++);
-	} else {
-		int i, j;
-		u32 ddstotal = (image->width + 31) >> 5;
-		u32 ddsleft = (2048 - image->dx + 31) >> 5;
-		u32 skipleft = ddstotal - ddsleft;
-
-		out32(GE44, 0xa0000000 | 1 << 20 | 1 << 19);
-		out32(GE08, point(image->dx, 0));
-		out32(GE0C, point(2048 - 1, image->height - 1));
-		for (i = 0; i < image->height; i++) {
-			for (j = 0; j < ddsleft; j++)
-				out32(GE9C, *pd++);
-			pd += skipleft;
-		}
-
-		if (image->dx % 32) {
-			out32(GE44, 0xa0000000 | 1 << 20 | 1 << 19);
-			out32(GE08, point(2048, 0));
-			if (image->width > ddsleft << 5)
-				out32(GE0C, point(image->dx + (ddsleft << 5) -
-						  1, image->height - 1));
-			else
-				out32(GE0C, point(image->dx + image->width - 1,
-						  image->height - 1));
-			pd = ((u32 *) image->data) + ddstotal - skipleft - 1;
-			for (i = 0; i < image->height; i++) {
-				out32(GE9C, swab32(swab32(*pd) << ((32 -
-					    (image->dx & 31)) & 31)));
-				pd += ddstotal;
-			}
-		}
-
-		if (skipleft) {
-			out32(GE44, 0xa0000000 | 1 << 20 | 1 << 19);
-			out32(GE08, point(image->dx + (ddsleft << 5), 0));
-			out32(GE0C, point(image->dx + image->width - 1,
-					  image->height - 1));
-			pd = (u32 *) image->data;
-			for (i = 0; i < image->height; i++) {
-				pd += ddsleft;
-				for (j = 0; j < skipleft; j++)
-					out32(GE9C, *pd++);
-			}
-		}
-	}
-}
-
-//==========================================================
-//
-// Check if video mode is acceptable. We change var->??? if
-// video mode is slightly off or return error otherwise.
-// info->??? must not be changed!
-//
-//==========================================================
-
-static int cyblafb_check_var(struct fb_var_screeninfo *var,
-			     struct fb_info *info)
-{
-	int bpp = var->bits_per_pixel;
-
-	//
-	// we try to support 8, 16, 24 and 32 bpp modes,
-	// default to 8
-	//
-	// there is a 24 bpp mode, but for now we change requests to 32 bpp
-	// (This is what tridentfb does ... will be changed in the future)
-	//
-	//
-	if (bpp % 8 != 0 || bpp < 8 || bpp > 32)
-		bpp = 8;
-	if (bpp == 24)
-		bpp = var->bits_per_pixel = 32;
-
-	//
-	// interlaced modes are broken, fail if one is requested
-	//
-	if (var->vmode & FB_VMODE_INTERLACED)
-		return -EINVAL;
-
-	//
-	// fail if requested resolution is higher than physical
-	// flatpanel resolution
-	//
-	if ((displaytype == DISPLAY_FP) && nativex && var->xres > nativex)
-		return -EINVAL;
-
-	//
-	// we do not allow vclk to exceed 230 MHz. If the requested
-	// vclk is too high, we default to 200 MHz
-	//
-	if ((bpp == 32 ? 200000000 : 100000000) / var->pixclock > 23000)
-		var->pixclock = (bpp == 32 ? 200000000 : 100000000) / 20000;
-
-	//
-	// enforce (h|v)sync_len limits
-	//
-	var->hsync_len &= ~7;
-	if(var->hsync_len > 248)
-		var->hsync_len = 248;
-
-	var->vsync_len &= 15;
-
-	//
-	// Enforce horizontal and vertical hardware limits.
-	// 1600x1200 is mentioned as a maximum, but higher resolutions could
-	// work with slow refresh, small margins and short sync.
-	//
-	var->xres &= ~7;
-
-	if (((var->xres + var->left_margin + var->right_margin +
-			var->hsync_len) > (bpp == 32 ? 2040 : 4088)) ||
-			((var->yres + var->upper_margin + var->lower_margin +
-			var->vsync_len) > 2047))
-		return -EINVAL;
-
-	if ((var->xres > 1600) || (var->yres > 1200))
-		output("Mode %dx%d exceeds documented limits.\n",
-					   var->xres, var->yres);
-	//
-	// try to be smart about (x|y)res_virtual problems.
-	//
-	if (var->xres > var->xres_virtual)
-		var->xres_virtual = var->xres;
-	if (var->yres > var->yres_virtual)
-		var->yres_virtual = var->yres;
-
-	if (bpp == 8 || bpp == 16) {
-		if (var->xres_virtual > 4088)
-			var->xres_virtual = 4088;
-	} else {
-		if (var->xres_virtual > 2040)
-			var->xres_virtual = 2040;
-	}
-	var->xres_virtual &= ~7;
-	while (var->xres_virtual * var->yres_virtual * bpp / 8 >
-	       info->fix.smem_len) {
-		if (var->yres_virtual > var->yres)
-			var->yres_virtual--;
-		else if (var->xres_virtual > var->xres)
-			var->xres_virtual -= 8;
-		else
-			return -EINVAL;
-	}
-
-	switch (bpp) {
-	case 8:
-		var->red.offset = 0;
-		var->green.offset = 0;
-		var->blue.offset = 0;
-		var->red.length = 6;
-		var->green.length = 6;
-		var->blue.length = 6;
-		break;
-	case 16:
-		var->red.offset = 11;
-		var->green.offset = 5;
-		var->blue.offset = 0;
-		var->red.length = 5;
-		var->green.length = 6;
-		var->blue.length = 5;
-		break;
-	case 32:
-		var->red.offset = 16;
-		var->green.offset = 8;
-		var->blue.offset = 0;
-		var->red.length = 8;
-		var->green.length = 8;
-		var->blue.length = 8;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-//=====================================================================
-//
-// Pan the display
-//
-// The datasheets defines crt start address to be 20 bits wide and
-// to be programmed to CR0C, CR0D, CR1E and CR27. Actually there is
-// CR2B[5] as an undocumented extension bit. Epia BIOS 2.07 does use
-// it, so it is also safe to be used here. BTW: datasheet CR0E on page
-// 90 really is CR1E, the real CRE is documented on page 72.
-//
-// BUT:
-//
-// As of internal version 0.60 we do not use vga panning any longer.
-// Vga panning did not allow us the use of all available video memory
-// and thus prevented ywrap scrolling. We do use the "right view"
-// register now.
-//
-//
-//=====================================================================
-
-static int cyblafb_pan_display(struct fb_var_screeninfo *var,
-			       struct fb_info *info)
-{
-	KD_GRAPHICS_RETURN(0);
-
-	info->var.xoffset = var->xoffset;
-	info->var.yoffset = var->yoffset;
-	out32(GE10, 0x80000000 | ((var->xoffset + (var->yoffset *
-		    var->xres_virtual)) * var->bits_per_pixel / 32));
-	return 0;
-}
-
-//============================================
-//
-// This will really help in case of a bug ...
-// dump most gaphics core registers.
-//
-//============================================
-
-static void regdump(struct cyblafb_par *par)
-{
-	int i;
-
-	if (verbosity < 2)
-		return;
-
-	printk("\n");
-	for (i = 0; i <= 0xff; i++) {
-		outb(i, 0x3d4);
-		printk("CR%02x=%02x ", i, inb(0x3d5));
-		if (i % 16 == 15)
-			printk("\n");
-	}
-
-	outb(0x30, 0x3ce);
-	outb(inb(0x3cf) | 0x40, 0x3cf);
-	for (i = 0; i <= 0x1f; i++) {
-		if (i == 0 || (i > 2 && i < 8) || i == 0x10 || i == 0x11
-		    || i == 0x16) {
-			outb(i, 0x3d4);
-			printk("CR%02x=%02x ", i, inb(0x3d5));
-		} else
-			printk("------- ");
-		if (i % 16 == 15)
-			printk("\n");
-	}
-	outb(0x30, 0x3ce);
-	outb(inb(0x3cf) & 0xbf, 0x3cf);
-
-	printk("\n");
-	for (i = 0; i <= 0x7f; i++) {
-		outb(i, 0x3ce);
-		printk("GR%02x=%02x ", i, inb(0x3cf));
-		if (i % 16 == 15)
-			printk("\n");
-	}
-
-	printk("\n");
-	for (i = 0; i <= 0xff; i++) {
-		outb(i, 0x3c4);
-		printk("SR%02x=%02x ", i, inb(0x3c5));
-		if (i % 16 == 15)
-			printk("\n");
-	}
-
-	printk("\n");
-	for (i = 0; i <= 0x1F; i++) {
-		inb(0x3da);	// next access is index!
-		outb(i, 0x3c0);
-		printk("AR%02x=%02x ", i, inb(0x3c1));
-		if (i % 16 == 15)
-			printk("\n");
-	}
-	printk("\n");
-
-	inb(0x3DA);		// reset internal flag to 3c0 index
-	outb(0x20, 0x3C0);	// enable attr
-
-	return;
-}
-
-//=======================================================================
-//
-// Save State
-//
-// This function is called while a switch to KD_TEXT is in progress,
-// before any of the other functions are called.
-//
-//=======================================================================
-
-static void cyblafb_save_state(struct fb_info *info)
-{
-	struct cyblafb_par *par = info->par;
-	if (verbosity > 0)
-		output("Switching to KD_TEXT\n");
-	disabled = 0;
-	regdump(par);
-	enable_mmio();
-	return;
-}
-
-//=======================================================================
-//
-// Restore State
-//
-// This function is called while a switch to KD_GRAPHICS is in progress,
-// We have to turn on vga style panning registers again because the
-// trident driver of X does not know about GE10.
-//
-//=======================================================================
-
-static void cyblafb_restore_state(struct fb_info *info)
-{
-	if (verbosity > 0)
-		output("Switching to KD_GRAPHICS\n");
-	out32(GE10, 0);
-	disabled = 1;
-	return;
-}
-
-//======================================
-//
-// Set hardware to requested video mode
-//
-//======================================
-
-static int cyblafb_set_par(struct fb_info *info)
-{
-	struct cyblafb_par *par = info->par;
-	u32 htotal, hdispend, hsyncstart, hsyncend, hblankstart,
-	    hblankend, preendfetch, vtotal, vdispend, vsyncstart,
-	    vsyncend, vblankstart, vblankend;
-	struct fb_var_screeninfo *var = &info->var;
-	int bpp = var->bits_per_pixel;
-	int i;
-
-	KD_GRAPHICS_RETURN(0);
-
-	if (verbosity > 0)
-		output("Switching to new mode: "
-		       "fbset -g %d %d %d %d %d -t %d %d %d %d %d %d %d\n",
-		       var->xres, var->yres, var->xres_virtual,
-		       var->yres_virtual, var->bits_per_pixel, var->pixclock,
-		       var->left_margin, var->right_margin, var->upper_margin,
-		       var->lower_margin, var->hsync_len, var->vsync_len);
-
-	htotal = (var->xres + var->left_margin + var->right_margin +
-		  var->hsync_len) / 8 - 5;
-	hdispend = var->xres / 8 - 1;
-	hsyncstart = (var->xres + var->right_margin) / 8;
-	hsyncend = var->hsync_len / 8;
-	hblankstart = hdispend + 1;
-	hblankend = htotal + 3; // should be htotal + 5, bios does it this way
-	preendfetch = ((var->xres >> 3) + 1) * ((bpp + 1) >> 3);
-
-	vtotal = var->yres + var->upper_margin + var->lower_margin +
-							var->vsync_len - 2;
-	vdispend = var->yres - 1;
-	vsyncstart = var->yres + var->lower_margin;
-	vblankstart = var->yres;
-	vblankend = vtotal; // should be vtotal + 2, but bios does it this way
-	vsyncend = var->vsync_len;
-
-	enable_mmio();		// necessary! ... check X ...
-
-	write3X4(CR11, read3X4(CR11) & 0x7F);	// unlock cr00 .. cr07
-
-	write3CE(GR30, 8);
-
-	if ((displaytype == DISPLAY_FP) && var->xres < nativex) {
-
-		// stretch or center ?
-
-		out8(0x3C2, 0xEB);
-
-		write3CE(GR30, read3CE(GR30) | 0x81);	// shadow mode on
-
-		if (center) {
-			write3CE(GR52, (read3CE(GR52) & 0x7C) | 0x80);
-			write3CE(GR53, (read3CE(GR53) & 0x7C) | 0x80);
-		} else if (stretch) {
-			write3CE(GR5D, 0);
-			write3CE(GR52, (read3CE(GR52) & 0x7C) | 1);
-			write3CE(GR53, (read3CE(GR53) & 0x7C) | 1);
-		}
-
-	} else {
-		out8(0x3C2, 0x2B);
-		write3CE(GR30, 8);
-	}
-
-	//
-	// Setup CRxx regs
-	//
-
-	write3X4(CR00, htotal & 0xFF);
-	write3X4(CR01, hdispend & 0xFF);
-	write3X4(CR02, hblankstart & 0xFF);
-	write3X4(CR03, hblankend & 0x1F);
-	write3X4(CR04, hsyncstart & 0xFF);
-	write3X4(CR05, (hsyncend & 0x1F) | ((hblankend & 0x20) << 2));
-	write3X4(CR06, vtotal & 0xFF);
-	write3X4(CR07, (vtotal & 0x100) >> 8 |
-		       (vdispend & 0x100) >> 7 |
-		       (vsyncstart & 0x100) >> 6 |
-		       (vblankstart & 0x100) >> 5 |
-		       0x10 |
-		       (vtotal & 0x200) >> 4 |
-		       (vdispend & 0x200) >> 3 | (vsyncstart & 0x200) >> 2);
-	write3X4(CR08, 0);
-	write3X4(CR09, (vblankstart & 0x200) >> 4 | 0x40 |	// FIX !!!
-		       ((info->var.vmode & FB_VMODE_DOUBLE) ? 0x80 : 0));
-	write3X4(CR0A, 0);	// Init to some reasonable default
-	write3X4(CR0B, 0);	// Init to some reasonable default
-	write3X4(CR0C, 0);	// Offset 0
-	write3X4(CR0D, 0);	// Offset 0
-	write3X4(CR0E, 0);	// Init to some reasonable default
-	write3X4(CR0F, 0);	// Init to some reasonable default
-	write3X4(CR10, vsyncstart & 0xFF);
-	write3X4(CR11, (vsyncend & 0x0F));
-	write3X4(CR12, vdispend & 0xFF);
-	write3X4(CR13, ((info->var.xres_virtual * bpp) / (4 * 16)) & 0xFF);
-	write3X4(CR14, 0x40);	// double word mode
-	write3X4(CR15, vblankstart & 0xFF);
-	write3X4(CR16, vblankend & 0xFF);
-	write3X4(CR17, 0xE3);
-	write3X4(CR18, 0xFF);
-	//	 CR19: needed for interlaced modes ... ignore it for now
-	write3X4(CR1A, 0x07);	// Arbitration Control Counter 1
-	write3X4(CR1B, 0x07);	// Arbitration Control Counter 2
-	write3X4(CR1C, 0x07);	// Arbitration Control Counter 3
-	write3X4(CR1D, 0x00);	// Don't know, doesn't hurt ; -)
-	write3X4(CR1E, (info->var.vmode & FB_VMODE_INTERLACED) ? 0x84 : 0x80);
-	//	 CR1F: do not set, contains BIOS info about memsize
-	write3X4(CR20, 0x20);	// enabe wr buf, disable 16bit planar mode
-	write3X4(CR21, 0x20);	// enable linear memory access
-	//	 CR22: RO cpu latch readback
-	//	 CR23: ???
-	//	 CR24: RO AR flag state
-	//	 CR25: RAMDAC rw timing, pclk buffer tristate control ????
-	//	 CR26: ???
-	write3X4(CR27, (vdispend & 0x400) >> 6 |
-		       (vsyncstart & 0x400) >> 5 |
-		       (vblankstart & 0x400) >> 4 |
-		       (vtotal & 0x400) >> 3 |
-		       0x8);
-	//	 CR28: ???
-	write3X4(CR29, (read3X4(CR29) & 0xCF) | ((((info->var.xres_virtual *
-			bpp) / (4 * 16)) & 0x300) >> 4));
-	write3X4(CR2A, read3X4(CR2A) | 0x40);
-	write3X4(CR2B, (htotal & 0x100) >> 8 |
-		       (hdispend & 0x100) >> 7 |
-		       // (0x00 & 0x100) >> 6 |   hinterlace para bit 8 ???
-		       (hsyncstart & 0x100) >> 5 |
-		       (hblankstart & 0x100) >> 4);
-	//	 CR2C: ???
-	//	 CR2D: initialized in cyblafb_setup_GE()
-	write3X4(CR2F, 0x92);	// conservative, better signal quality
-	//	 CR30: reserved
-	//	 CR31: reserved
-	//	 CR32: reserved
-	//	 CR33: reserved
-	//	 CR34: disabled in CR36
-	//	 CR35: disabled in CR36
-	//	 CR36: initialized in cyblafb_setup_GE
-	//	 CR37: i2c, ignore for now
-	write3X4(CR38, (bpp == 8) ? 0x00 :	//
-		       (bpp == 16) ? 0x05 :	// highcolor
-		       (bpp == 24) ? 0x29 :	// packed 24bit truecolor
-		       (bpp == 32) ? 0x09 : 0); // truecolor, 16 bit pixelbus
-	write3X4(CR39, 0x01 |	// MMIO enable
-		       (pcirb ? 0x02 : 0) |	// pci read burst enable
-		       (pciwb ? 0x04 : 0));	// pci write burst enable
-	write3X4(CR55, 0x1F | // pci clocks * 2 for STOP# during 1st data phase
-		       (pcirr ? 0x40 : 0) |	// pci read retry enable
-		       (pciwr ? 0x80 : 0));	// pci write retry enable
-	write3X4(CR56, preendfetch >> 8 < 2 ? (preendfetch >> 8 & 0x01) | 2
-					    : 0);
-	write3X4(CR57, preendfetch >> 8 < 2 ? preendfetch & 0xff : 0);
-	write3X4(CR58, 0x82);	// Bios does this .... don't know more
-	//
-	// Setup SRxx regs
-	//
-	write3C4(SR00, 3);
-	write3C4(SR01, 1);	//set char clock 8 dots wide
-	write3C4(SR02, 0x0F);	//enable 4 maps needed in chain4 mode
-	write3C4(SR03, 0);	//no character map select
-	write3C4(SR04, 0x0E);	//memory mode: ext mem, even, chain4
-
-	out8(0x3C4, 0x0b);
-	in8(0x3C5);		// Set NEW mode
-	write3C4(SR0D, 0x00);	// test ... check
-
-	set_vclk(par, (bpp == 32 ? 200000000 : 100000000)
-					/ info->var.pixclock);	//SR18, SR19
-
-	//
-	// Setup GRxx regs
-	//
-	write3CE(GR00, 0x00);	// test ... check
-	write3CE(GR01, 0x00);	// test ... check
-	write3CE(GR02, 0x00);	// test ... check
-	write3CE(GR03, 0x00);	// test ... check
-	write3CE(GR04, 0x00);	// test ... check
-	write3CE(GR05, 0x40);	// no CGA compat, allow 256 col
-	write3CE(GR06, 0x05);	// graphics mode
-	write3CE(GR07, 0x0F);	// planes?
-	write3CE(GR08, 0xFF);	// test ... check
-	write3CE(GR0F, (bpp == 32) ? 0x1A : 0x12); // vclk / 2 if 32bpp, chain4
-	write3CE(GR20, 0xC0);	// test ... check
-	write3CE(GR2F, 0xA0);	// PCLK = VCLK, no skew,
-
-	//
-	// Setup ARxx regs
-	//
-	for (i = 0; i < 0x10; i++)	// set AR00 .. AR0f
-		write3C0(i, i);
-	write3C0(AR10, 0x41);	// graphics mode and support 256 color modes
-	write3C0(AR12, 0x0F);	// planes
-	write3C0(AR13, 0);	// horizontal pel panning
-	in8(0x3DA);		// reset internal flag to 3c0 index
-	out8(0x3C0, 0x20);	// enable attr
-
-	//
-	// Setup hidden RAMDAC command register
-	//
-	in8(0x3C8);		// these reads are
-	in8(0x3C6);		// necessary to
-	in8(0x3C6);		// unmask the RAMDAC
-	in8(0x3C6);		// command reg, otherwise
-	in8(0x3C6);		// we would write the pixelmask reg!
-	out8(0x3C6, (bpp == 8) ? 0x00 : // 256 colors
-	     (bpp == 15) ? 0x10 :	//
-	     (bpp == 16) ? 0x30 :	// hicolor
-	     (bpp == 24) ? 0xD0 :	// truecolor
-	     (bpp == 32) ? 0xD0 : 0);	// truecolor
-	in8(0x3C8);
-
-	//
-	// GR31 is not mentioned in the datasheet
-	//
-	if (displaytype == DISPLAY_FP)
-		write3CE(GR31, (read3CE(GR31) & 0x8F) |
-			 ((info->var.yres > 1024) ? 0x50 :
-			  (info->var.yres > 768) ? 0x30 :
-			  (info->var.yres > 600) ? 0x20 :
-			  (info->var.yres > 480) ? 0x10 : 0));
-
-	info->fix.visual = (bpp == 8) ? FB_VISUAL_PSEUDOCOLOR
-				      : FB_VISUAL_TRUECOLOR;
-	info->fix.line_length = info->var.xres_virtual * (bpp >> 3);
-	info->cmap.len = (bpp == 8) ? 256 : 16;
-
-	//
-	// init acceleration engine
-	//
-	cyblafb_setup_GE(info->var.xres_virtual, info->var.bits_per_pixel);
-
-	//
-	// Set/clear flags to allow proper scroll mode selection.
-	//
-	if (var->xres == var->xres_virtual)
-		info->flags &= ~FBINFO_HWACCEL_XPAN;
-	else
-		info->flags |= FBINFO_HWACCEL_XPAN;
-
-	if (var->yres == var->yres_virtual)
-		info->flags &= ~FBINFO_HWACCEL_YPAN;
-	else
-		info->flags |= FBINFO_HWACCEL_YPAN;
-
-	if (info->fix.smem_len !=
-	    var->xres_virtual * var->yres_virtual * bpp / 8)
-		info->flags &= ~FBINFO_HWACCEL_YWRAP;
-	else
-		info->flags |= FBINFO_HWACCEL_YWRAP;
-
-	regdump(par);
-
-	return 0;
-}
-
-//========================
-//
-// Set one color register
-//
-//========================
-
-static int cyblafb_setcolreg(unsigned regno, unsigned red, unsigned green,
-			     unsigned blue, unsigned transp,
-			     struct fb_info *info)
-{
-	int bpp = info->var.bits_per_pixel;
-
-	KD_GRAPHICS_RETURN(0);
-
-	if (regno >= info->cmap.len)
-		return 1;
-
-	if (bpp == 8) {
-		out8(0x3C6, 0xFF);
-		out8(0x3C8, regno);
-		out8(0x3C9, red >> 10);
-		out8(0x3C9, green >> 10);
-		out8(0x3C9, blue >> 10);
-
-	} else if (regno < 16) {
-		if (bpp == 16)	// RGB 565
-			((u32 *) info->pseudo_palette)[regno] =
-				(red & 0xF800) |
-				((green & 0xFC00) >> 5) |
-				((blue & 0xF800) >> 11);
-		else if (bpp == 32)	// ARGB 8888
-			((u32 *) info->pseudo_palette)[regno] =
-				((transp & 0xFF00) << 16) |
-				((red & 0xFF00) << 8) |
-				((green & 0xFF00)) | ((blue & 0xFF00) >> 8);
-	}
-
-	return 0;
-}
-
-//==========================================================
-//
-// Try blanking the screen. For flat panels it does nothing
-//
-//==========================================================
-
-static int cyblafb_blank(int blank_mode, struct fb_info *info)
-{
-	unsigned char PMCont, DPMSCont;
-
-	KD_GRAPHICS_RETURN(0);
-
-	if (displaytype == DISPLAY_FP)
-		return 0;
-
-	out8(0x83C8, 0x04);	// DPMS Control
-	PMCont = in8(0x83C6) & 0xFC;
-
-	DPMSCont = read3CE(GR23) & 0xFC;
-
-	switch (blank_mode) {
-	case FB_BLANK_UNBLANK:	// Screen: On, HSync: On, VSync: On
-	case FB_BLANK_NORMAL:	// Screen: Off, HSync: On, VSync: On
-		PMCont |= 0x03;
-		DPMSCont |= 0x00;
-		break;
-	case FB_BLANK_HSYNC_SUSPEND:	// Screen: Off, HSync: Off, VSync: On
-		PMCont |= 0x02;
-		DPMSCont |= 0x01;
-		break;
-	case FB_BLANK_VSYNC_SUSPEND:	// Screen: Off, HSync: On, VSync: Off
-		PMCont |= 0x02;
-		DPMSCont |= 0x02;
-		break;
-	case FB_BLANK_POWERDOWN:	// Screen: Off, HSync: Off, VSync: Off
-		PMCont |= 0x00;
-		DPMSCont |= 0x03;
-		break;
-	}
-
-	write3CE(GR23, DPMSCont);
-	out8(0x83C8, 4);
-	out8(0x83C6, PMCont);
-	//
-	// let fbcon do a softblank for us
-	//
-	return (blank_mode == FB_BLANK_NORMAL) ? 1 : 0;
-}
-
-static struct fb_ops cyblafb_ops __devinitdata = {
-	.owner = THIS_MODULE,
-	.fb_setcolreg = cyblafb_setcolreg,
-	.fb_pan_display = cyblafb_pan_display,
-	.fb_blank = cyblafb_blank,
-	.fb_check_var = cyblafb_check_var,
-	.fb_set_par = cyblafb_set_par,
-	.fb_fillrect = cyblafb_fillrect,
-	.fb_copyarea = cyblafb_copyarea,
-	.fb_imageblit = cyblafb_imageblit,
-	.fb_sync = cyblafb_sync,
-	.fb_restore_state = cyblafb_restore_state,
-	.fb_save_state = cyblafb_save_state,
-};
-
-//==========================================================================
-//
-// getstartupmode() decides about the inital video mode
-//
-// There is no reason to use modedb, a lot of video modes there would
-// need altered timings to display correctly. So I decided that it is much
-// better to provide a limited optimized set of modes plus the option of
-// using the mode in effect at startup time (might be selected using the
-// vga=??? parameter). After that the user might use fbset to select any
-// mode he likes, check_var will not try to alter geometry parameters as
-// it would be necessary otherwise.
-//
-//==========================================================================
-
-static int __devinit getstartupmode(struct fb_info *info)
-{
-	u32 htotal, hdispend, hsyncstart, hsyncend, hblankstart, hblankend,
-	    vtotal, vdispend, vsyncstart, vsyncend, vblankstart, vblankend,
-	    cr00, cr01, cr02, cr03, cr04, cr05, cr2b,
-	    cr06, cr07, cr09, cr10, cr11, cr12, cr15, cr16, cr27,
-	    cr38, sr0d, sr18, sr19, gr0f, fi, pxclkdiv, vclkdiv, tmp, i;
-
-	struct modus {
-		int xres; int vxres; int yres; int vyres;
-		int bpp; int pxclk;
-		int left_margin; int right_margin;
-		int upper_margin; int lower_margin;
-		int hsync_len; int vsync_len;
-	} modedb[5] = {
-		{
-		0, 2048, 0, 4096, 0, 0, 0, 0, 0, 0, 0, 0}, {
-		640, 2048, 480, 4096, 0, 0, -40, 24, 17, 0, 216, 3}, {
-		800, 2048, 600, 4096, 0, 0, 96, 24, 14, 0, 136, 11}, {
-		1024, 2048, 768, 4096, 0, 0, 144, 24, 29, 0, 120, 3}, {
-		1280, 2048, 1024, 4096, 0, 0, 232, 16, 39, 0, 160, 3}
-	};
-
-	outb(0x00, 0x3d4); cr00 = inb(0x3d5);
-	outb(0x01, 0x3d4); cr01 = inb(0x3d5);
-	outb(0x02, 0x3d4); cr02 = inb(0x3d5);
-	outb(0x03, 0x3d4); cr03 = inb(0x3d5);
-	outb(0x04, 0x3d4); cr04 = inb(0x3d5);
-	outb(0x05, 0x3d4); cr05 = inb(0x3d5);
-	outb(0x06, 0x3d4); cr06 = inb(0x3d5);
-	outb(0x07, 0x3d4); cr07 = inb(0x3d5);
-	outb(0x09, 0x3d4); cr09 = inb(0x3d5);
-	outb(0x10, 0x3d4); cr10 = inb(0x3d5);
-	outb(0x11, 0x3d4); cr11 = inb(0x3d5);
-	outb(0x12, 0x3d4); cr12 = inb(0x3d5);
-	outb(0x15, 0x3d4); cr15 = inb(0x3d5);
-	outb(0x16, 0x3d4); cr16 = inb(0x3d5);
-	outb(0x27, 0x3d4); cr27 = inb(0x3d5);
-	outb(0x2b, 0x3d4); cr2b = inb(0x3d5);
-	outb(0x38, 0x3d4); cr38 = inb(0x3d5);
-
-	outb(0x0b, 0x3c4);
-	inb(0x3c5);
-
-	outb(0x0d, 0x3c4); sr0d = inb(0x3c5);
-	outb(0x18, 0x3c4); sr18 = inb(0x3c5);
-	outb(0x19, 0x3c4); sr19 = inb(0x3c5);
-	outb(0x0f, 0x3ce); gr0f = inb(0x3cf);
-
-	htotal = cr00 | (cr2b & 0x01) << 8;
-	hdispend = cr01 | (cr2b & 0x02) << 7;
-	hblankstart = cr02 | (cr2b & 0x10) << 4;
-	hblankend = (cr03 & 0x1f) | (cr05 & 0x80) >> 2;
-	hsyncstart = cr04 | (cr2b & 0x08) << 5;
-	hsyncend = cr05 & 0x1f;
-
-	modedb[0].xres = hblankstart * 8;
-	modedb[0].hsync_len = hsyncend * 8;
-	modedb[0].right_margin = hsyncstart * 8 - modedb[0].xres;
-	modedb[0].left_margin = (htotal + 5) * 8 - modedb[0].xres -
-	    modedb[0].right_margin - modedb[0].hsync_len;
-
-	vtotal = cr06 | (cr07 & 0x01) << 8 | (cr07 & 0x20) << 4
-	    | (cr27 & 0x80) << 3;
-	vdispend = cr12 | (cr07 & 0x02) << 7 | (cr07 & 0x40) << 3
-	    | (cr27 & 0x10) << 6;
-	vsyncstart = cr10 | (cr07 & 0x04) << 6 | (cr07 & 0x80) << 2
-	    | (cr27 & 0x20) << 5;
-	vsyncend = cr11 & 0x0f;
-	vblankstart = cr15 | (cr07 & 0x08) << 5 | (cr09 & 0x20) << 4
-	    | (cr27 & 0x40) << 4;
-	vblankend = cr16;
-
-	modedb[0].yres = vdispend + 1;
-	modedb[0].vsync_len = vsyncend;
-	modedb[0].lower_margin = vsyncstart - modedb[0].yres;
-	modedb[0].upper_margin = vtotal - modedb[0].yres -
-	    modedb[0].lower_margin - modedb[0].vsync_len + 2;
-
-	tmp = cr38 & 0x3c;
-	modedb[0].bpp = tmp == 0 ? 8 : tmp == 4 ? 16 : tmp == 28 ? 24 :
-	    tmp == 8 ? 32 : 8;
-
-	fi = ((5864727 * (sr18 + 8)) /
-	      (((sr19 & 0x3f) + 2) * (1 << ((sr19 & 0xc0) >> 6)))) >> 12;
-	pxclkdiv = ((gr0f & 0x08) >> 3 | (gr0f & 0x40) >> 5) + 1;
-	tmp = sr0d & 0x06;
-	vclkdiv = tmp == 0 ? 2 : tmp == 2 ? 4 : tmp == 4 ? 8 : 3; // * 2 !
-	modedb[0].pxclk = ((100000000 * pxclkdiv * vclkdiv) >> 1) / fi;
-
-	if (verbosity > 0)
-		output("detected startup mode: "
-		       "fbset -g %d %d %d ??? %d -t %d %d %d %d %d %d %d\n",
-		       modedb[0].xres, modedb[0].yres, modedb[0].xres,
-		       modedb[0].bpp, modedb[0].pxclk, modedb[0].left_margin,
-		       modedb[0].right_margin, modedb[0].upper_margin,
-		       modedb[0].lower_margin, modedb[0].hsync_len,
-		       modedb[0].vsync_len);
-
-	//
-	// We use this goto target in case of a failed check_var. No, I really
-	// do not want to do it in another way!
-	//
-
-      tryagain:
-
-	i = (mode == NULL) ? 0 :
-	    !strncmp(mode, "640x480", 7) ? 1 :
-	    !strncmp(mode, "800x600", 7) ? 2 :
-	    !strncmp(mode, "1024x768", 8) ? 3 :
-	    !strncmp(mode, "1280x1024", 9) ? 4 : 0;
-
-	ref = (ref < 50) ? 50 : (ref > 85) ? 85 : ref;
-
-	if (i == 0) {
-		info->var.pixclock = modedb[i].pxclk;
-		info->var.bits_per_pixel = modedb[i].bpp;
-	} else {
-		info->var.pixclock = (100000000 /
-				      ((modedb[i].left_margin +
-					modedb[i].xres +
-					modedb[i].right_margin +
-					modedb[i].hsync_len) *
-				       (modedb[i].upper_margin +
-					modedb[i].yres +
-					modedb[i].lower_margin +
-					modedb[i].vsync_len) * ref / 10000));
-		info->var.bits_per_pixel = bpp;
-	}
-
-	info->var.left_margin = modedb[i].left_margin;
-	info->var.right_margin = modedb[i].right_margin;
-	info->var.xres = modedb[i].xres;
-	if (!(modedb[i].yres == 1280 && modedb[i].bpp == 32))
-		info->var.xres_virtual = modedb[i].vxres;
-	else
-		info->var.xres_virtual = modedb[i].xres;
-	info->var.xoffset = 0;
-	info->var.hsync_len = modedb[i].hsync_len;
-	info->var.upper_margin = modedb[i].upper_margin;
-	info->var.yres = modedb[i].yres;
-	info->var.yres_virtual = modedb[i].vyres;
-	info->var.yoffset = 0;
-	info->var.lower_margin = modedb[i].lower_margin;
-	info->var.vsync_len = modedb[i].vsync_len;
-	info->var.sync = 0;
-	info->var.vmode = FB_VMODE_NONINTERLACED;
-
-	if (cyblafb_check_var(&info->var, info)) {
-		// 640x480 - 8@75 should really never fail. One case would
-		// be fp == 1 and nativex < 640 ... give up then
-		if (i == 1 && bpp == 8 && ref == 75) {
-			output("Can't find a valid mode :-(\n");
-			return -EINVAL;
-		}
-		// Our detected mode is unlikely to fail. If it does,
-		// try 640x480 - 8@75 ...
-		if (i == 0) {
-			mode = "640x480";
-			bpp = 8;
-			ref = 75;
-			output("Detected mode failed check_var! "
-			       "Trying 640x480 - 8@75\n");
-			goto tryagain;
-		}
-		// A specified video mode failed for some reason.
-		// Try the startup mode first
-		output("Specified mode '%s' failed check! "
-		       "Falling back to startup mode.\n", mode);
-		mode = NULL;
-		goto tryagain;
-	}
-
-	return 0;
-}
-
-//========================================================
-//
-// Detect activated memory size. Undefined values require
-// memsize parameter.
-//
-//========================================================
-
-static unsigned int __devinit get_memsize(void)
-{
-	unsigned char tmp;
-	unsigned int k;
-
-	if (memsize)
-		k = memsize * Kb;
-	else {
-		tmp = read3X4(CR1F) & 0x0F;
-		switch (tmp) {
-		case 0x03:
-			k = 1 * 1024 * 1024;
-			break;
-		case 0x07:
-			k = 2 * 1024 * 1024;
-			break;
-		case 0x0F:
-			k = 4 * 1024 * 1024;
-			break;
-		case 0x04:
-			k = 8 * 1024 * 1024;
-			break;
-		default:
-			k = 1 * 1024 * 1024;
-			output("Unknown memory size code %x in CR1F."
-			       " We default to 1 Mb for now, please"
-			       " do provide a memsize parameter!\n", tmp);
-		}
-	}
-
-	if (verbosity > 0)
-		output("framebuffer size = %d Kb\n", k / Kb);
-	return k;
-}
-
-//=========================================================
-//
-// Detect if a flat panel monitor connected to the special
-// interface is active. Override is possible by fp and crt
-// parameters.
-//
-//=========================================================
-
-static unsigned int __devinit get_displaytype(void)
-{
-	if (fp)
-		return DISPLAY_FP;
-	if (crt)
-		return DISPLAY_CRT;
-	return (read3CE(GR33) & 0x10) ? DISPLAY_FP : DISPLAY_CRT;
-}
-
-//=====================================
-//
-// Get native resolution of flat panel
-//
-//=====================================
-
-static int __devinit get_nativex(void)
-{
-	int x, y, tmp;
-
-	if (nativex)
-		return nativex;
-
-	tmp = (read3CE(GR52) >> 4) & 3;
-
-	switch (tmp) {
-	case 0: x = 1280; y = 1024;
-		break;
-	case 2: x = 1024; y = 768;
-		break;
-	case 3: x = 800;  y = 600;
-		break;
-	case 4: x = 1400; y = 1050;
-		break;
-	case 1:
-	default:
-		x = 640; y = 480;
-		break;
-	}
-
-	if (verbosity > 0)
-		output("%dx%d flat panel found\n", x, y);
-	return x;
-}
-
-static int __devinit cybla_pci_probe(struct pci_dev *dev,
-				     const struct pci_device_id *id)
-{
-	struct fb_info *info;
-	struct cyblafb_par *par;
-
-	info = framebuffer_alloc(sizeof(struct cyblafb_par), &dev->dev);
-	if (!info)
-		goto errout_alloc_info;
-
-	info->pixmap.addr = kzalloc(CYBLAFB_PIXMAPSIZE, GFP_KERNEL);
-	if (!info->pixmap.addr) {
-		output("allocation of pixmap buffer failed!\n");
-		goto errout_alloc_pixmap;
-	}
-	info->pixmap.size = CYBLAFB_PIXMAPSIZE - 4;
-	info->pixmap.buf_align = 4;
-	info->pixmap.access_align = 32;
-	info->pixmap.flags = FB_PIXMAP_SYSTEM;
-	info->pixmap.scan_align = 4;
-
-	par = info->par;
-	par->ops = cyblafb_ops;
-
-	info->fix = cyblafb_fix;
-	info->fbops = &par->ops;
-	info->fix = cyblafb_fix;
-
-	if (pci_enable_device(dev)) {
-		output("could not enable device!\n");
-		goto errout_enable;
-	}
-	// might already be requested by vga console or vesafb,
-	// so we do care about success
-	if (!request_region(0x3c0, 0x20, "cyblafb")) {
-		output("region 0x3c0/0x20 already reserved\n");
-		vesafb |= 1;
-
-	}
-	//
-	// Graphics Engine Registers
-	//
-	if (!request_region(GEBase, 0x100, "cyblafb")) {
-		output("region %#x/0x100 already reserved\n", GEBase);
-		vesafb |= 2;
-	}
-
-	regdump(par);
-
-	enable_mmio();
-
-	// setup MMIO region
-	info->fix.mmio_start = pci_resource_start(dev, 1);
-	info->fix.mmio_len = 0x20000;
-
-	if (!request_mem_region(info->fix.mmio_start,
-				info->fix.mmio_len, "cyblafb")) {
-		output("request_mem_region failed for mmio region!\n");
-		goto errout_mmio_reqmem;
-	}
-
-	io_virt = ioremap_nocache(info->fix.mmio_start, info->fix.mmio_len);
-
-	if (!io_virt) {
-		output("ioremap failed for mmio region\n");
-		goto errout_mmio_remap;
-	}
-	// setup framebuffer memory ... might already be requested
-	// by vesafb. Not to fail in case of an unsuccessful request
-	// is useful if both are loaded.
-	info->fix.smem_start = pci_resource_start(dev, 0);
-	info->fix.smem_len = get_memsize();
-
-	if (!request_mem_region(info->fix.smem_start,
-				info->fix.smem_len, "cyblafb")) {
-		output("region %#lx/%#x already reserved\n",
-		       info->fix.smem_start, info->fix.smem_len);
-		vesafb |= 4;
-	}
-
-	info->screen_base = ioremap_nocache(info->fix.smem_start,
-					    info->fix.smem_len);
-
-	if (!info->screen_base) {
-		output("ioremap failed for smem region\n");
-		goto errout_smem_remap;
-	}
-
-	displaytype = get_displaytype();
-
-	if (displaytype == DISPLAY_FP)
-		nativex = get_nativex();
-
-	info->flags = FBINFO_DEFAULT
-		    | FBINFO_HWACCEL_COPYAREA
-		    | FBINFO_HWACCEL_FILLRECT
-		    | FBINFO_HWACCEL_IMAGEBLIT
-		    | FBINFO_READS_FAST
-//		    | FBINFO_PARTIAL_PAN_OK
-		    | FBINFO_MISC_ALWAYS_SETPAR;
-
-	info->pseudo_palette = par->pseudo_pal;
-
-	if (getstartupmode(info))
-		goto errout_findmode;
-
-	fb_alloc_cmap(&info->cmap, 256, 0);
-
-	if (register_framebuffer(info)) {
-		output("Could not register CyBla framebuffer\n");
-		goto errout_register;
-	}
-
-	pci_set_drvdata(dev, info);
-
-	//
-	// normal exit and error paths
-	//
-
-	return 0;
-
-      errout_register:
-      errout_findmode:
-	iounmap(info->screen_base);
-      errout_smem_remap:
-	if (!(vesafb & 4))
-		release_mem_region(info->fix.smem_start, info->fix.smem_len);
-	iounmap(io_virt);
-      errout_mmio_remap:
-	release_mem_region(info->fix.mmio_start, info->fix.mmio_len);
-      errout_mmio_reqmem:
-	if (!(vesafb & 1))
-		release_region(0x3c0, 32);
-      errout_enable:
-	kfree(info->pixmap.addr);
-      errout_alloc_pixmap:
-	framebuffer_release(info);
-      errout_alloc_info:
-	output("CyblaFB version %s aborting init.\n", VERSION);
-	return -ENODEV;
-}
-
-static void __devexit cybla_pci_remove(struct pci_dev *dev)
-{
-	struct fb_info *info = pci_get_drvdata(dev);
-
-	unregister_framebuffer(info);
-	iounmap(io_virt);
-	iounmap(info->screen_base);
-	if (!(vesafb & 4))
-		release_mem_region(info->fix.smem_start, info->fix.smem_len);
-	release_mem_region(info->fix.mmio_start, info->fix.mmio_len);
-	fb_dealloc_cmap(&info->cmap);
-	if (!(vesafb & 2))
-		release_region(GEBase, 0x100);
-	if (!(vesafb & 1))
-		release_region(0x3c0, 32);
-	kfree(info->pixmap.addr);
-	framebuffer_release(info);
-	output("CyblaFB version %s normal exit.\n", VERSION);
-}
-
-//
-// List of boards that we are trying to support
-//
-static struct pci_device_id cybla_devices[] = {
-	{PCI_VENDOR_ID_TRIDENT, CYBERBLADEi1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
-	{0,}
-};
-
-MODULE_DEVICE_TABLE(pci, cybla_devices);
-
-static struct pci_driver cyblafb_pci_driver = {
-	.name = "cyblafb",
-	.id_table = cybla_devices,
-	.probe = cybla_pci_probe,
-	.remove = __devexit_p(cybla_pci_remove)
-};
-
-//=============================================================
-//
-// kernel command line example:
-//
-//	video=cyblafb:1280x1024, bpp=16, ref=50 ...
-//
-// modprobe command line example:
-//
-//	modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ...
-//
-//=============================================================
-
-static int __devinit cyblafb_init(void)
-{
-#ifndef MODULE
-	char *options = NULL;
-	char *opt;
-
-	if (fb_get_options("cyblafb", &options))
-		return -ENODEV;
-
-	if (options && *options)
-		while ((opt = strsep(&options, ",")) != NULL) {
-			if (!*opt)
-				continue;
-			else if (!strncmp(opt, "bpp=", 4))
-				bpp = simple_strtoul(opt + 4, NULL, 0);
-			else if (!strncmp(opt, "ref=", 4))
-				ref = simple_strtoul(opt + 4, NULL, 0);
-			else if (!strncmp(opt, "fp", 2))
-				displaytype = DISPLAY_FP;
-			else if (!strncmp(opt, "crt", 3))
-				displaytype = DISPLAY_CRT;
-			else if (!strncmp(opt, "nativex=", 8))
-				nativex = simple_strtoul(opt + 8, NULL, 0);
-			else if (!strncmp(opt, "center", 6))
-				center = 1;
-			else if (!strncmp(opt, "stretch", 7))
-				stretch = 1;
-			else if (!strncmp(opt, "pciwb=", 6))
-				pciwb = simple_strtoul(opt + 6, NULL, 0);
-			else if (!strncmp(opt, "pcirb=", 6))
-				pcirb = simple_strtoul(opt + 6, NULL, 0);
-			else if (!strncmp(opt, "pciwr=", 6))
-				pciwr = simple_strtoul(opt + 6, NULL, 0);
-			else if (!strncmp(opt, "pcirr=", 6))
-				pcirr = simple_strtoul(opt + 6, NULL, 0);
-			else if (!strncmp(opt, "memsize=", 8))
-				memsize = simple_strtoul(opt + 8, NULL, 0);
-			else if (!strncmp(opt, "verbosity=", 10))
-				verbosity = simple_strtoul(opt + 10, NULL, 0);
-			else
-				mode = opt;
-		}
-#endif
-	output("CyblaFB version %s initializing\n", VERSION);
-	return pci_register_driver(&cyblafb_pci_driver);
-}
-
-static void __exit cyblafb_exit(void)
-{
-	pci_unregister_driver(&cyblafb_pci_driver);
-}
-
-module_init(cyblafb_init);
-module_exit(cyblafb_exit);
-
-MODULE_AUTHOR("Knut Petersen <knut_petersen@t-online.de>");
-MODULE_DESCRIPTION("Framebuffer driver for Cyberblade/i1 graphics core");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/tridentfb.c b/drivers/video/tridentfb.c
index c5369ba9485..bc692a15e2a 100644
--- a/drivers/video/tridentfb.c
+++ b/drivers/video/tridentfb.c
@@ -2,7 +2,7 @@
  * Frame buffer driver for Trident TGUI, Blade and Image series
  *
  * Copyright 2001, 2002 - Jani Monoses   <jani@iv.ro>
- *
+ * Copyright 2009 Krzysztof Helt <krzysztof.h1@wp.pl>
  *
  * CREDITS:(in order of appearance)
  *	skeletonfb.c by Geert Uytterhoeven and other fb code in drivers/video
@@ -1490,6 +1490,9 @@ static int __devinit trident_pci_probe(struct pci_dev *dev,
 	} else
 		info->flags |= FBINFO_HWACCEL_DISABLED;
 
+	if (is_blade(chip_id) && chip_id != BLADE3D)
+		info->flags |= FBINFO_READS_FAST;
+
 	info->pixmap.addr = kmalloc(4096, GFP_KERNEL);
 	if (!info->pixmap.addr) {
 		err = -ENOMEM;
@@ -1664,4 +1667,5 @@ module_exit(tridentfb_exit);
 MODULE_AUTHOR("Jani Monoses <jani@iv.ro>");
 MODULE_DESCRIPTION("Framebuffer driver for Trident cards");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("cyblafb");
 
-- 
cgit v1.2.3


From 23b736545473ed853b685cbc03883aa6ff3f0e0d Mon Sep 17 00:00:00 2001
From: Michal Januszewski <spock@gentoo.org>
Date: Tue, 31 Mar 2009 15:25:41 -0700
Subject: uvesafb: fix selecting mode with the vbemode option

If the vbemode option is used, uvesafb calls fb_get_mode() without first
setting the resolution in info->var.  This results in a division by zero
in fb_get_mode(), as evidenced e.g.  in [1].  Fix this by ensuring the
info->var structure is populated before fb_get_mode() is called.

[1] http://bugzilla.kernel.org/show_bug.cgi?id=11661#c37

Signed-off-by: Michal Januszewski <spock@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/uvesafb.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index c288270e42f..0b370aebdbf 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -850,14 +850,16 @@ static int __devinit uvesafb_vbe_init_mode(struct fb_info *info)
 	if (vbemode) {
 		for (i = 0; i < par->vbe_modes_cnt; i++) {
 			if (par->vbe_modes[i].mode_id == vbemode) {
+				modeid = i;
+				uvesafb_setup_var(&info->var, info,
+						&par->vbe_modes[modeid]);
 				fb_get_mode(FB_VSYNCTIMINGS | FB_IGNOREMON, 60,
-							&info->var, info);
+						&info->var, info);
 				/*
 				 * With pixclock set to 0, the default BIOS
 				 * timings will be used in set_par().
 				 */
 				info->var.pixclock = 0;
-				modeid = i;
 				goto gotmode;
 			}
 		}
@@ -904,8 +906,11 @@ static int __devinit uvesafb_vbe_init_mode(struct fb_info *info)
 			fb_videomode_to_var(&info->var, mode);
 		} else {
 			modeid = par->vbe_modes[0].mode_id;
+			uvesafb_setup_var(&info->var, info,
+					&par->vbe_modes[modeid]);
 			fb_get_mode(FB_VSYNCTIMINGS | FB_IGNOREMON, 60,
-				    &info->var, info);
+					&info->var, info);
+
 			goto gotmode;
 		}
 	}
@@ -917,9 +922,9 @@ static int __devinit uvesafb_vbe_init_mode(struct fb_info *info)
 	if (modeid == -1)
 		return -EINVAL;
 
-gotmode:
 	uvesafb_setup_var(&info->var, info, &par->vbe_modes[modeid]);
 
+gotmode:
 	/*
 	 * If we are not VBE3.0+ compliant, we're done -- the BIOS will
 	 * ignore our timings anyway.
-- 
cgit v1.2.3


From c958557eb6e5f0fa15aeda3b6f952051b4179394 Mon Sep 17 00:00:00 2001
From: Felipe Contreras <felipe.contreras@nokia.com>
Date: Tue, 31 Mar 2009 15:25:42 -0700
Subject: omapfb: fix argument of blank operation

The blank operation should receive FB_BLANK_POWERDOWN, not VESA_POWERDOWN.

Signed-off-by: Felipe Contreras <felipe.contreras@nokia.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Andrea Righi <righi.andrea@gmail.com>
Acked-by: Trilok Soni <soni.trilok@gmail.com>
Cc: Imre Deak <imre.deak@solidboot.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/omap/omapfb_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c
index 1a49519dafa..060d72fe57c 100644
--- a/drivers/video/omap/omapfb_main.c
+++ b/drivers/video/omap/omapfb_main.c
@@ -338,7 +338,7 @@ static int omapfb_blank(int blank, struct fb_info *fbi)
 
 	omapfb_rqueue_lock(fbdev);
 	switch (blank) {
-	case VESA_NO_BLANKING:
+	case FB_BLANK_UNBLANK:
 		if (fbdev->state == OMAPFB_SUSPENDED) {
 			if (fbdev->ctrl->resume)
 				fbdev->ctrl->resume();
@@ -349,7 +349,7 @@ static int omapfb_blank(int blank, struct fb_info *fbi)
 				do_update = 1;
 		}
 		break;
-	case VESA_POWERDOWN:
+	case FB_BLANK_POWERDOWN:
 		if (fbdev->state == OMAPFB_ACTIVE) {
 			fbdev->panel->disable(fbdev->panel);
 			if (fbdev->ctrl->suspend)
@@ -1818,7 +1818,7 @@ static int omapfb_suspend(struct platform_device *pdev, pm_message_t mesg)
 {
 	struct omapfb_device *fbdev = platform_get_drvdata(pdev);
 
-	omapfb_blank(VESA_POWERDOWN, fbdev->fb_info[0]);
+	omapfb_blank(FB_BLANK_POWERDOWN, fbdev->fb_info[0]);
 
 	return 0;
 }
@@ -1828,7 +1828,7 @@ static int omapfb_resume(struct platform_device *pdev)
 {
 	struct omapfb_device *fbdev = platform_get_drvdata(pdev);
 
-	omapfb_blank(VESA_NO_BLANKING, fbdev->fb_info[0]);
+	omapfb_blank(FB_BLANK_UNBLANK, fbdev->fb_info[0]);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 18b41f1cd537168a886c43237297692ba8d0a143 Mon Sep 17 00:00:00 2001
From: Wolfgang Kroener <lkml@azog.de>
Date: Tue, 31 Mar 2009 15:25:44 -0700
Subject: radeonfb: suspend/resume for ATI Mobility Radeon RV350

Add suspend/resume for the Acer Travelmate 290D/292LMi with the following
graphic-chip:

01:00.0 VGA compatible controller [0300]: ATI Technologies Inc RV350
[Mobility Radeon 9600 M10] [1002:4e50] (prog-if 00 [VGA controller])
	Subsystem: Acer Incorporated [ALI] TravelMate 290 [1025:005a]
	Flags: bus master, 66MHz, medium devsel, latency 128, IRQ 10
	Memory at a8000000 (32-bit, prefetchable) [size=128M]
	I/O ports at c100 [size=256]
	Memory at e0010000 (32-bit, non-prefetchable) [size=64K]
	[virtual] Expansion ROM at a0000000 [disabled] [size=128K]
	Capabilities: [58] AGP version 2.0
	Capabilities: [50] Power Management version 2
	Kernel driver in use: radeonfb
	Kernel modules: radeonfb

Signed-off-by: Wolfgang Kroener <lkml@azog.de>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/aty/radeon_pm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/video/aty/radeon_pm.c b/drivers/video/aty/radeon_pm.c
index 1de0c003246..97a1f095f32 100644
--- a/drivers/video/aty/radeon_pm.c
+++ b/drivers/video/aty/radeon_pm.c
@@ -89,6 +89,9 @@ static struct radeon_device_id radeon_workaround_list[] = {
 	BUGFIX("Acer Aspire 2010",
 	       PCI_VENDOR_ID_AI, 0x0061,
 	       radeon_pm_off, radeon_reinitialize_M10),
+	BUGFIX("Acer Travelmate 290D/292LMi",
+	       PCI_VENDOR_ID_AI, 0x005a,
+	       radeon_pm_off, radeon_reinitialize_M10),
 	{ .ident = NULL }
 };
 
-- 
cgit v1.2.3


From 98da329581e3e6a08eba418ba6da64c05bacd029 Mon Sep 17 00:00:00 2001
From: Alessio Igor Bogani <abogani@texware.it>
Date: Tue, 31 Mar 2009 15:25:46 -0700
Subject: nvidiafb: remove open_lock mutex

Remove mutex from the nvidiafb_open/nvidiafb_release functions as these
operations are mutexed at fb layer.

Signed-off-by: Alessio Igor Bogani <abogani@texware.it>
Cc: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/nvidia/nv_type.h | 2 --
 drivers/video/nvidia/nvidia.c  | 7 -------
 2 files changed, 9 deletions(-)

diff --git a/drivers/video/nvidia/nv_type.h b/drivers/video/nvidia/nv_type.h
index f132aab8c5d..c03f7f55c76 100644
--- a/drivers/video/nvidia/nv_type.h
+++ b/drivers/video/nvidia/nv_type.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
-#include <linux/mutex.h>
 #include <video/vga.h>
 
 #define NV_ARCH_04  0x04
@@ -99,7 +98,6 @@ struct nvidia_par {
 	RIVA_HW_STATE initial_state;
 	RIVA_HW_STATE *CurrentState;
 	struct vgastate vgastate;
-	struct mutex open_lock;
 	u32 pseudo_palette[16];
 	struct pci_dev *pci_dev;
 	u32 Architecture;
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 9dbb5a5a267..efe10ff86d6 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -1004,15 +1004,12 @@ static int nvidiafb_open(struct fb_info *info, int user)
 {
 	struct nvidia_par *par = info->par;
 
-	mutex_lock(&par->open_lock);
-
 	if (!par->open_count) {
 		save_vga_x86(par);
 		nvidia_save_vga(par, &par->initial_state);
 	}
 
 	par->open_count++;
-	mutex_unlock(&par->open_lock);
 	return 0;
 }
 
@@ -1021,8 +1018,6 @@ static int nvidiafb_release(struct fb_info *info, int user)
 	struct nvidia_par *par = info->par;
 	int err = 0;
 
-	mutex_lock(&par->open_lock);
-
 	if (!par->open_count) {
 		err = -EINVAL;
 		goto done;
@@ -1035,7 +1030,6 @@ static int nvidiafb_release(struct fb_info *info, int user)
 
 	par->open_count--;
 done:
-	mutex_unlock(&par->open_lock);
 	return err;
 }
 
@@ -1300,7 +1294,6 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd,
 
 	par = info->par;
 	par->pci_dev = pd;
-	mutex_init(&par->open_lock);
 	info->pixmap.addr = kzalloc(8 * 1024, GFP_KERNEL);
 
 	if (info->pixmap.addr == NULL)
-- 
cgit v1.2.3


From 84d9077b3391e6966813e380e63b57f53e58e120 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:47 -0700
Subject: fb: hide hardware cursor in graphics mode (Mach64)

A hardware cursor is left enabled in the fb_set_par() which is called when a
new console is created.  This is inconsistent with software cursor's
behaviour.

Also, this makes a hardware cursor always visible in the Xfbdev (Xorg kdrive)
server.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Risto Suominen <risto.suominen@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/aty/mach64_accel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/video/aty/mach64_accel.c b/drivers/video/aty/mach64_accel.c
index a8f60c33863..0cc9724e61a 100644
--- a/drivers/video/aty/mach64_accel.c
+++ b/drivers/video/aty/mach64_accel.c
@@ -39,7 +39,8 @@ void aty_reset_engine(const struct atyfb_par *par)
 {
 	/* reset engine */
 	aty_st_le32(GEN_TEST_CNTL,
-		aty_ld_le32(GEN_TEST_CNTL, par) & ~GUI_ENGINE_ENABLE, par);
+		aty_ld_le32(GEN_TEST_CNTL, par) &
+		~(GUI_ENGINE_ENABLE | HWCURSOR_ENABLE), par);
 	/* enable engine */
 	aty_st_le32(GEN_TEST_CNTL,
 		aty_ld_le32(GEN_TEST_CNTL, par) | GUI_ENGINE_ENABLE, par);
-- 
cgit v1.2.3


From 2f682fae611df642acfb99b5f2fd665f001cd253 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:47 -0700
Subject: atyfb: speed up Mach64 cursor

Save one fifo entry on cursor enabling and disabling.

Save another fifo entry for FB_CUR_SETPOS operation by removing redundant one.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/aty/mach64_cursor.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/video/aty/mach64_cursor.c b/drivers/video/aty/mach64_cursor.c
index faf95da8fcb..04c710804bb 100644
--- a/drivers/video/aty/mach64_cursor.c
+++ b/drivers/video/aty/mach64_cursor.c
@@ -77,9 +77,13 @@ static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor)
 	if (par->asleep)
 		return -EPERM;
 
-	/* Hide cursor */
 	wait_for_fifo(1, par);
-	aty_st_le32(GEN_TEST_CNTL, aty_ld_le32(GEN_TEST_CNTL, par) & ~HWCURSOR_ENABLE, par);
+	if (cursor->enable)
+		aty_st_le32(GEN_TEST_CNTL, aty_ld_le32(GEN_TEST_CNTL, par)
+			    | HWCURSOR_ENABLE, par);
+	else
+		aty_st_le32(GEN_TEST_CNTL, aty_ld_le32(GEN_TEST_CNTL, par)
+				& ~HWCURSOR_ENABLE, par);
 
 	/* set position */
 	if (cursor->set & FB_CUR_SETPOS) {
@@ -109,7 +113,7 @@ static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor)
 			y<<=1;
 			h<<=1;
 		}
-		wait_for_fifo(4, par);
+		wait_for_fifo(3, par);
 		aty_st_le32(CUR_OFFSET, (info->fix.smem_len >> 3) + (yoff << 1), par);
 		aty_st_le32(CUR_HORZ_VERT_OFF,
 			    ((u32) (64 - h + yoff) << 16) | xoff, par);
@@ -177,11 +181,6 @@ static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor)
 	    }
 	}
 
-	if (cursor->enable) {
-		wait_for_fifo(1, par);
-		aty_st_le32(GEN_TEST_CNTL, aty_ld_le32(GEN_TEST_CNTL, par)
-			    | HWCURSOR_ENABLE, par);
-	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 04645fc337eef283887d3b1204017f1860ed2ff7 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:48 -0700
Subject: tridentfb: delete acceleration Kconfig option

Remove Kconfig option for tridentfb acceleration.  The acceleration can be
switched off with modules "noaccel" parameter.

The acceleration for Trident chips was fixed in the 2.6.27 kernel.

Also, add CyberXXX and CyberBlade names to Kconfig option's name.  It should
make easier to find the tridentfb choice for cyblafb driver's users.  The
cyblafb driver has been replaced by the tridentfb driver.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/Kconfig     |  9 +--------
 drivers/video/tridentfb.c | 12 ------------
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index db7a4f42eda..ffe2f2796e2 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -1598,7 +1598,7 @@ config FB_VT8623
 	  VIA VT8623 [Apollo CLE266] chipset.
 
 config FB_TRIDENT
-	tristate "Trident support"
+	tristate "Trident/CyberXXX/CyberBlade support"
 	depends on FB && PCI
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
@@ -1617,13 +1617,6 @@ config FB_TRIDENT
 	  To compile this driver as a module, choose M here: the
 	  module will be called tridentfb.
 
-config FB_TRIDENT_ACCEL
-	bool "Trident Acceleration functions (EXPERIMENTAL)"
-	depends on FB_TRIDENT && EXPERIMENTAL
-	---help---
-	This will compile the Trident frame buffer device with
-	acceleration functions.
-
 config FB_ARK
 	tristate "ARK 2000PV support"
 	depends on FB && PCI
diff --git a/drivers/video/tridentfb.c b/drivers/video/tridentfb.c
index bc692a15e2a..03a9c35e9f5 100644
--- a/drivers/video/tridentfb.c
+++ b/drivers/video/tridentfb.c
@@ -490,7 +490,6 @@ static void tgui_copy_rect(struct tridentfb_par *par,
 /*
  * Accel functions called by the upper layers
  */
-#ifdef CONFIG_FB_TRIDENT_ACCEL
 static void tridentfb_fillrect(struct fb_info *info,
 			       const struct fb_fillrect *fr)
 {
@@ -565,11 +564,6 @@ static int tridentfb_sync(struct fb_info *info)
 		par->wait_engine(par);
 	return 0;
 }
-#else
-#define tridentfb_fillrect cfb_fillrect
-#define tridentfb_copyarea cfb_copyarea
-#define tridentfb_imageblit cfb_imageblit
-#endif /* CONFIG_FB_TRIDENT_ACCEL */
 
 /*
  * Hardware access functions
@@ -1333,9 +1327,7 @@ static struct fb_ops tridentfb_ops = {
 	.fb_fillrect = tridentfb_fillrect,
 	.fb_copyarea = tridentfb_copyarea,
 	.fb_imageblit = tridentfb_imageblit,
-#ifdef CONFIG_FB_TRIDENT_ACCEL
 	.fb_sync = tridentfb_sync,
-#endif
 };
 
 static int __devinit trident_pci_probe(struct pci_dev *dev,
@@ -1359,10 +1351,6 @@ static int __devinit trident_pci_probe(struct pci_dev *dev,
 
 	chip_id = id->device;
 
-#ifndef CONFIG_FB_TRIDENT_ACCEL
-	noaccel = 1;
-#endif
-
 	/* If PCI id is 0x9660 then further detect chip type */
 
 	if (chip_id == TGUI9660) {
-- 
cgit v1.2.3


From e14a685dfabf3ceeb366f1db1a22471b8f98a08b Mon Sep 17 00:00:00 2001
From: Brian Maly <bmaly@redhat.com>
Date: Tue, 31 Mar 2009 15:25:50 -0700
Subject: efifb: dmi set video type

The current logic for dmi matching in efifb does not allow efifb to load
on all hardware that we can dmi match for.

For a real world example, boot with elilo (3.7 or 3.8 vanilla) and on a
Apple (MacBook) and EFI framebuffer driver will not load (you will have no
video).  This specific hardware is efi v1.10, so we have UGA and not GOP.
Without special bootloader magic (i.e.  extra elilo patches for UGA
graphics detection) no screen info will be passed to the kernel and as a
result efifb will not load.

This patch allows the dmi match to happen by moving it to earlier in
efifb_init, and sets the video type (in set_system) so that efifb can load
when we have a valid dmi match and already know the specifics of the
hardware.

Without this patch the efifb driver will fail to load in the event screen
info is not found and passed in by the bootloader, being that we will
never get to look for a dmi match.  A primary reason for matching with dmi
is because not all bootloaders detect the video info properly.  The
solution is that in the event of a dmi match, we should set
screen_info.orig_video_isVGA.  Most bootloaders fail to set screen info on
Apple hardware, and this is a big problem for people who use Apple
hardware.

Tested on a MacBook SantaRosa with elilo-3.8 (vanilla) and resolves the
issue, the dmi match now works, EFI framebuffer now loads and video works.

Signed-off-by: Brian Maly <bmaly@redhat.com>
Acked-by: Huang Ying <ying.huang@intel.com>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Chandramouli Narayanan <mouli@linux.intel.com>
Acked-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/efifb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c
index daf9b81878a..0c5b9a9fd56 100644
--- a/drivers/video/efifb.c
+++ b/drivers/video/efifb.c
@@ -129,6 +129,8 @@ static int set_system(const struct dmi_system_id *id)
 		screen_info.lfb_width = info->width;
 	if (screen_info.lfb_height == 0)
 		screen_info.lfb_height = info->height;
+	if (screen_info.orig_video_isVGA == 0)
+		screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
 
 	return 0;
 }
@@ -374,9 +376,10 @@ static int __init efifb_init(void)
 	int ret;
 	char *option = NULL;
 
+	dmi_check_system(dmi_system_table);
+
 	if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
 		return -ENODEV;
-	dmi_check_system(dmi_system_table);
 
 	if (fb_get_options("efifb", &option))
 		return -ENODEV;
-- 
cgit v1.2.3


From ebf7649a4c6d37ce24c143001125cf29cc0bcf6a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 1 Apr 2009 09:26:12 -0700
Subject: [IA64] Fix typo/thinko in arch/ia64/sn/kernel/sn2/sn2_smp.c

sn2_ptc_init() has what looks like a cut-n-paste error. Fix it.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/sn2/sn2_smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 3c2f242d90c..1176506b2ba 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -554,7 +554,7 @@ static int __init sn2_ptc_init(void)
 
 	proc_sn2_ptc = proc_create(PTC_BASENAME, 0444,
 				   NULL, &proc_sn2_ptc_operations);
-	if (!&proc_sn2_ptc_operations) {
+	if (!proc_sn2_ptc) {
 		printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 80a03e29164c76b70e6dbb1d10515820cc24487a Mon Sep 17 00:00:00 2001
From: Stoyan Gaydarov <stoyboyker@gmail.com>
Date: Tue, 10 Mar 2009 00:10:30 -0500
Subject: [IA64] BUG to BUG_ON changes

Replace:

	if (test)
		BUG();

with
	BUG_ON(test);

Signed-off-by: Stoyan Gaydarov <stoyboyker@gmail.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/module.c           |  3 +--
 arch/ia64/kernel/setup.c            |  3 +--
 arch/ia64/mm/init.c                 |  3 +--
 arch/ia64/sn/kernel/io_common.c     | 15 +++++----------
 arch/ia64/sn/kernel/io_init.c       | 12 ++++--------
 arch/ia64/sn/kernel/setup.c         |  3 +--
 arch/ia64/sn/kernel/sn2/sn_hwperf.c |  6 ++----
 arch/ia64/sn/pci/pcibr/pcibr_dma.c  |  4 +---
 8 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 34fe4259a14..da3b0cf495a 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -533,8 +533,7 @@ get_ltoff (struct module *mod, uint64_t value, int *okp)
 			goto found;
 
 	/* Not enough GOT entries? */
-	if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size))
-		BUG();
+	BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
 
 	e->val = value;
 	++mod->arch.next_got_entry;
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 833b3ef9277..714066aeda7 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -1018,8 +1018,7 @@ cpu_init (void)
 					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
-	if (current->mm)
-		BUG();
+	BUG_ON(current->mm);
 
 	ia64_mmu_init(ia64_imva(cpu_data));
 	ia64_mca_cpu_init(ia64_imva(cpu_data));
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 8503d534794..c0f3bee6904 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -636,8 +636,7 @@ mem_init (void)
 #endif
 
 #ifdef CONFIG_FLATMEM
-	if (!mem_map)
-		BUG();
+	BUG_ON(!mem_map);
 	max_mapnr = max_low_pfn;
 #endif
 
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 0d4ffa4da1d..57f280dd9de 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -135,8 +135,7 @@ static s64 sn_device_fixup_war(u64 nasid, u64 widget, int device,
 	}
 
 	war_list = kzalloc(DEV_PER_WIDGET * sizeof(*war_list), GFP_KERNEL);
-	if (!war_list)
-		BUG();
+	BUG_ON(!war_list);
 
 	SAL_CALL_NOLOCK(isrv, SN_SAL_IOIF_GET_WIDGET_DMAFLUSH_LIST,
 			nasid, widget, __pa(war_list), 0, 0, 0 ,0);
@@ -180,23 +179,20 @@ sn_common_hubdev_init(struct hubdev_info *hubdev)
 		sizeof(struct sn_flush_device_kernel *);
 	hubdev->hdi_flush_nasid_list.widget_p =
 		kzalloc(size, GFP_KERNEL);
-	if (!hubdev->hdi_flush_nasid_list.widget_p)
-		BUG();
+	BUG_ON(!hubdev->hdi_flush_nasid_list.widget_p);
 
 	for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++) {
 		size = DEV_PER_WIDGET *
 			sizeof(struct sn_flush_device_kernel);
 		sn_flush_device_kernel = kzalloc(size, GFP_KERNEL);
-		if (!sn_flush_device_kernel)
-			BUG();
+		BUG_ON(!sn_flush_device_kernel);
 
 		dev_entry = sn_flush_device_kernel;
 		for (device = 0; device < DEV_PER_WIDGET;
 		     device++, dev_entry++) {
 			size = sizeof(struct sn_flush_device_common);
 			dev_entry->common = kzalloc(size, GFP_KERNEL);
-			if (!dev_entry->common)
-				BUG();
+			BUG_ON(!dev_entry->common);
 			if (sn_prom_feature_available(PRF_DEVICE_FLUSH_LIST))
 				status = sal_get_device_dmaflush_list(
 					     hubdev->hdi_nasid, widget, device,
@@ -326,8 +322,7 @@ sn_common_bus_fixup(struct pci_bus *bus,
 	 */
 	controller->platform_data = kzalloc(sizeof(struct sn_platform_data),
 					    GFP_KERNEL);
-	if (controller->platform_data == NULL)
-		BUG();
+	BUG_ON(controller->platform_data == NULL);
 	sn_platform_data =
 			(struct sn_platform_data *) controller->platform_data;
 	sn_platform_data->provider_soft = provider_soft;
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index e2eb2da60f9..ee774c366a0 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -128,8 +128,7 @@ sn_legacy_pci_window_fixup(struct pci_controller *controller,
 {
 		controller->window = kcalloc(2, sizeof(struct pci_window),
 					     GFP_KERNEL);
-		if (controller->window == NULL)
-			BUG();
+		BUG_ON(controller->window == NULL);
 		controller->window[0].offset = legacy_io;
 		controller->window[0].resource.name = "legacy_io";
 		controller->window[0].resource.flags = IORESOURCE_IO;
@@ -168,8 +167,7 @@ sn_pci_window_fixup(struct pci_dev *dev, unsigned int count,
 	idx = controller->windows;
 	new_count = controller->windows + count;
 	new_window = kcalloc(new_count, sizeof(struct pci_window), GFP_KERNEL);
-	if (new_window == NULL)
-		BUG();
+	BUG_ON(new_window == NULL);
 	if (controller->window) {
 		memcpy(new_window, controller->window,
 		       sizeof(struct pci_window) * controller->windows);
@@ -222,8 +220,7 @@ sn_io_slot_fixup(struct pci_dev *dev)
 		(u64) __pa(pcidev_info),
 		(u64) __pa(sn_irq_info));
 
-	if (status)
-		BUG(); /* Cannot get platform pci device information */
+	BUG_ON(status); /* Cannot get platform pci device information */
 
 
 	/* Copy over PIO Mapped Addresses */
@@ -307,8 +304,7 @@ sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
 	prom_bussoft_ptr = __va(prom_bussoft_ptr);
 
 	controller = kzalloc(sizeof(*controller), GFP_KERNEL);
-	if (!controller)
-		BUG();
+	BUG_ON(!controller);
 	controller->segment = segment;
 
 	/*
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 12097776afc..e456f062f24 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -732,8 +732,7 @@ void __init build_cnode_tables(void)
 		kl_config_hdr_t *klgraph_header;
 		nasid = cnodeid_to_nasid(node);
 		klgraph_header = ia64_sn_get_klconfig_addr(nasid);
-		if (klgraph_header == NULL)
-			BUG();
+		BUG_ON(klgraph_header == NULL);
 		brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info);
 		while (brd) {
 			if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) {
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 45f3c239042..9e6491cf72b 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -275,8 +275,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
 
 	/* get it's interconnect topology */
 	sz = op->ports * sizeof(struct sn_hwperf_port_info);
-	if (sz > sizeof(ptdata))
-		BUG();
+	BUG_ON(sz > sizeof(ptdata));
 	e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
 			      SN_HWPERF_ENUM_PORTS, nodeobj->id, sz,
 			      (u64)&ptdata, 0, 0, NULL);
@@ -310,8 +309,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
 	if (router && (!found_cpu || !found_mem)) {
 		/* search for a node connected to the same router */
 		sz = router->ports * sizeof(struct sn_hwperf_port_info);
-		if (sz > sizeof(ptdata))
-			BUG();
+		BUG_ON(sz > sizeof(ptdata));
 		e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
 				      SN_HWPERF_ENUM_PORTS, router->id, sz,
 				      (u64)&ptdata, 0, 0, NULL);
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index 060df4aa991..c659ad5613a 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -256,9 +256,7 @@ void sn_dma_flush(u64 addr)
 
 	hubinfo = (NODEPDA(nasid_to_cnodeid(nasid)))->pdinfo;
 
-	if (!hubinfo) {
-		BUG();
-	}
+	BUG_ON(!hubinfo);
 
 	flush_nasid_list = &hubinfo->hdi_flush_nasid_list;
 	if (flush_nasid_list->widget_p == NULL)
-- 
cgit v1.2.3


From ad5b365c1266b0c9e8e254a3c1cc4ef66bf33cba Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 28 Mar 2009 19:55:20 +0000
Subject: NSM: Fix unaligned accesses in nsm_init_private()

This fixes unaligned accesses in nsm_init_private() when
creating nlm_reboot keys.

Signed-off-by: Mans Rullgard <mans@mansr.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac82..6d5d4a4169e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 
+#include <asm/unaligned.h>
+
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 #define NSM_PROGRAM		100024
 #define NSM_VERSION		1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
 {
 	u64 *p = (u64 *)&nsm->sm_priv.data;
 	struct timespec ts;
+	s64 ns;
 
 	ktime_get_ts(&ts);
-	*p++ = timespec_to_ns(&ts);
-	*p = (unsigned long)nsm;
+	ns = timespec_to_ns(&ts);
+	put_unaligned(ns, p);
+	put_unaligned((unsigned long)nsm, p + 1);
 }
 
 static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
-- 
cgit v1.2.3


From c69da774b28e01e062e0a3aba7509f2dcfd2a11a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 30 Mar 2009 18:59:17 -0400
Subject: SUNRPC: Ensure IPV6_V6ONLY is set on the socket before binding to a
 port

Also ensure that we use the protocol family instead of the address
family when calling sock_create_kern().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svcsock.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index ac6cd65220c..9d504234af4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1110,7 +1110,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	struct svc_sock	*svsk;
 	struct sock	*inet;
 	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
-	int		val;
 
 	dprintk("svc: svc_setup_socket %p\n", sock);
 	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1143,16 +1142,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	else
 		svc_tcp_init(svsk, serv);
 
-	/*
-	 * If this is a PF_INET6 listener, we want to avoid
-	 * getting requests from IPv4 remotes.  Those should
-	 * be shunted to a PF_INET listener via rpcbind.
-	 */
-	val = 1;
-	if (inet->sk_family == PF_INET6)
-		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
-					(char *)&val, sizeof(val));
-
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
 
@@ -1220,6 +1209,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	struct sockaddr_storage addr;
 	struct sockaddr *newsin = (struct sockaddr *)&addr;
 	int		newlen;
+	int		family;
+	int		val;
 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 
 	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
@@ -1231,14 +1222,35 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 				"sockets supported\n");
 		return ERR_PTR(-EINVAL);
 	}
+
 	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+	switch (sin->sa_family) {
+	case AF_INET6:
+		family = PF_INET6;
+		break;
+	case AF_INET:
+		family = PF_INET;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
 
-	error = sock_create_kern(sin->sa_family, type, protocol, &sock);
+	error = sock_create_kern(family, type, protocol, &sock);
 	if (error < 0)
 		return ERR_PTR(error);
 
 	svc_reclassify_socket(sock);
 
+	/*
+	 * If this is an PF_INET6 listener, we want to avoid
+	 * getting requests from IPv4 remotes.  Those should
+	 * be shunted to a PF_INET listener via rpcbind.
+	 */
+	val = 1;
+	if (family == PF_INET6)
+		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+					(char *)&val, sizeof(val));
+
 	if (type == SOCK_STREAM)
 		sock->sk->sk_reuse = 1;		/* allow address reuse */
 	error = kernel_bind(sock, sin, len);
-- 
cgit v1.2.3


From cd670599b7b00d9263f6f11a05c0edeb9cbedaf3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 1 Apr 2009 11:35:00 -0700
Subject: x86, setup: guard against pre-ACPI 3 e820 code not updating %ecx

Impact: BIOS bug safety

For pre-ACPI 3 BIOSes, pre-initialize the end of the e820 buffer just
in case the BIOS returns an unchanged %ecx but without actually
touching the ACPI 3 extended flags field.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/memory.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index d5d2360763d..5054c2ddd1a 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -31,6 +31,12 @@ static int detect_memory_e820(void)
 	struct e820entry *desc = boot_params.e820_map;
 	static struct e820_ext_entry buf; /* static so it is zeroed */
 
+	/*
+	 * Set this here so that if the BIOS doesn't change this field
+	 * but still doesn't change %ecx, we're still okay...
+	 */
+	buf.ext_flags = 1;
+
 	do {
 		size = sizeof buf;
 
-- 
cgit v1.2.3


From 972dd435fb2c5da173a4c8dea44ccb8748e36d35 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 1 Apr 2009 12:46:15 -0700
Subject: qeth: properly delete empty files.

Commit 64ef8957986f6a04f61e7c95fa6ffeb3a86a6661 ("qeth: remove EDDP")
removed the qeth_core_offl.[hc] files, but ended up doing so by just
patching them to zero size, rather than removing them properly.

Actually remove the files.

Reported-by: Andrew Price <andy@andrewprice.me.uk>
Cc: Frank Blaschka <frank.blaschka@de.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/s390/net/qeth_core_offl.c | 0
 drivers/s390/net/qeth_core_offl.h | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 drivers/s390/net/qeth_core_offl.c
 delete mode 100644 drivers/s390/net/qeth_core_offl.h

diff --git a/drivers/s390/net/qeth_core_offl.c b/drivers/s390/net/qeth_core_offl.c
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/drivers/s390/net/qeth_core_offl.h b/drivers/s390/net/qeth_core_offl.h
deleted file mode 100644
index e69de29bb2d..00000000000
-- 
cgit v1.2.3


From 833bb3046b6cb320e775ea2160ddca87d53260d5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 2 Apr 2009 01:30:04 +0400
Subject: serial: fixup /proc/tty/driver/serial after proc_fops conversion

"struct tty_driver *" lies in m->private not in v which is
SEQ_TOKEN_START which is 1 which is enough to trigger NULL dereference
next line:

	BUG: unable to handle kernel NULL pointer dereference at 000000ad
	IP: [<c040d689>] uart_proc_show+0xe/0x2b0

Noticed by Linus.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index bf3c0e32a33..b0bb29d804a 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1765,7 +1765,7 @@ static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
 
 static int uart_proc_show(struct seq_file *m, void *v)
 {
-	struct tty_driver *ttydrv = v;
+	struct tty_driver *ttydrv = m->private;
 	struct uart_driver *drv = ttydrv->driver_state;
 	int i;
 
-- 
cgit v1.2.3