60 files changed, 995 insertions, 18 deletions
diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h
index cb03bbe92cd..fc77f741308 100644
--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -176,6 +176,7 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
 }
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
diff --git a/include/asm-alpha/mutex.h b/include/asm-alpha/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-alpha/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-arm/atomic.h b/include/asm-arm/atomic.h
index f72b63309bc..3d7283d8440 100644
--- a/include/asm-arm/atomic.h
+++ b/include/asm-arm/atomic.h
@@ -175,6 +175,8 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
 
 #endif /* __LINUX_ARM_ARCH__ */
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
diff --git a/include/asm-arm/mutex.h b/include/asm-arm/mutex.h
new file mode 100644
index 00000000000..6caa59f1f59
--- /dev/null
+++ b/include/asm-arm/mutex.h
@@ -0,0 +1,128 @@
+/*
+ * include/asm-arm/mutex.h
+ *
+ * ARM optimized mutex locking primitives
+ *
+ * Please look into asm-generic/mutex-xchg.h for a formal definition.
+ */
+#ifndef _ASM_MUTEX_H
+#define _ASM_MUTEX_H
+
+#if __LINUX_ARM_ARCH__ < 6
+/* On pre-ARMv6 hardware the swp based implementation is the most efficient. */
+# include <asm-generic/mutex-xchg.h>
+#else
+
+/*
+ * Attempting to lock a mutex on ARMv6+ can be done with a bastardized
+ * atomic decrement (it is not a reliable atomic decrement but it satisfies
+ * the defined semantics for our purpose, while being smaller and faster
+ * than a real atomic decrement or atomic swap.  The idea is to attempt
+ * decrementing the lock value only once.  If once decremented it isn't zero,
+ * or if its store-back fails due to a dispute on the exclusive store, we
+ * simply bail out immediately through the slow path where the lock will be
+ * reattempted until it succeeds.
+ */
+#define __mutex_fastpath_lock(count, fail_fn)				\
+do {									\
+	int __ex_flag, __res;						\
+									\
+	typecheck(atomic_t *, count);					\
+	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ (							\
+		"ldrex	%0, [%2]	\n"				\
+		"sub	%0, %0, #1	\n"				\
+		"strex	%1, %0, [%2]	\n"				\
+									\
+		: "=&r" (__res), "=&r" (__ex_flag)			\
+		: "r" (&(count)->counter)				\
+		: "cc","memory" );					\
+									\
+	if (unlikely(__res || __ex_flag))				\
+		fail_fn(count);						\
+} while (0)
+
+#define __mutex_fastpath_lock_retval(count, fail_fn)			\
+({									\
+	int __ex_flag, __res;						\
+									\
+	typecheck(atomic_t *, count);					\
+	typecheck_fn(fastcall int (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ (							\
+		"ldrex	%0, [%2]	\n"				\
+		"sub	%0, %0, #1	\n"				\
+		"strex	%1, %0, [%2]	\n"				\
+									\
+		: "=&r" (__res), "=&r" (__ex_flag)			\
+		: "r" (&(count)->counter)				\
+		: "cc","memory" );					\
+									\
+	__res |= __ex_flag;						\
+	if (unlikely(__res != 0))					\
+		__res = fail_fn(count);					\
+	__res;								\
+})
+
+/*
+ * Same trick is used for the unlock fast path. However the original value,
+ * rather than the result, is used to test for success in order to have
+ * better generated assembly.
+ */
+#define __mutex_fastpath_unlock(count, fail_fn)				\
+do {									\
+	int __ex_flag, __res, __orig;					\
+									\
+	typecheck(atomic_t *, count);					\
+	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ (							\
+		"ldrex	%0, [%3]	\n"				\
+		"add	%1, %0, #1	\n"				\
+		"strex	%2, %1, [%3]	\n"				\
+									\
+		: "=&r" (__orig), "=&r" (__res), "=&r" (__ex_flag)	\
+		: "r" (&(count)->counter)				\
+		: "cc","memory" );					\
+									\
+	if (unlikely(__orig || __ex_flag))				\
+		fail_fn(count);						\
+} while (0)
+
+/*
+ * If the unlock was done on a contended lock, or if the unlock simply fails
+ * then the mutex remains locked.
+ */
+#define __mutex_slowpath_needs_to_unlock()	1
+
+/*
+ * For __mutex_fastpath_trylock we use another construct which could be
+ * described as a "single value cmpxchg".
+ *
+ * This provides the needed trylock semantics like cmpxchg would, but it is
+ * lighter and less generic than a true cmpxchg implementation.
+ */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	int __ex_flag, __res, __orig;
+
+	__asm__ (
+
+		"1: ldrex	%0, [%3]	\n"
+		"subs		%1, %0, #1	\n"
+		"strexeq	%2, %1, [%3]	\n"
+		"movlt		%0, #0		\n"
+		"cmpeq		%2, #0		\n"
+		"bgt		1b		\n"
+
+		: "=&r" (__orig), "=&r" (__res), "=&r" (__ex_flag)
+		: "r" (&count->counter)
+		: "cc", "memory" );
+
+	return __orig;
+}
+
+#endif
+#endif
diff --git a/include/asm-arm26/atomic.h b/include/asm-arm26/atomic.h
index 3074b0e7634..1552c865399 100644
--- a/include/asm-arm26/atomic.h
+++ b/include/asm-arm26/atomic.h
@@ -76,6 +76,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
diff --git a/include/asm-cris/atomic.h b/include/asm-cris/atomic.h
index 2df2c7aa19b..0b51a87e553 100644
--- a/include/asm-cris/atomic.h
+++ b/include/asm-cris/atomic.h
@@ -136,6 +136,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
diff --git a/include/asm-cris/mutex.h b/include/asm-cris/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-cris/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h
index 9c9e9499cfd..a59f684b4f3 100644
--- a/include/asm-frv/atomic.h
+++ b/include/asm-frv/atomic.h
@@ -328,6 +328,7 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 #endif
 
 #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), old, new))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
diff --git a/include/asm-frv/mutex.h b/include/asm-frv/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-frv/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h
new file mode 100644
index 00000000000..74b18cda169
--- /dev/null
+++ b/include/asm-generic/mutex-dec.h
@@ -0,0 +1,110 @@
+/*
+ * asm-generic/mutex-dec.h
+ *
+ * Generic implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ */
+#ifndef _ASM_GENERIC_MUTEX_DEC_H
+#define _ASM_GENERIC_MUTEX_DEC_H
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function MUST leave the value lower than
+ * 1 even when the "1" assertion wasn't true.
+ */
+#define __mutex_fastpath_lock(count, fail_fn)				\
+do {									\
+	if (unlikely(atomic_dec_return(count) < 0))			\
+		fail_fn(count);						\
+	else								\
+		smp_mb();						\
+} while (0)
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns.
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (unlikely(atomic_dec_return(count) < 0))
+		return fail_fn(count);
+	else {
+		smp_mb();
+		return 0;
+	}
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the count from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than 1.
+ *
+ * If the implementation sets it to a value of lower than 1, then the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+#define __mutex_fastpath_unlock(count, fail_fn)				\
+do {									\
+	smp_mb();							\
+	if (unlikely(atomic_inc_return(count) <= 0))			\
+		fail_fn(count);						\
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock()		1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ *
+ * If the architecture has no effective trylock variant, it should call the
+ * <fail_fn> spinlock-based trylock variant unconditionally.
+ */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	/*
+	 * We have two variants here. The cmpxchg based one is the best one
+	 * because it never induce a false contention state.  It is included
+	 * here because architectures using the inc/dec algorithms over the
+	 * xchg ones are much more likely to support cmpxchg natively.
+	 *
+	 * If not we fall back to the spinlock based variant - that is
+	 * just as efficient (and simpler) as a 'destructive' probing of
+	 * the mutex state would be.
+	 */
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (likely(atomic_cmpxchg(count, 1, 0)) == 1) {
+		smp_mb();
+		return 1;
+	}
+	return 0;
+#else
+	return fail_fn(count);
+#endif
+}
+
+#endif
diff --git a/include/asm-generic/mutex-null.h b/include/asm-generic/mutex-null.h
new file mode 100644
index 00000000000..5cf8b7ce0c4
--- /dev/null
+++ b/include/asm-generic/mutex-null.h
@@ -0,0 +1,24 @@
+/*
+ * asm-generic/mutex-null.h
+ *
+ * Generic implementation of the mutex fastpath, based on NOP :-)
+ *
+ * This is used by the mutex-debugging infrastructure, but it can also
+ * be used by architectures that (for whatever reason) want to use the
+ * spinlock based slowpath.
+ */
+#ifndef _ASM_GENERIC_MUTEX_NULL_H
+#define _ASM_GENERIC_MUTEX_NULL_H
+
+/* extra parameter only needed for mutex debugging: */
+#ifndef __IP__
+# define __IP__
+#endif
+
+#define __mutex_fastpath_lock(count, fail_fn)	      fail_fn(count __RET_IP__)
+#define __mutex_fastpath_lock_retval(count, fail_fn)  fail_fn(count __RET_IP__)
+#define __mutex_fastpath_unlock(count, fail_fn)       fail_fn(count __RET_IP__)
+#define __mutex_fastpath_trylock(count, fail_fn)      fail_fn(count)
+#define __mutex_slowpath_needs_to_unlock()	      1
+
+#endif
diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h
new file mode 100644
index 00000000000..1d24f47e6c4
--- /dev/null
+++ b/include/asm-generic/mutex-xchg.h
@@ -0,0 +1,117 @@
+/*
+ * asm-generic/mutex-xchg.h
+ *
+ * Generic implementation of the mutex fastpath, based on xchg().
+ *
+ * NOTE: An xchg based implementation is less optimal than an atomic
+ *       decrement/increment based implementation. If your architecture
+ *       has a reasonable atomic dec/inc then you should probably use
+ *	 asm-generic/mutex-dec.h instead, or you could open-code an
+ *	 optimized version in asm/mutex.h.
+ */
+#ifndef _ASM_GENERIC_MUTEX_XCHG_H
+#define _ASM_GENERIC_MUTEX_XCHG_H
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
+ * wasn't 1 originally. This function MUST leave the value lower than 1
+ * even when the "1" assertion wasn't true.
+ */
+#define __mutex_fastpath_lock(count, fail_fn)				\
+do {									\
+	if (unlikely(atomic_xchg(count, 0) != 1))			\
+		fail_fn(count);						\
+	else								\
+		smp_mb();						\
+} while (0)
+
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
+ * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (unlikely(atomic_xchg(count, 0) != 1))
+		return fail_fn(count);
+	else {
+		smp_mb();
+		return 0;
+	}
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * try to promote the mutex from 0 to 1. if it wasn't 0, call <function>
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than one.
+ * If the implementation sets it to a value of lower than one, the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+#define __mutex_fastpath_unlock(count, fail_fn)				\
+do {									\
+	smp_mb();							\
+	if (unlikely(atomic_xchg(count, 1) != 0))			\
+		fail_fn(count);						\
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock()		0
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: spinlock based trylock implementation
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ *
+ * If the architecture has no effective trylock variant, it should call the
+ * <fail_fn> spinlock-based trylock variant unconditionally.
+ */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	int prev = atomic_xchg(count, 0);
+
+	if (unlikely(prev < 0)) {
+		/*
+		 * The lock was marked contended so we must restore that
+		 * state. If while doing so we get back a prev value of 1
+		 * then we just own it.
+		 *
+		 * [ In the rare case of the mutex going to 1, to 0, to -1
+		 *   and then back to 0 in this few-instructions window,
+		 *   this has the potential to trigger the slowpath for the
+		 *   owner's unlock path needlessly, but that's not a problem
+		 *   in practice. ]
+		 */
+		prev = atomic_xchg(count, prev);
+		if (prev < 0)
+			prev = 0;
+	}
+	smp_mb();
+
+	return prev;
+}
+
+#endif
diff --git a/include/asm-h8300/atomic.h b/include/asm-h8300/atomic.h
index d891541e89c..21f54428c86 100644
--- a/include/asm-h8300/atomic.h
+++ b/include/asm-h8300/atomic.h
@@ -95,6 +95,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
diff --git a/include/asm-h8300/mutex.h b/include/asm-h8300/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-h8300/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
index 7a5472d7709..de649d3aa2d 100644
--- a/include/asm-i386/atomic.h
+++ b/include/asm-i386/atomic.h
@@ -216,6 +216,7 @@ static __inline__ int atomic_sub_return(int i, atomic_t *v)
 }
 
 #define atomic_cmpxchg(v, old, new) ((int)cmpxchg(&((v)->counter), old, new))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-i386/mutex.h b/include/asm-i386/mutex.h
new file mode 100644
index 00000000000..4e5e3de1b9a
--- /dev/null
+++ b/include/asm-i386/mutex.h
@@ -0,0 +1,124 @@
+/*
+ * Assembly implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_MUTEX_H
+#define _ASM_MUTEX_H
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fn> if it
+ * wasn't 1 originally. This function MUST leave the value lower than 1
+ * even when the "1" assertion wasn't true.
+ */
+#define __mutex_fastpath_lock(count, fail_fn)				\
+do {									\
+	unsigned int dummy;						\
+									\
+	typecheck(atomic_t *, count);					\
+	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ __volatile__(						\
+		LOCK	"   decl (%%eax)	\n"			\
+			"   js "#fail_fn"	\n"			\
+									\
+		:"=a" (dummy)						\
+		: "a" (count)						\
+		: "memory", "ecx", "edx");				\
+} while (0)
+
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
+ * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count,
+			     int fastcall (*fail_fn)(atomic_t *))
+{
+	if (unlikely(atomic_dec_return(count) < 0))
+		return fail_fn(count);
+	else
+		return 0;
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value
+ * to 1, or to set it to a value lower than 1.
+ *
+ * If the implementation sets it to a value of lower than 1, the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+#define __mutex_fastpath_unlock(count, fail_fn)				\
+do {									\
+	unsigned int dummy;						\
+									\
+	typecheck(atomic_t *, count);					\
+	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ __volatile__(						\
+		LOCK	"   incl (%%eax)	\n"			\
+			"   jle "#fail_fn"	\n"			\
+									\
+		:"=a" (dummy)						\
+		: "a" (count)						\
+		: "memory", "ecx", "edx");				\
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock()	1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	/*
+	 * We have two variants here. The cmpxchg based one is the best one
+	 * because it never induce a false contention state.  It is included
+	 * here because architectures using the inc/dec algorithms over the
+	 * xchg ones are much more likely to support cmpxchg natively.
+	 *
+	 * If not we fall back to the spinlock based variant - that is
+	 * just as efficient (and simpler) as a 'destructive' probing of
+	 * the mutex state would be.
+	 */
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (likely(atomic_cmpxchg(count, 1, 0)) == 1)
+		return 1;
+	return 0;
+#else
+	return fail_fn(count);
+#endif
+}
+
+#endif
diff --git a/include/asm-ia64/atomic.h b/include/asm-ia64/atomic.h
index 15cf7984c48..d3e0dfa99e1 100644
--- a/include/asm-ia64/atomic.h
+++ b/include/asm-ia64/atomic.h
@@ -89,6 +89,7 @@ ia64_atomic64_sub (__s64 i, atomic64_t *v)
 }
 
 #define atomic_cmpxchg(v, old, new) ((int)cmpxchg(&((v)->counter), old, new))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
diff --git a/include/asm-ia64/mutex.h b/include/asm-ia64/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-ia64/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-m32r/atomic.h b/include/asm-m32r/atomic.h
index 70761278b6c..3122fe106f0 100644
--- a/include/asm-m32r/atomic.h
+++ b/include/asm-m32r/atomic.h
@@ -243,6 +243,7 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 #define atomic_add_negative(i,v) (atomic_add_return((i), (v)) < 0)
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-m32r/mutex.h b/include/asm-m32r/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-m32r/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-m68k/atomic.h b/include/asm-m68k/atomic.h
index b8a4e75d679..a4a84d5c65d 100644
--- a/include/asm-m68k/atomic.h
+++ b/include/asm-m68k/atomic.h
@@ -140,6 +140,7 @@ static inline void atomic_set_mask(unsigned long mask, unsigned long *v)
 }
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
diff --git a/include/asm-m68k/mutex.h b/include/asm-m68k/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-m68k/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-m68knommu/atomic.h b/include/asm-m68knommu/atomic.h
index 1702dbe9318..6c4e4b63e45 100644
--- a/include/asm-m68knommu/atomic.h
+++ b/include/asm-m68knommu/atomic.h
@@ -129,6 +129,7 @@ static inline int atomic_sub_return(int i, atomic_t * v)
 }
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
diff --git a/include/asm-m68knommu/mutex.h b/include/asm-m68knommu/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-m68knommu/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-mips/atomic.h b/include/asm-mips/atomic.h
index 92256e43a93..94a95872d72 100644
--- a/include/asm-mips/atomic.h
+++ b/include/asm-mips/atomic.h
@@ -289,6 +289,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
 }
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-mips/mutex.h b/include/asm-mips/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-mips/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-parisc/atomic.h b/include/asm-parisc/atomic.h
index 64ebd086c40..2ca56d34aaa 100644
--- a/include/asm-parisc/atomic.h
+++ b/include/asm-parisc/atomic.h
@@ -165,6 +165,7 @@ static __inline__ int atomic_read(const atomic_t *v)
 
 /* exported interface */
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-parisc/mutex.h b/include/asm-parisc/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-parisc/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-powerpc/atomic.h b/include/asm-powerpc/atomic.h
index ae395a0632a..248f9aec959 100644
--- a/include/asm-powerpc/atomic.h
+++ b/include/asm-powerpc/atomic.h
@@ -165,6 +165,7 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 }
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-powerpc/mutex.h b/include/asm-powerpc/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-powerpc/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-s390/atomic.h b/include/asm-s390/atomic.h
index d82aedf616f..be6fefe223d 100644
--- a/include/asm-s390/atomic.h
+++ b/include/asm-s390/atomic.h
@@ -75,6 +75,8 @@ static __inline__ void atomic_set_mask(unsigned long mask, atomic_t * v)
 	       __CS_LOOP(v, mask, "or");
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static __inline__ int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	__asm__ __volatile__("  cs   %0,%3,0(%2)\n"
diff --git a/include/asm-s390/mutex.h b/include/asm-s390/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-s390/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-sh/atomic.h b/include/asm-sh/atomic.h
index 618d8e0de34..fb627de217f 100644
--- a/include/asm-sh/atomic.h
+++ b/include/asm-sh/atomic.h
@@ -101,6 +101,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
diff --git a/include/asm-sh/mutex.h b/include/asm-sh/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-sh/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-sh64/atomic.h b/include/asm-sh64/atomic.h
index f3ce5c0df13..28f2ea9b567 100644
--- a/include/asm-sh64/atomic.h
+++ b/include/asm-sh64/atomic.h
@@ -113,6 +113,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
diff --git a/include/asm-sh64/mutex.h b/include/asm-sh64/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-sh64/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-sparc/atomic.h b/include/asm-sparc/atomic.h
index accb4967e9d..e1033170bd3 100644
--- a/include/asm-sparc/atomic.h
+++ b/include/asm-sparc/atomic.h
@@ -20,6 +20,7 @@ typedef struct { volatile int counter; } atomic_t;
 
 extern int __atomic_add_return(int, atomic_t *);
 extern int atomic_cmpxchg(atomic_t *, int, int);
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 extern int atomic_add_unless(atomic_t *, int, int);
 extern void atomic_set(atomic_t *, int);
 
diff --git a/include/asm-sparc/mutex.h b/include/asm-sparc/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-sparc/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h
index 11f5aa5d108..25256bdc8aa 100644
--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -72,6 +72,7 @@ extern int atomic64_sub_ret(int, atomic64_t *);
 #define atomic64_add_negative(i, v) (atomic64_add_ret(i, v) < 0)
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
diff --git a/include/asm-sparc64/mutex.h b/include/asm-sparc64/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-sparc64/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-um/mutex.h b/include/asm-um/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-um/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-v850/atomic.h b/include/asm-v850/atomic.h
index f5b9ab6f4e7..166df00457e 100644
--- a/include/asm-v850/atomic.h
+++ b/include/asm-v850/atomic.h
@@ -104,6 +104,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
diff --git a/include/asm-v850/mutex.h b/include/asm-v850/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-v850/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index 72eb071488c..6b540237a2f 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -389,6 +389,7 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t *v)
 #define atomic64_dec_return(v)  (atomic64_sub_return(1,v))
 
 #define atomic_cmpxchg(v, old, new) ((int)cmpxchg(&((v)->counter), old, new))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h
new file mode 100644
index 00000000000..818abfd262d
--- /dev/null
+++ b/include/asm-x86_64/mutex.h
@@ -0,0 +1,113 @@
+/*
+ * Assembly implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_MUTEX_H
+#define _ASM_MUTEX_H
+
+/**
+ * __mutex_fastpath_lock - decrement and call function if negative
+ * @v: pointer of type atomic_t
+ * @fail_fn: function to call if the result is negative
+ *
+ * Atomically decrements @v and calls <fail_fn> if the result is negative.
+ */
+#define __mutex_fastpath_lock(v, fail_fn)				\
+do {									\
+	unsigned long dummy;						\
+									\
+	typecheck(atomic_t *, v);					\
+	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ __volatile__(						\
+		LOCK	"   decl (%%rdi)	\n"			\
+			"   js 2f		\n"			\
+			"1:			\n"			\
+									\
+		LOCK_SECTION_START("")					\
+			"2: call "#fail_fn"	\n"			\
+			"   jmp 1b		\n"			\
+		LOCK_SECTION_END					\
+									\
+		:"=D" (dummy)						\
+		: "D" (v)						\
+		: "rax", "rsi", "rdx", "rcx",				\
+		  "r8", "r9", "r10", "r11", "memory");			\
+} while (0)
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count,
+			     int fastcall (*fail_fn)(atomic_t *))
+{
+	if (unlikely(atomic_dec_return(count) < 0))
+		return fail_fn(count);
+	else
+		return 0;
+}
+
+/**
+ * __mutex_fastpath_unlock - increment and call function if nonpositive
+ * @v: pointer of type atomic_t
+ * @fail_fn: function to call if the result is nonpositive
+ *
+ * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
+ */
+#define __mutex_fastpath_unlock(v, fail_fn)				\
+do {									\
+	unsigned long dummy;						\
+									\
+	typecheck(atomic_t *, v);					\
+	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+									\
+	__asm__ __volatile__(						\
+		LOCK	"   incl (%%rdi)	\n"			\
+			"   jle 2f		\n"			\
+			"1:			\n"			\
+									\
+		LOCK_SECTION_START("")					\
+			"2: call "#fail_fn"	\n"			\
+			"   jmp 1b		\n"			\
+		LOCK_SECTION_END					\
+									\
+		:"=D" (dummy)						\
+		: "D" (v)						\
+		: "rax", "rsi", "rdx", "rcx",				\
+		  "r8", "r9", "r10", "r11", "memory");			\
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock()	1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
+ *
+ * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
+ * if it wasn't 1 originally. [the fallback function is never used on
+ * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
+ */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (likely(atomic_cmpxchg(count, 1, 0)) == 1)
+		return 1;
+	else
+		return 0;
+}
+
+#endif
diff --git a/include/asm-xtensa/atomic.h b/include/asm-xtensa/atomic.h
index e2ce06b101a..fe105a12392 100644
--- a/include/asm-xtensa/atomic.h
+++ b/include/asm-xtensa/atomic.h
@@ -224,6 +224,7 @@ static inline int atomic_sub_return(int i, atomic_t * v)
 #define atomic_add_negative(i,v) (atomic_add_return((i),(v)) < 0)
 
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
  * atomic_add_unless - add unless the number is a given value
diff --git a/include/asm-xtensa/mutex.h b/include/asm-xtensa/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-xtensa/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 2914f7b0715..e71dd98dbca 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -87,7 +87,7 @@ struct ext3_inode_info {
 #ifdef CONFIG_EXT3_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_sem even when reading would cause contention
+	 * data. Taking i_mutex even when reading would cause contention
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4c82219b0fa..92ae3e2067b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -219,6 +219,7 @@ extern int dir_notify_enable;
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -484,7 +485,7 @@ struct inode {
 	unsigned long		i_blocks;
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
-	struct semaphore	i_sem;
+	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
@@ -820,7 +821,7 @@ struct super_block {
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
-	struct semaphore	s_lock;
+	struct mutex		s_lock;
 	int			s_count;
 	int			s_syncing;
 	int			s_need_sync_fs;
@@ -892,13 +893,13 @@ static inline int has_fs_excl(void)
 static inline void lock_super(struct super_block * sb)
 {
 	get_fs_excl();
-	down(&sb->s_lock);
+	mutex_lock(&sb->s_lock);
 }
 
 static inline void unlock_super(struct super_block * sb)
 {
 	put_fs_excl();
-	up(&sb->s_lock);
+	mutex_unlock(&sb->s_lock);
 }
 
 /*
@@ -1191,7 +1192,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc);
  *    directory.  The name should be stored in the @name (with the
  *    understanding that it is already pointing to a a %NAME_MAX+1 sized
  *    buffer.   get_name() should return %0 on success, a negative error code
- *    or error.  @get_name will be called without @parent->i_sem held.
+ *    or error.  @get_name will be called without @parent->i_mutex held.
  *
  * get_parent:
  *    @get_parent should find the parent directory for the given @child which
@@ -1213,7 +1214,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc);
  *    nfsd_find_fh_dentry() in either the @obj or @parent parameters.
  *
  * Locking rules:
- *    get_parent is called with child->d_inode->i_sem down
+ *    get_parent is called with child->d_inode->i_mutex down
  *    get_name is not (which is possibly inconsistent)
  */
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ef8d0cbb832..9a8c05dbe4f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/device.h>
 #include <linux/pci.h>
+#include <linux/completion.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -638,7 +639,7 @@ typedef struct ide_drive_s {
 	int		crc_count;	/* crc counter to reduce drive speed */
 	struct list_head list;
 	struct device	gendev;
-	struct semaphore gendev_rel_sem;	/* to deal with device release() */
+	struct completion gendev_rel_comp;	/* to deal with device release() */
 } ide_drive_t;
 
 #define to_ide_device(dev)container_of(dev, ide_drive_t, gendev)
@@ -794,7 +795,7 @@ typedef struct hwif_s {
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 
 	struct device	gendev;
-	struct semaphore gendev_rel_sem; /* To deal with device release() */
+	struct completion gendev_rel_comp; /* To deal with device release() */
 
 	void		*hwif_data;	/* extra hwif data */
 
diff --git a/include/linux/jffs2_fs_i.h b/include/linux/jffs2_fs_i.h
index ef85ab56302..ad565bf9dcc 100644
--- a/include/linux/jffs2_fs_i.h
+++ b/include/linux/jffs2_fs_i.h
@@ -8,11 +8,11 @@
 #include <asm/semaphore.h>
 
 struct jffs2_inode_info {
-	/* We need an internal semaphore similar to inode->i_sem.
+	/* We need an internal mutex similar to inode->i_mutex.
 	   Unfortunately, we can't used the existing one, because
 	   either the GC would deadlock, or we'd have to release it
 	   before letting GC proceed. Or we'd have to put ugliness
-	   into the GC code so it didn't attempt to obtain the i_sem
+	   into the GC code so it didn't attempt to obtain the i_mutex
 	   for the inode(s) which are already locked */
 	struct semaphore sem;
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index ca7ff8fdd09..d0e6ca3b00e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -286,6 +286,15 @@ extern void dump_stack(void);
 	1; \
 })
 
+/*
+ * Check at compile time that 'function' is a certain type, or is a pointer
+ * to that type (needs to use typedef for the function type.)
+ */
+#define typecheck_fn(type,function) \
+({	typeof(type) __tmp = function; \
+	(void)__tmp; \
+})
+
 #endif /* __KERNEL__ */
 
 #define SI_LOAD_SHIFT	16
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 40f63c9879d..f96506782eb 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -58,9 +58,9 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct semaphore	lo_sem;
+	struct completion	lo_done;
+	struct completion	lo_bh_done;
 	struct semaphore	lo_ctl_mutex;
-	struct semaphore	lo_bh_mutex;
 	int			lo_pending;
 
 	request_queue_t		*lo_queue;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index df80e63903b..3f1fafc0245 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/rbtree.h>
 #include <linux/prio_tree.h>
 #include <linux/fs.h>
+#include <linux/mutex.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1024,6 +1025,9 @@ static inline void vm_stat_account(struct mm_struct *mm,
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
+	if (!PageHighMem(page) && !enable)
+		mutex_debug_check_no_locks_freed(page_address(page),
+						 page_address(page + numpages));
 }
 #endif
 
diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
new file mode 100644
index 00000000000..0ccd8f983b5
--- /dev/null
+++ b/include/linux/mutex-debug.h
@@ -0,0 +1,21 @@
+#ifndef __LINUX_MUTEX_DEBUG_H
+#define __LINUX_MUTEX_DEBUG_H
+
+/*
+ * Mutexes - debugging helpers:
+ */
+
+#define __DEBUG_MUTEX_INITIALIZER(lockname) \
+	, .held_list = LIST_HEAD_INIT(lockname.held_list), \
+	  .name = #lockname , .magic = &lockname
+
+#define mutex_init(sem)		__mutex_init(sem, __FUNCTION__)
+
+extern void FASTCALL(mutex_destroy(struct mutex *lock));
+
+extern void mutex_debug_show_all_locks(void);
+extern void mutex_debug_show_held_locks(struct task_struct *filter);
+extern void mutex_debug_check_no_locks_held(struct task_struct *task);
+extern void mutex_debug_check_no_locks_freed(const void *from, const void *to);
+
+#endif
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
new file mode 100644
index 00000000000..9bce0fee68d
--- /dev/null
+++ b/include/linux/mutex.h
@@ -0,0 +1,119 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure and API definitions.
+ */
+#ifndef __LINUX_MUTEX_H
+#define __LINUX_MUTEX_H
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#include <asm/atomic.h>
+
+/*
+ * Simple, straightforward mutexes with strict semantics:
+ *
+ * - only one task can hold the mutex at a time
+ * - only the owner can unlock the mutex
+ * - multiple unlocks are not permitted
+ * - recursive locking is not permitted
+ * - a mutex object must be initialized via the API
+ * - a mutex object must not be initialized via memset or copying
+ * - task may not exit with mutex held
+ * - memory areas where held locks reside must not be freed
+ * - held mutexes must not be reinitialized
+ * - mutexes may not be used in irq contexts
+ *
+ * These semantics are fully enforced when DEBUG_MUTEXES is
+ * enabled. Furthermore, besides enforcing the above rules, the mutex
+ * debugging code also implements a number of additional features
+ * that make lock debugging easier and faster:
+ *
+ * - uses symbolic names of mutexes, whenever they are printed in debug output
+ * - point-of-acquire tracking, symbolic lookup of function names
+ * - list of all locks held in the system, printout of them
+ * - owner tracking
+ * - detects self-recursing locks and prints out all relevant info
+ * - detects multi-task circular deadlocks and prints out all affected
+ *   locks and tasks (and only those tasks)
+ */
+struct mutex {
+	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
+	atomic_t		count;
+	spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct thread_info	*owner;
+	struct list_head	held_list;
+	unsigned long		acquire_ip;
+	const char 		*name;
+	void			*magic;
+#endif
+};
+
+/*
+ * This is the control structure for tasks blocked on mutex,
+ * which resides on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct mutex		*lock;
+	void			*magic;
+#endif
+};
+
+#ifdef CONFIG_DEBUG_MUTEXES
+# include <linux/mutex-debug.h>
+#else
+# define __DEBUG_MUTEX_INITIALIZER(lockname)
+# define mutex_init(mutex)			__mutex_init(mutex, NULL)
+# define mutex_destroy(mutex)				do { } while (0)
+# define mutex_debug_show_all_locks()			do { } while (0)
+# define mutex_debug_show_held_locks(p)			do { } while (0)
+# define mutex_debug_check_no_locks_held(task)		do { } while (0)
+# define mutex_debug_check_no_locks_freed(from, to)	do { } while (0)
+#endif
+
+#define __MUTEX_INITIALIZER(lockname) \
+		{ .count = ATOMIC_INIT(1) \
+		, .wait_lock = SPIN_LOCK_UNLOCKED \
+		, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
+		__DEBUG_MUTEX_INITIALIZER(lockname) }
+
+#define DEFINE_MUTEX(mutexname) \
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void fastcall __mutex_init(struct mutex *lock, const char *name);
+
+/***
+ * mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int fastcall mutex_is_locked(struct mutex *lock)
+{
+	return atomic_read(&lock->count) != 1;
+}
+
+/*
+ * See kernel/mutex.c for detailed documentation of these APIs.
+ * Also see Documentation/mutex-design.txt.
+ */
+extern void fastcall mutex_lock(struct mutex *lock);
+extern int fastcall mutex_lock_interruptible(struct mutex *lock);
+/*
+ * NOTE: mutex_trylock() follows the spin_trylock() convention,
+ *       not the down_trylock() convention!
+ */
+extern int fastcall mutex_trylock(struct mutex *lock);
+extern void fastcall mutex_unlock(struct mutex *lock);
+
+#endif
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index bb842ea4103..0798b7781a6 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -294,7 +294,7 @@ fill_post_wcc(struct svc_fh *fhp)
 /*
  * Lock a file handle/inode
  * NOTE: both fh_lock and fh_unlock are done "by hand" in
- * vfs.c:nfsd_rename as it needs to grab 2 i_sem's at once
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
  * so, any changes here should be reflected there.
  */
 static inline void
@@ -317,7 +317,7 @@ fh_lock(struct svc_fh *fhp)
 	}
 
 	inode = dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	fill_pre_wcc(fhp);
 	fhp->fh_locked = 1;
 }
@@ -333,7 +333,7 @@ fh_unlock(struct svc_fh *fhp)
 
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
-		up(&fhp->fh_dentry->d_inode->i_sem);
+		mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
 		fhp->fh_locked = 0;
 	}
 }
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 1767073df26..b12e59c7575 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -37,7 +37,7 @@ struct pipe_inode_info {
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
 
-#define PIPE_SEM(inode)		(&(inode).i_sem)
+#define PIPE_MUTEX(inode)	(&(inode).i_mutex)
 #define PIPE_WAIT(inode)	(&(inode).i_pipe->wait)
 #define PIPE_READERS(inode)	((inode).i_pipe->readers)
 #define PIPE_WRITERS(inode)	((inode).i_pipe->writers)
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 001ab82df05..e276c5ba2bb 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1857,7 +1857,7 @@ void padd_item(char *item, int total_length, int length);
 #define GET_BLOCK_CREATE 1	/* add anything you need to find block */
 #define GET_BLOCK_NO_HOLE 2	/* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4	/* read the tail if indirect item not found */
-#define GET_BLOCK_NO_ISEM     8	/* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_IMUX     8	/* i_mutex is not held, don't preallocate */
 #define GET_BLOCK_NO_DANGLE   16	/* don't leave any transactions running */
 
 int restart_transaction(struct reiserfs_transaction_handle *th,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78eb92ae4d9..85b53f87c70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,6 +817,11 @@ struct task_struct {
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
 
+#ifdef CONFIG_DEBUG_MUTEXES
+	/* mutex deadlock detection */
+	struct mutex_waiter *blocked_on;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;