From 6fd92b63d0626a8fe7eb8e2e50d19ecaa18cb412 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Sun, 9 Mar 2008 21:01:04 +0100
Subject: x86: change x86 to use generic find_next_bit

The versions with inline assembly are in fact slower on the machines I
tested them on (in userspace) (Athlon XP 2800+, p4-like Xeon 2.8GHz, AMD
Opteron 270). The i386-version needed a fix similar to 06024f21 to avoid
crashing the benchmark.

Benchmark using: gcc -fomit-frame-pointer -Os. For each bitmap size
1...512, for each possible bitmap with one bit set, for each possible
offset: find the position of the first bit starting at offset. If you
follow ;). Times include setup of the bitmap and checking of the
results.

		Athlon		Xeon		Opteron 32/64bit
x86-specific:	0m3.692s	0m2.820s	0m3.196s / 0m2.480s
generic:	0m2.622s	0m1.662s	0m2.100s / 0m1.572s

If the bitmap size is not a multiple of BITS_PER_LONG, and no set
(cleared) bit is found, find_next_bit (find_next_zero_bit) returns a
value outside of the range [0, size]. The generic version always returns
exactly size. The generic version also uses unsigned long everywhere,
while the x86 versions use a mishmash of int, unsigned (int), long and
unsigned long.

Using the generic version does give a slightly bigger kernel, though.

defconfig:	   text    data     bss     dec     hex filename
x86-specific:	4738555  481232  626688 5846475  5935cb vmlinux (32 bit)
generic:	4738621  481232  626688 5846541  59360d vmlinux (32 bit)
x86-specific:	5392395  846568  724424 6963387  6a40bb vmlinux (64 bit)
generic:	5392458  846568  724424 6963450  6a40fa vmlinux (64 bit)

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig         |  3 +++
 arch/x86/lib/Makefile    |  2 +-
 arch/x86/lib/bitops_32.c | 70 ------------------------------------------------
 arch/x86/lib/bitops_64.c | 68 ----------------------------------------------
 4 files changed, 4 insertions(+), 139 deletions(-)
 delete mode 100644 arch/x86/lib/bitops_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fadf794483..5639de47ed4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,6 +77,9 @@ config GENERIC_BUG
 	def_bool y
 	depends on BUG
 
+config GENERIC_FIND_NEXT_BIT
+	def_bool y
+
 config GENERIC_HWEIGHT
 	def_bool y
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 25df1c1989f..436093299bd 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -11,7 +11,7 @@ lib-y += memcpy_$(BITS).o
 ifeq ($(CONFIG_X86_32),y)
         lib-y += checksum_32.o
         lib-y += strstr_32.o
-        lib-y += bitops_32.o semaphore_32.o string_32.o
+        lib-y += semaphore_32.o string_32.o
 
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
deleted file mode 100644
index b6544045985..00000000000
--- a/arch/x86/lib/bitops_32.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/module.h>
-
-/**
- * find_next_bit - find the next set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-int find_next_bit(const unsigned long *addr, int size, int offset)
-{
-	const unsigned long *p = addr + (offset >> 5);
-	int set = 0, bit = offset & 31, res;
-
-	if (bit) {
-		/*
-		 * Look for nonzero in the first 32 bits:
-		 */
-		__asm__("bsfl %1,%0\n\t"
-			"jne 1f\n\t"
-			"movl $32, %0\n"
-			"1:"
-			: "=r" (set)
-			: "r" (*p >> bit));
-		if (set < (32 - bit))
-			return set + offset;
-		set = 32 - bit;
-		p++;
-	}
-	/*
-	 * No set bit yet, search remaining full words for a bit
-	 */
-	res = find_first_bit (p, size - 32 * (p - addr));
-	return (offset + set + res);
-}
-EXPORT_SYMBOL(find_next_bit);
-
-/**
- * find_next_zero_bit - find the first zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-int find_next_zero_bit(const unsigned long *addr, int size, int offset)
-{
-	const unsigned long *p = addr + (offset >> 5);
-	int set = 0, bit = offset & 31, res;
-
-	if (bit) {
-		/*
-		 * Look for zero in the first 32 bits.
-		 */
-		__asm__("bsfl %1,%0\n\t"
-			"jne 1f\n\t"
-			"movl $32, %0\n"
-			"1:"
-			: "=r" (set)
-			: "r" (~(*p >> bit)));
-		if (set < (32 - bit))
-			return set + offset;
-		set = 32 - bit;
-		p++;
-	}
-	/*
-	 * No zero yet, search remaining full bytes for a zero
-	 */
-	res = find_first_zero_bit(p, size - 32 * (p - addr));
-	return (offset + set + res);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
index 0e8f491e6cc..0eeb704d251 100644
--- a/arch/x86/lib/bitops_64.c
+++ b/arch/x86/lib/bitops_64.c
@@ -1,9 +1,7 @@
 #include <linux/bitops.h>
 
 #undef find_first_zero_bit
-#undef find_next_zero_bit
 #undef find_first_bit
-#undef find_next_bit
 
 static inline long
 __find_first_zero_bit(const unsigned long * addr, unsigned long size)
@@ -57,39 +55,6 @@ long find_first_zero_bit(const unsigned long * addr, unsigned long size)
 	return __find_first_zero_bit (addr, size);
 }
 
-/**
- * find_next_zero_bit - find the next zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_zero_bit (const unsigned long * addr, long size, long offset)
-{
-	const unsigned long * p = addr + (offset >> 6);
-	unsigned long set = 0;
-	unsigned long res, bit = offset&63;
-
-	if (bit) {
-		/*
-		 * Look for zero in first word
-		 */
-		asm("bsfq %1,%0\n\t"
-		    "cmoveq %2,%0"
-		    : "=r" (set)
-		    : "r" (~(*p >> bit)), "r"(64L));
-		if (set < (64 - bit))
-			return set + offset;
-		set = 64 - bit;
-		p++;
-	}
-	/*
-	 * No zero yet, search remaining full words for a zero
-	 */
-	res = __find_first_zero_bit (p, size - 64 * (p - addr));
-
-	return (offset + set + res);
-}
-
 static inline long
 __find_first_bit(const unsigned long * addr, unsigned long size)
 {
@@ -136,40 +101,7 @@ long find_first_bit(const unsigned long * addr, unsigned long size)
 	return __find_first_bit(addr,size);
 }
 
-/**
- * find_next_bit - find the first set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_bit(const unsigned long * addr, long size, long offset)
-{
-	const unsigned long * p = addr + (offset >> 6);
-	unsigned long set = 0, bit = offset & 63, res;
-
-	if (bit) {
-		/*
-		 * Look for nonzero in the first 64 bits:
-		 */
-		asm("bsfq %1,%0\n\t"
-		    "cmoveq %2,%0\n\t"
-		    : "=r" (set)
-		    : "r" (*p >> bit), "r" (64L));
-		if (set < (64 - bit))
-			return set + offset;
-		set = 64 - bit;
-		p++;
-	}
-	/*
-	 * No set bit yet, search remaining full words for a bit
-	 */
-	res = __find_first_bit (p, size - 64 * (p - addr));
-	return (offset + set + res);
-}
-
 #include <linux/module.h>
 
-EXPORT_SYMBOL(find_next_bit);
 EXPORT_SYMBOL(find_first_bit);
 EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
-- 
cgit v1.2.3


From 12d9c8420b9daa1da3d9e090640fb24bcd0deba2 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Sat, 15 Mar 2008 13:04:42 +0100
Subject: x86: merge the simple bitops and move them to bitops.h

Some of those can be written in such a way that the same
inline assembly can be used to generate both 32 bit and
64 bit code.

For ffs and fls, x86_64 unconditionally used the cmov
instruction and i386 unconditionally used a conditional
branch over a mov instruction. In the current patch I
chose to select the version based on the availability
of the cmov instruction instead. A small detail here is
that x86_64 did not previously set CONFIG_X86_CMOV=y.

Improved comments for ffs, ffz, fls and variations.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 4da3cdb9c1b..cf3ff2c5cef 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -398,7 +398,7 @@ config X86_TSC
 # generates cmov.
 config X86_CMOV
 	def_bool y
-	depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+	depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)
 
 config X86_MINIMUM_CPU_FAMILY
 	int
-- 
cgit v1.2.3


From 77b9bd9c49442407804c37bcc82021a35277f83c Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Tue, 1 Apr 2008 11:46:19 +0200
Subject: x86: generic versions of find_first_(zero_)bit, convert i386

Generic versions of __find_first_bit and __find_first_zero_bit
are introduced as simplified versions of __find_next_bit and
__find_next_zero_bit. Their compilation and use are guarded by
a new config variable GENERIC_FIND_FIRST_BIT.

The generic versions of find_first_bit and find_first_zero_bit
are implemented in terms of the newly introduced __find_first_bit
and __find_first_zero_bit.

This patch does not remove the i386-specific implementation,
but it does switch i386 to use the generic functions by setting
GENERIC_FIND_FIRST_BIT=y for X86_32.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5639de47ed4..1a69b68ff6c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,6 +77,9 @@ config GENERIC_BUG
 	def_bool y
 	depends on BUG
 
+config GENERIC_FIND_FIRST_BIT
+	def_bool X86_32
+
 config GENERIC_FIND_NEXT_BIT
 	def_bool y
 
-- 
cgit v1.2.3


From 2aba6925fdb96428d1129a61b1233597a03a387b Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Tue, 1 Apr 2008 17:41:26 +0200
Subject: x86: switch 64-bit to generic find_first_bit

Switch x86_64 to generic find_first_bit. The x86_64-specific
implementation is not removed.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig         | 2 +-
 arch/x86/lib/bitops_64.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1a69b68ff6c..700447738e7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -78,7 +78,7 @@ config GENERIC_BUG
 	depends on BUG
 
 config GENERIC_FIND_FIRST_BIT
-	def_bool X86_32
+	def_bool y
 
 config GENERIC_FIND_NEXT_BIT
 	def_bool y
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
index 0eeb704d251..568467d390c 100644
--- a/arch/x86/lib/bitops_64.c
+++ b/arch/x86/lib/bitops_64.c
@@ -1,3 +1,4 @@
+#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
 #include <linux/bitops.h>
 
 #undef find_first_zero_bit
@@ -105,3 +106,4 @@ long find_first_bit(const unsigned long * addr, unsigned long size)
 
 EXPORT_SYMBOL(find_first_bit);
 EXPORT_SYMBOL(find_first_zero_bit);
+#endif
-- 
cgit v1.2.3


From 5245698f665c4b7a533dcc47a5afdf33095d436a Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Tue, 1 Apr 2008 17:47:57 +0200
Subject: x86, UML: remove x86-specific implementations of find_first_bit

x86 has been switched to the generic versions of find_first_bit
and find_first_zero_bit, but the original versions were retained.
This patch just removes the now unused x86-specific versions.

also update UML.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/Makefile    |   1 -
 arch/x86/lib/bitops_64.c | 109 -----------------------------------------------
 2 files changed, 110 deletions(-)
 delete mode 100644 arch/x86/lib/bitops_64.c

(limited to 'arch/x86')

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 436093299bd..76f60f52a88 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,7 +21,6 @@ else
 
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
-        lib-y += bitops_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
 endif
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
deleted file mode 100644
index 568467d390c..00000000000
--- a/arch/x86/lib/bitops_64.c
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
-#include <linux/bitops.h>
-
-#undef find_first_zero_bit
-#undef find_first_bit
-
-static inline long
-__find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
-	long d0, d1, d2;
-	long res;
-
-	/*
-	 * We must test the size in words, not in bits, because
-	 * otherwise incoming sizes in the range -63..-1 will not run
-	 * any scasq instructions, and then the flags used by the je
-	 * instruction will have whatever random value was in place
-	 * before.  Nobody should call us like that, but
-	 * find_next_zero_bit() does when offset and size are at the
-	 * same word and it fails to find a zero itself.
-	 */
-	size += 63;
-	size >>= 6;
-	if (!size)
-		return 0;
-	asm volatile(
-		"  repe; scasq\n"
-		"  je 1f\n"
-		"  xorq -8(%%rdi),%%rax\n"
-		"  subq $8,%%rdi\n"
-		"  bsfq %%rax,%%rdx\n"
-		"1:  subq %[addr],%%rdi\n"
-		"  shlq $3,%%rdi\n"
-		"  addq %%rdi,%%rdx"
-		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-		:"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
-		 [addr] "S" (addr) : "memory");
-	/*
-	 * Any register would do for [addr] above, but GCC tends to
-	 * prefer rbx over rsi, even though rsi is readily available
-	 * and doesn't have to be saved.
-	 */
-	return res;
-}
-
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-long find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
-	return __find_first_zero_bit (addr, size);
-}
-
-static inline long
-__find_first_bit(const unsigned long * addr, unsigned long size)
-{
-	long d0, d1;
-	long res;
-
-	/*
-	 * We must test the size in words, not in bits, because
-	 * otherwise incoming sizes in the range -63..-1 will not run
-	 * any scasq instructions, and then the flags used by the jz
-	 * instruction will have whatever random value was in place
-	 * before.  Nobody should call us like that, but
-	 * find_next_bit() does when offset and size are at the same
-	 * word and it fails to find a one itself.
-	 */
-	size += 63;
-	size >>= 6;
-	if (!size)
-		return 0;
-	asm volatile(
-		"   repe; scasq\n"
-		"   jz 1f\n"
-		"   subq $8,%%rdi\n"
-		"   bsfq (%%rdi),%%rax\n"
-		"1: subq %[addr],%%rdi\n"
-		"   shlq $3,%%rdi\n"
-		"   addq %%rdi,%%rax"
-		:"=a" (res), "=&c" (d0), "=&D" (d1)
-		:"0" (0ULL), "1" (size), "2" (addr),
-		 [addr] "r" (addr) : "memory");
-	return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first set bit, not the number of the byte
- * containing a bit.
- */
-long find_first_bit(const unsigned long * addr, unsigned long size)
-{
-	return __find_first_bit(addr,size);
-}
-
-#include <linux/module.h>
-
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-#endif
-- 
cgit v1.2.3


From 19870def587554c4055df3e74a21508e3647fb7e Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Fri, 25 Apr 2008 13:12:53 +0200
Subject: x86, bitops: select the generic bitmap search functions

Introduce GENERIC_FIND_FIRST_BIT and GENERIC_FIND_NEXT_BIT in
lib/Kconfig, defaulting to off. An arch that wants to use the
generic implementation now only has to use a select statement
to include them.

I added an always-y option (X86_CPU) to arch/x86/Kconfig.cpu
and used that to select the generic search functions. This
way ARCH=um SUBARCH=i386 automatically picks up the change
too, and arch/um/Kconfig.i386 can therefore be simplified a
bit. ARCH=um SUBARCH=x86_64 does things differently, but
still compiles fine. It seems that a "def_bool y" always
wins over a "def_bool n"?

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig     | 6 ------
 arch/x86/Kconfig.cpu | 5 +++++
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 700447738e7..2fadf794483 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,12 +77,6 @@ config GENERIC_BUG
 	def_bool y
 	depends on BUG
 
-config GENERIC_FIND_FIRST_BIT
-	def_bool y
-
-config GENERIC_FIND_NEXT_BIT
-	def_bool y
-
 config GENERIC_HWEIGHT
 	def_bool y
 
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index cf3ff2c5cef..7ef18b01f0b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -278,6 +278,11 @@ config GENERIC_CPU
 
 endchoice
 
+config X86_CPU
+	def_bool y
+	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_FIND_NEXT_BIT
+
 config X86_GENERIC
 	bool "Generic x86 support"
 	depends on X86_32
-- 
cgit v1.2.3