From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 10 Aug 2008 18:35:38 -0700
Subject: rcu, debug: detect stalled grace periods

this is a diagnostic patch for Classic RCU.

The approach is to record a timestamp at the beginning
of the grace period (in rcu_start_batch()), then have
rcu_check_callbacks() complain if:

 1.	it is running on a CPU that has holding up grace periods for
 	a long time (say one second).  This will identify the culprit
 	assuming that the culprit has not disabled hardware irqs,
 	instruction execution, or some such.

 2.	it is running on a CPU that is not holding up grace periods,
 	but grace periods have been held up for an even longer time
 	(say two seconds).

It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter.

Rather than exponential backoff, it backs off to once per 30 seconds.
My feeling upon thinking on it was that if you have stalled RCU grace
periods for that long, a few extra printk() messages are probably the
least of your worries...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: David Witbrodt <dawitbro@sbcglobal.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/Kconfig.debug | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e1d4764435e..2fb6d90bf1e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_CPU_STALL
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From 1f7c14c62ce63805f9574664a6c6de3633d4a354 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Thu, 9 Oct 2008 12:50:59 -0400
Subject: percpu counter: clean up percpu_counter_sum_and_set()

percpu_counter_sum_and_set() and percpu_counter_sum() is the same except
the former updates the global counter after accounting.  Since we are
taking the fbc->lock to calculate the precise value of the counter in
percpu_counter_sum() anyway, it should simply set fbc->count too, as the
percpu_counter_sum_and_set() does.

This patch merges these two interfaces into one.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 lib/percpu_counter.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 4a8ba4bf5f6..a8663890a88 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
 	s64 ret;
 	int cpu;
@@ -62,11 +62,9 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
-		if (set)
-			*pcount = 0;
+		*pcount = 0;
 	}
-	if (set)
-		fbc->count = ret;
+	fbc->count = ret;
 
 	spin_unlock(&fbc->lock);
 	return ret;
-- 
cgit v1.2.3


From 68e91d61346db4359464d06617500141cfd1442a Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 8 Sep 2008 18:10:14 +0900
Subject: swiotlb: remove GFP_DMA hack in swiotlb_alloc_coherent

The callers are supposed to set up the gfp flags appropriately.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 977edbdbc1d..3066ffe1f9e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -467,13 +467,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	void *ret;
 	int order = get_order(size);
 
-	/*
-	 * XXX fix me: the DMA API should pass us an explicit DMA mask
-	 * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32
-	 * bit range instead of a 16MB one).
-	 */
-	flags |= GFP_DMA;
-
 	ret = (void *)__get_free_pages(flags, order);
 	if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) {
 		/*
-- 
cgit v1.2.3


From 9dfda12b8b769e224ca4efbc35ace4524b9c017b Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 8 Sep 2008 18:53:48 +0900
Subject: swiotlb: use map_single instead of swiotlb_map_single in
 swiotlb_alloc_coherent

We always need swiotlb memory here so address_needs_mapping and
swiotlb_force testings are irrelevant. map_single should be used here
instead of swiotlb_map_single.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 3066ffe1f9e..2fb485d0e7e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -483,12 +483,9 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 * swiotlb_map_single(), which will grab memory from
 		 * the lowest available address range.
 		 */
-		dma_addr_t handle;
-		handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
-		if (swiotlb_dma_mapping_error(hwdev, handle))
+		ret = map_single(hwdev, NULL, size, DMA_FROM_DEVICE);
+		if (!ret)
 			return NULL;
-
-		ret = bus_to_virt(handle);
 	}
 
 	memset(ret, 0, size);
-- 
cgit v1.2.3


From 21f6c4de4c25c4bdd88c75bc97a78e7fbeebac4d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 8 Sep 2008 18:53:49 +0900
Subject: swiotlb: use unmap_single instead of swiotlb_unmap_single in
 swiotlb_free_coherent

We don't need any check in swiotlb_unmap_single here. unmap_single is
appropriate.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 2fb485d0e7e..bf61c73a341 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -513,7 +513,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 		free_pages((unsigned long) vaddr, get_order(size));
 	else
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-		swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE);
+		unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
 }
 
 static void
-- 
cgit v1.2.3


From 640aebfe014554ced9c38d2564e38862e488d0eb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 8 Sep 2008 18:53:50 +0900
Subject: swiotlb: add is_swiotlb_buffer helper function

This adds is_swiotlb_buffer() helper function to see whether a buffer
belongs to the swiotlb buffer or not.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index bf61c73a341..b5f5d113304 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -283,6 +283,11 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 	return (addr & ~mask) != 0;
 }
 
+static int is_swiotlb_buffer(char *addr)
+{
+	return addr >= io_tlb_start && addr < io_tlb_end;
+}
+
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
@@ -508,8 +513,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 		      dma_addr_t dma_handle)
 {
 	WARN_ON(irqs_disabled());
-	if (!(vaddr >= (void *)io_tlb_start
-                    && vaddr < (void *)io_tlb_end))
+	if (!is_swiotlb_buffer(vaddr))
 		free_pages((unsigned long) vaddr, get_order(size));
 	else
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
@@ -602,7 +606,7 @@ swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
 	char *dma_addr = bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
-	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+	if (is_swiotlb_buffer(dma_addr))
 		unmap_single(hwdev, dma_addr, size, dir);
 	else if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(dma_addr, size);
@@ -632,7 +636,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 	char *dma_addr = bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
-	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+	if (is_swiotlb_buffer(dma_addr))
 		sync_single(hwdev, dma_addr, size, dir, target);
 	else if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(dma_addr, size);
@@ -663,7 +667,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
 	char *dma_addr = bus_to_virt(dev_addr) + offset;
 
 	BUG_ON(dir == DMA_NONE);
-	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+	if (is_swiotlb_buffer(dma_addr))
 		sync_single(hwdev, dma_addr, size, dir, target);
 	else if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(dma_addr, size);
-- 
cgit v1.2.3


From 2797982ed93c10d5585ee1842ab298cb11326ff5 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 10 Sep 2008 01:06:49 +0900
Subject: swiotlb: convert swiotlb to use is_buffer_dma_capable helper function

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index b5f5d113304..240a67c2c97 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -274,13 +274,13 @@ cleanup1:
 }
 
 static int
-address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 {
 	dma_addr_t mask = 0xffffffff;
 	/* If the device has a mask, use it, otherwise default to 32 bits */
 	if (hwdev && hwdev->dma_mask)
 		mask = *hwdev->dma_mask;
-	return (addr & ~mask) != 0;
+	return !is_buffer_dma_capable(mask, addr, size);
 }
 
 static int is_swiotlb_buffer(char *addr)
@@ -473,7 +473,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	int order = get_order(size);
 
 	ret = (void *)__get_free_pages(flags, order);
-	if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) {
+	if (ret && address_needs_mapping(hwdev, virt_to_bus(ret), size)) {
 		/*
 		 * The allocated memory isn't reachable by the device.
 		 * Fall back on swiotlb_map_single().
@@ -497,7 +497,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	dev_addr = virt_to_bus(ret);
 
 	/* Confirm address can be DMA'd by device */
-	if (address_needs_mapping(hwdev, dev_addr)) {
+	if (address_needs_mapping(hwdev, dev_addr, size)) {
 		printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
 		       (unsigned long long)*hwdev->dma_mask,
 		       (unsigned long long)dev_addr);
@@ -561,7 +561,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force)
+	if (!address_needs_mapping(hwdev, dev_addr, size) && !swiotlb_force)
 		return dev_addr;
 
 	/*
@@ -578,7 +578,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	/*
 	 * Ensure that the address returned is DMA'ble
 	 */
-	if (address_needs_mapping(hwdev, dev_addr))
+	if (address_needs_mapping(hwdev, dev_addr, size))
 		panic("map_single: bounce buffer is not DMA'ble");
 
 	return dev_addr;
@@ -721,7 +721,8 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 	for_each_sg(sgl, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
 		dev_addr = virt_to_bus(addr);
-		if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) {
+		if (swiotlb_force ||
+		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map = map_single(hwdev, addr, sg->length, dir);
 			if (!map) {
 				/* Don't panic here, we expect map_sg users
-- 
cgit v1.2.3


From 07a2c01a0c2a0cb4581a67d50d4f17cb4d2457c4 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 19 Sep 2008 02:02:05 +0900
Subject: convert swiotlb to use dma_get_mask

swiotlb can use dma_get_mask() instead of the homegrown function.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 240a67c2c97..f8eebd48914 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -276,11 +276,7 @@ cleanup1:
 static int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 {
-	dma_addr_t mask = 0xffffffff;
-	/* If the device has a mask, use it, otherwise default to 32 bits */
-	if (hwdev && hwdev->dma_mask)
-		mask = *hwdev->dma_mask;
-	return !is_buffer_dma_capable(mask, addr, size);
+	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
 }
 
 static int is_swiotlb_buffer(char *addr)
-- 
cgit v1.2.3


From d26dbc5cf94b0a28acc947285c3b54814a73cb2e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 22 Sep 2008 22:35:07 +0900
Subject: iommu: export iommu_area_reserve helper function

x86 has set_bit_string() that does the exact same thing that
set_bit_area() in lib/iommu-helper.c does.

This patch exports set_bit_area() in lib/iommu-helper.c as
iommu_area_reserve(), converts GART, Calgary, and AMD IOMMU to use it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/iommu-helper.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index a3b8d4c3f77..5d90074dca7 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -30,8 +30,7 @@ again:
 	return index;
 }
 
-static inline void set_bit_area(unsigned long *map, unsigned long i,
-				int len)
+void iommu_area_reserve(unsigned long *map, unsigned long i, int len)
 {
 	unsigned long end = i + len;
 	while (i < end) {
@@ -64,7 +63,7 @@ again:
 			start = index + 1;
 			goto again;
 		}
-		set_bit_area(map, index, nr);
+		iommu_area_reserve(map, index, nr);
 	}
 	return index;
 }
-- 
cgit v1.2.3


From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Oct 2008 16:06:39 -0700
Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU

This patch adds stalled-CPU detection to Classic RCU.  This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.

This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.

This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds.  This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4).

The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ccede1aeab3..9fee969dd60 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
-config RCU_CPU_STALL
+config RCU_CPU_STALL_DETECTOR
 	bool "Check for stalled CPUs delaying RCU grace periods"
 	depends on CLASSIC_RCU
 	default n
-- 
cgit v1.2.3


From 3c9f3681d0b4af09c1cbf04f92fdfb72bd81ad7b Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 31 Aug 2008 10:13:54 -0500
Subject: [SCSI] lib: add generic helper to print sizes rounded to the correct
 SI range

This patch adds the ability to print sizes in either units of 10^3 (SI)
or 2^10 (Binary) units.  It rounds up to three significant figures and
can be used for either memory or storage capacities.

Oh, and I'm fully aware that 64 bits is only 16EiB ... the Zetta and
Yotta units are added for future proofing against the day we have 128
bit computers ...

[fujita.tomonori@lab.ntt.co.jp: fix missed unsigned long long cast]
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 lib/Makefile         |  3 ++-
 lib/string_helpers.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 lib/string_helpers.c

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 3b1f94bbe9d..44001af76a7 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -19,7 +19,8 @@ lib-$(CONFIG_SMP) += cpumask.o
 lib-y	+= kobject.o kref.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
-	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o
+	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
+	 string_helpers.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
new file mode 100644
index 00000000000..8347925030f
--- /dev/null
+++ b/lib/string_helpers.c
@@ -0,0 +1,64 @@
+/*
+ * Helpers for formatting and printing strings
+ *
+ * Copyright 31 August 2008 James Bottomley
+ */
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/string_helpers.h>
+
+/**
+ * string_get_size - get the size in the specified units
+ * @size:	The size to be converted
+ * @units:	units to use (powers of 1000 or 1024)
+ * @buf:	buffer to format to
+ * @len:	length of buffer
+ *
+ * This function returns a string formatted to 3 significant figures
+ * giving the size in the required units.  Returns 0 on success or
+ * error on failure.  @buf is always zero terminated.
+ *
+ */
+int string_get_size(u64 size, const enum string_size_units units,
+		    char *buf, int len)
+{
+	const char *units_10[] = { "B", "KB", "MB", "GB", "TB", "PB",
+				   "EB", "ZB", "YB", NULL};
+	const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB",
+				 "EiB", "ZiB", "YiB", NULL };
+	const char **units_str[] = {
+		[STRING_UNITS_10] =  units_10,
+		[STRING_UNITS_2] = units_2,
+	};
+	const int divisor[] = {
+		[STRING_UNITS_10] = 1000,
+		[STRING_UNITS_2] = 1024,
+	};
+	int i, j;
+	u64 remainder = 0, sf_cap;
+	char tmp[8];
+
+	tmp[0] = '\0';
+
+	for (i = 0; size > divisor[units] && units_str[units][i]; i++)
+		remainder = do_div(size, divisor[units]);
+
+	sf_cap = size;
+	for (j = 0; sf_cap*10 < 1000; j++)
+		sf_cap *= 10;
+
+	if (j) {
+		remainder *= 1000;
+		do_div(remainder, divisor[units]);
+		snprintf(tmp, sizeof(tmp), ".%03lld",
+			 (unsigned long long)remainder);
+		tmp[j+1] = '\0';
+	}
+
+	snprintf(buf, len, "%lld%s%s", (unsigned long long)size,
+		 tmp, units_str[units][i]);
+
+	return 0;
+}
+EXPORT_SYMBOL(string_get_size);
-- 
cgit v1.2.3


From a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:50:16 +0200
Subject: klist: don't iterate over deleted entries

A klist entry is kept on the list till all its current iterations are
finished; however, a new iteration after deletion also iterates over
deleted entries as long as their reference count stays above zero.
This causes problems for cases where there are users which iterate
over the list while synchronized against list manipulations and
natuarally expect already deleted entries to not show up during
iteration.

This patch implements dead flag which gets set on deletion so that
iteration can skip already deleted entries.  The dead flag piggy backs
on the lowest bit of knode->n_klist and only visible to klist
implementation proper.

While at it, drop klist_iter->i_head as it's redundant and doesn't
offer anything in semantics or performance wise as klist_iter->i_klist
is dereferenced on every iteration anyway.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 lib/klist.c | 96 ++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 70 insertions(+), 26 deletions(-)

(limited to 'lib')

diff --git a/lib/klist.c b/lib/klist.c
index cca37f96faa..bbdd3015c2c 100644
--- a/lib/klist.c
+++ b/lib/klist.c
@@ -37,6 +37,37 @@
 #include <linux/klist.h>
 #include <linux/module.h>
 
+/*
+ * Use the lowest bit of n_klist to mark deleted nodes and exclude
+ * dead ones from iteration.
+ */
+#define KNODE_DEAD		1LU
+#define KNODE_KLIST_MASK	~KNODE_DEAD
+
+static struct klist *knode_klist(struct klist_node *knode)
+{
+	return (struct klist *)
+		((unsigned long)knode->n_klist & KNODE_KLIST_MASK);
+}
+
+static bool knode_dead(struct klist_node *knode)
+{
+	return (unsigned long)knode->n_klist & KNODE_DEAD;
+}
+
+static void knode_set_klist(struct klist_node *knode, struct klist *klist)
+{
+	knode->n_klist = klist;
+	/* no knode deserves to start its life dead */
+	WARN_ON(knode_dead(knode));
+}
+
+static void knode_kill(struct klist_node *knode)
+{
+	/* and no knode should die twice ever either, see we're very humane */
+	WARN_ON(knode_dead(knode));
+	*(unsigned long *)&knode->n_klist |= KNODE_DEAD;
+}
 
 /**
  * klist_init - Initialize a klist structure.
@@ -79,7 +110,7 @@ static void klist_node_init(struct klist *k, struct klist_node *n)
 	INIT_LIST_HEAD(&n->n_node);
 	init_completion(&n->n_removed);
 	kref_init(&n->n_ref);
-	n->n_klist = k;
+	knode_set_klist(n, k);
 	if (k->get)
 		k->get(n);
 }
@@ -115,7 +146,7 @@ EXPORT_SYMBOL_GPL(klist_add_tail);
  */
 void klist_add_after(struct klist_node *n, struct klist_node *pos)
 {
-	struct klist *k = pos->n_klist;
+	struct klist *k = knode_klist(pos);
 
 	klist_node_init(k, n);
 	spin_lock(&k->k_lock);
@@ -131,7 +162,7 @@ EXPORT_SYMBOL_GPL(klist_add_after);
  */
 void klist_add_before(struct klist_node *n, struct klist_node *pos)
 {
-	struct klist *k = pos->n_klist;
+	struct klist *k = knode_klist(pos);
 
 	klist_node_init(k, n);
 	spin_lock(&k->k_lock);
@@ -144,9 +175,10 @@ static void klist_release(struct kref *kref)
 {
 	struct klist_node *n = container_of(kref, struct klist_node, n_ref);
 
+	WARN_ON(!knode_dead(n));
 	list_del(&n->n_node);
 	complete(&n->n_removed);
-	n->n_klist = NULL;
+	knode_set_klist(n, NULL);
 }
 
 static int klist_dec_and_del(struct klist_node *n)
@@ -154,22 +186,29 @@ static int klist_dec_and_del(struct klist_node *n)
 	return kref_put(&n->n_ref, klist_release);
 }
 
-/**
- * klist_del - Decrement the reference count of node and try to remove.
- * @n: node we're deleting.
- */
-void klist_del(struct klist_node *n)
+static void klist_put(struct klist_node *n, bool kill)
 {
-	struct klist *k = n->n_klist;
+	struct klist *k = knode_klist(n);
 	void (*put)(struct klist_node *) = k->put;
 
 	spin_lock(&k->k_lock);
+	if (kill)
+		knode_kill(n);
 	if (!klist_dec_and_del(n))
 		put = NULL;
 	spin_unlock(&k->k_lock);
 	if (put)
 		put(n);
 }
+
+/**
+ * klist_del - Decrement the reference count of node and try to remove.
+ * @n: node we're deleting.
+ */
+void klist_del(struct klist_node *n)
+{
+	klist_put(n, true);
+}
 EXPORT_SYMBOL_GPL(klist_del);
 
 /**
@@ -206,7 +245,6 @@ void klist_iter_init_node(struct klist *k, struct klist_iter *i,
 			  struct klist_node *n)
 {
 	i->i_klist = k;
-	i->i_head = &k->k_list;
 	i->i_cur = n;
 	if (n)
 		kref_get(&n->n_ref);
@@ -237,7 +275,7 @@ EXPORT_SYMBOL_GPL(klist_iter_init);
 void klist_iter_exit(struct klist_iter *i)
 {
 	if (i->i_cur) {
-		klist_del(i->i_cur);
+		klist_put(i->i_cur, false);
 		i->i_cur = NULL;
 	}
 }
@@ -258,27 +296,33 @@ static struct klist_node *to_klist_node(struct list_head *n)
  */
 struct klist_node *klist_next(struct klist_iter *i)
 {
-	struct list_head *next;
-	struct klist_node *lnode = i->i_cur;
-	struct klist_node *knode = NULL;
 	void (*put)(struct klist_node *) = i->i_klist->put;
+	struct klist_node *last = i->i_cur;
+	struct klist_node *next;
 
 	spin_lock(&i->i_klist->k_lock);
-	if (lnode) {
-		next = lnode->n_node.next;
-		if (!klist_dec_and_del(lnode))
+
+	if (last) {
+		next = to_klist_node(last->n_node.next);
+		if (!klist_dec_and_del(last))
 			put = NULL;
 	} else
-		next = i->i_head->next;
+		next = to_klist_node(i->i_klist->k_list.next);
 
-	if (next != i->i_head) {
-		knode = to_klist_node(next);
-		kref_get(&knode->n_ref);
+	i->i_cur = NULL;
+	while (next != to_klist_node(&i->i_klist->k_list)) {
+		if (likely(!knode_dead(next))) {
+			kref_get(&next->n_ref);
+			i->i_cur = next;
+			break;
+		}
+		next = to_klist_node(next->n_node.next);
 	}
-	i->i_cur = knode;
+
 	spin_unlock(&i->i_klist->k_lock);
-	if (put && lnode)
-		put(lnode);
-	return knode;
+
+	if (put && last)
+		put(last);
+	return i->i_cur;
 }
 EXPORT_SYMBOL_GPL(klist_next);
-- 
cgit v1.2.3


From 870d6656126add8e383645732b03df2b7ccd4f94 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:25 +0900
Subject: block: implement CONFIG_DEBUG_BLOCK_EXT_DEVT

Extended devt introduces non-contiguos device numbers.  This patch
implements a debug option which forces most devt allocations to be
from the extended area and spreads them out.  This is enabled by
default if DEBUG_KERNEL is set and achieves...

1. Detects code paths in kernel or userland which expect predetermined
   consecutive device numbers.

2. When something goes wrong, avoid corruption as adding to the minor
   of earlier partition won't lead to the wrong but valid device.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 lib/Kconfig.debug | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0b504814e37..5a536f703a8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -624,6 +624,22 @@ config BACKTRACE_SELF_TEST
 
 	  Say N if you are unsure.
 
+config DEBUG_BLOCK_EXT_DEVT
+        bool "Force extended block device numbers and spread them"
+	depends on DEBUG_KERNEL
+	depends on BLOCK
+	default y
+	help
+	  Conventionally, block device numbers are allocated from
+	  predetermined contiguous area.  However, extended block area
+	  may introduce non-contiguous block device numbers.  This
+	  option forces most block device numbers to be allocated from
+	  the extended space and spreads them to discover kernel or
+	  userland code paths which assume predetermined contiguous
+	  device number allocation.
+
+	  Say N if you are unsure.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From 759f8ca3048f7438aa3129268d7252552505d662 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 29 Aug 2008 09:06:29 +0200
Subject: Change default value of CONFIG_DEBUG_BLOCK_EXT_DEVT to 'n'

It's a debug option that you would explicitly enable to test this
feature, we should default it to 'n' to prevent accidental surprises
for now.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5a536f703a8..4378d5e923c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -628,7 +628,7 @@ config DEBUG_BLOCK_EXT_DEVT
         bool "Force extended block device numbers and spread them"
 	depends on DEBUG_KERNEL
 	depends on BLOCK
-	default y
+	default n
 	help
 	  Conventionally, block device numbers are allocated from
 	  predetermined contiguous area.  However, extended block area
-- 
cgit v1.2.3


From 55dc7db70a73a3809a2334063c9b5b0d8ccebdaa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 1 Sep 2008 13:44:35 +0200
Subject: init: DEBUG_BLOCK_EXT_DEVT requires explicit root= param

DEBUG_BLOCK_EXT_DEVT shuffles SCSI and IDE device numbers and root
device number set using rdev become meaningless.  Root devices should
be explicitly specified using textual names.  Warn about it if root
can't be found and DEBUG_BLOCK_EXT_DEVT is enabled.  Also, add warning
to the help text.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 lib/Kconfig.debug | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4378d5e923c..c556896abe5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -638,6 +638,12 @@ config DEBUG_BLOCK_EXT_DEVT
 	  userland code paths which assume predetermined contiguous
 	  device number allocation.
 
+	  Note that turning on this debug option shuffles all the
+	  device numbers for all IDE and SCSI devices including libata
+	  ones, so root partition specified using device number
+	  directly (via rdev or root=MAJ:MIN) won't work anymore.
+	  Textual device names (root=/dev/sdXn) will continue to work.
+
 	  Say N if you are unsure.
 
 config LKDTM
-- 
cgit v1.2.3


From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 14 Sep 2008 05:56:33 -0700
Subject: block: add fault injection mechanism for faking request timeouts

Only works for the generic request timer handling. Allows one to
sporadically ignore request completions, thus exercising the timeout
handling.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 lib/Kconfig.debug | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c556896abe5..7d7a31d0dde 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -683,10 +683,21 @@ config FAIL_PAGE_ALLOC
 
 config FAIL_MAKE_REQUEST
 	bool "Fault-injection capability for disk IO"
-	depends on FAULT_INJECTION
+	depends on FAULT_INJECTION && BLOCK
 	help
 	  Provide fault-injection capability for disk IO.
 
+config FAIL_IO_TIMEOUT
+	bool "Faul-injection capability for faking disk interrupts"
+	depends on FAULT_INJECTION && BLOCK
+	help
+	  Provide fault-injection capability on end IO handling. This
+	  will make the block layer "forget" an interrupt as configured,
+	  thus exercising the error handling.
+
+	  Only works with drivers that use the generic timeout handling,
+	  for others it wont do anything.
+
 config FAULT_INJECTION_DEBUG_FS
 	bool "Debugfs entries for fault-injection capabilities"
 	depends on FAULT_INJECTION && SYSFS && DEBUG_FS
-- 
cgit v1.2.3