Merge branch 'for-2.6.25' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc

* 'for-2.6.25' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc: (454 commits) [POWERPC] Cell IOMMU fixed mapping support [POWERPC] Split out the ioid fetching/checking logic [POWERPC] Add support to cell_iommu_setup_page_tables() for multiple windows [POWERPC] Split out the IOMMU logic from cell_dma_dev_setup() [POWERPC] Split cell_iommu_setup_hardware() into two parts [POWERPC] Split out the logic that allocates struct iommus [POWERPC] Allocate the hash table under 1G on cell [POWERPC] Add set_dma_ops() to match get_dma_ops() [POWERPC] 83xx: Clean up / convert mpc83xx board DTS files to v1 format. [POWERPC] 85xx: Only invalidate TLB0 and TLB1 [POWERPC] 83xx: Fix typo in mpc837x compatible entries [POWERPC] 85xx: convert sbc85* boards to use machine_device_initcall [POWERPC] 83xx: rework platform Kconfig [POWERPC] 85xx: rework platform Kconfig [POWERPC] 86xx: Remove unused IRQ defines [POWERPC] QE: Explicitly set address-cells and size cells for muram [POWERPC] Convert StorCenter DTS file to /dts-v1/ format. [POWERPC] 86xx: Convert all 86xx DTS files to /dts-v1/ format. [PPC] Remove 85xx from arch/ppc [PPC] Remove 83xx from arch/ppc ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-01-31 13:37:27 +1100
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-01-31 13:37:27 +1100
commit: 8af03e782cae1e0a0f530ddd22301cdd12cf9dc0 (patch)
tree: c4af13a38bd3cc1a811a37f2358491f171052070 /arch/powerpc/platforms/cell
parent: 6232665040f9a23fafd9d94d4ae8d5a2dc850f65 (diff)
parent: 99e139126ab2e84be67969650f92eb37c12ab5cd (diff)
24 files changed, 1530 insertions, 692 deletions
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index 39d695cb969..c89964c6fb1 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -20,7 +20,7 @@ spu-manage-$(CONFIG_PPC_CELL_NATIVE)	+= spu_manage.o
 
 obj-$(CONFIG_SPU_BASE)			+= spu_callbacks.o spu_base.o \
 					   spu_notify.o \
-					   spu_syscalls.o \
+					   spu_syscalls.o spu_fault.o \
 					   $(spu-priv1-y) \
 					   $(spu-manage-y) \
 					   spufs/
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index 13d5a87f13b..ec7c8f45a21 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -21,8 +21,9 @@
  */
 
 #include <linux/cpufreq.h>
+#include <linux/of_platform.h>
+
 #include <asm/machdep.h>
-#include <asm/of_platform.h>
 #include <asm/prom.h>
 #include <asm/cell-regs.h>
 #include "cbe_cpufreq.h"
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
index 6a2c1b0a9a9..69288f65314 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
@@ -23,7 +23,8 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/timer.h>
-#include <asm/of_platform.h>
+#include <linux/of_platform.h>
+
 #include <asm/processor.h>
 #include <asm/prom.h>
 #include <asm/pmi.h>
diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c
index 16a9b07e7b0..dbc338f187a 100644
--- a/arch/powerpc/platforms/cell/cbe_regs.c
+++ b/arch/powerpc/platforms/cell/cbe_regs.c
@@ -9,13 +9,13 @@
 #include <linux/percpu.h>
 #include <linux/types.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
 
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/prom.h>
 #include <asm/ptrace.h>
-#include <asm/of_device.h>
-#include <asm/of_platform.h>
 #include <asm/cell-regs.h>
 
 /*
@@ -256,6 +256,7 @@ void __init cbe_regs_init(void)
 			printk(KERN_ERR "cbe_regs: More BE chips than supported"
 			       "!\n");
 			cbe_regs_map_count--;
+			of_node_put(cpu);
 			return;
 		}
 		map->cpu_node = cpu;
diff --git a/arch/powerpc/platforms/cell/io-workarounds.c b/arch/powerpc/platforms/cell/io-workarounds.c
index 9d7c2ef940a..979d4b67efb 100644
--- a/arch/powerpc/platforms/cell/io-workarounds.c
+++ b/arch/powerpc/platforms/cell/io-workarounds.c
@@ -238,7 +238,7 @@ static void __init spider_pci_setup_chip(struct spider_pci_bus *bus)
 static void __init spider_pci_add_one(struct pci_controller *phb)
 {
 	struct spider_pci_bus *bus = &spider_pci_busses[spider_pci_count];
-	struct device_node *np = phb->arch_data;
+	struct device_node *np = phb->dn;
 	struct resource rsrc;
 	void __iomem *regs;
 
@@ -309,15 +309,12 @@ static int __init spider_pci_workaround_init(void)
 {
 	struct pci_controller *phb;
 
-	if (!machine_is(cell))
-		return 0;
-
 	/* Find spider bridges. We assume they have been all probed
 	 * in setup_arch(). If that was to change, we would need to
 	 * update this code to cope with dynamically added busses
 	 */
 	list_for_each_entry(phb, &hose_list, list_node) {
-		struct device_node *np = phb->arch_data;
+		struct device_node *np = phb->dn;
 		const char *model = of_get_property(np, "model", NULL);
 
 		/* If no model property or name isn't exactly "pci", skip */
@@ -343,4 +340,4 @@ static int __init spider_pci_workaround_init(void)
 
 	return 0;
 }
-arch_initcall(spider_pci_workaround_init);
+machine_arch_initcall(cell, spider_pci_workaround_init);
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index faabc3fdc13..df330666ccc 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -1,7 +1,7 @@
 /*
  * IOMMU implementation for Cell Broadband Processor Architecture
  *
- * (C) Copyright IBM Corporation 2006
+ * (C) Copyright IBM Corporation 2006-2008
  *
  * Author: Jeremy Kerr <jk@ozlabs.org>
  *
@@ -26,14 +26,15 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
+#include <linux/of_platform.h>
 
 #include <asm/prom.h>
 #include <asm/iommu.h>
 #include <asm/machdep.h>
 #include <asm/pci-bridge.h>
 #include <asm/udbg.h>
-#include <asm/of_platform.h>
 #include <asm/lmb.h>
+#include <asm/firmware.h>
 #include <asm/cell-regs.h>
 
 #include "interrupt.h"
@@ -305,29 +306,28 @@ static int cell_iommu_find_ioc(int nid, unsigned long *base)
 	return -ENODEV;
 }
 
-static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long size)
+static void cell_iommu_setup_page_tables(struct cbe_iommu *iommu,
+				unsigned long dbase, unsigned long dsize,
+				unsigned long fbase, unsigned long fsize)
 {
 	struct page *page;
-	int ret, i;
-	unsigned long reg, segments, pages_per_segment, ptab_size, n_pte_pages;
-	unsigned long xlate_base;
-	unsigned int virq;
-
-	if (cell_iommu_find_ioc(iommu->nid, &xlate_base))
-		panic("%s: missing IOC register mappings for node %d\n",
-		      __FUNCTION__, iommu->nid);
+	int i;
+	unsigned long reg, segments, pages_per_segment, ptab_size, stab_size,
+		      n_pte_pages, base;
 
-	iommu->xlate_regs = ioremap(xlate_base, IOC_Reg_Size);
-	iommu->cmd_regs = iommu->xlate_regs + IOC_IOCmd_Offset;
+	base = dbase;
+	if (fsize != 0)
+		base = min(fbase, dbase);
 
-	segments = size >> IO_SEGMENT_SHIFT;
+	segments = max(dbase + dsize, fbase + fsize) >> IO_SEGMENT_SHIFT;
 	pages_per_segment = 1ull << IO_PAGENO_BITS;
 
 	pr_debug("%s: iommu[%d]: segments: %lu, pages per segment: %lu\n",
 			__FUNCTION__, iommu->nid, segments, pages_per_segment);
 
 	/* set up the segment table */
-	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
+	stab_size = segments * sizeof(unsigned long);
+	page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(stab_size));
 	BUG_ON(!page);
 	iommu->stab = page_address(page);
 	clear_page(iommu->stab);
@@ -371,11 +371,25 @@ static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long siz
 	}
 
 	pr_debug("Setting up IOMMU stab:\n");
-	for (i = 0; i * (1ul << IO_SEGMENT_SHIFT) < size; i++) {
+	for (i = base >> IO_SEGMENT_SHIFT; i < segments; i++) {
 		iommu->stab[i] = reg |
 			(__pa(iommu->ptab) + n_pte_pages * IOMMU_PAGE_SIZE * i);
 		pr_debug("\t[%d] 0x%016lx\n", i, iommu->stab[i]);
 	}
+}
+
+static void cell_iommu_enable_hardware(struct cbe_iommu *iommu)
+{
+	int ret;
+	unsigned long reg, xlate_base;
+	unsigned int virq;
+
+	if (cell_iommu_find_ioc(iommu->nid, &xlate_base))
+		panic("%s: missing IOC register mappings for node %d\n",
+		      __FUNCTION__, iommu->nid);
+
+	iommu->xlate_regs = ioremap(xlate_base, IOC_Reg_Size);
+	iommu->cmd_regs = iommu->xlate_regs + IOC_IOCmd_Offset;
 
 	/* ensure that the STEs have updated */
 	mb();
@@ -405,6 +419,13 @@ static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long siz
 	out_be64(iommu->cmd_regs + IOC_IOCmd_Cfg, reg);
 }
 
+static void cell_iommu_setup_hardware(struct cbe_iommu *iommu,
+	unsigned long base, unsigned long size)
+{
+	cell_iommu_setup_page_tables(iommu, base, size, 0, 0);
+	cell_iommu_enable_hardware(iommu);
+}
+
 #if 0/* Unused for now */
 static struct iommu_window *find_window(struct cbe_iommu *iommu,
 		unsigned long offset, unsigned long size)
@@ -422,25 +443,36 @@ static struct iommu_window *find_window(struct cbe_iommu *iommu,
 }
 #endif
 
+static inline u32 cell_iommu_get_ioid(struct device_node *np)
+{
+	const u32 *ioid;
+
+	ioid = of_get_property(np, "ioid", NULL);
+	if (ioid == NULL) {
+		printk(KERN_WARNING "iommu: missing ioid for %s using 0\n",
+		       np->full_name);
+		return 0;
+	}
+
+	return *ioid;
+}
+
 static struct iommu_window * __init
 cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 			unsigned long offset, unsigned long size,
 			unsigned long pte_offset)
 {
 	struct iommu_window *window;
-	const unsigned int *ioid;
+	u32 ioid;
 
-	ioid = of_get_property(np, "ioid", NULL);
-	if (ioid == NULL)
-		printk(KERN_WARNING "iommu: missing ioid for %s using 0\n",
-		       np->full_name);
+	ioid = cell_iommu_get_ioid(np);
 
 	window = kmalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid);
 	BUG_ON(window == NULL);
 
 	window->offset = offset;
 	window->size = size;
-	window->ioid = ioid ? *ioid : 0;
+	window->ioid = ioid;
 	window->iommu = iommu;
 	window->pte_offset = pte_offset;
 
@@ -489,16 +521,17 @@ static struct cbe_iommu *cell_iommu_for_node(int nid)
 	return NULL;
 }
 
-static void cell_dma_dev_setup(struct device *dev)
+static unsigned long cell_dma_direct_offset;
+
+static unsigned long dma_iommu_fixed_base;
+struct dma_mapping_ops dma_iommu_fixed_ops;
+
+static void cell_dma_dev_setup_iommu(struct device *dev)
 {
 	struct iommu_window *window;
 	struct cbe_iommu *iommu;
 	struct dev_archdata *archdata = &dev->archdata;
 
-	/* If we run without iommu, no need to do anything */
-	if (get_pci_dma_ops() == &dma_direct_ops)
-		return;
-
 	/* Current implementation uses the first window available in that
 	 * node's iommu. We -might- do something smarter later though it may
 	 * never be necessary
@@ -515,6 +548,23 @@ static void cell_dma_dev_setup(struct device *dev)
 	archdata->dma_data = &window->table;
 }
 
+static void cell_dma_dev_setup_static(struct device *dev);
+
+static void cell_dma_dev_setup(struct device *dev)
+{
+	struct dev_archdata *archdata = &dev->archdata;
+
+	/* Order is important here, these are not mutually exclusive */
+	if (get_dma_ops(dev) == &dma_iommu_fixed_ops)
+		cell_dma_dev_setup_static(dev);
+	else if (get_pci_dma_ops() == &dma_iommu_ops)
+		cell_dma_dev_setup_iommu(dev);
+	else if (get_pci_dma_ops() == &dma_direct_ops)
+		archdata->dma_data = (void *)cell_dma_direct_offset;
+	else
+		BUG();
+}
+
 static void cell_pci_dma_dev_setup(struct pci_dev *dev)
 {
 	cell_dma_dev_setup(&dev->dev);
@@ -560,10 +610,9 @@ static int __init cell_iommu_get_window(struct device_node *np,
 	return 0;
 }
 
-static void __init cell_iommu_init_one(struct device_node *np, unsigned long offset)
+static struct cbe_iommu * __init cell_iommu_alloc(struct device_node *np)
 {
 	struct cbe_iommu *iommu;
-	unsigned long base, size;
 	int nid, i;
 
 	/* Get node ID */
@@ -571,7 +620,7 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 	if (nid < 0) {
 		printk(KERN_ERR "iommu: failed to get node for %s\n",
 		       np->full_name);
-		return;
+		return NULL;
 	}
 	pr_debug("iommu: setting up iommu for node %d (%s)\n",
 		 nid, np->full_name);
@@ -587,7 +636,7 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 	if (cbe_nr_iommus >= NR_IOMMUS) {
 		printk(KERN_ERR "iommu: too many IOMMUs detected ! (%s)\n",
 		       np->full_name);
-		return;
+		return NULL;
 	}
 
 	/* Init base fields */
@@ -598,6 +647,19 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 	snprintf(iommu->name, sizeof(iommu->name), "iommu%d", i);
 	INIT_LIST_HEAD(&iommu->windows);
 
+	return iommu;
+}
+
+static void __init cell_iommu_init_one(struct device_node *np,
+				       unsigned long offset)
+{
+	struct cbe_iommu *iommu;
+	unsigned long base, size;
+
+	iommu = cell_iommu_alloc(np);
+	if (!iommu)
+		return;
+
 	/* Obtain a window for it */
 	cell_iommu_get_window(np, &base, &size);
 
@@ -605,7 +667,7 @@ static void __init cell_iommu_init_one(struct device_node *np, unsigned long off
 		 base, base + size - 1);
 
 	/* Initialize the hardware */
-	cell_iommu_setup_hardware(iommu, size);
+	cell_iommu_setup_hardware(iommu, base, size);
 
 	/* Setup the iommu_table */
 	cell_iommu_setup_window(iommu, np, base, size,
@@ -653,7 +715,7 @@ static int __init cell_iommu_init_disabled(void)
 
 	/* If we have no Axon, we set up the spider DMA magic offset */
 	if (of_find_node_by_name(NULL, "axon") == NULL)
-		dma_direct_offset = SPIDER_DMA_OFFSET;
+		cell_dma_direct_offset = SPIDER_DMA_OFFSET;
 
 	/* Now we need to check to see where the memory is mapped
 	 * in PCI space. We assume that all busses use the same dma
@@ -687,20 +749,274 @@ static int __init cell_iommu_init_disabled(void)
 		return -ENODEV;
 	}
 
-	dma_direct_offset += base;
+	cell_dma_direct_offset += base;
+
+	if (cell_dma_direct_offset != 0)
+		ppc_md.pci_dma_dev_setup = cell_pci_dma_dev_setup;
 
 	printk("iommu: disabled, direct DMA offset is 0x%lx\n",
-	       dma_direct_offset);
+	       cell_dma_direct_offset);
 
 	return 0;
 }
 
-static int __init cell_iommu_init(void)
+/*
+ *  Fixed IOMMU mapping support
+ *
+ *  This code adds support for setting up a fixed IOMMU mapping on certain
+ *  cell machines. For 64-bit devices this avoids the performance overhead of
+ *  mapping and unmapping pages at runtime. 32-bit devices are unable to use
+ *  the fixed mapping.
+ *
+ *  The fixed mapping is established at boot, and maps all of physical memory
+ *  1:1 into device space at some offset. On machines with < 30 GB of memory
+ *  we setup the fixed mapping immediately above the normal IOMMU window.
+ *
+ *  For example a machine with 4GB of memory would end up with the normal
+ *  IOMMU window from 0-2GB and the fixed mapping window from 2GB to 6GB. In
+ *  this case a 64-bit device wishing to DMA to 1GB would be told to DMA to
+ *  3GB, plus any offset required by firmware. The firmware offset is encoded
+ *  in the "dma-ranges" property.
+ *
+ *  On machines with 30GB or more of memory, we are unable to place the fixed
+ *  mapping above the normal IOMMU window as we would run out of address space.
+ *  Instead we move the normal IOMMU window to coincide with the hash page
+ *  table, this region does not need to be part of the fixed mapping as no
+ *  device should ever be DMA'ing to it. We then setup the fixed mapping
+ *  from 0 to 32GB.
+ */
+
+static u64 cell_iommu_get_fixed_address(struct device *dev)
 {
+	u64 cpu_addr, size, best_size, pci_addr = OF_BAD_ADDR;
+	struct device_node *tmp, *np;
+	const u32 *ranges = NULL;
+	int i, len, best;
+
+	np = dev->archdata.of_node;
+	of_node_get(np);
+	ranges = of_get_property(np, "dma-ranges", &len);
+	while (!ranges && np) {
+		tmp = of_get_parent(np);
+		of_node_put(np);
+		np = tmp;
+		ranges = of_get_property(np, "dma-ranges", &len);
+	}
+
+	if (!ranges) {
+		dev_dbg(dev, "iommu: no dma-ranges found\n");
+		goto out;
+	}
+
+	len /= sizeof(u32);
+
+	/* dma-ranges format:
+	 * 1 cell:  pci space
+	 * 2 cells: pci address
+	 * 2 cells: parent address
+	 * 2 cells: size
+	 */
+	for (i = 0, best = -1, best_size = 0; i < len; i += 7) {
+		cpu_addr = of_translate_dma_address(np, ranges +i + 3);
+		size = of_read_number(ranges + i + 5, 2);
+
+		if (cpu_addr == 0 && size > best_size) {
+			best = i;
+			best_size = size;
+		}
+	}
+
+	if (best >= 0)
+		pci_addr = of_read_number(ranges + best + 1, 2);
+	else
+		dev_dbg(dev, "iommu: no suitable range found!\n");
+
+out:
+	of_node_put(np);
+
+	return pci_addr;
+}
+
+static int dma_set_mask_and_switch(struct device *dev, u64 dma_mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+		return -EIO;
+
+	if (dma_mask == DMA_BIT_MASK(64)) {
+		if (cell_iommu_get_fixed_address(dev) == OF_BAD_ADDR)
+			dev_dbg(dev, "iommu: 64-bit OK, but bad addr\n");
+		else {
+			dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
+			set_dma_ops(dev, &dma_iommu_fixed_ops);
+			cell_dma_dev_setup(dev);
+		}
+	} else {
+		dev_dbg(dev, "iommu: not 64-bit, using default ops\n");
+		set_dma_ops(dev, get_pci_dma_ops());
+	}
+
+	*dev->dma_mask = dma_mask;
+
+	return 0;
+}
+
+static void cell_dma_dev_setup_static(struct device *dev)
+{
+	struct dev_archdata *archdata = &dev->archdata;
+	u64 addr;
+
+	addr = cell_iommu_get_fixed_address(dev) + dma_iommu_fixed_base;
+	archdata->dma_data = (void *)addr;
+
+	dev_dbg(dev, "iommu: fixed addr = %lx\n", addr);
+}
+
+static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu,
+	struct device_node *np, unsigned long dbase, unsigned long dsize,
+	unsigned long fbase, unsigned long fsize)
+{
+	unsigned long base_pte, uaddr, *io_pte;
+	int i;
+
+	dma_iommu_fixed_base = fbase;
+
+	/* convert from bytes into page table indices */
+	dbase = dbase >> IOMMU_PAGE_SHIFT;
+	dsize = dsize >> IOMMU_PAGE_SHIFT;
+	fbase = fbase >> IOMMU_PAGE_SHIFT;
+	fsize = fsize >> IOMMU_PAGE_SHIFT;
+
+	pr_debug("iommu: mapping 0x%lx pages from 0x%lx\n", fsize, fbase);
+
+	io_pte = iommu->ptab;
+	base_pte = IOPTE_PP_W | IOPTE_PP_R | IOPTE_M | IOPTE_SO_RW
+		    | (cell_iommu_get_ioid(np) & IOPTE_IOID_Mask);
+
+	uaddr = 0;
+	for (i = fbase; i < fbase + fsize; i++, uaddr += IOMMU_PAGE_SIZE) {
+		/* Don't touch the dynamic region */
+		if (i >= dbase && i < (dbase + dsize)) {
+			pr_debug("iommu: static/dynamic overlap, skipping\n");
+			continue;
+		}
+		io_pte[i] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask);
+	}
+
+	mb();
+}
+
+static int __init cell_iommu_fixed_mapping_init(void)
+{
+	unsigned long dbase, dsize, fbase, fsize, hbase, hend;
+	struct cbe_iommu *iommu;
 	struct device_node *np;
 
-	if (!machine_is(cell))
-		return -ENODEV;
+	/* The fixed mapping is only supported on axon machines */
+	np = of_find_node_by_name(NULL, "axon");
+	if (!np) {
+		pr_debug("iommu: fixed mapping disabled, no axons found\n");
+		return -1;
+	}
+
+	/* The default setup is to have the fixed mapping sit after the
+	 * dynamic region, so find the top of the largest IOMMU window
+	 * on any axon, then add the size of RAM and that's our max value.
+	 * If that is > 32GB we have to do other shennanigans.
+	 */
+	fbase = 0;
+	for_each_node_by_name(np, "axon") {
+		cell_iommu_get_window(np, &dbase, &dsize);
+		fbase = max(fbase, dbase + dsize);
+	}
+
+	fbase = _ALIGN_UP(fbase, 1 << IO_SEGMENT_SHIFT);
+	fsize = lmb_phys_mem_size();
+
+	if ((fbase + fsize) <= 0x800000000)
+		hbase = 0; /* use the device tree window */
+	else {
+		/* If we're over 32 GB we need to cheat. We can't map all of
+		 * RAM with the fixed mapping, and also fit the dynamic
+		 * region. So try to place the dynamic region where the hash
+		 * table sits, drivers never need to DMA to it, we don't
+		 * need a fixed mapping for that area.
+		 */
+		if (!htab_address) {
+			pr_debug("iommu: htab is NULL, on LPAR? Huh?\n");
+			return -1;
+		}
+		hbase = __pa(htab_address);
+		hend  = hbase + htab_size_bytes;
+
+		/* The window must start and end on a segment boundary */
+		if ((hbase != _ALIGN_UP(hbase, 1 << IO_SEGMENT_SHIFT)) ||
+		    (hend != _ALIGN_UP(hend, 1 << IO_SEGMENT_SHIFT))) {
+			pr_debug("iommu: hash window not segment aligned\n");
+			return -1;
+		}
+
+		/* Check the hash window fits inside the real DMA window */
+		for_each_node_by_name(np, "axon") {
+			cell_iommu_get_window(np, &dbase, &dsize);
+
+			if (hbase < dbase || (hend > (dbase + dsize))) {
+				pr_debug("iommu: hash window doesn't fit in"
+					 "real DMA window\n");
+				return -1;
+			}
+		}
+
+		fbase = 0;
+	}
+
+	/* Setup the dynamic regions */
+	for_each_node_by_name(np, "axon") {
+		iommu = cell_iommu_alloc(np);
+		BUG_ON(!iommu);
+
+		if (hbase == 0)
+			cell_iommu_get_window(np, &dbase, &dsize);
+		else {
+			dbase = hbase;
+			dsize = htab_size_bytes;
+		}
+
+		pr_debug("iommu: setting up %d, dynamic window %lx-%lx " \
+			 "fixed window %lx-%lx\n", iommu->nid, dbase,
+			 dbase + dsize, fbase, fbase + fsize);
+
+		cell_iommu_setup_page_tables(iommu, dbase, dsize, fbase, fsize);
+		cell_iommu_setup_fixed_ptab(iommu, np, dbase, dsize,
+					     fbase, fsize);
+		cell_iommu_enable_hardware(iommu);
+		cell_iommu_setup_window(iommu, np, dbase, dsize, 0);
+	}
+
+	dma_iommu_fixed_ops = dma_direct_ops;
+	dma_iommu_fixed_ops.set_dma_mask = dma_set_mask_and_switch;
+
+	dma_iommu_ops.set_dma_mask = dma_set_mask_and_switch;
+	set_pci_dma_ops(&dma_iommu_ops);
+
+	printk(KERN_DEBUG "IOMMU fixed mapping established.\n");
+
+	return 0;
+}
+
+static int iommu_fixed_disabled;
+
+static int __init setup_iommu_fixed(char *str)
+{
+	if (strcmp(str, "off") == 0)
+		iommu_fixed_disabled = 1;
+
+	return 1;
+}
+__setup("iommu_fixed=", setup_iommu_fixed);
+
+static int __init cell_iommu_init(void)
+{
+	struct device_node *np;
 
 	/* If IOMMU is disabled or we have little enough RAM to not need
 	 * to enable it, we setup a direct mapping.
@@ -717,6 +1033,9 @@ static int __init cell_iommu_init(void)
 	ppc_md.tce_build = tce_build_cell;
 	ppc_md.tce_free = tce_free_cell;
 
+	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
+		goto bail;
+
 	/* Create an iommu for each /axon node.  */
 	for_each_node_by_name(np, "axon") {
 		if (np->parent == NULL || np->parent->parent != NULL)
@@ -744,5 +1063,6 @@ static int __init cell_iommu_init(void)
 
 	return 0;
 }
-arch_initcall(cell_iommu_init);
+machine_arch_initcall(cell, cell_iommu_init);
+machine_arch_initcall(celleb_native, cell_iommu_init);
 
diff --git a/arch/powerpc/platforms/cell/pmu.c b/arch/powerpc/platforms/cell/pmu.c
index 1ed30367888..69ed0d7f164 100644
--- a/arch/powerpc/platforms/cell/pmu.c
+++ b/arch/powerpc/platforms/cell/pmu.c
@@ -213,7 +213,7 @@ u32 cbe_read_pm(u32 cpu, enum pm_reg_name reg)
 		break;
 
 	case pm_interval:
-		READ_SHADOW_REG(val, pm_interval);
+		READ_MMIO_UPPER32(val, pm_interval);
 		break;
 
 	case pm_start_stop:
@@ -381,9 +381,6 @@ static int __init cbe_init_pm_irq(void)
 	unsigned int irq;
 	int rc, node;
 
-	if (!machine_is(cell))
-		return 0;
-
 	for_each_node(node) {
 		irq = irq_create_mapping(NULL, IIC_IRQ_IOEX_PMI |
 					       (node << IIC_IRQ_NODE_SHIFT));
@@ -404,7 +401,7 @@ static int __init cbe_init_pm_irq(void)
 
 	return 0;
 }
-arch_initcall(cbe_init_pm_irq);
+machine_arch_initcall(cell, cbe_init_pm_irq);
 
 void cbe_sync_irq(int node)
 {
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 98e7ef8e6fc..e6534b519c9 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -30,6 +30,7 @@
 #include <linux/console.h>
 #include <linux/mutex.h>
 #include <linux/memory_hotplug.h>
+#include <linux/of_platform.h>
 
 #include <asm/mmu.h>
 #include <asm/processor.h>
@@ -51,7 +52,6 @@
 #include <asm/spu_priv1.h>
 #include <asm/udbg.h>
 #include <asm/mpic.h>
-#include <asm/of_platform.h>
 #include <asm/cell-regs.h>
 
 #include "interrupt.h"
@@ -85,9 +85,6 @@ static int __init cell_publish_devices(void)
 {
 	int node;
 
-	if (!machine_is(cell))
-		return 0;
-
 	/* Publish OF platform devices for southbridge IOs */
 	of_platform_bus_probe(NULL, NULL, NULL);
 
@@ -101,7 +98,7 @@ static int __init cell_publish_devices(void)
 	}
 	return 0;
 }
-device_initcall(cell_publish_devices);
+machine_device_initcall(cell, cell_publish_devices);
 
 static void cell_mpic_cascade(unsigned int irq, struct irq_desc *desc)
 {
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index e4438456c86..efb3964457b 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -42,6 +42,7 @@
 #include <asm/firmware.h>
 #include <asm/system.h>
 #include <asm/rtas.h>
+#include <asm/cputhreads.h>
 
 #include "interrupt.h"
 #include <asm/udbg.h>
@@ -182,7 +183,7 @@ static int smp_cell_cpu_bootable(unsigned int nr)
 	 */
 	if (system_state < SYSTEM_RUNNING &&
 	    cpu_has_feature(CPU_FTR_SMT) &&
-	    !smt_enabled_at_boot && nr % 2 != 0)
+	    !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
 		return 0;
 
 	return 1;
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index a0886220364..e45cfa84911 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -34,6 +34,7 @@
 #include <linux/linux_logo.h>
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
+#include <asm/spu_csa.h>
 #include <asm/xmon.h>
 #include <asm/prom.h>
 
@@ -47,6 +48,13 @@ struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
 EXPORT_SYMBOL_GPL(cbe_spu_info);
 
 /*
+ * The spufs fault-handling code needs to call force_sig_info to raise signals
+ * on DMA errors. Export it here to avoid general kernel-wide access to this
+ * function
+ */
+EXPORT_SYMBOL_GPL(force_sig_info);
+
+/*
  * Protects cbe_spu_info and spu->number.
  */
 static DEFINE_SPINLOCK(spu_lock);
@@ -66,6 +74,10 @@ static LIST_HEAD(spu_full_list);
 static DEFINE_SPINLOCK(spu_full_list_lock);
 static DEFINE_MUTEX(spu_full_list_mutex);
 
+struct spu_slb {
+	u64 esid, vsid;
+};
+
 void spu_invalidate_slbs(struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
@@ -114,40 +126,36 @@ void spu_associate_mm(struct spu *spu, struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(spu_associate_mm);
 
-static int __spu_trap_invalid_dma(struct spu *spu)
+int spu_64k_pages_available(void)
 {
-	pr_debug("%s\n", __FUNCTION__);
-	spu->dma_callback(spu, SPE_EVENT_INVALID_DMA);
-	return 0;
+	return mmu_psize_defs[MMU_PAGE_64K].shift != 0;
 }
+EXPORT_SYMBOL_GPL(spu_64k_pages_available);
 
-static int __spu_trap_dma_align(struct spu *spu)
+static void spu_restart_dma(struct spu *spu)
 {
-	pr_debug("%s\n", __FUNCTION__);
-	spu->dma_callback(spu, SPE_EVENT_DMA_ALIGNMENT);
-	return 0;
-}
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
-static int __spu_trap_error(struct spu *spu)
-{
-	pr_debug("%s\n", __FUNCTION__);
-	spu->dma_callback(spu, SPE_EVENT_SPE_ERROR);
-	return 0;
+	if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags))
+		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
 }
 
-static void spu_restart_dma(struct spu *spu)
+static inline void spu_load_slb(struct spu *spu, int slbe, struct spu_slb *slb)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
-	if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags))
-		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
+	pr_debug("%s: adding SLB[%d] 0x%016lx 0x%016lx\n",
+			__func__, slbe, slb->vsid, slb->esid);
+
+	out_be64(&priv2->slb_index_W, slbe);
+	out_be64(&priv2->slb_vsid_RW, slb->vsid);
+	out_be64(&priv2->slb_esid_RW, slb->esid);
 }
 
 static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 {
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
 	struct mm_struct *mm = spu->mm;
-	u64 esid, vsid, llp;
+	struct spu_slb slb;
 	int psize;
 
 	pr_debug("%s\n", __FUNCTION__);
@@ -159,7 +167,7 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 		printk("%s: invalid access during switch!\n", __func__);
 		return 1;
 	}
-	esid = (ea & ESID_MASK) | SLB_ESID_V;
+	slb.esid = (ea & ESID_MASK) | SLB_ESID_V;
 
 	switch(REGION_ID(ea)) {
 	case USER_REGION_ID:
@@ -168,21 +176,21 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 #else
 		psize = mm->context.user_psize;
 #endif
-		vsid = (get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M) << SLB_VSID_SHIFT) |
-				SLB_VSID_USER;
+		slb.vsid = (get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M)
+				<< SLB_VSID_SHIFT) | SLB_VSID_USER;
 		break;
 	case VMALLOC_REGION_ID:
 		if (ea < VMALLOC_END)
 			psize = mmu_vmalloc_psize;
 		else
 			psize = mmu_io_psize;
-		vsid = (get_kernel_vsid(ea, MMU_SEGSIZE_256M) << SLB_VSID_SHIFT) |
-			SLB_VSID_KERNEL;
+		slb.vsid = (get_kernel_vsid(ea, MMU_SEGSIZE_256M)
+				<< SLB_VSID_SHIFT) | SLB_VSID_KERNEL;
 		break;
 	case KERNEL_REGION_ID:
 		psize = mmu_linear_psize;
-		vsid = (get_kernel_vsid(ea, MMU_SEGSIZE_256M) << SLB_VSID_SHIFT) |
-			SLB_VSID_KERNEL;
+		slb.vsid = (get_kernel_vsid(ea, MMU_SEGSIZE_256M)
+				<< SLB_VSID_SHIFT) | SLB_VSID_KERNEL;
 		break;
 	default:
 		/* Future: support kernel segments so that drivers
@@ -191,11 +199,9 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 		pr_debug("invalid region access at %016lx\n", ea);
 		return 1;
 	}
-	llp = mmu_psize_defs[psize].sllp;
+	slb.vsid |= mmu_psize_defs[psize].sllp;
 
-	out_be64(&priv2->slb_index_W, spu->slb_replace);
-	out_be64(&priv2->slb_vsid_RW, vsid | llp);
-	out_be64(&priv2->slb_esid_RW, esid);
+	spu_load_slb(spu, spu->slb_replace, &slb);
 
 	spu->slb_replace++;
 	if (spu->slb_replace >= 8)
@@ -225,13 +231,83 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr)
 		return 1;
 	}
 
+	spu->class_0_pending = 0;
 	spu->dar = ea;
 	spu->dsisr = dsisr;
-	mb();
+
 	spu->stop_callback(spu);
+
+	return 0;
+}
+
+static void __spu_kernel_slb(void *addr, struct spu_slb *slb)
+{
+	unsigned long ea = (unsigned long)addr;
+	u64 llp;
+
+	if (REGION_ID(ea) == KERNEL_REGION_ID)
+		llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	else
+		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+
+	slb->vsid = (get_kernel_vsid(ea, MMU_SEGSIZE_256M) << SLB_VSID_SHIFT) |
+		SLB_VSID_KERNEL | llp;
+	slb->esid = (ea & ESID_MASK) | SLB_ESID_V;
+}
+
+/**
+ * Given an array of @nr_slbs SLB entries, @slbs, return non-zero if the
+ * address @new_addr is present.
+ */
+static inline int __slb_present(struct spu_slb *slbs, int nr_slbs,
+		void *new_addr)
+{
+	unsigned long ea = (unsigned long)new_addr;
+	int i;
+
+	for (i = 0; i < nr_slbs; i++)
+		if (!((slbs[i].esid ^ ea) & ESID_MASK))
+			return 1;
+
 	return 0;
 }
 
+/**
+ * Setup the SPU kernel SLBs, in preparation for a context save/restore. We
+ * need to map both the context save area, and the save/restore code.
+ *
+ * Because the lscsa and code may cross segment boundaires, we check to see
+ * if mappings are required for the start and end of each range. We currently
+ * assume that the mappings are smaller that one segment - if not, something
+ * is seriously wrong.
+ */
+void spu_setup_kernel_slbs(struct spu *spu, struct spu_lscsa *lscsa,
+		void *code, int code_size)
+{
+	struct spu_slb slbs[4];
+	int i, nr_slbs = 0;
+	/* start and end addresses of both mappings */
+	void *addrs[] = {
+		lscsa, (void *)lscsa + sizeof(*lscsa) - 1,
+		code, code + code_size - 1
+	};
+
+	/* check the set of addresses, and create a new entry in the slbs array
+	 * if there isn't already a SLB for that address */
+	for (i = 0; i < ARRAY_SIZE(addrs); i++) {
+		if (__slb_present(slbs, nr_slbs, addrs[i]))
+			continue;
+
+		__spu_kernel_slb(addrs[i], &slbs[nr_slbs]);
+		nr_slbs++;
+	}
+
+	/* Add the set of SLBs */
+	for (i = 0; i < nr_slbs; i++)
+		spu_load_slb(spu, i, &slbs[i]);
+}
+EXPORT_SYMBOL_GPL(spu_setup_kernel_slbs);
+
 static irqreturn_t
 spu_irq_class_0(int irq, void *data)
 {
@@ -240,12 +316,13 @@ spu_irq_class_0(int irq, void *data)
 
 	spu = data;
 
+	spin_lock(&spu->register_lock);
 	mask = spu_int_mask_get(spu, 0);
-	stat = spu_int_stat_get(spu, 0);
-	stat &= mask;
+	stat = spu_int_stat_get(spu, 0) & mask;
 
-	spin_lock(&spu->register_lock);
 	spu->class_0_pending |= stat;
+	spu->dsisr = spu_mfc_dsisr_get(spu);
+	spu->dar = spu_mfc_dar_get(spu);
 	spin_unlock(&spu->register_lock);
 
 	spu->stop_callback(spu);
@@ -255,31 +332,6 @@ spu_irq_class_0(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-int
-spu_irq_class_0_bottom(struct spu *spu)
-{
-	unsigned long flags;
-	unsigned long stat;
-
-	spin_lock_irqsave(&spu->register_lock, flags);
-	stat = spu->class_0_pending;
-	spu->class_0_pending = 0;
-
-	if (stat & 1) /* invalid DMA alignment */
-		__spu_trap_dma_align(spu);
-
-	if (stat & 2) /* invalid MFC DMA */
-		__spu_trap_invalid_dma(spu);
-
-	if (stat & 4) /* error on SPU */
-		__spu_trap_error(spu);
-
-	spin_unlock_irqrestore(&spu->register_lock, flags);
-
-	return (stat & 0x7) ? -EIO : 0;
-}
-EXPORT_SYMBOL_GPL(spu_irq_class_0_bottom);
-
 static irqreturn_t
 spu_irq_class_1(int irq, void *data)
 {
@@ -294,24 +346,23 @@ spu_irq_class_1(int irq, void *data)
 	stat  = spu_int_stat_get(spu, 1) & mask;
 	dar   = spu_mfc_dar_get(spu);
 	dsisr = spu_mfc_dsisr_get(spu);
-	if (stat & 2) /* mapping fault */
+	if (stat & CLASS1_STORAGE_FAULT_INTR)
 		spu_mfc_dsisr_set(spu, 0ul);
 	spu_int_stat_clear(spu, 1, stat);
 	spin_unlock(&spu->register_lock);
 	pr_debug("%s: %lx %lx %lx %lx\n", __FUNCTION__, mask, stat,
 			dar, dsisr);
 
-	if (stat & 1) /* segment fault */
+	if (stat & CLASS1_SEGMENT_FAULT_INTR)
 		__spu_trap_data_seg(spu, dar);
 
-	if (stat & 2) { /* mapping fault */
+	if (stat & CLASS1_STORAGE_FAULT_INTR)
 		__spu_trap_data_map(spu, dar, dsisr);
-	}
 
-	if (stat & 4) /* ls compare & suspend on get */
+	if (stat & CLASS1_LS_COMPARE_SUSPEND_ON_GET_INTR)
 		;
 
-	if (stat & 8) /* ls compare & suspend on put */
+	if (stat & CLASS1_LS_COMPARE_SUSPEND_ON_PUT_INTR)
 		;
 
 	return stat ? IRQ_HANDLED : IRQ_NONE;
@@ -323,6 +374,8 @@ spu_irq_class_2(int irq, void *data)
 	struct spu *spu;
 	unsigned long stat;
 	unsigned long mask;
+	const int mailbox_intrs =
+		CLASS2_MAILBOX_THRESHOLD_INTR | CLASS2_MAILBOX_INTR;
 
 	spu = data;
 	spin_lock(&spu->register_lock);
@@ -330,31 +383,30 @@ spu_irq_class_2(int irq, void *data)
 	mask = spu_int_mask_get(spu, 2);
 	/* ignore interrupts we're not waiting for */
 	stat &= mask;
-	/*
-	 * mailbox interrupts (0x1 and 0x10) are level triggered.
-	 * mask them now before acknowledging.
-	 */
-	if (stat & 0x11)
-		spu_int_mask_and(spu, 2, ~(stat & 0x11));
+
+	/* mailbox interrupts are level triggered. mask them now before
+	 * acknowledging */
+	if (stat & mailbox_intrs)
+		spu_int_mask_and(spu, 2, ~(stat & mailbox_intrs));
 	/* acknowledge all interrupts before the callbacks */
 	spu_int_stat_clear(spu, 2, stat);
 	spin_unlock(&spu->register_lock);
 
 	pr_debug("class 2 interrupt %d, %lx, %lx\n", irq, stat, mask);
 
-	if (stat & 1)  /* PPC core mailbox */
+	if (stat & CLASS2_MAILBOX_INTR)
 		spu->ibox_callback(spu);
 
-	if (stat & 2) /* SPU stop-and-signal */
+	if (stat & CLASS2_SPU_STOP_INTR)
 		spu->stop_callback(spu);
 
-	if (stat & 4) /* SPU halted */
+	if (stat & CLASS2_SPU_HALT_INTR)
 		spu->stop_callback(spu);
 
-	if (stat & 8) /* DMA tag group complete */
+	if (stat & CLASS2_SPU_DMA_TAG_GROUP_COMPLETE_INTR)
 		spu->mfc_callback(spu);
 
-	if (stat & 0x10) /* SPU mailbox threshold */
+	if (stat & CLASS2_MAILBOX_THRESHOLD_INTR)
 		spu->wbox_callback(spu);
 
 	spu->stats.class2_intr++;
@@ -479,13 +531,27 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
 int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
+	int rc = 0;
 
 	mutex_lock(&spu_full_list_mutex);
-	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysfs_create_group(&spu->sysdev.kobj, attrs);
+	list_for_each_entry(spu, &spu_full_list, full_list) {
+		rc = sysfs_create_group(&spu->sysdev.kobj, attrs);
+
+		/* we're in trouble here, but try unwinding anyway */
+		if (rc) {
+			printk(KERN_ERR "%s: can't create sysfs group '%s'\n",
+					__func__, attrs->name);
+
+			list_for_each_entry_continue_reverse(spu,
+					&spu_full_list, full_list)
+				sysfs_remove_group(&spu->sysdev.kobj, attrs);
+			break;
+		}
+	}
+
 	mutex_unlock(&spu_full_list_mutex);
 
-	return 0;
+	return rc;
 }
 EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
 
diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c
new file mode 100644
index 00000000000..c8b1cd42905
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spu_fault.c
@@ -0,0 +1,98 @@
+/*
+ * SPU mm fault handler
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/spu.h>
+#include <asm/spu_csa.h>
+
+/*
+ * This ought to be kept in sync with the powerpc specific do_page_fault
+ * function. Currently, there are a few corner cases that we haven't had
+ * to handle fortunately.
+ */
+int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
+		unsigned long dsisr, unsigned *flt)
+{
+	struct vm_area_struct *vma;
+	unsigned long is_write;
+	int ret;
+
+#if 0
+	if (!IS_VALID_EA(ea)) {
+		return -EFAULT;
+	}
+#endif /* XXX */
+	if (mm == NULL) {
+		return -EFAULT;
+	}
+	if (mm->pgd == NULL) {
+		return -EFAULT;
+	}
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, ea);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= ea)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (expand_stack(vma, ea))
+		goto bad_area;
+good_area:
+	is_write = dsisr & MFC_DSISR_ACCESS_PUT;
+	if (is_write) {
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+	} else {
+		if (dsisr & MFC_DSISR_ACCESS_DENIED)
+			goto bad_area;
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+			goto bad_area;
+	}
+	ret = 0;
+	*flt = handle_mm_fault(mm, vma, ea, is_write);
+	if (unlikely(*flt & VM_FAULT_ERROR)) {
+		if (*flt & VM_FAULT_OOM) {
+			ret = -ENOMEM;
+			goto bad_area;
+		} else if (*flt & VM_FAULT_SIGBUS) {
+			ret = -EFAULT;
+			goto bad_area;
+		}
+		BUG();
+	}
+	if (*flt & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
+	up_read(&mm->mmap_sem);
+	return ret;
+
+bad_area:
+	up_read(&mm->mmap_sem);
+	return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(spu_handle_mm_fault);
diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c
index 1b010707488..d351bdebf5f 100644
--- a/arch/powerpc/platforms/cell/spu_manage.c
+++ b/arch/powerpc/platforms/cell/spu_manage.c
@@ -35,6 +35,7 @@
 #include <asm/firmware.h>
 #include <asm/prom.h>
 
+#include "spufs/spufs.h"
 #include "interrupt.h"
 
 struct device_node *spu_devnode(struct spu *spu)
@@ -345,7 +346,7 @@ static int __init of_create_spu(struct spu *spu, void *data)
 		}
 		ret = spu_map_interrupts_old(spu, spe);
 		if (ret) {
-			printk(KERN_ERR "%s: could not map interrupts",
+			printk(KERN_ERR "%s: could not map interrupts\n",
 				spu->name);
 			goto out_unmap;
 		}
@@ -369,6 +370,16 @@ static int of_destroy_spu(struct spu *spu)
 	return 0;
 }
 
+static void enable_spu_by_master_run(struct spu_context *ctx)
+{
+	ctx->ops->master_start(ctx);
+}
+
+static void disable_spu_by_master_run(struct spu_context *ctx)
+{
+	ctx->ops->master_stop(ctx);
+}
+
 /* Hardcoded affinity idxs for qs20 */
 #define QS20_SPES_PER_BE 8
 static int qs20_reg_idxs[QS20_SPES_PER_BE] =   { 0, 2, 4, 6, 7, 5, 3, 1 };
@@ -411,10 +422,15 @@ static void init_affinity_qs20_harcoded(void)
 
 static int of_has_vicinity(void)
 {
-	struct spu* spu;
+	struct device_node *dn;
 
-	spu = list_first_entry(&cbe_spu_info[0].spus, struct spu, cbe_list);
-	return of_find_property(spu_devnode(spu), "vicinity", NULL) != NULL;
+	for_each_node_by_type(dn, "spe") {
+		if (of_find_property(dn, "vicinity", NULL))  {
+			of_node_put(dn);
+			return 1;
+		}
+	}
+	return 0;
 }
 
 static struct spu *devnode_spu(int cbe, struct device_node *dn)
@@ -525,7 +541,7 @@ static int __init init_affinity(void)
 		if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0"))
 			init_affinity_qs20_harcoded();
 		else
-			printk("No affinity configuration found");
+			printk("No affinity configuration found\n");
 	}
 
 	return 0;
@@ -535,5 +551,7 @@ const struct spu_management_ops spu_management_of_ops = {
 	.enumerate_spus = of_enumerate_spus,
 	.create_spu = of_create_spu,
 	.destroy_spu = of_destroy_spu,
+	.enable_spu = enable_spu_by_master_run,
+	.disable_spu = disable_spu_by_master_run,
 	.init_affinity = init_affinity,
 };
diff --git a/arch/powerpc/platforms/cell/spufs/Makefile b/arch/powerpc/platforms/cell/spufs/Makefile
index 328afcf8950..d3a349fb42e 100644
--- a/arch/powerpc/platforms/cell/spufs/Makefile
+++ b/arch/powerpc/platforms/cell/spufs/Makefile
@@ -1,8 +1,8 @@
-obj-y += switch.o fault.o lscsa_alloc.o
 
 obj-$(CONFIG_SPU_FS) += spufs.o
 spufs-y += inode.o file.o context.o syscalls.o coredump.o
 spufs-y += sched.o backing_ops.o hw_ops.o run.o gang.o
+spufs-y += switch.o fault.o lscsa_alloc.o
 
 # Rules to build switch.o with the help of SPU tool chain
 SPU_CROSS	:= spu-
diff --git a/arch/powerpc/platforms/cell/spufs/backing_ops.c b/arch/powerpc/platforms/cell/spufs/backing_ops.c
index ec01214e51e..50d98a154aa 100644
--- a/arch/powerpc/platforms/cell/spufs/backing_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/backing_ops.c
@@ -106,16 +106,20 @@ static unsigned int spu_backing_mbox_stat_poll(struct spu_context *ctx,
 		if (stat & 0xff0000)
 			ret |= POLLIN | POLLRDNORM;
 		else {
-			ctx->csa.priv1.int_stat_class0_RW &= ~0x1;
-			ctx->csa.priv1.int_mask_class2_RW |= 0x1;
+			ctx->csa.priv1.int_stat_class2_RW &=
+				~CLASS2_MAILBOX_INTR;
+			ctx->csa.priv1.int_mask_class2_RW |=
+				CLASS2_ENABLE_MAILBOX_INTR;
 		}
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if (stat & 0x00ff00)
 			ret = POLLOUT | POLLWRNORM;
 		else {
-			ctx->csa.priv1.int_stat_class0_RW &= ~0x10;
-			ctx->csa.priv1.int_mask_class2_RW |= 0x10;
+			ctx->csa.priv1.int_stat_class2_RW &=
+				~CLASS2_MAILBOX_THRESHOLD_INTR;
+			ctx->csa.priv1.int_mask_class2_RW |=
+				CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR;
 		}
 	}
 	spin_unlock_irq(&ctx->csa.register_lock);
@@ -139,7 +143,7 @@ static int spu_backing_ibox_read(struct spu_context *ctx, u32 * data)
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt */
-		ctx->csa.priv1.int_mask_class2_RW |= 0x1UL;
+		ctx->csa.priv1.int_mask_class2_RW |= CLASS2_ENABLE_MAILBOX_INTR;
 		ret = 0;
 	}
 	spin_unlock(&ctx->csa.register_lock);
@@ -169,7 +173,8 @@ static int spu_backing_wbox_write(struct spu_context *ctx, u32 data)
 	} else {
 		/* make sure we get woken up by the interrupt when space
 		   becomes available */
-		ctx->csa.priv1.int_mask_class2_RW |= 0x10;
+		ctx->csa.priv1.int_mask_class2_RW |=
+			CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR;
 		ret = 0;
 	}
 	spin_unlock(&ctx->csa.register_lock);
@@ -268,6 +273,11 @@ static char *spu_backing_get_ls(struct spu_context *ctx)
 	return ctx->csa.lscsa->ls;
 }
 
+static void spu_backing_privcntl_write(struct spu_context *ctx, u64 val)
+{
+	ctx->csa.priv2.spu_privcntl_RW = val;
+}
+
 static u32 spu_backing_runcntl_read(struct spu_context *ctx)
 {
 	return ctx->csa.prob.spu_runcntl_RW;
@@ -285,6 +295,11 @@ static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 	spin_unlock(&ctx->csa.register_lock);
 }
 
+static void spu_backing_runcntl_stop(struct spu_context *ctx)
+{
+	spu_backing_runcntl_write(ctx, SPU_RUNCNTL_STOP);
+}
+
 static void spu_backing_master_start(struct spu_context *ctx)
 {
 	struct spu_state *csa = &ctx->csa;
@@ -358,7 +373,7 @@ static int spu_backing_send_mfc_command(struct spu_context *ctx,
 
 static void spu_backing_restart_dma(struct spu_context *ctx)
 {
-	/* nothing to do here */
+	ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_RESTART_DMA_COMMAND;
 }
 
 struct spu_context_ops spu_backing_ops = {
@@ -379,8 +394,10 @@ struct spu_context_ops spu_backing_ops = {
 	.npc_write = spu_backing_npc_write,
 	.status_read = spu_backing_status_read,
 	.get_ls = spu_backing_get_ls,
+	.privcntl_write = spu_backing_privcntl_write,
 	.runcntl_read = spu_backing_runcntl_read,
 	.runcntl_write = spu_backing_runcntl_write,
+	.runcntl_stop = spu_backing_runcntl_stop,
 	.master_start = spu_backing_master_start,
 	.master_stop = spu_backing_master_stop,
 	.set_mfc_query = spu_backing_set_mfc_query,
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index adf0a030d6f..133995ed5cc 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -52,6 +52,7 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	init_waitqueue_head(&ctx->wbox_wq);
 	init_waitqueue_head(&ctx->stop_wq);
 	init_waitqueue_head(&ctx->mfc_wq);
+	init_waitqueue_head(&ctx->run_wq);
 	ctx->state = SPU_STATE_SAVED;
 	ctx->ops = &spu_backing_ops;
 	ctx->owner = get_task_mm(current);
@@ -105,7 +106,17 @@ int put_spu_context(struct spu_context *ctx)
 void spu_forget(struct spu_context *ctx)
 {
 	struct mm_struct *mm;
-	spu_acquire_saved(ctx);
+
+	/*
+	 * This is basically an open-coded spu_acquire_saved, except that
+	 * we don't acquire the state mutex interruptible.
+	 */
+	mutex_lock(&ctx->state_mutex);
+	if (ctx->state != SPU_STATE_SAVED) {
+		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
+		spu_deactivate(ctx);
+	}
+
 	mm = ctx->owner;
 	ctx->owner = NULL;
 	mmput(mm);
@@ -133,47 +144,23 @@ void spu_unmap_mappings(struct spu_context *ctx)
 }
 
 /**
- * spu_acquire_runnable - lock spu contex and make sure it is in runnable state
+ * spu_acquire_saved - lock spu contex and make sure it is in saved state
  * @ctx:	spu contex to lock
- *
- * Note:
- *	Returns 0 and with the context locked on success
- *	Returns negative error and with the context _unlocked_ on failure.
  */
-int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags)
+int spu_acquire_saved(struct spu_context *ctx)
 {
-	int ret = -EINVAL;
-
-	spu_acquire(ctx);
-	if (ctx->state == SPU_STATE_SAVED) {
-		/*
-		 * Context is about to be freed, so we can't acquire it anymore.
-		 */
-		if (!ctx->owner)
-			goto out_unlock;
-		ret = spu_activate(ctx, flags);
-		if (ret)
-			goto out_unlock;
-	}
+	int ret;
 
-	return 0;
-
- out_unlock:
-	spu_release(ctx);
-	return ret;
-}
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
-/**
- * spu_acquire_saved - lock spu contex and make sure it is in saved state
- * @ctx:	spu contex to lock
- */
-void spu_acquire_saved(struct spu_context *ctx)
-{
-	spu_acquire(ctx);
 	if (ctx->state != SPU_STATE_SAVED) {
 		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
 		spu_deactivate(ctx);
 	}
+
+	return 0;
 }
 
 /**
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 80f62363e1c..0c6a96b82b2 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -148,7 +148,9 @@ int spufs_coredump_extra_notes_size(void)
 
 	fd = 0;
 	while ((ctx = coredump_next_context(&fd)) != NULL) {
-		spu_acquire_saved(ctx);
+		rc = spu_acquire_saved(ctx);
+		if (rc)
+			break;
 		rc = spufs_ctx_note_size(ctx, fd);
 		spu_release_saved(ctx);
 		if (rc < 0)
@@ -224,7 +226,9 @@ int spufs_coredump_extra_notes_write(struct file *file, loff_t *foffset)
 
 	fd = 0;
 	while ((ctx = coredump_next_context(&fd)) != NULL) {
-		spu_acquire_saved(ctx);
+		rc = spu_acquire_saved(ctx);
+		if (rc)
+			return rc;
 
 		for (j = 0; spufs_coredump_read[j].name != NULL; j++) {
 			rc = spufs_arch_write_note(ctx, j, file, fd, foffset);
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 917eab4be48..eff4d291ba8 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -28,117 +28,71 @@
 
 #include "spufs.h"
 
-/*
- * This ought to be kept in sync with the powerpc specific do_page_fault
- * function. Currently, there are a few corner cases that we haven't had
- * to handle fortunately.
+/**
+ * Handle an SPE event, depending on context SPU_CREATE_EVENTS_ENABLED flag.
+ *
+ * If the context was created with events, we just set the return event.
+ * Otherwise, send an appropriate signal to the process.
  */
-static int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
-		unsigned long dsisr, unsigned *flt)
+static void spufs_handle_event(struct spu_context *ctx,
+				unsigned long ea, int type)
 {
-	struct vm_area_struct *vma;
-	unsigned long is_write;
-	int ret;
+	siginfo_t info;
 
-#if 0
-	if (!IS_VALID_EA(ea)) {
-		return -EFAULT;
-	}
-#endif /* XXX */
-	if (mm == NULL) {
-		return -EFAULT;
-	}
-	if (mm->pgd == NULL) {
-		return -EFAULT;
+	if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
+		ctx->event_return |= type;
+		wake_up_all(&ctx->stop_wq);
+		return;
 	}
 
-	down_read(&mm->mmap_sem);
-	vma = find_vma(mm, ea);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= ea)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (expand_stack(vma, ea))
-		goto bad_area;
-good_area:
-	is_write = dsisr & MFC_DSISR_ACCESS_PUT;
-	if (is_write) {
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-	} else {
-		if (dsisr & MFC_DSISR_ACCESS_DENIED)
-			goto bad_area;
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
-			goto bad_area;
+	memset(&info, 0, sizeof(info));
+
+	switch (type) {
+	case SPE_EVENT_INVALID_DMA:
+		info.si_signo = SIGBUS;
+		info.si_code = BUS_OBJERR;
+		break;
+	case SPE_EVENT_SPE_DATA_STORAGE:
+		info.si_signo = SIGSEGV;
+		info.si_addr = (void __user *)ea;
+		info.si_code = SEGV_ACCERR;
+		ctx->ops->restart_dma(ctx);
+		break;
+	case SPE_EVENT_DMA_ALIGNMENT:
+		info.si_signo = SIGBUS;
+		/* DAR isn't set for an alignment fault :( */
+		info.si_code = BUS_ADRALN;
+		break;
+	case SPE_EVENT_SPE_ERROR:
+		info.si_signo = SIGILL;
+		info.si_addr = (void __user *)(unsigned long)
+			ctx->ops->npc_read(ctx) - 4;
+		info.si_code = ILL_ILLOPC;
+		break;
 	}
-	ret = 0;
-	*flt = handle_mm_fault(mm, vma, ea, is_write);
-	if (unlikely(*flt & VM_FAULT_ERROR)) {
-		if (*flt & VM_FAULT_OOM) {
-			ret = -ENOMEM;
-			goto bad_area;
-		} else if (*flt & VM_FAULT_SIGBUS) {
-			ret = -EFAULT;
-			goto bad_area;
-		}
-		BUG();
-	}
-	if (*flt & VM_FAULT_MAJOR)
-		current->maj_flt++;
-	else
-		current->min_flt++;
-	up_read(&mm->mmap_sem);
-	return ret;
 
-bad_area:
-	up_read(&mm->mmap_sem);
-	return -EFAULT;
+	if (info.si_signo)
+		force_sig_info(info.si_signo, &info, current);
 }
 
-static void spufs_handle_dma_error(struct spu_context *ctx,
-				unsigned long ea, int type)
+int spufs_handle_class0(struct spu_context *ctx)
 {
-	if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
-		ctx->event_return |= type;
-		wake_up_all(&ctx->stop_wq);
-	} else {
-		siginfo_t info;
-		memset(&info, 0, sizeof(info));
-
-		switch (type) {
-		case SPE_EVENT_INVALID_DMA:
-			info.si_signo = SIGBUS;
-			info.si_code = BUS_OBJERR;
-			break;
-		case SPE_EVENT_SPE_DATA_STORAGE:
-			info.si_signo = SIGBUS;
-			info.si_addr = (void __user *)ea;
-			info.si_code = BUS_ADRERR;
-			break;
-		case SPE_EVENT_DMA_ALIGNMENT:
-			info.si_signo = SIGBUS;
-			/* DAR isn't set for an alignment fault :( */
-			info.si_code = BUS_ADRALN;
-			break;
-		case SPE_EVENT_SPE_ERROR:
-			info.si_signo = SIGILL;
-			info.si_addr = (void __user *)(unsigned long)
-				ctx->ops->npc_read(ctx) - 4;
-			info.si_code = ILL_ILLOPC;
-			break;
-		}
-		if (info.si_signo)
-			force_sig_info(info.si_signo, &info, current);
-	}
-}
+	unsigned long stat = ctx->csa.class_0_pending & CLASS0_INTR_MASK;
 
-void spufs_dma_callback(struct spu *spu, int type)
-{
-	spufs_handle_dma_error(spu->ctx, spu->dar, type);
+	if (likely(!stat))
+		return 0;
+
+	if (stat & CLASS0_DMA_ALIGNMENT_INTR)
+		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_DMA_ALIGNMENT);
+
+	if (stat & CLASS0_INVALID_DMA_COMMAND_INTR)
+		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_INVALID_DMA);
+
+	if (stat & CLASS0_SPU_ERROR_INTR)
+		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_SPE_ERROR);
+
+	return -EIO;
 }
-EXPORT_SYMBOL_GPL(spufs_dma_callback);
 
 /*
  * bottom half handler for page faults, we can't do this from
@@ -154,7 +108,7 @@ int spufs_handle_class1(struct spu_context *ctx)
 	u64 ea, dsisr, access;
 	unsigned long flags;
 	unsigned flt = 0;
-	int ret;
+	int ret, ret2;
 
 	/*
 	 * dar and dsisr get passed from the registers
@@ -165,16 +119,8 @@ int spufs_handle_class1(struct spu_context *ctx)
 	 * in time, we can still expect to get the same fault
 	 * the immediately after the context restore.
 	 */
-	if (ctx->state == SPU_STATE_RUNNABLE) {
-		ea = ctx->spu->dar;
-		dsisr = ctx->spu->dsisr;
-		ctx->spu->dar= ctx->spu->dsisr = 0;
-	} else {
-		ea = ctx->csa.priv1.mfc_dar_RW;
-		dsisr = ctx->csa.priv1.mfc_dsisr_RW;
-		ctx->csa.priv1.mfc_dar_RW = 0;
-		ctx->csa.priv1.mfc_dsisr_RW = 0;
-	}
+	ea = ctx->csa.dar;
+	dsisr = ctx->csa.dsisr;
 
 	if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)))
 		return 0;
@@ -201,7 +147,22 @@ int spufs_handle_class1(struct spu_context *ctx)
 	if (ret)
 		ret = spu_handle_mm_fault(current->mm, ea, dsisr, &flt);
 
-	spu_acquire(ctx);
+	/*
+	 * If spu_acquire fails due to a pending signal we just want to return
+	 * EINTR to userspace even if that means missing the dma restart or
+	 * updating the page fault statistics.
+	 */
+	ret2 = spu_acquire(ctx);
+	if (ret2)
+		goto out;
+
+	/*
+	 * Clear dsisr under ctxt lock after handling the fault, so that
+	 * time slicing will not preempt the context while the page fault
+	 * handler is running. Context switch code removes mappings.
+	 */
+	ctx->csa.dar = ctx->csa.dsisr = 0;
+
 	/*
 	 * If we handled the fault successfully and are in runnable
 	 * state, restart the DMA.
@@ -222,9 +183,9 @@ int spufs_handle_class1(struct spu_context *ctx)
 		if (ctx->spu)
 			ctx->ops->restart_dma(ctx);
 	} else
-		spufs_handle_dma_error(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
+		spufs_handle_event(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
 
+ out:
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(spufs_handle_class1);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index d9e56a50379..3fcd06418b0 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -40,6 +40,120 @@
 
 #define SPUFS_MMAP_4K (PAGE_SIZE == 0x1000)
 
+/* Simple attribute files */
+struct spufs_attr {
+	int (*get)(void *, u64 *);
+	int (*set)(void *, u64);
+	char get_buf[24];       /* enough to store a u64 and "\n\0" */
+	char set_buf[24];
+	void *data;
+	const char *fmt;        /* format for read operation */
+	struct mutex mutex;     /* protects access to these buffers */
+};
+
+static int spufs_attr_open(struct inode *inode, struct file *file,
+		int (*get)(void *, u64 *), int (*set)(void *, u64),
+		const char *fmt)
+{
+	struct spufs_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->get = get;
+	attr->set = set;
+	attr->data = inode->i_private;
+	attr->fmt = fmt;
+	mutex_init(&attr->mutex);
+	file->private_data = attr;
+
+	return nonseekable_open(inode, file);
+}
+
+static int spufs_attr_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t spufs_attr_read(struct file *file, char __user *buf,
+		size_t len, loff_t *ppos)
+{
+	struct spufs_attr *attr;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->get)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	if (*ppos) {		/* continued read */
+		size = strlen(attr->get_buf);
+	} else {		/* first read */
+		u64 val;
+		ret = attr->get(attr->data, &val);
+		if (ret)
+			goto out;
+
+		size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
+				 attr->fmt, (unsigned long long)val);
+	}
+
+	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+static ssize_t spufs_attr_write(struct file *file, const char __user *buf,
+		size_t len, loff_t *ppos)
+{
+	struct spufs_attr *attr;
+	u64 val;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->set)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	size = min(sizeof(attr->set_buf) - 1, len);
+	if (copy_from_user(attr->set_buf, buf, size))
+		goto out;
+
+	ret = len; /* claim we got the whole input */
+	attr->set_buf[size] = '\0';
+	val = simple_strtol(attr->set_buf, NULL, 0);
+	attr->set(attr->data, val);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+#define DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)	\
+static int __fops ## _open(struct inode *inode, struct file *file)	\
+{									\
+	__simple_attr_check_format(__fmt, 0ull);			\
+	return spufs_attr_open(inode, file, __get, __set, __fmt);	\
+}									\
+static struct file_operations __fops = {				\
+	.owner	 = THIS_MODULE,						\
+	.open	 = __fops ## _open,					\
+	.release = spufs_attr_release,					\
+	.read	 = spufs_attr_read,					\
+	.write	 = spufs_attr_write,					\
+};
+
 
 static int
 spufs_mem_open(struct inode *inode, struct file *file)
@@ -84,9 +198,12 @@ spufs_mem_read(struct file *file, char __user *buffer,
 	struct spu_context *ctx = file->private_data;
 	ssize_t ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_mem_read(ctx, buffer, size, pos);
 	spu_release(ctx);
+
 	return ret;
 }
 
@@ -106,7 +223,10 @@ spufs_mem_write(struct file *file, const char __user *buffer,
 	if (size > LS_SIZE - pos)
 		size = LS_SIZE - pos;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
 	local_store = ctx->ops->get_ls(ctx);
 	ret = copy_from_user(local_store + pos, buffer, size);
 	spu_release(ctx);
@@ -146,7 +266,8 @@ static unsigned long spufs_mem_mmap_nopfn(struct vm_area_struct *vma,
 	pr_debug("spufs_mem_mmap_nopfn address=0x%lx -> 0x%lx, offset=0x%lx\n",
 		 addr0, address, offset);
 
-	spu_acquire(ctx);
+	if (spu_acquire(ctx))
+		return NOPFN_REFAULT;
 
 	if (ctx->state == SPU_STATE_SAVED) {
 		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
@@ -236,23 +357,32 @@ static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma,
 {
 	struct spu_context *ctx = vma->vm_file->private_data;
 	unsigned long area, offset = address - vma->vm_start;
-	int ret;
 
 	offset += vma->vm_pgoff << PAGE_SHIFT;
 	if (offset >= ps_size)
 		return NOPFN_SIGBUS;
 
-	/* error here usually means a signal.. we might want to test
-	 * the error code more precisely though
+	/*
+	 * We have to wait for context to be loaded before we have
+	 * pages to hand out to the user, but we don't want to wait
+	 * with the mmap_sem held.
+	 * It is possible to drop the mmap_sem here, but then we need
+	 * to return NOPFN_REFAULT because the mappings may have
+	 * hanged.
 	 */
-	ret = spu_acquire_runnable(ctx, 0);
-	if (ret)
+	if (spu_acquire(ctx))
 		return NOPFN_REFAULT;
 
-	area = ctx->spu->problem_phys + ps_offs;
-	vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
-	spu_release(ctx);
+	if (ctx->state == SPU_STATE_SAVED) {
+		up_read(&current->mm->mmap_sem);
+		spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+		down_read(&current->mm->mmap_sem);
+	} else {
+		area = ctx->spu->problem_phys + ps_offs;
+		vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
+	}
 
+	spu_release(ctx);
 	return NOPFN_REFAULT;
 }
 
@@ -286,25 +416,32 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma)
 #define spufs_cntl_mmap NULL
 #endif /* !SPUFS_MMAP_4K */
 
-static u64 spufs_cntl_get(void *data)
+static int spufs_cntl_get(void *data, u64 *val)
 {
 	struct spu_context *ctx = data;
-	u64 val;
+	int ret;
 
-	spu_acquire(ctx);
-	val = ctx->ops->status_read(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+	*val = ctx->ops->status_read(ctx);
 	spu_release(ctx);
 
-	return val;
+	return 0;
 }
 
-static void spufs_cntl_set(void *data, u64 val)
+static int spufs_cntl_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->runcntl_write(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static int spufs_cntl_open(struct inode *inode, struct file *file)
@@ -317,7 +454,7 @@ static int spufs_cntl_open(struct inode *inode, struct file *file)
 	if (!i->i_openers++)
 		ctx->cntl = inode->i_mapping;
 	mutex_unlock(&ctx->mapping_lock);
-	return simple_attr_open(inode, file, spufs_cntl_get,
+	return spufs_attr_open(inode, file, spufs_cntl_get,
 					spufs_cntl_set, "0x%08lx");
 }
 
@@ -327,7 +464,7 @@ spufs_cntl_release(struct inode *inode, struct file *file)
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
 
-	simple_attr_close(inode, file);
+	spufs_attr_release(inode, file);
 
 	mutex_lock(&ctx->mapping_lock);
 	if (!--i->i_openers)
@@ -339,8 +476,8 @@ spufs_cntl_release(struct inode *inode, struct file *file)
 static const struct file_operations spufs_cntl_fops = {
 	.open = spufs_cntl_open,
 	.release = spufs_cntl_release,
-	.read = simple_attr_read,
-	.write = simple_attr_write,
+	.read = spufs_attr_read,
+	.write = spufs_attr_write,
 	.mmap = spufs_cntl_mmap,
 };
 
@@ -368,7 +505,9 @@ spufs_regs_read(struct file *file, char __user *buffer,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_regs_read(ctx, buffer, size, pos);
 	spu_release_saved(ctx);
 	return ret;
@@ -387,7 +526,9 @@ spufs_regs_write(struct file *file, const char __user *buffer,
 		return -EFBIG;
 	*pos += size;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 
 	ret = copy_from_user(lscsa->gprs + *pos - size,
 			     buffer, size) ? -EFAULT : size;
@@ -419,7 +560,9 @@ spufs_fpcr_read(struct file *file, char __user * buffer,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_fpcr_read(ctx, buffer, size, pos);
 	spu_release_saved(ctx);
 	return ret;
@@ -436,10 +579,12 @@ spufs_fpcr_write(struct file *file, const char __user * buffer,
 	size = min_t(ssize_t, sizeof(lscsa->fpcr) - *pos, size);
 	if (size <= 0)
 		return -EFBIG;
-	*pos += size;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 
+	*pos += size;
 	ret = copy_from_user((char *)&lscsa->fpcr + *pos - size,
 			     buffer, size) ? -EFAULT : size;
 
@@ -486,7 +631,10 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf,
 
 	udata = (void __user *)buf;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		return count;
+
 	for (count = 0; (count + 4) <= len; count += 4, udata++) {
 		int ret;
 		ret = ctx->ops->mbox_read(ctx, &mbox_data);
@@ -522,12 +670,15 @@ static ssize_t spufs_mbox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 mbox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
 	mbox_stat = ctx->ops->mbox_stat_read(ctx) & 0xff;
 
@@ -562,6 +713,9 @@ void spufs_ibox_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->ibox_wq);
 	kill_fasync(&ctx->ibox_fasync, SIGIO, POLLIN);
 }
@@ -593,7 +747,9 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
 
 	udata = (void __user *)buf;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		return count;
 
 	/* wait only for the first element */
 	count = 0;
@@ -639,7 +795,11 @@ static unsigned int spufs_ibox_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->ibox_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	mask = ctx->ops->mbox_stat_poll(ctx, POLLIN | POLLRDNORM);
 	spu_release(ctx);
 
@@ -657,12 +817,15 @@ static ssize_t spufs_ibox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 ibox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ibox_stat = (ctx->ops->mbox_stat_read(ctx) >> 16) & 0xff;
 	spu_release(ctx);
 
@@ -698,6 +861,9 @@ void spufs_wbox_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->wbox_wq);
 	kill_fasync(&ctx->wbox_fasync, SIGIO, POLLOUT);
 }
@@ -731,7 +897,9 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 	if (__get_user(wbox_data, udata))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		return count;
 
 	/*
 	 * make sure we can at least write one element, by waiting
@@ -772,7 +940,11 @@ static unsigned int spufs_wbox_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->wbox_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	mask = ctx->ops->mbox_stat_poll(ctx, POLLOUT | POLLWRNORM);
 	spu_release(ctx);
 
@@ -790,12 +962,15 @@ static ssize_t spufs_wbox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 wbox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	wbox_stat = (ctx->ops->mbox_stat_read(ctx) >> 8) & 0xff;
 	spu_release(ctx);
 
@@ -866,7 +1041,9 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_signal1_read(ctx, buf, len, pos);
 	spu_release_saved(ctx);
 
@@ -877,6 +1054,7 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx;
+	ssize_t ret;
 	u32 data;
 
 	ctx = file->private_data;
@@ -887,7 +1065,9 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 	if (copy_from_user(&data, buf, 4))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal1_write(ctx, data);
 	spu_release(ctx);
 
@@ -997,7 +1177,9 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_signal2_read(ctx, buf, len, pos);
 	spu_release_saved(ctx);
 
@@ -1008,6 +1190,7 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx;
+	ssize_t ret;
 	u32 data;
 
 	ctx = file->private_data;
@@ -1018,7 +1201,9 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 	if (copy_from_user(&data, buf, 4))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal2_write(ctx, data);
 	spu_release(ctx);
 
@@ -1086,33 +1271,42 @@ static const struct file_operations spufs_signal2_nosched_fops = {
 #define SPU_ATTR_ACQUIRE_SAVED	2
 
 #define DEFINE_SPUFS_ATTRIBUTE(__name, __get, __set, __fmt, __acquire)	\
-static u64 __##__get(void *data)					\
+static int __##__get(void *data, u64 *val)				\
 {									\
 	struct spu_context *ctx = data;					\
-	u64 ret;							\
+	int ret = 0;							\
 									\
 	if (__acquire == SPU_ATTR_ACQUIRE) {				\
-		spu_acquire(ctx);					\
-		ret = __get(ctx);					\
+		ret = spu_acquire(ctx);					\
+		if (ret)						\
+			return ret;					\
+		*val = __get(ctx);					\
 		spu_release(ctx);					\
 	} else if (__acquire == SPU_ATTR_ACQUIRE_SAVED)	{		\
-		spu_acquire_saved(ctx);					\
-		ret = __get(ctx);					\
+		ret = spu_acquire_saved(ctx);				\
+		if (ret)						\
+			return ret;					\
+		*val = __get(ctx);					\
 		spu_release_saved(ctx);					\
 	} else								\
-		ret = __get(ctx);					\
+		*val = __get(ctx);					\
 									\
-	return ret;							\
+	return 0;							\
 }									\
-DEFINE_SIMPLE_ATTRIBUTE(__name, __##__get, __set, __fmt);
+DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__name, __##__get, __set, __fmt);
 
-static void spufs_signal1_type_set(void *data, u64 val)
+static int spufs_signal1_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal1_type_set(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static u64 spufs_signal1_type_get(struct spu_context *ctx)
@@ -1123,13 +1317,18 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_signal1_type, spufs_signal1_type_get,
 		       spufs_signal1_type_set, "%llu", SPU_ATTR_ACQUIRE);
 
 
-static void spufs_signal2_type_set(void *data, u64 val)
+static int spufs_signal2_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal2_type_set(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static u64 spufs_signal2_type_get(struct spu_context *ctx)
@@ -1329,6 +1528,9 @@ void spufs_mfc_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->mfc_wq);
 
 	pr_debug("%s %s\n", __FUNCTION__, spu->name);
@@ -1375,12 +1577,17 @@ static ssize_t spufs_mfc_read(struct file *file, char __user *buffer,
 	if (size != 4)
 		goto out;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
 	if (file->f_flags & O_NONBLOCK) {
 		status = ctx->ops->read_mfc_tagstatus(ctx);
 		if (!(status & ctx->tagwait))
 			ret = -EAGAIN;
 		else
+			/* XXX(hch): shouldn't we clear ret here? */
 			ctx->tagwait &= ~status;
 	} else {
 		ret = spufs_wait(ctx->mfc_wq,
@@ -1505,7 +1712,11 @@ static ssize_t spufs_mfc_write(struct file *file, const char __user *buffer,
 	if (ret)
 		goto out;
 
-	ret = spu_acquire_runnable(ctx, 0);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out;
+
+	ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
 	if (ret)
 		goto out;
 
@@ -1539,7 +1750,11 @@ static unsigned int spufs_mfc_poll(struct file *file,poll_table *wait)
 
 	poll_wait(file, &ctx->mfc_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	ctx->ops->set_mfc_query(ctx, ctx->tagwait, 2);
 	free_elements = ctx->ops->get_mfc_free_elements(ctx);
 	tagstatus = ctx->ops->read_mfc_tagstatus(ctx);
@@ -1562,7 +1777,9 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id)
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 #if 0
 /* this currently hangs */
 	ret = spufs_wait(ctx->mfc_wq,
@@ -1605,12 +1822,18 @@ static const struct file_operations spufs_mfc_fops = {
 	.mmap	 = spufs_mfc_mmap,
 };
 
-static void spufs_npc_set(void *data, u64 val)
+static int spufs_npc_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	spu_acquire(ctx);
+	int ret;
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->npc_write(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static u64 spufs_npc_get(struct spu_context *ctx)
@@ -1620,13 +1843,19 @@ static u64 spufs_npc_get(struct spu_context *ctx)
 DEFINE_SPUFS_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set,
 		       "0x%llx\n", SPU_ATTR_ACQUIRE);
 
-static void spufs_decr_set(void *data, u64 val)
+static int spufs_decr_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->decr.slot[0] = (u32) val;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_decr_get(struct spu_context *ctx)
@@ -1637,15 +1866,21 @@ static u64 spufs_decr_get(struct spu_context *ctx)
 DEFINE_SPUFS_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
 		       "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED);
 
-static void spufs_decr_status_set(void *data, u64 val)
+static int spufs_decr_status_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	if (val)
 		ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
 	else
 		ctx->csa.priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_decr_status_get(struct spu_context *ctx)
@@ -1659,13 +1894,19 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
 		       spufs_decr_status_set, "0x%llx\n",
 		       SPU_ATTR_ACQUIRE_SAVED);
 
-static void spufs_event_mask_set(void *data, u64 val)
+static int spufs_event_mask_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->event_mask.slot[0] = (u32) val;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_event_mask_get(struct spu_context *ctx)
@@ -1690,13 +1931,19 @@ static u64 spufs_event_status_get(struct spu_context *ctx)
 DEFINE_SPUFS_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
 		       NULL, "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED)
 
-static void spufs_srr0_set(void *data, u64 val)
+static int spufs_srr0_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->srr0.slot[0] = (u32) val;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_srr0_get(struct spu_context *ctx)
@@ -1727,10 +1974,12 @@ static u64 spufs_object_id_get(struct spu_context *ctx)
 	return ctx->object_id;
 }
 
-static void spufs_object_id_set(void *data, u64 id)
+static int spufs_object_id_set(void *data, u64 id)
 {
 	struct spu_context *ctx = data;
 	ctx->object_id = id;
+
+	return 0;
 }
 
 DEFINE_SPUFS_ATTRIBUTE(spufs_object_id_ops, spufs_object_id_get,
@@ -1777,13 +2026,13 @@ static const struct file_operations spufs_caps_fops = {
 static ssize_t __spufs_mbox_info_read(struct spu_context *ctx,
 			char __user *buf, size_t len, loff_t *pos)
 {
-	u32 mbox_stat;
 	u32 data;
 
-	mbox_stat = ctx->csa.prob.mb_stat_R;
-	if (mbox_stat & 0x0000ff) {
-		data = ctx->csa.prob.pu_mb_R;
-	}
+	/* EOF if there's no entry in the mbox */
+	if (!(ctx->csa.prob.mb_stat_R & 0x0000ff))
+		return 0;
+
+	data = ctx->csa.prob.pu_mb_R;
 
 	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
 }
@@ -1797,7 +2046,9 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_mbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1815,13 +2066,13 @@ static const struct file_operations spufs_mbox_info_fops = {
 static ssize_t __spufs_ibox_info_read(struct spu_context *ctx,
 				char __user *buf, size_t len, loff_t *pos)
 {
-	u32 ibox_stat;
 	u32 data;
 
-	ibox_stat = ctx->csa.prob.mb_stat_R;
-	if (ibox_stat & 0xff0000) {
-		data = ctx->csa.priv2.puint_mb_R;
-	}
+	/* EOF if there's no entry in the ibox */
+	if (!(ctx->csa.prob.mb_stat_R & 0xff0000))
+		return 0;
+
+	data = ctx->csa.priv2.puint_mb_R;
 
 	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
 }
@@ -1835,7 +2086,9 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_ibox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1876,7 +2129,9 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_wbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1926,7 +2181,9 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_dma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1977,7 +2234,9 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -2066,8 +2325,12 @@ static unsigned long long spufs_class2_intrs(struct spu_context *ctx)
 static int spufs_show_stat(struct seq_file *s, void *private)
 {
 	struct spu_context *ctx = s->private;
+	int ret;
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
-	spu_acquire(ctx);
 	seq_printf(s, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
 		ctx_state_names[ctx->stats.util_state],
diff --git a/arch/powerpc/platforms/cell/spufs/hw_ops.c b/arch/powerpc/platforms/cell/spufs/hw_ops.c
index fc4ed1ffbd4..64f8540b832 100644
--- a/arch/powerpc/platforms/cell/spufs/hw_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/hw_ops.c
@@ -76,16 +76,18 @@ static unsigned int spu_hw_mbox_stat_poll(struct spu_context *ctx,
 		if (stat & 0xff0000)
 			ret |= POLLIN | POLLRDNORM;
 		else {
-			spu_int_stat_clear(spu, 2, 0x1);
-			spu_int_mask_or(spu, 2, 0x1);
+			spu_int_stat_clear(spu, 2, CLASS2_MAILBOX_INTR);
+			spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		}
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if (stat & 0x00ff00)
 			ret = POLLOUT | POLLWRNORM;
 		else {
-			spu_int_stat_clear(spu, 2, 0x10);
-			spu_int_mask_or(spu, 2, 0x10);
+			spu_int_stat_clear(spu, 2,
+					CLASS2_MAILBOX_THRESHOLD_INTR);
+			spu_int_mask_or(spu, 2,
+					CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR);
 		}
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -106,7 +108,7 @@ static int spu_hw_ibox_read(struct spu_context *ctx, u32 * data)
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt */
-		spu_int_mask_or(spu, 2, 0x1);
+		spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		ret = 0;
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -127,7 +129,7 @@ static int spu_hw_wbox_write(struct spu_context *ctx, u32 data)
 	} else {
 		/* make sure we get woken up by the interrupt when space
 		   becomes available */
-		spu_int_mask_or(spu, 2, 0x10);
+		spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR);
 		ret = 0;
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -206,6 +208,11 @@ static char *spu_hw_get_ls(struct spu_context *ctx)
 	return ctx->spu->local_store;
 }
 
+static void spu_hw_privcntl_write(struct spu_context *ctx, u64 val)
+{
+	out_be64(&ctx->spu->priv2->spu_privcntl_RW, val);
+}
+
 static u32 spu_hw_runcntl_read(struct spu_context *ctx)
 {
 	return in_be32(&ctx->spu->problem->spu_runcntl_RW);
@@ -215,11 +222,21 @@ static void spu_hw_runcntl_write(struct spu_context *ctx, u32 val)
 {
 	spin_lock_irq(&ctx->spu->register_lock);
 	if (val & SPU_RUNCNTL_ISOLATE)
-		out_be64(&ctx->spu->priv2->spu_privcntl_RW, 4LL);
+		spu_hw_privcntl_write(ctx,
+			SPU_PRIVCNT_LOAD_REQUEST_ENABLE_MASK);
 	out_be32(&ctx->spu->problem->spu_runcntl_RW, val);
 	spin_unlock_irq(&ctx->spu->register_lock);
 }
 
+static void spu_hw_runcntl_stop(struct spu_context *ctx)
+{
+	spin_lock_irq(&ctx->spu->register_lock);
+	out_be32(&ctx->spu->problem->spu_runcntl_RW, SPU_RUNCNTL_STOP);
+	while (in_be32(&ctx->spu->problem->spu_status_R) & SPU_STATUS_RUNNING)
+		cpu_relax();
+	spin_unlock_irq(&ctx->spu->register_lock);
+}
+
 static void spu_hw_master_start(struct spu_context *ctx)
 {
 	struct spu *spu = ctx->spu;
@@ -319,8 +336,10 @@ struct spu_context_ops spu_hw_ops = {
 	.npc_write = spu_hw_npc_write,
 	.status_read = spu_hw_status_read,
 	.get_ls = spu_hw_get_ls,
+	.privcntl_write = spu_hw_privcntl_write,
 	.runcntl_read = spu_hw_runcntl_read,
 	.runcntl_write = spu_hw_runcntl_write,
+	.runcntl_stop = spu_hw_runcntl_stop,
 	.master_start = spu_hw_master_start,
 	.master_stop = spu_hw_master_stop,
 	.set_mfc_query = spu_hw_set_mfc_query,
diff --git a/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c b/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
index f4b3c052dab..0e9f325c9ff 100644
--- a/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
+++ b/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
@@ -28,6 +28,8 @@
 #include <asm/spu_csa.h>
 #include <asm/mmu.h>
 
+#include "spufs.h"
+
 static int spu_alloc_lscsa_std(struct spu_state *csa)
 {
 	struct spu_lscsa *lscsa;
@@ -73,7 +75,7 @@ int spu_alloc_lscsa(struct spu_state *csa)
 	int		i, j, n_4k;
 
 	/* Check availability of 64K pages */
-	if (mmu_psize_defs[MMU_PAGE_64K].shift == 0)
+	if (!spu_64k_pages_available())
 		goto fail;
 
 	csa->use_big_pages = 1;
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 1ce5e22ea5f..c01a09da1e5 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -15,24 +15,55 @@ void spufs_stop_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
-	wake_up_all(&ctx->stop_wq);
+	/*
+	 * It should be impossible to preempt a context while an exception
+	 * is being processed, since the context switch code is specially
+	 * coded to deal with interrupts ... But, just in case, sanity check
+	 * the context pointer.  It is OK to return doing nothing since
+	 * the exception will be regenerated when the context is resumed.
+	 */
+	if (ctx) {
+		/* Copy exception arguments into module specific structure */
+		ctx->csa.class_0_pending = spu->class_0_pending;
+		ctx->csa.dsisr = spu->dsisr;
+		ctx->csa.dar = spu->dar;
+
+		/* ensure that the exception status has hit memory before a
+		 * thread waiting on the context's stop queue is woken */
+		smp_wmb();
+
+		wake_up_all(&ctx->stop_wq);
+	}
+
+	/* Clear callback arguments from spu structure */
+	spu->class_0_pending = 0;
+	spu->dsisr = 0;
+	spu->dar = 0;
 }
 
-static inline int spu_stopped(struct spu_context *ctx, u32 *stat)
+int spu_stopped(struct spu_context *ctx, u32 *stat)
 {
-	struct spu *spu;
-	u64 pte_fault;
+	u64 dsisr;
+	u32 stopped;
 
 	*stat = ctx->ops->status_read(ctx);
 
-	spu = ctx->spu;
-	if (ctx->state != SPU_STATE_RUNNABLE ||
-	    test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
+	if (test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
 		return 1;
-	pte_fault = spu->dsisr &
-	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
-	return (!(*stat & SPU_STATUS_RUNNING) || pte_fault || spu->class_0_pending) ?
-		1 : 0;
+
+	stopped = SPU_STATUS_INVALID_INSTR | SPU_STATUS_SINGLE_STEP |
+		SPU_STATUS_STOPPED_BY_HALT | SPU_STATUS_STOPPED_BY_STOP;
+	if (*stat & stopped)
+		return 1;
+
+	dsisr = ctx->csa.dsisr;
+	if (dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))
+		return 1;
+
+	if (ctx->csa.class_0_pending)
+		return 1;
+
+	return 0;
 }
 
 static int spu_setup_isolated(struct spu_context *ctx)
@@ -128,34 +159,66 @@ out:
 
 static int spu_run_init(struct spu_context *ctx, u32 *npc)
 {
+	unsigned long runcntl = SPU_RUNCNTL_RUNNABLE;
+	int ret;
+
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
-	if (ctx->flags & SPU_CREATE_ISOLATE) {
-		unsigned long runcntl;
+	/*
+	 * NOSCHED is synchronous scheduling with respect to the caller.
+	 * The caller waits for the context to be loaded.
+	 */
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		if (ctx->state == SPU_STATE_SAVED) {
+			ret = spu_activate(ctx, 0);
+			if (ret)
+				return ret;
+		}
+	}
 
+	/*
+	 * Apply special setup as required.
+	 */
+	if (ctx->flags & SPU_CREATE_ISOLATE) {
 		if (!(ctx->ops->status_read(ctx) & SPU_STATUS_ISOLATED_STATE)) {
-			int ret = spu_setup_isolated(ctx);
+			ret = spu_setup_isolated(ctx);
 			if (ret)
 				return ret;
 		}
 
-		/* if userspace has set the runcntrl register (eg, to issue an
-		 * isolated exit), we need to re-set it here */
+		/*
+		 * If userspace has set the runcntrl register (eg, to
+		 * issue an isolated exit), we need to re-set it here
+		 */
 		runcntl = ctx->ops->runcntl_read(ctx) &
 			(SPU_RUNCNTL_RUNNABLE | SPU_RUNCNTL_ISOLATE);
 		if (runcntl == 0)
 			runcntl = SPU_RUNCNTL_RUNNABLE;
+	}
+
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		spuctx_switch_state(ctx, SPU_UTIL_USER);
 		ctx->ops->runcntl_write(ctx, runcntl);
 	} else {
-		unsigned long mode = SPU_PRIVCNTL_MODE_NORMAL;
-		ctx->ops->npc_write(ctx, *npc);
+		unsigned long privcntl;
+
 		if (test_thread_flag(TIF_SINGLESTEP))
-			mode = SPU_PRIVCNTL_MODE_SINGLE_STEP;
-		out_be64(&ctx->spu->priv2->spu_privcntl_RW, mode);
-		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
-	}
+			privcntl = SPU_PRIVCNTL_MODE_SINGLE_STEP;
+		else
+			privcntl = SPU_PRIVCNTL_MODE_NORMAL;
 
-	spuctx_switch_state(ctx, SPU_UTIL_USER);
+		ctx->ops->npc_write(ctx, *npc);
+		ctx->ops->privcntl_write(ctx, privcntl);
+		ctx->ops->runcntl_write(ctx, runcntl);
+
+		if (ctx->state == SPU_STATE_SAVED) {
+			ret = spu_activate(ctx, 0);
+			if (ret)
+				return ret;
+		} else {
+			spuctx_switch_state(ctx, SPU_UTIL_USER);
+		}
+	}
 
 	return 0;
 }
@@ -165,6 +228,8 @@ static int spu_run_fini(struct spu_context *ctx, u32 *npc,
 {
 	int ret = 0;
 
+	spu_del_from_rq(ctx);
+
 	*status = ctx->ops->status_read(ctx);
 	*npc = ctx->ops->npc_read(ctx);
 
@@ -177,26 +242,6 @@ static int spu_run_fini(struct spu_context *ctx, u32 *npc,
 	return ret;
 }
 
-static int spu_reacquire_runnable(struct spu_context *ctx, u32 *npc,
-				         u32 *status)
-{
-	int ret;
-
-	ret = spu_run_fini(ctx, npc, status);
-	if (ret)
-		return ret;
-
-	if (*status & (SPU_STATUS_STOPPED_BY_STOP | SPU_STATUS_STOPPED_BY_HALT))
-		return *status;
-
-	ret = spu_acquire_runnable(ctx, 0);
-	if (ret)
-		return ret;
-
-	spuctx_switch_state(ctx, SPU_UTIL_USER);
-	return 0;
-}
-
 /*
  * SPU syscall restarting is tricky because we violate the basic
  * assumption that the signal handler is running on the interrupted
@@ -247,7 +292,7 @@ static int spu_process_callback(struct spu_context *ctx)
 	u32 ls_pointer, npc;
 	void __iomem *ls;
 	long spu_ret;
-	int ret;
+	int ret, ret2;
 
 	/* get syscall block from local store */
 	npc = ctx->ops->npc_read(ctx) & ~3;
@@ -269,9 +314,11 @@ static int spu_process_callback(struct spu_context *ctx)
 		if (spu_ret <= -ERESTARTSYS) {
 			ret = spu_handle_restartsys(ctx, &spu_ret, &npc);
 		}
-		spu_acquire(ctx);
+		ret2 = spu_acquire(ctx);
 		if (ret == -ERESTARTSYS)
 			return ret;
+		if (ret2)
+			return -EINTR;
 	}
 
 	/* write result, jump over indirect pointer */
@@ -281,18 +328,6 @@ static int spu_process_callback(struct spu_context *ctx)
 	return ret;
 }
 
-static inline int spu_process_events(struct spu_context *ctx)
-{
-	struct spu *spu = ctx->spu;
-	int ret = 0;
-
-	if (spu->class_0_pending)
-		ret = spu_irq_class_0_bottom(spu);
-	if (!ret && signal_pending(current))
-		ret = -ERESTARTSYS;
-	return ret;
-}
-
 long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 {
 	int ret;
@@ -302,29 +337,14 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 	if (mutex_lock_interruptible(&ctx->run_mutex))
 		return -ERESTARTSYS;
 
-	ctx->ops->master_start(ctx);
+	spu_enable_spu(ctx);
 	ctx->event_return = 0;
 
-	spu_acquire(ctx);
-	if (ctx->state == SPU_STATE_SAVED) {
-		__spu_update_sched_info(ctx);
-		spu_set_timeslice(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out_unlock;
 
-		ret = spu_activate(ctx, 0);
-		if (ret) {
-			spu_release(ctx);
-			goto out;
-		}
-	} else {
-		/*
-		 * We have to update the scheduling priority under active_mutex
-		 * to protect against find_victim().
-		 *
-		 * No need to update the timeslice ASAP, it will get updated
-		 * once the current one has expired.
-		 */
-		spu_update_sched_info(ctx);
-	}
+	spu_update_sched_info(ctx);
 
 	ret = spu_run_init(ctx, npc);
 	if (ret) {
@@ -358,14 +378,12 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 		if (ret)
 			break;
 
-		if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) {
-			ret = spu_reacquire_runnable(ctx, npc, &status);
-			if (ret)
-				goto out2;
-			continue;
-		}
-		ret = spu_process_events(ctx);
+		ret = spufs_handle_class0(ctx);
+		if (ret)
+			break;
 
+		if (signal_pending(current))
+			ret = -ERESTARTSYS;
 	} while (!ret && !(status & (SPU_STATUS_STOPPED_BY_STOP |
 				      SPU_STATUS_STOPPED_BY_HALT |
 				       SPU_STATUS_SINGLE_STEP)));
@@ -376,11 +394,10 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 		ctx->stats.libassist++;
 
 
-	ctx->ops->master_stop(ctx);
+	spu_disable_spu(ctx);
 	ret = spu_run_fini(ctx, npc, &status);
 	spu_yield(ctx);
 
-out2:
 	if ((ret == 0) ||
 	    ((ret == -ERESTARTSYS) &&
 	     ((status & SPU_STATUS_STOPPED_BY_HALT) ||
@@ -401,6 +418,7 @@ out2:
 
 out:
 	*event = ctx->event_return;
+out_unlock:
 	mutex_unlock(&ctx->run_mutex);
 	return ret;
 }
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 9ad53e637ae..00d914232af 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -58,6 +58,7 @@ static unsigned long spu_avenrun[3];
 static struct spu_prio_array *spu_prio;
 static struct task_struct *spusched_task;
 static struct timer_list spusched_timer;
+static struct timer_list spuloadavg_timer;
 
 /*
  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
@@ -105,15 +106,21 @@ void spu_set_timeslice(struct spu_context *ctx)
 void __spu_update_sched_info(struct spu_context *ctx)
 {
 	/*
-	 * 32-Bit assignment are atomic on powerpc, and we don't care about
-	 * memory ordering here because retriving the controlling thread is
-	 * per defintion racy.
+	 * assert that the context is not on the runqueue, so it is safe
+	 * to change its scheduling parameters.
+	 */
+	BUG_ON(!list_empty(&ctx->rq));
+
+	/*
+	 * 32-Bit assignments are atomic on powerpc, and we don't care about
+	 * memory ordering here because retrieving the controlling thread is
+	 * per definition racy.
 	 */
 	ctx->tid = current->pid;
 
 	/*
 	 * We do our own priority calculations, so we normally want
-	 * ->static_prio to start with. Unfortunately thies field
+	 * ->static_prio to start with. Unfortunately this field
 	 * contains junk for threads with a realtime scheduling
 	 * policy so we have to look at ->prio in this case.
 	 */
@@ -124,23 +131,32 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	ctx->policy = current->policy;
 
 	/*
-	 * A lot of places that don't hold list_mutex poke into
-	 * cpus_allowed, including grab_runnable_context which
-	 * already holds the runq_lock.  So abuse runq_lock
-	 * to protect this field aswell.
+	 * TO DO: the context may be loaded, so we may need to activate
+	 * it again on a different node. But it shouldn't hurt anything
+	 * to update its parameters, because we know that the scheduler
+	 * is not actively looking at this field, since it is not on the
+	 * runqueue. The context will be rescheduled on the proper node
+	 * if it is timesliced or preempted.
 	 */
-	spin_lock(&spu_prio->runq_lock);
 	ctx->cpus_allowed = current->cpus_allowed;
-	spin_unlock(&spu_prio->runq_lock);
 }
 
 void spu_update_sched_info(struct spu_context *ctx)
 {
-	int node = ctx->spu->node;
+	int node;
 
-	mutex_lock(&cbe_spu_info[node].list_mutex);
-	__spu_update_sched_info(ctx);
-	mutex_unlock(&cbe_spu_info[node].list_mutex);
+	if (ctx->state == SPU_STATE_RUNNABLE) {
+		node = ctx->spu->node;
+
+		/*
+		 * Take list_mutex to sync with find_victim().
+		 */
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		__spu_update_sched_info(ctx);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	} else {
+		__spu_update_sched_info(ctx);
+	}
 }
 
 static int __node_allowed(struct spu_context *ctx, int node)
@@ -174,7 +190,7 @@ void do_notify_spus_active(void)
 	 * Wake up the active spu_contexts.
 	 *
 	 * When the awakened processes see their "notify_active" flag is set,
-	 * they will call spu_switch_notify();
+	 * they will call spu_switch_notify().
 	 */
 	for_each_online_node(node) {
 		struct spu *spu;
@@ -221,7 +237,6 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spu->wbox_callback = spufs_wbox_callback;
 	spu->stop_callback = spufs_stop_callback;
 	spu->mfc_callback = spufs_mfc_callback;
-	spu->dma_callback = spufs_dma_callback;
 	mb();
 	spu_unmap_mappings(ctx);
 	spu_restore(&ctx->csa, spu);
@@ -409,7 +424,6 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 	spu->wbox_callback = NULL;
 	spu->stop_callback = NULL;
 	spu->mfc_callback = NULL;
-	spu->dma_callback = NULL;
 	spu_associate_mm(spu, NULL);
 	spu->pid = 0;
 	spu->tgid = 0;
@@ -454,6 +468,13 @@ static void __spu_add_to_rq(struct spu_context *ctx)
 	}
 }
 
+static void spu_add_to_rq(struct spu_context *ctx)
+{
+	spin_lock(&spu_prio->runq_lock);
+	__spu_add_to_rq(ctx);
+	spin_unlock(&spu_prio->runq_lock);
+}
+
 static void __spu_del_from_rq(struct spu_context *ctx)
 {
 	int prio = ctx->prio;
@@ -468,10 +489,24 @@ static void __spu_del_from_rq(struct spu_context *ctx)
 	}
 }
 
+void spu_del_from_rq(struct spu_context *ctx)
+{
+	spin_lock(&spu_prio->runq_lock);
+	__spu_del_from_rq(ctx);
+	spin_unlock(&spu_prio->runq_lock);
+}
+
 static void spu_prio_wait(struct spu_context *ctx)
 {
 	DEFINE_WAIT(wait);
 
+	/*
+	 * The caller must explicitly wait for a context to be loaded
+	 * if the nosched flag is set.  If NOSCHED is not set, the caller
+	 * queues the context and waits for an spu event or error.
+	 */
+	BUG_ON(!(ctx->flags & SPU_CREATE_NOSCHED));
+
 	spin_lock(&spu_prio->runq_lock);
 	prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
 	if (!signal_pending(current)) {
@@ -555,7 +590,7 @@ static struct spu *find_victim(struct spu_context *ctx)
 	/*
 	 * Look for a possible preemption candidate on the local node first.
 	 * If there is no candidate look at the other nodes.  This isn't
-	 * exactly fair, but so far the whole spu schedule tries to keep
+	 * exactly fair, but so far the whole spu scheduler tries to keep
 	 * a strong node affinity.  We might want to fine-tune this in
 	 * the future.
 	 */
@@ -571,6 +606,7 @@ static struct spu *find_victim(struct spu_context *ctx)
 			struct spu_context *tmp = spu->ctx;
 
 			if (tmp && tmp->prio > ctx->prio &&
+			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
 			    (!victim || tmp->prio > victim->prio))
 				victim = spu->ctx;
 		}
@@ -582,6 +618,10 @@ static struct spu *find_victim(struct spu_context *ctx)
 			 * higher priority contexts before lower priority
 			 * ones, so this is safe until we introduce
 			 * priority inheritance schemes.
+			 *
+			 * XXX if the highest priority context is locked,
+			 * this can loop a long time.  Might be better to
+			 * look at another context or give up after X retries.
 			 */
 			if (!mutex_trylock(&victim->state_mutex)) {
 				victim = NULL;
@@ -589,10 +629,10 @@ static struct spu *find_victim(struct spu_context *ctx)
 			}
 
 			spu = victim->spu;
-			if (!spu) {
+			if (!spu || victim->prio <= ctx->prio) {
 				/*
 				 * This race can happen because we've dropped
-				 * the active list mutex.  No a problem, just
+				 * the active list mutex.  Not a problem, just
 				 * restart the search.
 				 */
 				mutex_unlock(&victim->state_mutex);
@@ -607,13 +647,10 @@ static struct spu *find_victim(struct spu_context *ctx)
 
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
+			spu_add_to_rq(victim);
+
 			mutex_unlock(&victim->state_mutex);
-			/*
-			 * We need to break out of the wait loop in spu_run
-			 * manually to ensure this context gets put on the
-			 * runqueue again ASAP.
-			 */
-			wake_up(&victim->stop_wq);
+
 			return spu;
 		}
 	}
@@ -621,6 +658,50 @@ static struct spu *find_victim(struct spu_context *ctx)
 	return NULL;
 }
 
+static void __spu_schedule(struct spu *spu, struct spu_context *ctx)
+{
+	int node = spu->node;
+	int success = 0;
+
+	spu_set_timeslice(ctx);
+
+	mutex_lock(&cbe_spu_info[node].list_mutex);
+	if (spu->ctx == NULL) {
+		spu_bind_context(spu, ctx);
+		cbe_spu_info[node].nr_active++;
+		spu->alloc_state = SPU_USED;
+		success = 1;
+	}
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+
+	if (success)
+		wake_up_all(&ctx->run_wq);
+	else
+		spu_add_to_rq(ctx);
+}
+
+static void spu_schedule(struct spu *spu, struct spu_context *ctx)
+{
+	/* not a candidate for interruptible because it's called either
+	   from the scheduler thread or from spu_deactivate */
+	mutex_lock(&ctx->state_mutex);
+	__spu_schedule(spu, ctx);
+	spu_release(ctx);
+}
+
+static void spu_unschedule(struct spu *spu, struct spu_context *ctx)
+{
+	int node = spu->node;
+
+	mutex_lock(&cbe_spu_info[node].list_mutex);
+	cbe_spu_info[node].nr_active--;
+	spu->alloc_state = SPU_FREE;
+	spu_unbind_context(spu, ctx);
+	ctx->stats.invol_ctx_switch++;
+	spu->stats.invol_ctx_switch++;
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+}
+
 /**
  * spu_activate - find a free spu for a context and execute it
  * @ctx:	spu context to schedule
@@ -632,39 +713,47 @@ static struct spu *find_victim(struct spu_context *ctx)
  */
 int spu_activate(struct spu_context *ctx, unsigned long flags)
 {
-	do {
-		struct spu *spu;
+	struct spu *spu;
 
-		/*
-		 * If there are multiple threads waiting for a single context
-		 * only one actually binds the context while the others will
-		 * only be able to acquire the state_mutex once the context
-		 * already is in runnable state.
-		 */
-		if (ctx->spu)
-			return 0;
+	/*
+	 * If there are multiple threads waiting for a single context
+	 * only one actually binds the context while the others will
+	 * only be able to acquire the state_mutex once the context
+	 * already is in runnable state.
+	 */
+	if (ctx->spu)
+		return 0;
 
-		spu = spu_get_idle(ctx);
-		/*
-		 * If this is a realtime thread we try to get it running by
-		 * preempting a lower priority thread.
-		 */
-		if (!spu && rt_prio(ctx->prio))
-			spu = find_victim(ctx);
-		if (spu) {
-			int node = spu->node;
+spu_activate_top:
+	if (signal_pending(current))
+		return -ERESTARTSYS;
 
-			mutex_lock(&cbe_spu_info[node].list_mutex);
-			spu_bind_context(spu, ctx);
-			cbe_spu_info[node].nr_active++;
-			mutex_unlock(&cbe_spu_info[node].list_mutex);
-			return 0;
-		}
+	spu = spu_get_idle(ctx);
+	/*
+	 * If this is a realtime thread we try to get it running by
+	 * preempting a lower priority thread.
+	 */
+	if (!spu && rt_prio(ctx->prio))
+		spu = find_victim(ctx);
+	if (spu) {
+		unsigned long runcntl;
+
+		runcntl = ctx->ops->runcntl_read(ctx);
+		__spu_schedule(spu, ctx);
+		if (runcntl & SPU_RUNCNTL_RUNNABLE)
+			spuctx_switch_state(ctx, SPU_UTIL_USER);
 
+		return 0;
+	}
+
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
 		spu_prio_wait(ctx);
-	} while (!signal_pending(current));
+		goto spu_activate_top;
+	}
 
-	return -ERESTARTSYS;
+	spu_add_to_rq(ctx);
+
+	return 0;
 }
 
 /**
@@ -706,21 +795,19 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
 		if (new || force) {
-			int node = spu->node;
-
-			mutex_lock(&cbe_spu_info[node].list_mutex);
-			spu_unbind_context(spu, ctx);
-			spu->alloc_state = SPU_FREE;
-			cbe_spu_info[node].nr_active--;
-			mutex_unlock(&cbe_spu_info[node].list_mutex);
-
-			ctx->stats.vol_ctx_switch++;
-			spu->stats.vol_ctx_switch++;
-
-			if (new)
-				wake_up(&new->stop_wq);
+			spu_unschedule(spu, ctx);
+			if (new) {
+				if (new->flags & SPU_CREATE_NOSCHED)
+					wake_up(&new->stop_wq);
+				else {
+					spu_release(ctx);
+					spu_schedule(spu, new);
+					/* this one can't easily be made
+					   interruptible */
+					mutex_lock(&ctx->state_mutex);
+				}
+			}
 		}
-
 	}
 
 	return new != NULL;
@@ -757,43 +844,38 @@ void spu_yield(struct spu_context *ctx)
 
 static noinline void spusched_tick(struct spu_context *ctx)
 {
+	struct spu_context *new = NULL;
+	struct spu *spu = NULL;
+	u32 status;
+
+	if (spu_acquire(ctx))
+		BUG();	/* a kernel thread never has signals pending */
+
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		goto out;
+	if (spu_stopped(ctx, &status))
+		goto out;
 	if (ctx->flags & SPU_CREATE_NOSCHED)
-		return;
+		goto out;
 	if (ctx->policy == SCHED_FIFO)
-		return;
+		goto out;
 
 	if (--ctx->time_slice)
-		return;
+		goto out;
 
-	/*
-	 * Unfortunately list_mutex ranks outside of state_mutex, so
-	 * we have to trylock here.  If we fail give the context another
-	 * tick and try again.
-	 */
-	if (mutex_trylock(&ctx->state_mutex)) {
-		struct spu *spu = ctx->spu;
-		struct spu_context *new;
-
-		new = grab_runnable_context(ctx->prio + 1, spu->node);
-		if (new) {
-			spu_unbind_context(spu, ctx);
-			ctx->stats.invol_ctx_switch++;
-			spu->stats.invol_ctx_switch++;
-			spu->alloc_state = SPU_FREE;
-			cbe_spu_info[spu->node].nr_active--;
-			wake_up(&new->stop_wq);
-			/*
-			 * We need to break out of the wait loop in
-			 * spu_run manually to ensure this context
-			 * gets put on the runqueue again ASAP.
-			 */
-			wake_up(&ctx->stop_wq);
-		}
-		spu_set_timeslice(ctx);
-		mutex_unlock(&ctx->state_mutex);
+	spu = ctx->spu;
+	new = grab_runnable_context(ctx->prio + 1, spu->node);
+	if (new) {
+		spu_unschedule(spu, ctx);
+		spu_add_to_rq(ctx);
 	} else {
 		ctx->time_slice++;
 	}
+out:
+	spu_release(ctx);
+
+	if (new)
+		spu_schedule(spu, new);
 }
 
 /**
@@ -817,35 +899,31 @@ static unsigned long count_active_contexts(void)
 }
 
 /**
- * spu_calc_load - given tick count, update the avenrun load estimates.
- * @tick:	tick count
+ * spu_calc_load - update the avenrun load estimates.
  *
  * No locking against reading these values from userspace, as for
  * the CPU loadavg code.
  */
-static void spu_calc_load(unsigned long ticks)
+static void spu_calc_load(void)
 {
 	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
-
-	count -= ticks;
-
-	if (unlikely(count < 0)) {
-		active_tasks = count_active_contexts() * FIXED_1;
-		do {
-			CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
+
+	active_tasks = count_active_contexts() * FIXED_1;
+	CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
+	CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
+	CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
 }
 
 static void spusched_wake(unsigned long data)
 {
 	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	wake_up_process(spusched_task);
-	spu_calc_load(SPUSCHED_TICK);
+}
+
+static void spuloadavg_wake(unsigned long data)
+{
+	mod_timer(&spuloadavg_timer, jiffies + LOAD_FREQ);
+	spu_calc_load();
 }
 
 static int spusched_thread(void *unused)
@@ -857,17 +935,58 @@ static int spusched_thread(void *unused)
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		for (node = 0; node < MAX_NUMNODES; node++) {
-			mutex_lock(&cbe_spu_info[node].list_mutex);
-			list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
-				if (spu->ctx)
-					spusched_tick(spu->ctx);
-			mutex_unlock(&cbe_spu_info[node].list_mutex);
+			struct mutex *mtx = &cbe_spu_info[node].list_mutex;
+
+			mutex_lock(mtx);
+			list_for_each_entry(spu, &cbe_spu_info[node].spus,
+					cbe_list) {
+				struct spu_context *ctx = spu->ctx;
+
+				if (ctx) {
+					mutex_unlock(mtx);
+					spusched_tick(ctx);
+					mutex_lock(mtx);
+				}
+			}
+			mutex_unlock(mtx);
 		}
 	}
 
 	return 0;
 }
 
+void spuctx_switch_state(struct spu_context *ctx,
+		enum spu_utilization_state new_state)
+{
+	unsigned long long curtime;
+	signed long long delta;
+	struct timespec ts;
+	struct spu *spu;
+	enum spu_utilization_state old_state;
+
+	ktime_get_ts(&ts);
+	curtime = timespec_to_ns(&ts);
+	delta = curtime - ctx->stats.tstamp;
+
+	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
+	WARN_ON(delta < 0);
+
+	spu = ctx->spu;
+	old_state = ctx->stats.util_state;
+	ctx->stats.util_state = new_state;
+	ctx->stats.tstamp = curtime;
+
+	/*
+	 * Update the physical SPU utilization statistics.
+	 */
+	if (spu) {
+		ctx->stats.times[old_state] += delta;
+		spu->stats.times[old_state] += delta;
+		spu->stats.util_state = new_state;
+		spu->stats.tstamp = curtime;
+	}
+}
+
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 
@@ -881,7 +1000,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 
 	/*
 	 * Note that last_pid doesn't really make much sense for the
-	 * SPU loadavg (it even seems very odd on the CPU side..),
+	 * SPU loadavg (it even seems very odd on the CPU side...),
 	 * but we include it here to have a 100% compatible interface.
 	 */
 	seq_printf(s, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
@@ -922,6 +1041,7 @@ int __init spu_sched_init(void)
 	spin_lock_init(&spu_prio->runq_lock);
 
 	setup_timer(&spusched_timer, spusched_wake, 0);
+	setup_timer(&spuloadavg_timer, spuloadavg_wake, 0);
 
 	spusched_task = kthread_run(spusched_thread, NULL, "spusched");
 	if (IS_ERR(spusched_task)) {
@@ -929,6 +1049,8 @@ int __init spu_sched_init(void)
 		goto out_free_spu_prio;
 	}
 
+	mod_timer(&spuloadavg_timer, 0);
+
 	entry = create_proc_entry("spu_loadavg", 0, NULL);
 	if (!entry)
 		goto out_stop_kthread;
@@ -954,6 +1076,7 @@ void spu_sched_exit(void)
 	remove_proc_entry("spu_loadavg", NULL);
 
 	del_timer_sync(&spusched_timer);
+	del_timer_sync(&spuloadavg_timer);
 	kthread_stop(spusched_task);
 
 	for (node = 0; node < MAX_NUMNODES; node++) {
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index ca47b991bda..0e114038ea6 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -71,6 +71,7 @@ struct spu_context {
 	wait_queue_head_t wbox_wq;
 	wait_queue_head_t stop_wq;
 	wait_queue_head_t mfc_wq;
+	wait_queue_head_t run_wq;
 	struct fasync_struct *ibox_fasync;
 	struct fasync_struct *wbox_fasync;
 	struct fasync_struct *mfc_fasync;
@@ -168,8 +169,10 @@ struct spu_context_ops {
 	void (*npc_write) (struct spu_context * ctx, u32 data);
 	 u32(*status_read) (struct spu_context * ctx);
 	char*(*get_ls) (struct spu_context * ctx);
+	void (*privcntl_write) (struct spu_context *ctx, u64 data);
 	 u32 (*runcntl_read) (struct spu_context * ctx);
 	void (*runcntl_write) (struct spu_context * ctx, u32 data);
+	void (*runcntl_stop) (struct spu_context * ctx);
 	void (*master_start) (struct spu_context * ctx);
 	void (*master_stop) (struct spu_context * ctx);
 	int (*set_mfc_query)(struct spu_context * ctx, u32 mask, u32 mode);
@@ -219,15 +222,16 @@ void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 
 /* fault handling */
 int spufs_handle_class1(struct spu_context *ctx);
+int spufs_handle_class0(struct spu_context *ctx);
 
 /* affinity */
 struct spu *affinity_check(struct spu_context *ctx);
 
 /* context management */
 extern atomic_t nr_spu_contexts;
-static inline void spu_acquire(struct spu_context *ctx)
+static inline int __must_check spu_acquire(struct spu_context *ctx)
 {
-	mutex_lock(&ctx->state_mutex);
+	return mutex_lock_interruptible(&ctx->state_mutex);
 }
 
 static inline void spu_release(struct spu_context *ctx)
@@ -242,10 +246,11 @@ int put_spu_context(struct spu_context *ctx);
 void spu_unmap_mappings(struct spu_context *ctx);
 
 void spu_forget(struct spu_context *ctx);
-int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags);
-void spu_acquire_saved(struct spu_context *ctx);
+int __must_check spu_acquire_saved(struct spu_context *ctx);
 void spu_release_saved(struct spu_context *ctx);
 
+int spu_stopped(struct spu_context *ctx, u32 * stat);
+void spu_del_from_rq(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
@@ -279,7 +284,9 @@ extern char *isolated_loader;
 		}							\
 		spu_release(ctx);					\
 		schedule();						\
-		spu_acquire(ctx);					\
+		__ret = spu_acquire(ctx);				\
+		if (__ret)						\
+			break;						\
 	}								\
 	finish_wait(&(wq), &__wait);					\
 	__ret;								\
@@ -306,41 +313,16 @@ struct spufs_coredump_reader {
 extern struct spufs_coredump_reader spufs_coredump_read[];
 extern int spufs_coredump_num_notes;
 
-/*
- * This function is a little bit too large for an inline, but
- * as fault.c is built into the kernel we can't move it out of
- * line.
- */
-static inline void spuctx_switch_state(struct spu_context *ctx,
-		enum spu_utilization_state new_state)
-{
-	unsigned long long curtime;
-	signed long long delta;
-	struct timespec ts;
-	struct spu *spu;
-	enum spu_utilization_state old_state;
-
-	ktime_get_ts(&ts);
-	curtime = timespec_to_ns(&ts);
-	delta = curtime - ctx->stats.tstamp;
-
-	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
-	WARN_ON(delta < 0);
-
-	spu = ctx->spu;
-	old_state = ctx->stats.util_state;
-	ctx->stats.util_state = new_state;
-	ctx->stats.tstamp = curtime;
-
-	/*
-	 * Update the physical SPU utilization statistics.
-	 */
-	if (spu) {
-		ctx->stats.times[old_state] += delta;
-		spu->stats.times[old_state] += delta;
-		spu->stats.util_state = new_state;
-		spu->stats.tstamp = curtime;
-	}
-}
+extern int spu_init_csa(struct spu_state *csa);
+extern void spu_fini_csa(struct spu_state *csa);
+extern int spu_save(struct spu_state *prev, struct spu *spu);
+extern int spu_restore(struct spu_state *new, struct spu *spu);
+extern int spu_switch(struct spu_state *prev, struct spu_state *new,
+		      struct spu *spu);
+extern int spu_alloc_lscsa(struct spu_state *csa);
+extern void spu_free_lscsa(struct spu_state *csa);
+
+extern void spuctx_switch_state(struct spu_context *ctx,
+		enum spu_utilization_state new_state);
 
 #endif
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 3d64c81cc6e..6063c88c26d 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -48,6 +48,8 @@
 #include <asm/spu_csa.h>
 #include <asm/mmu_context.h>
 
+#include "spufs.h"
+
 #include "spu_save_dump.h"
 #include "spu_restore_dump.h"
 
@@ -691,35 +693,9 @@ static inline void resume_mfc_queue(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESUME_DMA_QUEUE);
 }
 
-static inline void get_kernel_slb(u64 ea, u64 slb[2])
+static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu,
+		unsigned int *code, int code_size)
 {
-	u64 llp;
-
-	if (REGION_ID(ea) == KERNEL_REGION_ID)
-		llp = mmu_psize_defs[mmu_linear_psize].sllp;
-	else
-		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
-	slb[0] = (get_kernel_vsid(ea, MMU_SEGSIZE_256M) << SLB_VSID_SHIFT) |
-		SLB_VSID_KERNEL | llp;
-	slb[1] = (ea & ESID_MASK) | SLB_ESID_V;
-}
-
-static inline void load_mfc_slb(struct spu *spu, u64 slb[2], int slbe)
-{
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
-
-	out_be64(&priv2->slb_index_W, slbe);
-	eieio();
-	out_be64(&priv2->slb_vsid_RW, slb[0]);
-	out_be64(&priv2->slb_esid_RW, slb[1]);
-	eieio();
-}
-
-static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu)
-{
-	u64 code_slb[2];
-	u64 lscsa_slb[2];
-
 	/* Save, Step 47:
 	 * Restore, Step 30.
 	 *     If MFC_SR1[R]=1, write 0 to SLB_Invalidate_All
@@ -735,11 +711,7 @@ static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu)
 	 *     translation is desired by OS environment).
 	 */
 	spu_invalidate_slbs(spu);
-	get_kernel_slb((unsigned long)&spu_save_code[0], code_slb);
-	get_kernel_slb((unsigned long)csa->lscsa, lscsa_slb);
-	load_mfc_slb(spu, code_slb, 0);
-	if ((lscsa_slb[0] != code_slb[0]) || (lscsa_slb[1] != code_slb[1]))
-		load_mfc_slb(spu, lscsa_slb, 1);
+	spu_setup_kernel_slbs(spu, csa->lscsa, code, code_size);
 }
 
 static inline void set_switch_active(struct spu_state *csa, struct spu *spu)
@@ -768,9 +740,9 @@ static inline void enable_interrupts(struct spu_state *csa, struct spu *spu)
 	 *     (translation) interrupts.
 	 */
 	spin_lock_irq(&spu->register_lock);
-	spu_int_stat_clear(spu, 0, ~0ul);
-	spu_int_stat_clear(spu, 1, ~0ul);
-	spu_int_stat_clear(spu, 2, ~0ul);
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 1, CLASS1_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	spu_int_mask_set(spu, 0, 0ul);
 	spu_int_mask_set(spu, 1, class1_mask);
 	spu_int_mask_set(spu, 2, 0ul);
@@ -927,8 +899,8 @@ static inline void wait_tag_complete(struct spu_state *csa, struct spu *spu)
 	POLL_WHILE_FALSE(in_be32(&prob->dma_tagstatus_R) & mask);
 
 	local_irq_save(flags);
-	spu_int_stat_clear(spu, 0, ~(0ul));
-	spu_int_stat_clear(spu, 2, ~(0ul));
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	local_irq_restore(flags);
 }
 
@@ -946,8 +918,8 @@ static inline void wait_spu_stopped(struct spu_state *csa, struct spu *spu)
 	POLL_WHILE_TRUE(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING);
 
 	local_irq_save(flags);
-	spu_int_stat_clear(spu, 0, ~(0ul));
-	spu_int_stat_clear(spu, 2, ~(0ul));
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	local_irq_restore(flags);
 }
 
@@ -1423,9 +1395,9 @@ static inline void clear_interrupts(struct spu_state *csa, struct spu *spu)
 	spu_int_mask_set(spu, 0, 0ul);
 	spu_int_mask_set(spu, 1, 0ul);
 	spu_int_mask_set(spu, 2, 0ul);
-	spu_int_stat_clear(spu, 0, ~0ul);
-	spu_int_stat_clear(spu, 1, ~0ul);
-	spu_int_stat_clear(spu, 2, ~0ul);
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 1, CLASS1_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	spin_unlock_irq(&spu->register_lock);
 }
 
@@ -1866,7 +1838,8 @@ static void save_lscsa(struct spu_state *prev, struct spu *spu)
 	 */
 
 	resume_mfc_queue(prev, spu);	/* Step 46. */
-	setup_mfc_slbs(prev, spu);	/* Step 47. */
+	/* Step 47. */
+	setup_mfc_slbs(prev, spu, spu_save_code, sizeof(spu_save_code));
 	set_switch_active(prev, spu);	/* Step 48. */
 	enable_interrupts(prev, spu);	/* Step 49. */
 	save_ls_16kb(prev, spu);	/* Step 50. */
@@ -1971,7 +1944,8 @@ static void restore_lscsa(struct spu_state *next, struct spu *spu)
 	setup_spu_status_part1(next, spu);	/* Step 27. */
 	setup_spu_status_part2(next, spu);	/* Step 28. */
 	restore_mfc_rag(next, spu);	        /* Step 29. */
-	setup_mfc_slbs(next, spu);	        /* Step 30. */
+	/* Step 30. */
+	setup_mfc_slbs(next, spu, spu_restore_code, sizeof(spu_restore_code));
 	set_spu_npc(next, spu);	                /* Step 31. */
 	set_signot1(next, spu);	                /* Step 32. */
 	set_signot2(next, spu);	                /* Step 33. */
@@ -2103,10 +2077,6 @@ int spu_save(struct spu_state *prev, struct spu *spu)
 	int rc;
 
 	acquire_spu_lock(spu);	        /* Step 1.     */
-	prev->dar = spu->dar;
-	prev->dsisr = spu->dsisr;
-	spu->dar = 0;
-	spu->dsisr = 0;
 	rc = __do_spu_save(prev, spu);	/* Steps 2-53. */
 	release_spu_lock(spu);
 	if (rc != 0 && rc != 2 && rc != 6) {
@@ -2133,9 +2103,6 @@ int spu_restore(struct spu_state *new, struct spu *spu)
 	acquire_spu_lock(spu);
 	harvest(NULL, spu);
 	spu->slb_replace = 0;
-	new->dar = 0;
-	new->dsisr = 0;
-	spu->class_0_pending = 0;
 	rc = __do_spu_restore(new, spu);
 	release_spu_lock(spu);
 	if (rc) {
@@ -2215,10 +2182,8 @@ int spu_init_csa(struct spu_state *csa)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(spu_init_csa);
 
 void spu_fini_csa(struct spu_state *csa)
 {
 	spu_free_lscsa(csa);
 }
-EXPORT_SYMBOL_GPL(spu_fini_csa);
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-01-31 13:37:27 +1100
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-01-31 13:37:27 +1100
commit	8af03e782cae1e0a0f530ddd22301cdd12cf9dc0 (patch)
tree	c4af13a38bd3cc1a811a37f2358491f171052070 /arch/powerpc/platforms/cell
parent	6232665040f9a23fafd9d94d4ae8d5a2dc850f65 (diff)
parent	99e139126ab2e84be67969650f92eb37c12ab5cd (diff)