17 files changed, 1474 insertions, 157 deletions
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index b936a1dd0a5..25c273c761d 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -23,6 +23,9 @@
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
 
+/* The platform string corresponding to the real PVR */
+const char *powerpc_base_platform;
+
 /* NOTE:
  * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
  * the responsibility of the appropriate CPU save/restore functions to
@@ -1652,6 +1655,14 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 			} else
 				*t = *s;
 			*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+
+			/*
+			 * Set the base platform string once; assumes
+			 * we're called with real pvr first.
+			 */
+			if (*PTRRELOC(&powerpc_base_platform) == NULL)
+				*PTRRELOC(&powerpc_base_platform) = t->platform;
+
 #if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
 			/* ppc64 and booke expect identify_cpu to also call
 			 * setup_cpu for that processor. I will consolidate
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index da52269aec1..81c8324a4a3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -148,7 +148,7 @@ transfer_to_handler:
 	/* Check to see if the dbcr0 register is set up to debug.  Use the
 	   internal debug mode bit to do this. */
 	lwz	r12,THREAD_DBCR0(r12)
-	andis.	r12,r12,DBCR0_IDM@h
+	andis.	r12,r12,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	beq+	3f
 	/* From user and task is ptraced - load up global dbcr0 */
 	li	r12,-1			/* clear all pending debug events */
@@ -292,7 +292,7 @@ syscall_exit_cont:
 	/* If the process has its own DBCR0 value, load it up.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
 	lwz	r0,THREAD+THREAD_DBCR0(r2)
-	andis.	r10,r0,DBCR0_IDM@h
+	andis.	r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	bnel-	load_dbcr0
 #endif
 #ifdef CONFIG_44x
@@ -720,7 +720,7 @@ restore_user:
 	/* Check whether this process has its own DBCR0 value.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
 	lwz	r0,THREAD+THREAD_DBCR0(r2)
-	andis.	r10,r0,DBCR0_IDM@h
+	andis.	r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	bnel-	load_dbcr0
 #endif
 
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index c3cf0e8f3ac..d308a9f70f1 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -60,7 +60,7 @@ void cpu_idle(void)
 
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched() && !cpu_should_die()) {
 			ppc64_runlatch_off();
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2385f68c175..550a19399bf 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -49,6 +49,8 @@ static int novmerge = 1;
 
 static int protect4gb = 1;
 
+static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
+
 static inline unsigned long iommu_num_pages(unsigned long vaddr,
 					    unsigned long slen)
 {
@@ -191,6 +193,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 {
 	unsigned long entry, flags;
 	dma_addr_t ret = DMA_ERROR_CODE;
+	int build_fail;
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
@@ -205,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
-			 direction, attrs);
+	build_fail = ppc_md.tce_build(tbl, entry, npages,
+	                              (unsigned long)page & IOMMU_PAGE_MASK,
+	                              direction, attrs);
+
+	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	 * Clean up the table bitmap in this case and return
+	 * DMA_ERROR_CODE. For all other errors the functionality is
+	 * not altered.
+	 */
+	if (unlikely(build_fail)) {
+		__iommu_free(tbl, ret, npages);
 
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return DMA_ERROR_CODE;
+	}
 
 	/* Flush/invalidate TLB caches if necessary */
 	if (ppc_md.tce_flush)
@@ -276,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	dma_addr_t dma_next = 0, dma_addr;
 	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
-	int outcount, incount, i;
+	int outcount, incount, i, build_fail = 0;
 	unsigned int align;
 	unsigned long handle;
 	unsigned int max_seg_size;
@@ -337,8 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK,
-				 direction, attrs);
+		build_fail = ppc_md.tce_build(tbl, entry, npages,
+		                              vaddr & IOMMU_PAGE_MASK,
+		                              direction, attrs);
+		if(unlikely(build_fail))
+			goto failure;
 
 		/* If we are in an open segment, try merging */
 		if (segstart != s) {
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 4ba2af12545..de79915452c 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -144,7 +144,6 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
@@ -312,8 +311,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -352,7 +350,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	regs->nip = orig_ret_address;
 
 	reset_current_kprobe();
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 827a5726a03..9f856a0c3e3 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -34,8 +34,9 @@
 #include <asm/time.h>
 #include <asm/prom.h>
 #include <asm/vdso_datapage.h>
+#include <asm/vio.h>
 
-#define MODULE_VERS "1.7"
+#define MODULE_VERS "1.8"
 #define MODULE_NAME "lparcfg"
 
 /* #define LPARCFG_DEBUG */
@@ -129,32 +130,46 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
 /*
  * Methods used to fetch LPAR data when running on a pSeries platform.
  */
-static void log_plpar_hcall_return(unsigned long rc, char *tag)
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
 {
-	switch(rc) {
-	case 0:
-		return;
-	case H_HARDWARE:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Hardware fault\n", tag);
-		return;
-	case H_FUNCTION:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Function not allowed\n", tag);
-		return;
-	case H_AUTHORITY:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Not authorized to this function\n", tag);
-		return;
-	case H_PARAMETER:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Bad parameter(s)\n",tag);
-		return;
-	default:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Unexpected rc(0x%lx)\n", tag, rc);
-	}
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+	mpp_data->entitled_mem = retbuf[0];
+	mpp_data->mapped_mem = retbuf[1];
+
+	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	mpp_data->pool_num = retbuf[2] & 0xffff;
+
+	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+	mpp_data->pool_size = retbuf[4];
+	mpp_data->loan_request = retbuf[5];
+	mpp_data->backing_mem = retbuf[6];
+
+	return rc;
 }
+EXPORT_SYMBOL(h_get_mpp);
+
+struct hvcall_ppp_data {
+	u64	entitlement;
+	u64	unallocated_entitlement;
+	u16	group_num;
+	u16	pool_num;
+	u8	capped;
+	u8	weight;
+	u8	unallocated_weight;
+	u16	active_procs_in_pool;
+	u16	active_system_procs;
+};
 
 /*
  * H_GET_PPP hcall returns info in 4 parms.
@@ -176,27 +191,30 @@ static void log_plpar_hcall_return(unsigned long rc, char *tag)
  *              XXXX - Active processors in Physical Processor Pool.
  *                  XXXX  - Processors active on platform.
  */
-static unsigned int h_get_ppp(unsigned long *entitled,
-			      unsigned long *unallocated,
-			      unsigned long *aggregation,
-			      unsigned long *resource)
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 {
 	unsigned long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 
 	rc = plpar_hcall(H_GET_PPP, retbuf);
 
-	*entitled = retbuf[0];
-	*unallocated = retbuf[1];
-	*aggregation = retbuf[2];
-	*resource = retbuf[3];
+	ppp_data->entitlement = retbuf[0];
+	ppp_data->unallocated_entitlement = retbuf[1];
+
+	ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	ppp_data->pool_num = retbuf[2] & 0xffff;
 
-	log_plpar_hcall_return(rc, "H_GET_PPP");
+	ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+	ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+	ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+	ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+	ppp_data->active_system_procs = retbuf[3] & 0xffff;
 
 	return rc;
 }
 
-static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
+static unsigned h_pic(unsigned long *pool_idle_time,
+		      unsigned long *num_procs)
 {
 	unsigned long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
@@ -206,8 +224,87 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
 	*pool_idle_time = retbuf[0];
 	*num_procs = retbuf[1];
 
-	if (rc != H_AUTHORITY)
-		log_plpar_hcall_return(rc, "H_PIC");
+	return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+	struct hvcall_ppp_data ppp_data;
+	int rc;
+
+	rc = h_get_ppp(&ppp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "partition_entitled_capacity=%ld\n",
+	           ppp_data.entitlement);
+	seq_printf(m, "group=%d\n", ppp_data.group_num);
+	seq_printf(m, "system_active_processors=%d\n",
+	           ppp_data.active_system_procs);
+
+	/* pool related entries are apropriate for shared configs */
+	if (lppaca[0].shared_proc) {
+		unsigned long pool_idle_time, pool_procs;
+
+		seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+		/* report pool_capacity in percentage */
+		seq_printf(m, "pool_capacity=%d\n",
+			   ppp_data.active_procs_in_pool * 100);
+
+		h_pic(&pool_idle_time, &pool_procs);
+		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+	}
+
+	seq_printf(m, "unallocated_capacity_weight=%d\n",
+		   ppp_data.unallocated_weight);
+	seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+	seq_printf(m, "capped=%d\n", ppp_data.capped);
+	seq_printf(m, "unallocated_capacity=%ld\n",
+		   ppp_data.unallocated_entitlement);
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+	struct hvcall_mpp_data mpp_data;
+	int rc;
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+	if (mpp_data.mapped_mem != -1)
+		seq_printf(m, "mapped_entitled_memory=%ld\n",
+		           mpp_data.mapped_mem);
+
+	seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+	seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+	seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+	seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+	           mpp_data.unallocated_mem_weight);
+	seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+	           mpp_data.unallocated_entitlement);
+
+	if (mpp_data.pool_size != -1)
+		seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+		           mpp_data.pool_size);
+
+	seq_printf(m, "entitled_memory_loan_request=%ld\n",
+	           mpp_data.loan_request);
+
+	seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
 }
 
 #define SPLPAR_CHARACTERISTICS_TOKEN 20
@@ -313,6 +410,25 @@ static int lparcfg_count_active_processors(void)
 	return count;
 }
 
+static void pseries_cmo_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long cmo_faults = 0;
+	unsigned long cmo_fault_time = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cmo_faults += lppaca[cpu].cmo_faults;
+		cmo_fault_time += lppaca[cpu].cmo_fault_time;
+	}
+
+	seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+	seq_printf(m, "cmo_fault_time_usec=%lu\n",
+		   cmo_fault_time / tb_ticks_per_usec);
+}
+
 static int pseries_lparcfg_data(struct seq_file *m, void *v)
 {
 	int partition_potential_processors;
@@ -334,60 +450,13 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	partition_active_processors = lparcfg_count_active_processors();
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-		unsigned long h_entitled, h_unallocated;
-		unsigned long h_aggregation, h_resource;
-		unsigned long pool_idle_time, pool_procs;
-		unsigned long purr;
-
-		h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
-			  &h_resource);
-
-		seq_printf(m, "R4=0x%lx\n", h_entitled);
-		seq_printf(m, "R5=0x%lx\n", h_unallocated);
-		seq_printf(m, "R6=0x%lx\n", h_aggregation);
-		seq_printf(m, "R7=0x%lx\n", h_resource);
-
-		purr = get_purr();
-
 		/* this call handles the ibm,get-system-parameter contents */
 		parse_system_parameter_string(m);
+		parse_ppp_data(m);
+		parse_mpp_data(m);
+		pseries_cmo_data(m);
 
-		seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
-
-		seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
-
-		seq_printf(m, "system_active_processors=%ld\n",
-			   (h_resource >> 0 * 8) & 0xffff);
-
-		/* pool related entries are apropriate for shared configs */
-		if (lppaca[0].shared_proc) {
-
-			h_pic(&pool_idle_time, &pool_procs);
-
-			seq_printf(m, "pool=%ld\n",
-				   (h_aggregation >> 0 * 8) & 0xffff);
-
-			/* report pool_capacity in percentage */
-			seq_printf(m, "pool_capacity=%ld\n",
-				   ((h_resource >> 2 * 8) & 0xffff) * 100);
-
-			seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
-
-			seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
-		}
-
-		seq_printf(m, "unallocated_capacity_weight=%ld\n",
-			   (h_resource >> 4 * 8) & 0xFF);
-
-		seq_printf(m, "capacity_weight=%ld\n",
-			   (h_resource >> 5 * 8) & 0xFF);
-
-		seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
-
-		seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
-
-		seq_printf(m, "purr=%ld\n", purr);
-
+		seq_printf(m, "purr=%ld\n", get_purr());
 	} else {		/* non SPLPAR case */
 
 		seq_printf(m, "system_active_processors=%d\n",
@@ -414,6 +483,83 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	return 0;
 }
 
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_ppp_data ppp_data;
+	u8 new_weight;
+	u64 new_entitled;
+	ssize_t retval;
+
+	/* Get our current parameters */
+	retval = h_get_ppp(&ppp_data);
+	if (retval)
+		return retval;
+
+	if (entitlement) {
+		new_weight = ppp_data.weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = ppp_data.entitlement;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __FUNCTION__, ppp_data.entitlement, ppp_data.weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+		 __FUNCTION__, new_entitled, new_weight);
+
+	retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+	return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition.  Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_mpp_data mpp_data;
+	u64 new_entitled;
+	u8 new_weight;
+	ssize_t rc;
+
+	if (entitlement) {
+		/* Check with vio to ensure the new memory entitlement
+		 * can be handled.
+		 */
+		rc = vio_cmo_entitlement_update(*entitlement);
+		if (rc)
+			return rc;
+	}
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return rc;
+
+	if (entitlement) {
+		new_weight = mpp_data.mem_weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = mpp_data.entitled_mem;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+	         __FUNCTION__, new_entitled, new_weight);
+
+	rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+	return rc;
+}
+
 /*
  * Interface for changing system parameters (variable capacity weight
  * and entitled capacity).  Format of input is "param_name=value";
@@ -427,35 +573,27 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 			     size_t count, loff_t * off)
 {
-	char *kbuf;
+	int kbuf_sz = 64;
+	char kbuf[kbuf_sz];
 	char *tmp;
 	u64 new_entitled, *new_entitled_ptr = &new_entitled;
 	u8 new_weight, *new_weight_ptr = &new_weight;
-
-	unsigned long current_entitled;	/* parameters for h_get_ppp */
-	unsigned long dummy;
-	unsigned long resource;
-	u8 current_weight;
-
-	ssize_t retval = -ENOMEM;
+	ssize_t retval;
 
 	if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
 			firmware_has_feature(FW_FEATURE_ISERIES))
 		return -EINVAL;
 
-	kbuf = kmalloc(count, GFP_KERNEL);
-	if (!kbuf)
-		goto out;
+	if (count > kbuf_sz)
+		return -EINVAL;
 
-	retval = -EFAULT;
 	if (copy_from_user(kbuf, buf, count))
-		goto out;
+		return -EFAULT;
 
-	retval = -EINVAL;
 	kbuf[count - 1] = '\0';
 	tmp = strchr(kbuf, '=');
 	if (!tmp)
-		goto out;
+		return -EINVAL;
 
 	*tmp++ = '\0';
 
@@ -463,34 +601,32 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		char *endp;
 		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
-		new_weight_ptr = &current_weight;
+			return -EINVAL;
+
+		retval = update_ppp(new_entitled_ptr, NULL);
 	} else if (!strcmp(kbuf, "capacity_weight")) {
 		char *endp;
 		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
-		new_entitled_ptr = &current_entitled;
-	} else
-		goto out;
-
-	/* Get our current parameters */
-	retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
-	if (retval) {
-		retval = -EIO;
-		goto out;
-	}
-
-	current_weight = (resource >> 5 * 8) & 0xFF;
+			return -EINVAL;
 
-	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
-		 __func__, current_entitled, current_weight);
+		retval = update_ppp(NULL, new_weight_ptr);
+	} else if (!strcmp(kbuf, "entitled_memory")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
 
-	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
-		 __func__, *new_entitled_ptr, *new_weight_ptr);
+		retval = update_mpp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
 
-	retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr,
-				    *new_weight_ptr);
+		retval = update_mpp(NULL, new_weight_ptr);
+	} else
+		return -EINVAL;
 
 	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
 		retval = count;
@@ -506,8 +642,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		retval = -EIO;
 	}
 
-out:
-	kfree(kbuf);
 	return retval;
 }
 
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 29a0e039d43..aab76887a84 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -48,7 +48,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	if (ppc_md.machine_kexec)
 		ppc_md.machine_kexec(image);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 219f3634115..db2497ccc11 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -47,6 +47,8 @@
 #ifdef CONFIG_PPC64
 #include <asm/firmware.h>
 #endif
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
 
 extern unsigned long _get_SP(void);
 
@@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void)
 }
 #endif /* CONFIG_SMP */
 
+void do_dabr(struct pt_regs *regs, unsigned long address,
+		    unsigned long error_code)
+{
+	siginfo_t info;
+
+	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+			11, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (debugger_dabr_match(regs))
+		return;
+
+	/* Clear the DAC and struct entries.  One shot trigger */
+#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE))
+	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W
+							| DBCR0_IDM));
+#endif
+
+	/* Clear the DABR */
+	set_dabr(0);
+
+	/* Deliver the signal to userspace */
+	info.si_signo = SIGTRAP;
+	info.si_errno = 0;
+	info.si_code = TRAP_HWBKPT;
+	info.si_addr = (void __user *)address;
+	force_sig_info(SIGTRAP, &info, current);
+}
+
 static DEFINE_PER_CPU(unsigned long, current_dabr);
 
 int set_dabr(unsigned long dabr)
@@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr)
 #if defined(CONFIG_PPC64) || defined(CONFIG_6xx)
 	mtspr(SPRN_DABR, dabr);
 #endif
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	mtspr(SPRN_DAC1, dabr);
+#endif
+
 	return 0;
 }
 
@@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
 		set_dabr(new->thread.dabr);
 
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	/* If new thread DAC (HW breakpoint) is the same then leave it */
+	if (new->thread.dabr)
+		set_dabr(new->thread.dabr);
+#endif
+
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 
@@ -525,6 +567,10 @@ void flush_thread(void)
 	if (current->thread.dabr) {
 		current->thread.dabr = 0;
 		set_dabr(0);
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+		current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W);
+#endif
 	}
 }
 
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1ea8c8d3ce8..c4ab2195b9c 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -643,6 +643,11 @@ static void __init early_cmdline_parse(void)
 #else
 #define OV5_MSI			0x00
 #endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO			0x80	/* Cooperative Memory Overcommitment */
+#else
+#define OV5_CMO			0x00
+#endif
 
 /*
  * The architecture vector has an array of PVR mask/value pairs,
@@ -687,10 +692,12 @@ static unsigned char ibm_architecture_vec[] = {
 	0,				/* don't halt */
 
 	/* option vector 5: PAPR/OF options */
-	3 - 2,				/* length */
+	5 - 2,				/* length */
 	0,				/* don't ignore, don't halt */
 	OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
 	OV5_DONATE_DEDICATE_CPU | OV5_MSI,
+	0,
+	OV5_CMO,
 };
 
 /* Old method - ELF header with PT_NOTE sections */
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 8feb93e7890..a5d0e78779c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task)
 
 	if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-		task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC;
+		task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
 		regs->msr |= MSR_DE;
 #else
 		regs->msr |= MSR_SE;
@@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task)
 {
 	struct pt_regs *regs = task->thread.regs;
 
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	/* If DAC then do not single step, skip */
+	if (task->thread.dabr)
+		return;
+#endif
+
 	if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-		task->thread.dbcr0 = 0;
+		task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM);
 		regs->msr &= ~MSR_DE;
 #else
 		regs->msr &= ~MSR_SE;
@@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task)
 	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
 }
 
-static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
+int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 			       unsigned long data)
 {
-	/* We only support one DABR and no IABRS at the moment */
+	/* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
+	 *  For embedded processors we support one DAC and no IAC's at the
+	 *  moment.
+	 */
 	if (addr > 0)
 		return -EINVAL;
 
-	/* The bottom 3 bits are flags */
 	if ((data & ~0x7UL) >= TASK_SIZE)
 		return -EIO;
 
-	/* Ensure translation is on */
+#ifdef CONFIG_PPC64
+
+	/* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
+	 *  It was assumed, on previous implementations, that 3 bits were
+	 *  passed together with the data address, fitting the design of the
+	 *  DABR register, as follows:
+	 *
+	 *  bit 0: Read flag
+	 *  bit 1: Write flag
+	 *  bit 2: Breakpoint translation
+	 *
+	 *  Thus, we use them here as so.
+	 */
+
+	/* Ensure breakpoint translation bit is set */
 	if (data && !(data & DABR_TRANSLATION))
 		return -EIO;
 
+	/* Move contents to the DABR register */
 	task->thread.dabr = data;
+
+#endif
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+
+	/* As described above, it was assumed 3 bits were passed with the data
+	 *  address, but we will assume only the mode bits will be passed
+	 *  as to not cause alignment restrictions for DAC-based processors.
+	 */
+
+	/* DAC's hold the whole address without any mode flags */
+	task->thread.dabr = data & ~0x3UL;
+
+	if (task->thread.dabr == 0) {
+		task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM);
+		task->thread.regs->msr &= ~MSR_DE;
+		return 0;
+	}
+
+	/* Read or Write bits must be set */
+
+	if (!(data & 0x3UL))
+		return -EINVAL;
+
+	/* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
+	   register */
+	task->thread.dbcr0 = DBCR0_IDM;
+
+	/* Check for write and read flags and set DBCR0
+	   accordingly */
+	if (data & 0x1UL)
+		task->thread.dbcr0 |= DBSR_DAC1R;
+	if (data & 0x2UL)
+		task->thread.dbcr0 |= DBSR_DAC1W;
+
+	task->thread.regs->msr |= MSR_DE;
+#endif
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 09ded5c424a..149cb112cd1 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(struct kmem_cache *cache, void *ptr)
+void rtas_block_ctor(void *ptr)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index ad55488939c..7aada783ec6 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
 	 * user space. The DABR will have been cleared if it
 	 * triggered inside the kernel.
 	 */
-	if (current->thread.dabr)
+	if (current->thread.dabr) {
 		set_dabr(current->thread.dabr);
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+		mtspr(SPRN_DBCR0, current->thread.dbcr0);
+#endif
+	}
 
 	if (is32) {
         	if (ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index 8cee5710754..6fc6328dc62 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -7,6 +7,7 @@
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
+#include <linux/mm.h>
 #include <asm/page.h>
 
 /* References to section boundaries */
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index aba0ba95f06..800e5e9a087 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -529,7 +529,8 @@ static void register_nodes(void)
 #endif
 
 /* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev, char *buf)
+static ssize_t show_physical_id(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 878fbddb6ae..81ccb8dd1a5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
 		}
 
 		_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+	} else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
+		regs->msr &= ~MSR_DE;
+
+		if (user_mode(regs)) {
+			current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W |
+								DBCR0_IDM);
+		} else {
+			/* Disable DAC interupts */
+			mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R |
+						DBSR_DAC1W | DBCR0_IDM));
+
+			/* Clear the DAC event */
+			mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W));
+		}
+		/* Setup and send the trap to the handler */
+		do_dabr(regs, mfspr(SPRN_DAC1), debug_status);
 	}
 }
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b77f8af7ddd..ade8aeaa2e7 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1,11 +1,12 @@
 /*
  * IBM PowerPC Virtual I/O Infrastructure Support.
  *
- *    Copyright (c) 2003-2005 IBM Corp.
+ *    Copyright (c) 2003,2008 IBM Corp.
  *     Dave Engebretsen engebret@us.ibm.com
  *     Santiago Leon santil@us.ibm.com
  *     Hollis Blanchard <hollisb@us.ibm.com>
  *     Stephen Rothwell
+ *     Robert Jennings <rcjenn@us.ibm.com>
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device  = { /* fake "parent" device */
 	.dev.bus = &vio_bus_type,
 };
 
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+	size_t size;
+	size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+	struct vio_dev *viodev;
+	struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+struct vio_cmo {
+	spinlock_t lock;
+	struct delayed_work balance_q;
+	struct list_head device_list;
+	size_t entitled;
+	struct vio_cmo_pool reserve;
+	struct vio_cmo_pool excess;
+	size_t spare;
+	size_t min;
+	size_t desired;
+	size_t curr;
+	size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+	struct device_node *node_vroot;
+	int count = 0;
+
+	/*
+	 * Count the number of vdevice entries with an
+	 * ibm,my-dma-window OF property
+	 */
+	node_vroot = of_find_node_by_name(NULL, "vdevice");
+	if (node_vroot) {
+		struct device_node *of_node;
+		struct property *prop;
+
+		for_each_child_of_node(node_vroot, of_node) {
+			prop = of_find_property(of_node, "ibm,my-dma-window",
+			                       NULL);
+			if (prop)
+				count++;
+		}
+	}
+	of_node_put(node_vroot);
+	return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices.  The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ *  0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t reserve_free = 0;
+	size_t excess_free = 0;
+	int ret = -ENOMEM;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Determine the amount of free entitlement available in reserve */
+	if (viodev->cmo.entitled > viodev->cmo.allocated)
+		reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+	/* If spare is not fulfilled, the excess pool can not be used. */
+	if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+		excess_free = vio_cmo.excess.free;
+
+	/* The request can be satisfied */
+	if ((reserve_free + excess_free) >= size) {
+		vio_cmo.curr += size;
+		if (vio_cmo.curr > vio_cmo.high)
+			vio_cmo.high = vio_cmo.curr;
+		viodev->cmo.allocated += size;
+		size -= min(reserve_free, size);
+		vio_cmo.excess.free -= size;
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t spare_needed = 0;
+	size_t excess_freed = 0;
+	size_t reserve_freed = size;
+	size_t tmp;
+	int balance = 0;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.curr -= size;
+
+	/* Amount of memory freed from the excess pool */
+	if (viodev->cmo.allocated > viodev->cmo.entitled) {
+		excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+		                                   viodev->cmo.entitled));
+		reserve_freed -= excess_freed;
+	}
+
+	/* Remove allocation from device */
+	viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+	/* Spare is a subset of the reserve pool, replenish it first. */
+	spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+	/*
+	 * Replenish the spare in the reserve pool from the excess pool.
+	 * This moves entitlement into the reserve pool.
+	 */
+	if (spare_needed && excess_freed) {
+		tmp = min(excess_freed, spare_needed);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		vio_cmo.spare += tmp;
+		excess_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Replenish the spare in the reserve pool from the reserve pool.
+	 * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+	 * if needed, and gives it to the spare pool. The amount of used
+	 * memory in this pool does not change.
+	 */
+	if (spare_needed && reserve_freed) {
+		tmp = min(spare_needed, min(reserve_freed,
+		                            (viodev->cmo.entitled -
+		                             VIO_CMO_MIN_ENT)));
+
+		vio_cmo.spare += tmp;
+		viodev->cmo.entitled -= tmp;
+		reserve_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Increase the reserve pool until the desired allocation is met.
+	 * Move an allocation freed from the excess pool into the reserve
+	 * pool and schedule a balance operation.
+	 */
+	if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+		tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		excess_freed -= tmp;
+		balance = 1;
+	}
+
+	/* Return memory from the excess pool to that pool */
+	if (excess_freed)
+		vio_cmo.excess.free += excess_freed;
+
+	if (balance)
+		schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool.  Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail, delta, tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Entitlement increases */
+	if (new_entitlement > vio_cmo.entitled) {
+		delta = new_entitlement - vio_cmo.entitled;
+
+		/* Fulfill spare allocation */
+		if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+			tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			vio_cmo.reserve.size += tmp;
+			delta -= tmp;
+		}
+
+		/* Remaining new allocation goes to the excess pool */
+		vio_cmo.entitled += delta;
+		vio_cmo.excess.size += delta;
+		vio_cmo.excess.free += delta;
+
+		goto out;
+	}
+
+	/* Entitlement decreases */
+	delta = vio_cmo.entitled - new_entitlement;
+	avail = vio_cmo.excess.free;
+
+	/*
+	 * Need to check how much unused entitlement each device can
+	 * sacrifice to fulfill entitlement change.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		if (avail >= delta)
+			break;
+
+		viodev = dev_ent->viodev;
+		if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+		    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				avail += viodev->cmo.entitled -
+				         max_t(size_t, viodev->cmo.allocated,
+				               VIO_CMO_MIN_ENT);
+	}
+
+	if (delta <= avail) {
+		vio_cmo.entitled -= delta;
+
+		/* Take entitlement from the excess pool first */
+		tmp = min(vio_cmo.excess.free, delta);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.excess.free -= tmp;
+		delta -= tmp;
+
+		/*
+		 * Remove all but VIO_CMO_MIN_ENT bytes from devices
+		 * until entitlement change is served
+		 */
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			if (!delta)
+				break;
+
+			viodev = dev_ent->viodev;
+			tmp = 0;
+			if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+			    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				tmp = viodev->cmo.entitled -
+				      max_t(size_t, viodev->cmo.allocated,
+				            VIO_CMO_MIN_ENT);
+			viodev->cmo.entitled -= min(tmp, delta);
+			delta -= min(tmp, delta);
+		}
+	} else {
+		spin_unlock_irqrestore(&vio_cmo.lock, flags);
+		return -ENOMEM;
+	}
+
+out:
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver.  The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+	struct vio_cmo *cmo;
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail = 0, level, chunk, need;
+	int devcount = 0, fulfilled;
+
+	cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Calculate minimum entitlement and fulfill spare */
+	cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+	BUG_ON(cmo->min > cmo->entitled);
+	cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+	cmo->min += cmo->spare;
+	cmo->desired = cmo->min;
+
+	/*
+	 * Determine how much entitlement is available and reset device
+	 * entitlements
+	 */
+	avail = cmo->entitled - cmo->spare;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		devcount++;
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+		avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+	}
+
+	/*
+	 * Having provided each device with the minimum entitlement, loop
+	 * over the devices portioning out the remaining entitlement
+	 * until there is nothing left.
+	 */
+	level = VIO_CMO_MIN_ENT;
+	while (avail) {
+		fulfilled = 0;
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			viodev = dev_ent->viodev;
+
+			if (viodev->cmo.desired <= level) {
+				fulfilled++;
+				continue;
+			}
+
+			/*
+			 * Give the device up to VIO_CMO_BALANCE_CHUNK
+			 * bytes of entitlement, but do not exceed the
+			 * desired level of entitlement for the device.
+			 */
+			chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+			chunk = min(chunk, (viodev->cmo.desired -
+			                    viodev->cmo.entitled));
+			viodev->cmo.entitled += chunk;
+
+			/*
+			 * If the memory for this entitlement increase was
+			 * already allocated to the device it does not come
+			 * from the available pool being portioned out.
+			 */
+			need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+			       max(viodev->cmo.allocated, level);
+			avail -= need;
+
+		}
+		if (fulfilled == devcount)
+			break;
+		level += VIO_CMO_BALANCE_CHUNK;
+	}
+
+	/* Calculate new reserve and excess pool sizes */
+	cmo->reserve.size = cmo->min;
+	cmo->excess.free = 0;
+	cmo->excess.size = 0;
+	need = 0;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		/* Calculated reserve size above the minimum entitlement */
+		if (viodev->cmo.entitled)
+			cmo->reserve.size += (viodev->cmo.entitled -
+			                      VIO_CMO_MIN_ENT);
+		/* Calculated used excess entitlement */
+		if (viodev->cmo.allocated > viodev->cmo.entitled)
+			need += viodev->cmo.allocated - viodev->cmo.entitled;
+	}
+	cmo->excess.size = cmo->entitled - cmo->reserve.size;
+	cmo->excess.free = cmo->excess.size - need;
+
+	cancel_delayed_work(container_of(work, struct delayed_work, work));
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+                                          dma_addr_t *dma_handle, gfp_t flag)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	void *ret;
+
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return NULL;
+	}
+
+	ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
+	if (unlikely(ret == NULL)) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+                                        void *vaddr, dma_addr_t dma_handle)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr,
+                                           size_t size,
+                                           enum dma_data_direction direction,
+                                           struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	dma_addr_t ret = DMA_ERROR_CODE;
+
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return ret;
+	}
+
+	ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs);
+	if (unlikely(dma_mapping_error(ret))) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_single(struct device *dev,
+		dma_addr_t dma_handle, size_t size,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+                                int nelems, enum dma_data_direction direction,
+                                struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct scatterlist *sgl;
+	int ret, count = 0;
+	size_t alloc_size = 0;
+
+	for (sgl = sglist; count < nelems; count++, sgl++)
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+
+	if (vio_cmo_alloc(viodev, alloc_size)) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return 0;
+	}
+
+	ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+	if (unlikely(!ret)) {
+		vio_cmo_dealloc(viodev, alloc_size);
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+	if (alloc_size)
+		vio_cmo_dealloc(viodev, alloc_size);
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct scatterlist *sgl;
+	size_t alloc_size = 0;
+	int count = 0;
+
+	for (sgl = sglist; count < nelems; count++, sgl++)
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+
+	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+	vio_cmo_dealloc(viodev, alloc_size);
+}
+
+struct dma_mapping_ops vio_dma_mapping_ops = {
+	.alloc_coherent = vio_dma_iommu_alloc_coherent,
+	.free_coherent  = vio_dma_iommu_free_coherent,
+	.map_single     = vio_dma_iommu_map_single,
+	.unmap_single   = vio_dma_iommu_unmap_single,
+	.map_sg         = vio_dma_iommu_map_sg,
+	.unmap_sg       = vio_dma_iommu_unmap_sg,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @new_desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs.  The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+	unsigned long flags;
+	struct vio_cmo_dev_entry *dev_ent;
+	int found = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (desired < VIO_CMO_MIN_ENT)
+		desired = VIO_CMO_MIN_ENT;
+
+	/*
+	 * Changes will not be made for devices not in the device list.
+	 * If it is not in the device list, then no driver is loaded
+	 * for the device and it can not receive entitlement.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			found = 1;
+			break;
+		}
+	if (!found)
+		return;
+
+	/* Increase/decrease in desired device entitlement */
+	if (desired >= viodev->cmo.desired) {
+		/* Just bump the bus and device values prior to a balance*/
+		vio_cmo.desired += desired - viodev->cmo.desired;
+		viodev->cmo.desired = desired;
+	} else {
+		/* Decrease bus and device values for desired entitlement */
+		vio_cmo.desired -= viodev->cmo.desired - desired;
+		viodev->cmo.desired = desired;
+		/*
+		 * If less entitlement is desired than current entitlement, move
+		 * any reserve memory in the change region to the excess pool.
+		 */
+		if (viodev->cmo.entitled > desired) {
+			vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+			vio_cmo.excess.size += viodev->cmo.entitled - desired;
+			/*
+			 * If entitlement moving from the reserve pool to the
+			 * excess pool is currently unused, add to the excess
+			 * free counter.
+			 */
+			if (viodev->cmo.allocated < viodev->cmo.entitled)
+				vio_cmo.excess.free += viodev->cmo.entitled -
+				                       max(viodev->cmo.allocated, desired);
+			viodev->cmo.entitled = desired;
+		}
+	}
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ *          -ENOMEM when entitlement is not available for device or
+ *          device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	struct device *dev = &viodev->dev;
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	unsigned long flags;
+	size_t size;
+
+	/*
+	 * Check to see that device has a DMA window and configure
+	 * entitlement for the device.
+	 */
+	if (of_get_property(viodev->dev.archdata.of_node,
+	                    "ibm,my-dma-window", NULL)) {
+		/* Check that the driver is CMO enabled and get desired DMA */
+		if (!viodrv->get_desired_dma) {
+			dev_err(dev, "%s: device driver does not support CMO\n",
+			        __func__);
+			return -EINVAL;
+		}
+
+		viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+			viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		size = VIO_CMO_MIN_ENT;
+
+		dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+		                  GFP_KERNEL);
+		if (!dev_ent)
+			return -ENOMEM;
+
+		dev_ent->viodev = viodev;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+		list_add(&dev_ent->list, &vio_cmo.device_list);
+	} else {
+		viodev->cmo.desired = 0;
+		size = 0;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+	}
+
+	/*
+	 * If the needs for vio_cmo.min have not changed since they
+	 * were last set, the number of devices in the OF tree has
+	 * been constant and the IO memory for this is already in
+	 * the reserve pool.
+	 */
+	if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+	                    VIO_CMO_MIN_ENT)) {
+		/* Updated desired entitlement if device requires it */
+		if (size)
+			vio_cmo.desired += (viodev->cmo.desired -
+		                        VIO_CMO_MIN_ENT);
+	} else {
+		size_t tmp;
+
+		tmp = vio_cmo.spare + vio_cmo.excess.free;
+		if (tmp < size) {
+			dev_err(dev, "%s: insufficient free "
+			        "entitlement to add device. "
+			        "Need %lu, have %lu\n", __func__,
+				size, (vio_cmo.spare + tmp));
+			spin_unlock_irqrestore(&vio_cmo.lock, flags);
+			return -ENOMEM;
+		}
+
+		/* Use excess pool first to fulfill request */
+		tmp = min(size, vio_cmo.excess.free);
+		vio_cmo.excess.free -= tmp;
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+
+		/* Use spare if excess pool was insufficient */
+		vio_cmo.spare -= size - tmp;
+
+		/* Update bus accounting */
+		vio_cmo.min += size;
+		vio_cmo.desired += viodev->cmo.desired;
+	}
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list.  The minimum entitlement
+ * will be reserved for the device as long as it is in the system.  The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (viodev->cmo.allocated) {
+		dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+		        "allocated after remove operation.\n",
+		        __func__, viodev->cmo.allocated);
+		BUG();
+	}
+
+	/*
+	 * Remove the device from the device list being maintained for
+	 * CMO enabled devices.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			list_del(&dev_ent->list);
+			kfree(dev_ent);
+			break;
+		}
+
+	/*
+	 * Devices may not require any entitlement and they do not need
+	 * to be processed.  Otherwise, return the device's entitlement
+	 * back to the pools.
+	 */
+	if (viodev->cmo.entitled) {
+		/*
+		 * This device has not yet left the OF tree, it's
+		 * minimum entitlement remains in vio_cmo.min and
+		 * vio_cmo.desired
+		 */
+		vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+		/*
+		 * Save min allocation for device in reserve as long
+		 * as it exists in OF tree as determined by later
+		 * balance operation
+		 */
+		viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+		/* Replenish spare from freed reserve pool */
+		if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+			tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+			                                 vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			viodev->cmo.entitled -= tmp;
+		}
+
+		/* Remaining reserve goes to excess pool */
+		vio_cmo.excess.size += viodev->cmo.entitled;
+		vio_cmo.excess.free += viodev->cmo.entitled;
+		vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+		/*
+		 * Until the device is removed it will keep a
+		 * minimum entitlement; this will guarantee that
+		 * a module unload/load will result in a success.
+		 */
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		atomic_set(&viodev->cmo.allocs_failed, 0);
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+	vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
+	viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+	struct hvcall_mpp_data mpp_data;
+	int err;
+
+	memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+	spin_lock_init(&vio_cmo.lock);
+	INIT_LIST_HEAD(&vio_cmo.device_list);
+	INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+	/* Get current system entitlement */
+	err = h_get_mpp(&mpp_data);
+
+	/*
+	 * On failure, continue with entitlement set to 0, will panic()
+	 * later when spare is reserved.
+	 */
+	if (err != H_SUCCESS) {
+		printk(KERN_ERR "%s: unable to determine system IO "\
+		       "entitlement. (%d)\n", __func__, err);
+		vio_cmo.entitled = 0;
+	} else {
+		vio_cmo.entitled = mpp_data.entitled_mem;
+	}
+
+	/* Set reservation and check against entitlement */
+	vio_cmo.spare = VIO_CMO_MIN_ENT;
+	vio_cmo.reserve.size = vio_cmo.spare;
+	vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+	                         VIO_CMO_MIN_ENT);
+	if (vio_cmo.reserve.size > vio_cmo.entitled) {
+		printk(KERN_ERR "%s: insufficient system entitlement\n",
+		       __func__);
+		panic("%s: Insufficient system entitlement", __func__);
+	}
+
+	/* Set the remaining accounting variables */
+	vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+	vio_cmo.excess.free = vio_cmo.excess.size;
+	vio_cmo.min = vio_cmo.reserve.size;
+	vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name)                                        \
+static ssize_t viodev_cmo_##name##_show(struct device *dev,             \
+                                        struct device_attribute *attr,  \
+                                         char *buf)                     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
+}
+
+static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	atomic_set(&viodev->cmo.allocs_failed, 0);
+	return count;
+}
+
+static ssize_t viodev_cmo_desired_set(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	size_t new_desired;
+	int ret;
+
+	ret = strict_strtoul(buf, 10, &new_desired);
+	if (ret)
+		return ret;
+
+	vio_cmo_set_dev_desired(viodev, new_desired);
+	return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static struct device_attribute vio_cmo_dev_attrs[] = {
+	__ATTR_RO(name),
+	__ATTR_RO(devspec),
+	__ATTR(cmo_desired,       S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viodev_cmo_desired_show, viodev_cmo_desired_set),
+	__ATTR(cmo_entitled,      S_IRUGO, viodev_cmo_entitled_show,      NULL),
+	__ATTR(cmo_allocated,     S_IRUGO, viodev_cmo_allocated_show,     NULL),
+	__ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
+	__ATTR_NULL
+};
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name)                                        \
+static ssize_t                                                          \
+viobus_cmo_##name##_show(struct bus_type *bt, char *buf)                \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name);                     \
+}
+
+#define viobus_cmo_pool_rd_attr(name, var)                              \
+static ssize_t                                                          \
+viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf)     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
+}
+
+static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
+                                     size_t count)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.high = vio_cmo.curr;
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+	return count;
+}
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_rd_attr(high);
+
+static struct bus_attribute vio_cmo_bus_attrs[] = {
+	__ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
+	__ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
+	__ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
+	__ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
+	__ATTR(cmo_spare,   S_IRUGO, viobus_cmo_spare_show,   NULL),
+	__ATTR(cmo_min,     S_IRUGO, viobus_cmo_min_show,     NULL),
+	__ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
+	__ATTR(cmo_curr,    S_IRUGO, viobus_cmo_curr_show,    NULL),
+	__ATTR(cmo_high,    S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viobus_cmo_high_show, viobus_cmo_high_reset),
+	__ATTR_NULL
+};
+
+static void vio_cmo_sysfs_init(void)
+{
+	vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
+	vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
+}
+#else /* CONFIG_PPC_SMLPAR */
+/* Dummy functions for iSeries platform */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init() {}
+static void vio_cmo_sysfs_init() { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
 static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 {
 	const unsigned char *dma_window;
@@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev)
 		return error;
 
 	id = vio_match_device(viodrv->id_table, viodev);
-	if (id)
+	if (id) {
+		memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			error = vio_cmo_bus_probe(viodev);
+			if (error)
+				return error;
+		}
 		error = viodrv->probe(viodev, id);
+		if (error)
+			vio_cmo_bus_remove(viodev);
+	}
 
 	return error;
 }
@@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
 	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	struct device *devptr;
+	int ret = 1;
+
+	/*
+	 * Hold a reference to the device after the remove function is called
+	 * to allow for CMO accounting cleanup for the device.
+	 */
+	devptr = get_device(dev);
 
 	if (viodrv->remove)
-		return viodrv->remove(viodev);
+		ret = viodrv->remove(viodev);
+
+	if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_remove(viodev);
 
-	/* driver can't remove */
-	return 1;
+	put_device(devptr);
+	return ret;
 }
 
 /**
@@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
 			viodev->unit_address = *unit_address;
 	}
 	viodev->dev.archdata.of_node = of_node_get(of_node);
-	viodev->dev.archdata.dma_ops = &dma_iommu_ops;
+
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_set_dma_ops(viodev);
+	else
+		viodev->dev.archdata.dma_ops = &dma_iommu_ops;
 	viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev);
 	viodev->dev.archdata.numa_node = of_node_to_nid(of_node);
 
@@ -245,6 +1260,9 @@ static int __init vio_bus_init(void)
 	int err;
 	struct device_node *node_vroot;
 
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_sysfs_init();
+
 	err = bus_register(&vio_bus_type);
 	if (err) {
 		printk(KERN_ERR "failed to register VIO bus\n");
@@ -262,6 +1280,9 @@ static int __init vio_bus_init(void)
 		return err;
 	}
 
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_init();
+
 	node_vroot = of_find_node_by_name(NULL, "vdevice");
 	if (node_vroot) {
 		struct device_node *of_node;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index a914411bced..4a8ce62fe11 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -85,7 +85,7 @@ SECTIONS
 
 	/* The dummy segment contents for the bug workaround mentioned above
 	   near PHDRS.  */
-	.dummy : {
+	.dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
 		LONG(0xf177)
 	} :kernel :dummy