From 2ef9481e666b4654159ac9f847e6963809e3c470 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 23 Jan 2006 10:58:20 -0600
Subject: [PATCH] powerpc: trivial: modify comments to refer to new location of
 files

This patch removes all self references and fixes references to files
in the now defunct arch/ppc64 tree.  I think this accomplises
everything wanted, though there might be a few references I missed.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 685c25175d9..3e376202dd4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -958,7 +958,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 	 *
 	 * i386     no
 	 * x86_64   no
-	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 * ppc64    yes (see arch/powerpc/platforms/iseries/misc.S)
 	 *
 	 * This also happens with vm86 emulation in a non-nested manner
 	 * (entries without exits), so this case must be caught.
-- 
cgit v1.2.3


From 1fa44ecad2b86475e038aed81b0bf333fa484f8b Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Thu, 23 Feb 2006 12:43:43 -0600
Subject: [SCSI] add execute_in_process_context() API

We have several points in the SCSI stack (primarily for our device
functions) where we need to guarantee process context, but (given the
place where the last reference was released) we cannot guarantee this.

This API gets around the issue by executing the function directly if
the caller has process context, but scheduling a workqueue to execute
in process context if the caller doesn't have it.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c71..e9e464a9037 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/hardirq.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn:		the function to execute
+ * @data:	data to pass to the function
+ * @ew:		guaranteed storage for the execute work structure (must
+ *		be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns:	0 - function was executed
+ *		1 - function was scheduled for execution
+ */
+int execute_in_process_context(void (*fn)(void *data), void *data,
+			       struct execute_work *ew)
+{
+	if (!in_interrupt()) {
+		fn(data);
+		return 0;
+	}
+
+	INIT_WORK(&ew->work, fn, data);
+	schedule_work(&ew->work);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
 int keventd_up(void)
 {
 	return keventd_wq != NULL;
-- 
cgit v1.2.3


From 51107301b629640f9ab76fe23bf385e187b9ac29 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Wed, 15 Mar 2006 08:28:55 -0500
Subject: [PATCH] kobject: fix build error if CONFIG_SYSFS=n

Moving uevent_seqnum and uevent_helper to kobject_uevent.c
because they are used even if CONFIG_SYSFS=n
while kernel/ksysfs.c is built only if CONFIG_SYSFS=y,

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d5eeae0fa5b..f2690ed7453 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,9 +15,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
-u64 uevent_seqnum;
-char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
-
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 
-- 
cgit v1.2.3


From 3fd6805f4dfb02bcfb5634972eabad0e790f119a Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 8 Feb 2006 21:16:45 +0100
Subject: [PATCH] Clean up module.c symbol searching logic

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 73 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5aad477ddc7..2a892b20d68 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -135,6 +135,18 @@ extern const unsigned long __start___kcrctab_gpl[];
 #define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
 #endif
 
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 /* Find a symbol, return value, crc and module which owns it */
 static unsigned long __find_symbol(const char *name,
 				   struct module **owner,
@@ -142,39 +154,41 @@ static unsigned long __find_symbol(const char *name,
 				   int gplok)
 {
 	struct module *mod;
-	unsigned int i;
+	const struct kernel_symbol *ks;
 
 	/* Core kernel first. */ 
 	*owner = NULL;
-	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
-		if (strcmp(__start___ksymtab[i].name, name) == 0) {
-			*crc = symversion(__start___kcrctab, i);
-			return __start___ksymtab[i].value;
-		}
+	ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+	if (ks) {
+		*crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
+		return ks->value;
 	}
 	if (gplok) {
-		for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
-			if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
-				*crc = symversion(__start___kcrctab_gpl, i);
-				return __start___ksymtab_gpl[i].value;
-			}
+		ks = lookup_symbol(name, __start___ksymtab_gpl,
+					 __stop___ksymtab_gpl);
+		if (ks) {
+			*crc = symversion(__start___kcrctab_gpl,
+					  (ks - __start___ksymtab_gpl));
+			return ks->value;
+		}
 	}
 
 	/* Now try modules. */ 
 	list_for_each_entry(mod, &modules, list) {
 		*owner = mod;
-		for (i = 0; i < mod->num_syms; i++)
-			if (strcmp(mod->syms[i].name, name) == 0) {
-				*crc = symversion(mod->crcs, i);
-				return mod->syms[i].value;
-			}
+		ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+		if (ks) {
+			*crc = symversion(mod->crcs, (ks - mod->syms));
+			return ks->value;
+		}
 
 		if (gplok) {
-			for (i = 0; i < mod->num_gpl_syms; i++) {
-				if (strcmp(mod->gpl_syms[i].name, name) == 0) {
-					*crc = symversion(mod->gpl_crcs, i);
-					return mod->gpl_syms[i].value;
-				}
+			ks = lookup_symbol(name, mod->gpl_syms,
+					   mod->gpl_syms + mod->num_gpl_syms);
+			if (ks) {
+				*crc = symversion(mod->gpl_crcs,
+						  (ks - mod->gpl_syms));
+				return ks->value;
 			}
 		}
 	}
@@ -1444,18 +1458,13 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
 {
-	unsigned int i;
-
-	if (!mod) {
-		for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
-			if (strcmp(__start___ksymtab[i].name, name) == 0)
-				return 1;
-		return 0;
-	}
-	for (i = 0; i < mod->num_syms; i++)
-		if (strcmp(mod->syms[i].name, name) == 0)
+	if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
+		return 1;
+	else
+		if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
 			return 1;
-	return 0;
+		else
+			return 0;
 }
 
 /* As per nm */
-- 
cgit v1.2.3


From 9f28bb7e1d0188a993403ab39b774785892805e1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Mar 2006 13:17:13 -0800
Subject: [PATCH] add EXPORT_SYMBOL_GPL_FUTURE()

This patch adds the ability to mark symbols that will be changed in the
future, so that kernel modules that don't include MODULE_LICENSE("GPL")
and use the symbols, will be flagged and printed out to the system log.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2a892b20d68..5ca99fbe9f4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -126,8 +126,11 @@ extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
 extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -172,6 +175,22 @@ static unsigned long __find_symbol(const char *name,
 			return ks->value;
 		}
 	}
+	ks = lookup_symbol(name, __start___ksymtab_gpl_future,
+				 __stop___ksymtab_gpl_future);
+	if (ks) {
+		if (!gplok) {
+			printk(KERN_WARNING "Symbol %s is being used "
+			       "by a non-GPL module, which will not "
+			       "be allowed in the future\n", name);
+			printk(KERN_WARNING "Please see the file "
+			       "Documentation/feature-removal-schedule.txt "
+			       "in the kernel source tree for more "
+			       "details.\n");
+		}
+		*crc = symversion(__start___kcrctab_gpl_future,
+				  (ks - __start___ksymtab_gpl_future));
+		return ks->value;
+	}
 
 	/* Now try modules. */ 
 	list_for_each_entry(mod, &modules, list) {
@@ -191,6 +210,23 @@ static unsigned long __find_symbol(const char *name,
 				return ks->value;
 			}
 		}
+		ks = lookup_symbol(name, mod->gpl_future_syms,
+				   (mod->gpl_future_syms +
+				    mod->num_gpl_future_syms));
+		if (ks) {
+			if (!gplok) {
+				printk(KERN_WARNING "Symbol %s is being used "
+				       "by a non-GPL module, which will not "
+				       "be allowed in the future\n", name);
+				printk(KERN_WARNING "Please see the file "
+				       "Documentation/feature-removal-schedule.txt "
+				       "in the kernel source tree for more "
+				       "details.\n");
+			}
+			*crc = symversion(mod->gpl_future_crcs,
+					  (ks - mod->gpl_future_syms));
+			return ks->value;
+		}
 	}
 	DEBUGP("Failed to find symbol %s\n", name);
  	return 0;
@@ -1546,7 +1582,8 @@ static struct module *load_module(void __user *umod,
 	char *secstrings, *args, *modmagic, *strtab = NULL;
 	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
-		crcindex, gplcrcindex, versindex, pcpuindex;
+		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
+		gplfuturecrcindex;
 	long arglen;
 	struct module *mod;
 	long err = 0;
@@ -1627,8 +1664,10 @@ static struct module *load_module(void __user *umod,
 	/* Optional sections */
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1784,10 +1823,16 @@ static struct module *load_module(void __user *umod,
 	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
 	if (gplcrcindex)
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
+					sizeof(*mod->gpl_future_syms);
+	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
+	if (gplfuturecrcindex)
+		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !crcindex) || 
-	    (mod->num_gpl_syms && !gplcrcindex)) {
+	    (mod->num_gpl_syms && !gplcrcindex) ||
+	    (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
 		add_taint(TAINT_FORCED_MODULE);
-- 
cgit v1.2.3


From 01ca70dca5c64cb774a8ac2f50bddff21d60169f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Mar 2006 13:17:13 -0800
Subject: [PATCH] add EXPORT_SYMBOL_GPL_FUTURE() to RCU subsystem

As the RCU symbols are going to be changed to GPL in the near future,
lets warn users that this is going to happen.

Cc: Paul McKenney <paulmck@us.ibm.com>
Acked-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/rcupdate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 8cf15a569fc..fedf5e36975 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -609,7 +609,7 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu);	/* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);	/* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
-- 
cgit v1.2.3


From 03e88ae1b13dfdc8bbaa59b8198e1ca53aad12ac Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 16 Feb 2006 13:50:23 -0800
Subject: [PATCH] fix module sysfs files reference counting

The module files, refcnt, version, and srcversion did not properly
increment the owner's module reference count, allowing the modules to
be removed while the files were open, causing oopses.

This patch fixes this, and also fixes the problem that the version and
srcversion files were not showing up, unless CONFIG_MODULE_UNLOAD was
enabled, which is not correct.

Cc: Nathan Lynch <ntl@pobox.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 77 +++++++++++++++++++++++----------------------------------
 kernel/params.c | 10 --------
 2 files changed, 31 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5ca99fbe9f4..77764f22f02 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -429,7 +429,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 }
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_MODULE_UNLOAD
 #define MODINFO_ATTR(field)	\
 static void setup_modinfo_##field(struct module *mod, const char *s)  \
 {                                                                     \
@@ -461,12 +460,7 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
 
-static struct module_attribute *modinfo_attrs[] = {
-	&modinfo_version,
-	&modinfo_srcversion,
-	NULL,
-};
-
+#ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -781,6 +775,15 @@ static inline void module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
+static struct module_attribute *modinfo_attrs[] = {
+	&modinfo_version,
+	&modinfo_srcversion,
+#ifdef CONFIG_MODULE_UNLOAD
+	&refcnt,
+#endif
+	NULL,
+};
+
 #ifdef CONFIG_OBSOLETE_MODPARM
 /* Bounds checking done below */
 static int obsparm_copy_string(const char *val, struct kernel_param *kp)
@@ -1106,37 +1109,28 @@ static inline void remove_sect_attrs(struct module *mod)
 }
 #endif /* CONFIG_KALLSYMS */
 
-
-#ifdef CONFIG_MODULE_UNLOAD
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-#else
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-	return 0;
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_UNLOAD
 static int module_add_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
+	struct module_attribute *temp_attr;
 	int error = 0;
 	int i;
 
+	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+					(ARRAY_SIZE(modinfo_attrs) + 1)),
+					GFP_KERNEL);
+	if (!mod->modinfo_attrs)
+		return -ENOMEM;
+
+	temp_attr = mod->modinfo_attrs;
 	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
 		if (!attr->test ||
-		    (attr->test && attr->test(mod)))
-			error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+		    (attr->test && attr->test(mod))) {
+			memcpy(temp_attr, attr, sizeof(*temp_attr));
+			temp_attr->attr.owner = mod;
+			error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+			++temp_attr;
+		}
 	}
 	return error;
 }
@@ -1146,12 +1140,16 @@ static void module_remove_modinfo_attrs(struct module *mod)
 	struct module_attribute *attr;
 	int i;
 
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+		/* pick a field to test for end of list */
+		if (!attr->attr.name)
+			break;
 		sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
-		attr->free(mod);
+		if (attr->free)
+			attr->free(mod);
 	}
+	kfree(mod->modinfo_attrs);
 }
-#endif
 
 static int mod_sysfs_setup(struct module *mod,
 			   struct kernel_param *kparam,
@@ -1169,19 +1167,13 @@ static int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out;
 
-	err = module_add_refcnt_attr(mod);
-	if (err)
-		goto out_unreg;
-
 	err = module_param_sysfs_setup(mod, kparam, num_params);
 	if (err)
 		goto out_unreg;
 
-#ifdef CONFIG_MODULE_UNLOAD
 	err = module_add_modinfo_attrs(mod);
 	if (err)
 		goto out_unreg;
-#endif
 
 	return 0;
 
@@ -1193,10 +1185,7 @@ out:
 
 static void mod_kobject_remove(struct module *mod)
 {
-#ifdef CONFIG_MODULE_UNLOAD
 	module_remove_modinfo_attrs(mod);
-#endif
-	module_remove_refcnt_attr(mod);
 	module_param_sysfs_remove(mod);
 
 	kobject_unregister(&mod->mkobj.kobj);
@@ -1474,7 +1463,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
 	return NULL;
 }
 
-#ifdef CONFIG_MODULE_UNLOAD
 static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 			  unsigned int infoindex)
 {
@@ -1489,7 +1477,6 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 						attr->attr.name));
 	}
 }
-#endif
 
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
@@ -1803,10 +1790,8 @@ static struct module *load_module(void __user *umod,
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint(TAINT_PROPRIETARY_MODULE);
 
-#ifdef CONFIG_MODULE_UNLOAD
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
-#endif
 
 	/* Fix up syms, so that st_value is a pointer to location. */
 	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
diff --git a/kernel/params.c b/kernel/params.c
index c76ad25e6a2..a2915058231 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj,
 	if (!attribute->show)
 		return -EIO;
 
-	if (!try_module_get(mk->mod))
-		return -ENODEV;
-
 	ret = attribute->show(attribute, mk->mod, buf);
 
-	module_put(mk->mod);
-
 	return ret;
 }
 
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj,
 	if (!attribute->store)
 		return -EIO;
 
-	if (!try_module_get(mk->mod))
-		return -ENODEV;
-
 	ret = attribute->store(attribute, mk->mod, buf, len);
 
-	module_put(mk->mod);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9430d58e34ec3861e1ca72f8e49105b227aad327 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 22 Mar 2006 00:07:33 -0800
Subject: [PATCH] sched: remove sleep_avg multiplier

Remove the sleep_avg multiplier.  This multiplier was necessary back when
we had 10 seconds of dynamic range in sleep_avg, but now that we only have
one second, it causes that one second to be compressed down to 100ms in
some cases.  This is particularly noticeable when compiling a kernel in a
slow NFS mount, and I believe it to be a very likely candidate for other
recently reported network related interactivity problems.

In testing, I can detect no negative impact of this removal.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4d46e90f59c..6b6e0d70eb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -706,12 +706,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 						DEF_TIMESLICE);
 		} else {
-			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time.
-			 */
-			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
-
 			/*
 			 * Tasks waking from uninterruptible sleep are
 			 * limited in their sleep_avg rise as they
-- 
cgit v1.2.3


From 06f9d4f94a075285d25253edbf57f2cda07d4ff3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 22 Mar 2006 00:07:40 -0800
Subject: [PATCH] unshare: Error if passed unsupported flags

A bare bones trivial patch to ensure we always get -EINVAL on the
unsupported cases for sys_unshare.  If this goes in before 2.6.16 it allows
us to forward compatible with future applications using sys_unshare.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: JANAK DESAI <janak@us.ibm.com>
Cc: <stable@kerenl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca49..9bd7b65ee41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1534,6 +1534,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 
 	check_unshare_flags(&unshare_flags);
 
+	/* Return -EINVAL for all unsupported flags */
+	err = -EINVAL;
+	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+		goto bad_unshare_out;
+
 	if ((err = unshare_thread(unshare_flags)))
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
-- 
cgit v1.2.3


From 78eef01b0fae087c5fadbd85dd4fe2918c3a015f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:16 -0800
Subject: [PATCH] on_each_cpu(): disable local interrupts

When on_each_cpu() runs the callback on other CPUs, it runs with local
interrupts disabled.  So we should run the function with local interrupts
disabled on this CPU, too.

And do the same for UP, so the callback is run in the same environment on both
UP and SMP.  (strictly it should do preempt_disable() too, but I think
local_irq_disable is sufficiently equivalent).

Also uninlines on_each_cpu().  softirq.c was the most appropriate file I could
find, but it doesn't seem to justify creating a new file.

Oh, and fix up that comment over (under?) x86's smp_call_function().  It
drives me nuts.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdded..ec8fed42a86 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/smp.h>
 
 #include <asm/irq.h>
 /*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, retry, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
-- 
cgit v1.2.3


From e9028b0ff2bad1954568604dc17725692c8524d6 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 23 Mar 2006 02:59:20 -0800
Subject: [PATCH] fix scheduler deadlock

We have noticed lockups during boot when stress testing kexec on ppc64.
Two cpus would deadlock in scheduler code trying to grab already taken
spinlocks.

The double_rq_lock code uses the address of the runqueue to order the
taking of multiple locks.  This address is a per cpu variable:

	if (rq1 < rq2) {
		spin_lock(&rq1->lock);
		spin_lock(&rq2->lock);
	} else {
		spin_lock(&rq2->lock);
		spin_lock(&rq1->lock);
	}

On the other hand, the code in wake_sleeping_dependent uses the cpu id
order to grab locks:

	for_each_cpu_mask(i, sibling_map)
		spin_lock(&cpu_rq(i)->lock);

This means we rely on the address of per cpu data increasing as cpu ids
increase.  While this will be true for the generic percpu implementation it
may not be true for arch specific implementations.

One way to solve this is to always take runqueues in cpu id order. To do
this we add a cpu variable to the runqueue and check it in the
double runqueue locking functions.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6b6e0d70eb3..a5bd60453ea 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -237,6 +237,7 @@ struct runqueue {
 
 	task_t *migration_thread;
 	struct list_head migration_queue;
+	int cpu;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -1654,6 +1655,9 @@ unsigned long nr_iowait(void)
 /*
  * double_rq_lock - safely lock two runqueues
  *
+ * We must take them in cpu order to match code in
+ * dependent_sleeper and wake_dependent_sleeper.
+ *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
@@ -1665,7 +1669,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
-		if (rq1 < rq2) {
+		if (rq1->cpu < rq2->cpu) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
@@ -1701,7 +1705,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest < this_rq) {
+		if (busiest->cpu < this_rq->cpu) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
@@ -6029,6 +6033,7 @@ void __init sched_init(void)
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
+		rq->cpu = i;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-- 
cgit v1.2.3


From 2b322ce210aec74ae0d02938d3a01e29fe079469 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 02:59:58 -0800
Subject: [PATCH] revert "swsusp: fix breakage with swap on lvm"

This was a temporary thing for 2.6.16.

Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d9d08f72f7..4e90905f0e8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -153,11 +153,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int i;
 
+	if (!swsusp_resume_device)
+		return -ENODEV;
 	spin_lock(&swap_lock);
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
-		if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
+		if (is_resume_device(swap_info + i)) {
 			spin_unlock(&swap_lock);
 			root_swap = i;
 			return 0;
-- 
cgit v1.2.3


From f577eb30afdc68233f25d4d82b04102129262365 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 02:59:59 -0800
Subject: [PATCH] swsusp: low level interface

Introduce the low level interface that can be used for handling the
snapshot of the system memory by the in-kernel swap-writing/reading code of
swsusp and the userland interface code (to be introduced shortly).

Also change the way in which swsusp records the allocated swap pages and,
consequently, simplifies the in-kernel swap-writing/reading code (this is
necessary for the userland interface too).  To this end, it introduces two
helper functions in mm/swapfile.c, so that the swsusp code does not refer
directly to the swap internals.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     |  12 +-
 kernel/power/power.h    |  26 +-
 kernel/power/snapshot.c | 326 +++++++++++++++++++++-
 kernel/power/swsusp.c   | 723 +++++++++++++++++-------------------------------
 4 files changed, 599 insertions(+), 488 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc98..4eb464b7134 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,9 +26,9 @@ extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
+extern int swsusp_write(void);
 extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
+extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -70,10 +70,6 @@ static void power_down(suspend_disk_method_t mode)
 	while(1);
 }
 
-
-static int in_suspend __nosavedata = 0;
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -145,7 +141,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write(pagedir_nosave, nr_copy_pages);
+		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -216,7 +212,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read(&pagedir_nosave))) {
+	if ((error = swsusp_read())) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba68084..ea7132ed029 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -37,21 +37,31 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 
+extern int in_suspend;
+
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
-extern unsigned int snapshot_nr_pages(void);
-extern struct pbe *snapshot_pblist(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+
+struct snapshot_handle {
+	loff_t		offset;
+	unsigned int	page;
+	unsigned int	page_offset;
+	unsigned int	prev;
+	struct pbe	*pbe;
+	void		*buffer;
+	unsigned int	buf_offset;
+};
+
+#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d62..cc349437fb7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
  */
 
 
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
@@ -34,7 +35,8 @@
 #include "power.h"
 
 struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -235,7 +237,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
@@ -301,7 +303,7 @@ struct eaten_page {
 
 static struct eaten_page *eaten_pages = NULL;
 
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
 {
 	struct eaten_page *p, *q;
 
@@ -376,7 +378,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!nr_pages)
 		return NULL;
 
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
 	pblist = alloc_image_page(gfp_mask, safe_needed);
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -414,6 +415,9 @@ void swsusp_free(void)
 				}
 			}
 	}
+	nr_copy_pages = 0;
+	nr_meta_pages = 0;
+	pagedir_nosave = NULL;
 }
 
 
@@ -437,7 +441,7 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
 {
 	struct pbe *p;
 
@@ -504,7 +508,319 @@ asmlinkage int swsusp_save(void)
 	 */
 
 	nr_copy_pages = nr_pages;
+	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
+
+static void init_header(struct swsusp_info *info)
+{
+	memset(info, 0, sizeof(struct swsusp_info));
+	info->version_code = LINUX_VERSION_CODE;
+	info->num_physpages = num_physpages;
+	memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+	info->cpus = num_online_cpus();
+	info->image_pages = nr_copy_pages;
+	info->pages = nr_copy_pages + nr_meta_pages + 1;
+}
+
+/**
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
+}
+
+/**
+ *	snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to read from the snapshot.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to read up to the returned number of bytes from the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the read would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the end of data stream condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+
+	if (handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset) {
+		init_header((struct swsusp_info *)buffer);
+		handle->buffer = buffer;
+		handle->pbe = pagedir_nosave;
+	}
+	if (handle->prev < handle->page) {
+		if (handle->page <= nr_meta_pages) {
+			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe)
+				handle->pbe = pagedir_nosave;
+		} else {
+			handle->buffer = (void *)handle->pbe->address;
+			handle->pbe = handle->pbe->next;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+/**
+ *	mark_unsafe_pages - mark the pages that cannot be used for storing
+ *	the image during resume, because they conflict with the pages that
+ *	had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *p;
+
+	if (!pblist) /* a sanity check */
+		return -EINVAL;
+
+	/* Clear page flags */
+	for_each_zone (zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+				ClearPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Mark orig addresses */
+	for_each_pbe (p, pblist) {
+		if (virt_addr_valid(p->orig_address))
+			SetPageNosaveFree(virt_to_page(p->orig_address));
+		else
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+	/* We assume both lists contain the same number of elements */
+	while (src) {
+		dst->orig_address = src->orig_address;
+		dst = dst->next;
+		src = src->next;
+	}
+}
+
+static int check_header(struct swsusp_info *info)
+{
+	char *reason = NULL;
+
+	if (info->version_code != LINUX_VERSION_CODE)
+		reason = "kernel version";
+	if (info->num_physpages != num_physpages)
+		reason = "memory size";
+	if (strcmp(info->uts.sysname,system_utsname.sysname))
+		reason = "system type";
+	if (strcmp(info->uts.release,system_utsname.release))
+		reason = "kernel release";
+	if (strcmp(info->uts.version,system_utsname.version))
+		reason = "version";
+	if (strcmp(info->uts.machine,system_utsname.machine))
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+		return -EPERM;
+	}
+	return 0;
+}
+
+/**
+ *	load header - check the image header and copy data from it
+ */
+
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+	int error;
+	struct pbe *pblist;
+
+	error = check_header(info);
+	if (!error) {
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		if (!pblist)
+			return -ENOMEM;
+		pagedir_nosave = pblist;
+		handle->pbe = pblist;
+		nr_copy_pages = info->image_pages;
+		nr_meta_pages = info->pages - info->image_pages - 1;
+	}
+	return error;
+}
+
+/**
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
+ */
+
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
+
+/**
+ *	create_image - use metadata contained in the PBE list
+ *	pointed to by pagedir_nosave to mark the pages that will
+ *	be overwritten in the process of restoring the system
+ *	memory state from the image and allocate memory for
+ *	the image avoiding these pages
+ */
+
+static int create_image(struct snapshot_handle *handle)
+{
+	int error = 0;
+	struct pbe *p, *pblist;
+
+	p = pagedir_nosave;
+	error = mark_unsafe_pages(p);
+	if (!error) {
+		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+	}
+	if (!error)
+		error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+	if (!error) {
+		release_eaten_pages();
+		pagedir_nosave = pblist;
+	} else {
+		pagedir_nosave = NULL;
+		handle->pbe = NULL;
+		nr_copy_pages = 0;
+		nr_meta_pages = 0;
+	}
+	return error;
+}
+
+/**
+ *	snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to write to the image.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to write up to the returned number of bytes to the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the write would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the "end of file" condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+	int error = 0;
+
+	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset)
+		handle->buffer = buffer;
+	if (handle->prev < handle->page) {
+		if (!handle->prev) {
+			error = load_header(handle, (struct swsusp_info *)buffer);
+			if (error)
+				return error;
+		} else if (handle->prev <= nr_meta_pages) {
+			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe) {
+				error = create_image(handle);
+				if (error)
+					return error;
+				handle->pbe = pagedir_nosave;
+				handle->buffer = (void *)handle->pbe->address;
+			}
+		} else {
+			handle->pbe = handle->pbe->next;
+			handle->buffer = (void *)handle->pbe->address;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+		handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e8..457084f5001 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -77,6 +77,8 @@
  */
 unsigned long image_size = 500 * 1024 * 1024;
 
+int in_suspend __nosavedata = 0;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -98,8 +100,6 @@ static struct swsusp_header {
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 
-static struct swsusp_info swsusp_info;
-
 /*
  * Saving part...
  */
@@ -129,255 +129,261 @@ static int mark_swapfiles(swp_entry_t start)
 	return error;
 }
 
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device.  This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
  */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
-	struct file *file = swap_info->swap_file;
-	struct inode *inode = file->f_dentry->d_inode;
-
-	return S_ISBLK(inode->i_mode) &&
-		swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int i;
-
-	if (!swsusp_resume_device)
-		return -ENODEV;
-	spin_lock(&swap_lock);
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK))
-			continue;
-		if (is_resume_device(swap_info + i)) {
-			spin_unlock(&swap_lock);
-			root_swap = i;
-			return 0;
-		}
+	int res = swap_type_of(swsusp_resume_device);
+
+	if (res >= 0) {
+		root_swap = res;
+		return 0;
 	}
-	spin_unlock(&swap_lock);
-	return -ENODEV;
+	return res;
 }
 
 /**
- *	write_page - Write one page to a fresh swap location.
- *	@addr:	Address we're writing.
- *	@loc:	Place to store the entry we used.
+ *	The bitmap is used for tracing allocated swap pages
  *
- *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *	errors. That is an artifact left over from swsusp. It did not
- *	check the return of rw_swap_page_sync() at all, since most pages
- *	written back to swap would return -EIO.
- *	This is a partial improvement, since we will at least return other
- *	errors, though we need to eventually fix the damn code.
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
  */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
-	swp_entry_t entry;
-	int error = -ENOSPC;
 
-	entry = get_swap_page_of_type(root_swap);
-	if (swp_offset(entry)) {
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
-		if (!error || error == -EIO)
-			*loc = entry;
-	}
-	return error;
-}
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
 
 /**
- *	Swap map-handling functions
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
  *
- *	The swap map is a data structure used for keeping track of each page
- *	written to the swap.  It consists of many swap_map_page structures
- *	that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are linked together with the help of either the
- *	.next (in memory) or the .next_swap (in swap) member.
- *
- *	The swap map is created during suspend.  At that time we need to keep
- *	it in memory, because we have to free all of the allocated swap
- *	entries if an error occurs.  The memory needed is preallocated
- *	so that we know in advance if there's enough of it.
- *
- *	The first swap_map_page structure is filled with the swap entries that
- *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
- *	so on.  After the all of the data pages have been written, the order
- *	of the swap_map_page structures in the map is reversed so that they
- *	can be read from swap in the original order.  This causes the data
- *	pages to be loaded in exactly the same order in which they have been
- *	saved.
- *
- *	During resume we only need to use one swap_map_page structure
- *	at a time, which means that we only need to use two memory pages for
- *	reading the image - one for reading the swap_map_page structures
- *	and the second for reading the data pages from swap.
+ *	The functions operate on a linked bitmap structure defined
+ *	above
  */
 
-#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
-			/ sizeof(swp_entry_t))
-
-struct swap_map_page {
-	swp_entry_t		entries[MAP_PAGE_SIZE];
-	swp_entry_t		next_swap;
-	struct swap_map_page	*next;
-};
-
-static inline void free_swap_map(struct swap_map_page *swap_map)
+static void free_bitmap(struct bitmap_page *bitmap)
 {
-	struct swap_map_page *swp;
+	struct bitmap_page *bp;
 
-	while (swap_map) {
-		swp = swap_map->next;
-		free_page((unsigned long)swap_map);
-		swap_map = swp;
+	while (bitmap) {
+		bp = bitmap->next;
+		free_page((unsigned long)bitmap);
+		bitmap = bp;
 	}
 }
 
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+static struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
-	struct swap_map_page *swap_map, *swp;
-	unsigned n = 0;
+	struct bitmap_page *bitmap, *bp;
+	unsigned int n;
 
-	if (!nr_pages)
+	if (!nr_bits)
 		return NULL;
 
-	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
-	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-	swp = swap_map;
-	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
-		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-		swp = swp->next;
-		if (!swp) {
-			free_swap_map(swap_map);
+	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+	bp = bitmap;
+	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+		bp = bp->next;
+		if (!bp) {
+			free_bitmap(bitmap);
 			return NULL;
 		}
 	}
-	return swap_map;
+	return bitmap;
 }
 
-/**
- *	reverse_swap_map - reverse the order of pages in the swap map
- *	@swap_map
- */
-
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 {
-	struct swap_map_page *prev, *next;
-
-	prev = NULL;
-	while (swap_map) {
-		next = swap_map->next;
-		swap_map->next = prev;
-		prev = swap_map;
-		swap_map = next;
+	unsigned int n;
+
+	n = BITMAP_PAGE_BITS;
+	while (bitmap && n <= bit) {
+		n += BITMAP_PAGE_BITS;
+		bitmap = bitmap->next;
 	}
-	return prev;
+	if (!bitmap)
+		return -EINVAL;
+	n -= BITMAP_PAGE_BITS;
+	bit -= n;
+	n = 0;
+	while (bit >= BITS_PER_CHUNK) {
+		bit -= BITS_PER_CHUNK;
+		n++;
+	}
+	bitmap->chunks[n] |= (1UL << bit);
+	return 0;
 }
 
-/**
- *	free_swap_map_entries - free the swap entries allocated to store
- *	the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 {
-	while (swap_map) {
-		if (swap_map->next_swap.val)
-			swap_free(swap_map->next_swap);
-		swap_map = swap_map->next;
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (bitmap_set(bitmap, offset)) {
+			swap_free(swp_entry(swap, offset));
+			offset = 0;
+		}
 	}
+	return offset;
 }
 
-/**
- *	save_swap_map - save the swap map used for tracing the data pages
- *	stored in the swap
- */
-
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 {
-	swp_entry_t entry = (swp_entry_t){0};
-	int error;
+	unsigned int bit, n;
+	unsigned long test;
 
-	while (swap_map) {
-		swap_map->next_swap = entry;
-		if ((error = write_page((unsigned long)swap_map, &entry)))
-			return error;
-		swap_map = swap_map->next;
+	bit = 0;
+	while (bitmap) {
+		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+			for (test = 1UL; test; test <<= 1) {
+				if (bitmap->chunks[n] & test)
+					swap_free(swp_entry(swap, bit));
+				bit++;
+			}
+		bitmap = bitmap->next;
 	}
-	*start = entry;
-	return 0;
 }
 
 /**
- *	free_image_entries - free the swap entries allocated to store
- *	the image data pages (this is only called in case of an error)
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
  */
 
-static inline void free_image_entries(struct swap_map_page *swp)
+static int write_page(void *buf, unsigned long offset)
 {
-	unsigned k;
+	swp_entry_t entry;
+	int error = -ENOSPC;
 
-	while (swp) {
-		for (k = 0; k < MAP_PAGE_SIZE; k++)
-			if (swp->entries[k].val)
-				swap_free(swp->entries[k]);
-		swp = swp->next;
+	if (offset) {
+		entry = swp_entry(root_swap, offset);
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
 	}
+	return error;
 }
 
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+	unsigned long		entries[MAP_PAGE_ENTRIES];
+	unsigned long		next_swap;
+};
+
 /**
- *	The swap_map_handle structure is used for handling the swap map in
+ *	The swap_map_handle structure is used for handling swap in
  *	a file-alike way
  */
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
+	unsigned long cur_swap;
+	struct bitmap_page *bitmap;
 	unsigned int k;
 };
 
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
-                                        struct swap_map_page *map)
+static void release_swap_writer(struct swap_map_handle *handle)
 {
-	handle->cur = map;
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
 	handle->k = 0;
+	return 0;
 }
 
-static inline int swap_map_write_page(struct swap_map_handle *handle,
-                                      unsigned long addr)
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
 {
 	int error;
+	unsigned long offset;
 
-	error = write_page(addr, handle->cur->entries + handle->k);
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swap_page(root_swap, handle->bitmap);
+	error = write_page(buf, offset);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
-		handle->cur = handle->cur->next;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		offset = alloc_swap_page(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap);
+		if (error)
+			return error;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
 		handle->k = 0;
 	}
 	return 0;
 }
 
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap);
+	else
+		return -EINVAL;
+}
+
 /**
- *	save_image_data - save the data pages pointed to by the PBEs
- *	from the list @pblist using the swap map handle @handle
- *	(assume there are @nr_pages data pages to save)
+ *	save_image - save the suspend image data
  */
 
-static int save_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
 	unsigned int m;
-	struct pbe *p;
+	int ret;
 	int error = 0;
 
 	printk("Saving image data pages (%u pages) ...     ", nr_pages);
@@ -385,98 +391,22 @@ static int save_image_data(struct pbe *pblist,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	for_each_pbe (p, pblist) {
-		error = swap_map_write_page(handle, p->address);
-		if (error)
-			break;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
 	return error;
 }
 
-static void dump_info(void)
-{
-	pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
-	pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
-	pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
-	pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
-	pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
-	pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
-	pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
-	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
-	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
-	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-
-static void init_header(unsigned int nr_pages)
-{
-	memset(&swsusp_info, 0, sizeof(swsusp_info));
-	swsusp_info.version_code = LINUX_VERSION_CODE;
-	swsusp_info.num_physpages = num_physpages;
-	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-
-	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_pages;
-	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-
-/**
- *	pack_orig_addresses - the .orig_address fields of the PBEs from the
- *	list starting at @pbe are stored in the array @buf[] (1 page)
- */
-
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
-                                              struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		buf[j] = pbe->orig_address;
-		pbe = pbe->next;
-	}
-	if (!pbe)
-		for (; j < PAGE_SIZE / sizeof(long); j++)
-			buf[j] = 0;
-	return pbe;
-}
-
-/**
- *	save_image_metadata - save the .orig_address fields of the PBEs
- *	from the list @pblist using the swap map handle @handle
- */
-
-static int save_image_metadata(struct pbe *pblist,
-                               struct swap_map_handle *handle)
-{
-	unsigned long *buf;
-	unsigned int n = 0;
-	struct pbe *p;
-	int error = 0;
-
-	printk("Saving image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		p = pack_orig_addresses(buf, p);
-		error = swap_map_write_page(handle, (unsigned long)buf);
-		if (error)
-			break;
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages saved)\n", n);
-	return error;
-}
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -486,8 +416,7 @@ static int save_image_metadata(struct pbe *pblist,
 
 static int enough_swap(unsigned int nr_pages)
 {
-	unsigned int free_swap = swap_info[root_swap].pages -
-		swap_info[root_swap].inuse_pages;
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
 
 	pr_debug("swsusp: free swap pages: %u\n", free_swap);
 	return free_swap > (nr_pages + PAGES_FOR_IO +
@@ -503,57 +432,44 @@ static int enough_swap(unsigned int nr_pages)
  *	correctly, we'll mark system clean, anyway.)
  */
 
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
+int swsusp_write(void)
 {
-	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
-	swp_entry_t start;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	unsigned long start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
 		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
 		return error;
 	}
-	if (!enough_swap(nr_pages)) {
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
-
-	init_header(nr_pages);
-	swap_map = alloc_swap_map(swsusp_info.pages);
-	if (!swap_map)
-		return -ENOMEM;
-	init_swap_map_handle(&handle, swap_map);
-
-	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
-	if (!error)
-		error = save_image_metadata(pblist, &handle);
+	error = get_swap_writer(&handle);
+	if (!error) {
+		start = handle.cur_swap;
+		error = swap_write_page(&handle, header);
+	}
 	if (!error)
-		error = save_image_data(pblist, &handle, nr_pages);
-	if (error)
-		goto Free_image_entries;
-
-	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &start);
-	if (error)
-		goto Free_map_entries;
-
-	dump_info();
-	printk( "S" );
-	error = mark_swapfiles(start);
-	printk( "|\n" );
+		error = save_image(&handle, &snapshot, header->pages - 1);
+	if (!error) {
+		flush_swap_writer(&handle);
+		printk("S");
+		error = mark_swapfiles(swp_entry(root_swap, start));
+		printk("|\n");
+	}
 	if (error)
-		goto Free_map_entries;
-
-Free_swap_map:
-	free_swap_map(swap_map);
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
 	return error;
-
-Free_map_entries:
-	free_swap_map_entries(swap_map);
-Free_image_entries:
-	free_image_entries(swap_map);
-	goto Free_swap_map;
 }
 
 /**
@@ -663,45 +579,6 @@ int swsusp_resume(void)
 	return error;
 }
 
-/**
- *	mark_unsafe_pages - mark the pages that cannot be used for storing
- *	the image during resume, because they conflict with the pages that
- *	had been used before suspend
- */
-
-static void mark_unsafe_pages(struct pbe *pblist)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *p;
-
-	if (!pblist) /* a sanity check */
-		return;
-
-	/* Clear page flags */
-	for_each_zone (zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-				ClearPageNosaveFree(pfn_to_page(zone_pfn +
-					zone->zone_start_pfn));
-	}
-
-	/* Mark orig addresses */
-	for_each_pbe (p, pblist)
-		SetPageNosaveFree(virt_to_page(p->orig_address));
-
-}
-
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
-	/* We assume both lists contain the same number of elements */
-	while (src) {
-		dst->orig_address = src->orig_address;
-		dst = dst->next;
-		src = src->next;
-	}
-}
-
 /*
  *	Using bio to read from swap.
  *	This code requires a bit more work than just using buffer heads
@@ -779,14 +656,14 @@ static int bio_write_page(pgoff_t page_off, void *page)
  *	in a file-alike way
  */
 
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
+static void release_swap_reader(struct swap_map_handle *handle)
 {
 	if (handle->cur)
 		free_page((unsigned long)handle->cur);
 	handle->cur = NULL;
 }
 
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
+static int get_swap_reader(struct swap_map_handle *handle,
                                       swp_entry_t start)
 {
 	int error;
@@ -798,149 +675,80 @@ static inline int get_swap_map_reader(struct swap_map_handle *handle,
 		return -ENOMEM;
 	error = bio_read_page(swp_offset(start), handle->cur);
 	if (error) {
-		release_swap_map_reader(handle);
+		release_swap_reader(handle);
 		return error;
 	}
 	handle->k = 0;
 	return 0;
 }
 
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
 {
 	unsigned long offset;
 	int error;
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = swp_offset(handle->cur->entries[handle->k]);
+	offset = handle->cur->entries[handle->k];
 	if (!offset)
-		return -EINVAL;
+		return -EFAULT;
 	error = bio_read_page(offset, buf);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
 		handle->k = 0;
-		offset = swp_offset(handle->cur->next_swap);
+		offset = handle->cur->next_swap;
 		if (!offset)
-			release_swap_map_reader(handle);
+			release_swap_reader(handle);
 		else
 			error = bio_read_page(offset, handle->cur);
 	}
 	return error;
 }
 
-static int check_header(void)
-{
-	char *reason = NULL;
-
-	dump_info();
-	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		reason = "kernel version";
-	if (swsusp_info.num_physpages != num_physpages)
-		reason = "memory size";
-	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		reason = "system type";
-	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		reason = "kernel release";
-	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		reason = "version";
-	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		reason = "machine";
-	if (reason) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-		return -EPERM;
-	}
-	return 0;
-}
-
 /**
- *	load_image_data - load the image data using the swap map handle
- *	@handle and store them using the page backup list @pblist
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
  *	(assume there are @nr_pages pages to load)
  */
 
-static int load_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
-	int error;
 	unsigned int m;
-	struct pbe *p;
+	int ret;
+	int error = 0;
 
-	if (!pblist)
-		return -EINVAL;
 	printk("Loading image data pages (%u pages) ...     ", nr_pages);
 	m = nr_pages / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, (void *)p->address);
-		if (error)
-			break;
-		p = p->next;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_read_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
+	if (!snapshot_image_loaded(snapshot))
+		error = -ENODATA;
 	return error;
 }
 
-/**
- *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *	the PBEs in the list starting at @pbe
- */
-
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		pbe->orig_address = buf[j];
-		pbe = pbe->next;
-	}
-	return pbe;
-}
-
-/**
- *	load_image_metadata - load the image metadata using the swap map
- *	handle @handle and put them into the PBEs in the list @pblist
- */
-
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
-	struct pbe *p;
-	unsigned long *buf;
-	unsigned int n = 0;
-	int error = 0;
-
-	printk("Loading image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, buf);
-		if (error)
-			break;
-		p = unpack_orig_addresses(buf, p);
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages loaded)\n", n);
-	return error;
-}
-
-int swsusp_read(struct pbe **pblist_ptr)
+int swsusp_read(void)
 {
 	int error;
-	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
 	unsigned int nr_pages;
 
 	if (IS_ERR(resume_bdev)) {
@@ -948,38 +756,19 @@ int swsusp_read(struct pbe **pblist_ptr)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = get_swap_map_reader(&handle, swsusp_header.image);
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
 	if (!error)
-		error = swap_map_read_page(&handle, &swsusp_info);
-	if (!error)
-		error = check_header();
-	if (error)
-		return error;
-	nr_pages = swsusp_info.image_pages;
-	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
-	if (!p)
-		return -ENOMEM;
-	error = load_image_metadata(p, &handle);
+		error = swap_read_page(&handle, header);
 	if (!error) {
-		mark_unsafe_pages(p);
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-		if (pblist)
-			copy_page_backup_list(pblist, p);
-		free_pagedir(p);
-		if (!pblist)
-			error = -ENOMEM;
-
-		/* Allocate memory for the image and read the data from swap */
-		if (!error)
-			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error) {
-			release_eaten_pages();
-			error = load_image_data(pblist, &handle, nr_pages);
-		}
-		if (!error)
-			*pblist_ptr = pblist;
+		nr_pages = header->image_pages;
+		error = load_image(&handle, &snapshot, nr_pages);
 	}
-	release_swap_map_reader(&handle);
+	release_swap_reader(&handle);
 
 	blkdev_put(resume_bdev);
 
-- 
cgit v1.2.3


From 61159a314bca6408320c3173c1282c64f5cdaa76 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:00 -0800
Subject: [PATCH] swsusp: separate swap-writing/reading code

Move the swap-writing/reading code of swsusp to a separate file.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Makefile |   2 +-
 kernel/power/power.h  |  31 ++-
 kernel/power/swap.c   | 544 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c | 568 +-------------------------------------------------
 4 files changed, 581 insertions(+), 564 deletions(-)
 create mode 100644 kernel/power/swap.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96a..bb91a061530 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ea7132ed029..089c84bed89 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -41,8 +41,8 @@ extern struct pbe *pagedir_nosave;
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
-
 extern int in_suspend;
+extern dev_t swsusp_resume_device;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
@@ -65,3 +65,32 @@ struct snapshot_handle {
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 int snapshot_image_loaded(struct snapshot_handle *handle);
+
+/**
+ *	The bitmap is used for tracing allocated swap pages
+ *
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
+ */
+
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
+
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+
+extern int swsusp_shrink_memory(void);
+extern int swsusp_suspend(void);
+extern int swsusp_resume(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 00000000000..9177f3f73a6
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,544 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+
+#include "power.h"
+
+extern char resume_file[];
+
+#define SWSUSP_SIG	"S1SUSPEND"
+
+static struct swsusp_header {
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+	swp_entry_t image;
+	char	orig_sig[10];
+	char	sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+
+/*
+ * Saving part...
+ */
+
+static unsigned short root_swap = 0xffff;
+
+static int mark_swapfiles(swp_entry_t start)
+{
+	int error;
+
+	rw_swap_page_sync(READ,
+			  swp_entry(root_swap, 0),
+			  virt_to_page((unsigned long)&swsusp_header));
+	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		swsusp_header.image = start;
+		error = rw_swap_page_sync(WRITE,
+					  swp_entry(root_swap, 0),
+					  virt_to_page((unsigned long)
+						       &swsusp_header));
+	} else {
+		pr_debug("swsusp: Partition is not swap space.\n");
+		error = -ENODEV;
+	}
+	return error;
+}
+
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
+ */
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+	int res = swap_type_of(swsusp_resume_device);
+
+	if (res >= 0) {
+		root_swap = res;
+		return 0;
+	}
+	return res;
+}
+
+/**
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
+ */
+
+static int write_page(void *buf, unsigned long offset)
+{
+	swp_entry_t entry;
+	int error = -ENOSPC;
+
+	if (offset) {
+		entry = swp_entry(root_swap, offset);
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+	}
+	return error;
+}
+
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+	unsigned long		entries[MAP_PAGE_ENTRIES];
+	unsigned long		next_swap;
+};
+
+/**
+ *	The swap_map_handle structure is used for handling swap in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	unsigned long cur_swap;
+	struct bitmap_page *bitmap;
+	unsigned int k;
+};
+
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
+{
+	int error;
+	unsigned long offset;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swap_page(root_swap, handle->bitmap);
+	error = write_page(buf, offset);
+	if (error)
+		return error;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		offset = alloc_swap_page(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap);
+		if (error)
+			return error;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
+		handle->k = 0;
+	}
+	return 0;
+}
+
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap);
+	else
+		return -EINVAL;
+}
+
+/**
+ *	save_image - save the suspend image data
+ */
+
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+	unsigned int m;
+	int ret;
+	int error = 0;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
+	if (!error)
+		printk("\b\b\b\bdone\n");
+	return error;
+}
+
+/**
+ *	enough_swap - Make sure we have enough swap to save the image.
+ *
+ *	Returns TRUE or FALSE after checking the total amount of swap
+ *	space avaiable from the resume partition.
+ */
+
+static int enough_swap(unsigned int nr_pages)
+{
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
+
+	pr_debug("swsusp: free swap pages: %u\n", free_swap);
+	return free_swap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+
+/**
+ *	swsusp_write - Write entire image and metadata.
+ *
+ *	It is important _NOT_ to umount filesystems at this point. We want
+ *	them synced (in case something goes wrong) but we DO not want to mark
+ *	filesystem clean: it is not. (And it does not matter, if we resume
+ *	correctly, we'll mark system clean, anyway.)
+ */
+
+int swsusp_write(void)
+{
+	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	unsigned long start;
+	int error;
+
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		return error;
+	}
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+	error = get_swap_writer(&handle);
+	if (!error) {
+		start = handle.cur_swap;
+		error = swap_write_page(&handle, header);
+	}
+	if (!error)
+		error = save_image(&handle, &snapshot, header->pages - 1);
+	if (!error) {
+		flush_swap_writer(&handle);
+		printk("S");
+		error = mark_swapfiles(swp_entry(root_swap, start));
+		printk("|\n");
+	}
+	if (error)
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
+	return error;
+}
+
+/*
+ *	Using bio to read from swap.
+ *	This code requires a bit more work than just using buffer heads
+ *	but, it is the recommended way for 2.5/2.6.
+ *	The following are to signal the beginning and end of I/O. Bios
+ *	finish asynchronously, while we want them to happen synchronously.
+ *	A simple atomic_t, and a wait loop take care of this problem.
+ */
+
+static atomic_t io_done = ATOMIC_INIT(0);
+
+static int end_io(struct bio *bio, unsigned int num, int err)
+{
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		panic("I/O error reading memory image");
+	atomic_set(&io_done, 0);
+	return 0;
+}
+
+static struct block_device *resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're writing, make sure the page is marked as dirty.
+ *	Then submit it and wait.
+ */
+
+static int submit(int rw, pgoff_t page_off, void *page)
+{
+	int error = 0;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_io;
+
+	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+		error = -EFAULT;
+		goto Done;
+	}
+
+	atomic_set(&io_done, 1);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	while (atomic_read(&io_done))
+		yield();
+	if (rw == READ)
+		bio_set_pages_dirty(bio);
+ Done:
+	bio_put(bio);
+	return error;
+}
+
+static int bio_read_page(pgoff_t page_off, void *page)
+{
+	return submit(READ, page_off, page);
+}
+
+static int bio_write_page(pgoff_t page_off, void *page)
+{
+	return submit(WRITE, page_off, page);
+}
+
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static int get_swap_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+	int error;
+
+	if (!swp_offset(start))
+		return -EINVAL;
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	if (!handle->cur)
+		return -ENOMEM;
+	error = bio_read_page(swp_offset(start), handle->cur);
+	if (error) {
+		release_swap_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
+{
+	unsigned long offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = handle->cur->entries[handle->k];
+	if (!offset)
+		return -EFAULT;
+	error = bio_read_page(offset, buf);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
+		handle->k = 0;
+		offset = handle->cur->next_swap;
+		if (!offset)
+			release_swap_reader(handle);
+		else
+			error = bio_read_page(offset, handle->cur);
+	}
+	return error;
+}
+
+/**
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
+ *	(assume there are @nr_pages pages to load)
+ */
+
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+	unsigned int m;
+	int ret;
+	int error = 0;
+
+	printk("Loading image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	do {
+		ret = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_read_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
+	if (!error)
+		printk("\b\b\b\bdone\n");
+	if (!snapshot_image_loaded(snapshot))
+		error = -ENODATA;
+	return error;
+}
+
+int swsusp_read(void)
+{
+	int error;
+	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
+	if (!error)
+		error = swap_read_page(&handle, header);
+	if (!error)
+		error = load_image(&handle, &snapshot, header->pages - 1);
+	release_swap_reader(&handle);
+
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
+	return error;
+}
+
+/**
+ *      swsusp_check - Check for swsusp signature in the resume device
+ */
+
+int swsusp_check(void)
+{
+	int error;
+
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(resume_bdev)) {
+		set_blocksize(resume_bdev, PAGE_SIZE);
+		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		if ((error = bio_read_page(0, &swsusp_header)))
+			return error;
+		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+			/* Reset swap signature now */
+			error = bio_write_page(0, &swsusp_header);
+		} else {
+			return -EINVAL;
+		}
+		if (error)
+			blkdev_put(resume_bdev);
+		else
+			pr_debug("swsusp: Signature found, resuming\n");
+	} else {
+		error = PTR_ERR(resume_bdev);
+	}
+
+	if (error)
+		pr_debug("swsusp: Error %d check for resume file\n", error);
+
+	return error;
+}
+
+/**
+ *	swsusp_close - close swap device.
+ */
+
+void swsusp_close(void)
+{
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return;
+	}
+
+	blkdev_put(resume_bdev);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 457084f5001..c4016cbbd3e 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
  * Fixed runaway init
  *
  * Rafael J. Wysocki <rjw@sisk.pl>
- * Added the swap map data structure and reworked the handling of swap
+ * Reworked the freeing of memory and the handling of swap
  *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
  */
 
-#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
-#include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
-#include <linux/delay.h>
-#include <linux/bitops.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/swap.h>
 #include <linux/pm.h>
-#include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-
-#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
 
 #include "power.h"
 
@@ -89,91 +72,15 @@ static int restore_highmem(void) { return 0; }
 static unsigned int count_highmem_pages(void) { return 0; }
 #endif
 
-extern char resume_file[];
-
-#define SWSUSP_SIG	"S1SUSPEND"
-
-static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t image;
-	char	orig_sig[10];
-	char	sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
-
-/*
- * Saving part...
- */
-
-static unsigned short root_swap = 0xffff;
-
-static int mark_swapfiles(swp_entry_t start)
-{
-	int error;
-
-	rw_swap_page_sync(READ,
-			  swp_entry(root_swap, 0),
-			  virt_to_page((unsigned long)&swsusp_header));
-	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
-	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
-		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
-		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		swsusp_header.image = start;
-		error = rw_swap_page_sync(WRITE,
-					  swp_entry(root_swap, 0),
-					  virt_to_page((unsigned long)
-						       &swsusp_header));
-	} else {
-		pr_debug("swsusp: Partition is not swap space.\n");
-		error = -ENODEV;
-	}
-	return error;
-}
-
-/**
- *	swsusp_swap_check - check if the resume device is a swap device
- *	and get its index (if so)
- */
-
-static int swsusp_swap_check(void) /* This is called before saving image */
-{
-	int res = swap_type_of(swsusp_resume_device);
-
-	if (res >= 0) {
-		root_swap = res;
-		return 0;
-	}
-	return res;
-}
-
-/**
- *	The bitmap is used for tracing allocated swap pages
- *
- *	The entire bitmap consists of a number of bitmap_page
- *	structures linked with the help of the .next member.
- *	Thus each page can be allocated individually, so we only
- *	need to make 0-order memory allocations to create
- *	the bitmap.
- */
-
-#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
-#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
-#define BITS_PER_CHUNK		(sizeof(long) * 8)
-#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
-
-struct bitmap_page {
-	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
-	struct bitmap_page	*next;
-};
-
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
  *
  *	The functions operate on a linked bitmap structure defined
- *	above
+ *	in power.h
  */
 
-static void free_bitmap(struct bitmap_page *bitmap)
+void free_bitmap(struct bitmap_page *bitmap)
 {
 	struct bitmap_page *bp;
 
@@ -184,7 +91,7 @@ static void free_bitmap(struct bitmap_page *bitmap)
 	}
 }
 
-static struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
 	struct bitmap_page *bitmap, *bp;
 	unsigned int n;
@@ -227,7 +134,7 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 	return 0;
 }
 
-static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 {
 	unsigned long offset;
 
@@ -241,7 +148,7 @@ static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 	return offset;
 }
 
-static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 {
 	unsigned int bit, n;
 	unsigned long test;
@@ -258,220 +165,6 @@ static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 	}
 }
 
-/**
- *	write_page - Write one page to given swap location.
- *	@buf:		Address we're writing.
- *	@offset:	Offset of the swap page we're writing to.
- */
-
-static int write_page(void *buf, unsigned long offset)
-{
-	swp_entry_t entry;
-	int error = -ENOSPC;
-
-	if (offset) {
-		entry = swp_entry(root_swap, offset);
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
-	}
-	return error;
-}
-
-/*
- *	The swap map is a data structure used for keeping track of each page
- *	written to a swap partition.  It consists of many swap_map_page
- *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are stored on the swap and linked together with the
- *	help of the .next_swap member.
- *
- *	The swap map is created during suspend.  The swap map pages are
- *	allocated and populated one at a time, so we only need one memory
- *	page to set up the entire structure.
- *
- *	During resume we also only need to use one swap_map_page structure
- *	at a time.
- */
-
-#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
-
-struct swap_map_page {
-	unsigned long		entries[MAP_PAGE_ENTRIES];
-	unsigned long		next_swap;
-};
-
-/**
- *	The swap_map_handle structure is used for handling swap in
- *	a file-alike way
- */
-
-struct swap_map_handle {
-	struct swap_map_page *cur;
-	unsigned long cur_swap;
-	struct bitmap_page *bitmap;
-	unsigned int k;
-};
-
-static void release_swap_writer(struct swap_map_handle *handle)
-{
-	if (handle->cur)
-		free_page((unsigned long)handle->cur);
-	handle->cur = NULL;
-	if (handle->bitmap)
-		free_bitmap(handle->bitmap);
-	handle->bitmap = NULL;
-}
-
-static int get_swap_writer(struct swap_map_handle *handle)
-{
-	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
-	if (!handle->cur)
-		return -ENOMEM;
-	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
-	if (!handle->bitmap) {
-		release_swap_writer(handle);
-		return -ENOMEM;
-	}
-	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
-	if (!handle->cur_swap) {
-		release_swap_writer(handle);
-		return -ENOSPC;
-	}
-	handle->k = 0;
-	return 0;
-}
-
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
-{
-	int error;
-	unsigned long offset;
-
-	if (!handle->cur)
-		return -EINVAL;
-	offset = alloc_swap_page(root_swap, handle->bitmap);
-	error = write_page(buf, offset);
-	if (error)
-		return error;
-	handle->cur->entries[handle->k++] = offset;
-	if (handle->k >= MAP_PAGE_ENTRIES) {
-		offset = alloc_swap_page(root_swap, handle->bitmap);
-		if (!offset)
-			return -ENOSPC;
-		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap);
-		if (error)
-			return error;
-		memset(handle->cur, 0, PAGE_SIZE);
-		handle->cur_swap = offset;
-		handle->k = 0;
-	}
-	return 0;
-}
-
-static int flush_swap_writer(struct swap_map_handle *handle)
-{
-	if (handle->cur && handle->cur_swap)
-		return write_page(handle->cur, handle->cur_swap);
-	else
-		return -EINVAL;
-}
-
-/**
- *	save_image - save the suspend image data
- */
-
-static int save_image(struct swap_map_handle *handle,
-                      struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
-{
-	unsigned int m;
-	int ret;
-	int error = 0;
-
-	printk("Saving image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
-	if (!m)
-		m = 1;
-	nr_pages = 0;
-	do {
-		ret = snapshot_read_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_write_page(handle, data_of(*snapshot));
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
-	if (!error)
-		printk("\b\b\b\bdone\n");
-	return error;
-}
-
-/**
- *	enough_swap - Make sure we have enough swap to save the image.
- *
- *	Returns TRUE or FALSE after checking the total amount of swap
- *	space avaiable from the resume partition.
- */
-
-static int enough_swap(unsigned int nr_pages)
-{
-	unsigned int free_swap = count_swap_pages(root_swap, 1);
-
-	pr_debug("swsusp: free swap pages: %u\n", free_swap);
-	return free_swap > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-
-/**
- *	swsusp_write - Write entire image and metadata.
- *
- *	It is important _NOT_ to umount filesystems at this point. We want
- *	them synced (in case something goes wrong) but we DO not want to mark
- *	filesystem clean: it is not. (And it does not matter, if we resume
- *	correctly, we'll mark system clean, anyway.)
- */
-
-int swsusp_write(void)
-{
-	struct swap_map_handle handle;
-	struct snapshot_handle snapshot;
-	struct swsusp_info *header;
-	unsigned long start;
-	int error;
-
-	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
-		return error;
-	}
-	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_read_next(&snapshot, PAGE_SIZE);
-	if (error < PAGE_SIZE)
-		return error < 0 ? error : -EFAULT;
-	header = (struct swsusp_info *)data_of(snapshot);
-	if (!enough_swap(header->pages)) {
-		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
-	}
-	error = get_swap_writer(&handle);
-	if (!error) {
-		start = handle.cur_swap;
-		error = swap_write_page(&handle, header);
-	}
-	if (!error)
-		error = save_image(&handle, &snapshot, header->pages - 1);
-	if (!error) {
-		flush_swap_writer(&handle);
-		printk("S");
-		error = mark_swapfiles(swp_entry(root_swap, start));
-		printk("|\n");
-	}
-	if (error)
-		free_all_swap_pages(root_swap, handle.bitmap);
-	release_swap_writer(&handle);
-	return error;
-}
-
 /**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
@@ -578,252 +271,3 @@ int swsusp_resume(void)
 	local_irq_enable();
 	return error;
 }
-
-/*
- *	Using bio to read from swap.
- *	This code requires a bit more work than just using buffer heads
- *	but, it is the recommended way for 2.5/2.6.
- *	The following are to signal the beginning and end of I/O. Bios
- *	finish asynchronously, while we want them to happen synchronously.
- *	A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		panic("I/O error reading memory image");
-	atomic_set(&io_done, 0);
-	return 0;
-}
-
-static struct block_device *resume_bdev;
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're writing, make sure the page is marked as dirty.
- *	Then submit it and wait.
- */
-
-static int submit(int rw, pgoff_t page_off, void *page)
-{
-	int error = 0;
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_ATOMIC, 1);
-	if (!bio)
-		return -ENOMEM;
-	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_io;
-
-	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
-		error = -EFAULT;
-		goto Done;
-	}
-
-
-	atomic_set(&io_done, 1);
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	while (atomic_read(&io_done))
-		yield();
-	if (rw == READ)
-		bio_set_pages_dirty(bio);
- Done:
-	bio_put(bio);
-	return error;
-}
-
-static int bio_read_page(pgoff_t page_off, void *page)
-{
-	return submit(READ, page_off, page);
-}
-
-static int bio_write_page(pgoff_t page_off, void *page)
-{
-	return submit(WRITE, page_off, page);
-}
-
-/**
- *	The following functions allow us to read data using a swap map
- *	in a file-alike way
- */
-
-static void release_swap_reader(struct swap_map_handle *handle)
-{
-	if (handle->cur)
-		free_page((unsigned long)handle->cur);
-	handle->cur = NULL;
-}
-
-static int get_swap_reader(struct swap_map_handle *handle,
-                                      swp_entry_t start)
-{
-	int error;
-
-	if (!swp_offset(start))
-		return -EINVAL;
-	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-	if (!handle->cur)
-		return -ENOMEM;
-	error = bio_read_page(swp_offset(start), handle->cur);
-	if (error) {
-		release_swap_reader(handle);
-		return error;
-	}
-	handle->k = 0;
-	return 0;
-}
-
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
-{
-	unsigned long offset;
-	int error;
-
-	if (!handle->cur)
-		return -EINVAL;
-	offset = handle->cur->entries[handle->k];
-	if (!offset)
-		return -EFAULT;
-	error = bio_read_page(offset, buf);
-	if (error)
-		return error;
-	if (++handle->k >= MAP_PAGE_ENTRIES) {
-		handle->k = 0;
-		offset = handle->cur->next_swap;
-		if (!offset)
-			release_swap_reader(handle);
-		else
-			error = bio_read_page(offset, handle->cur);
-	}
-	return error;
-}
-
-/**
- *	load_image - load the image using the swap map handle
- *	@handle and the snapshot handle @snapshot
- *	(assume there are @nr_pages pages to load)
- */
-
-static int load_image(struct swap_map_handle *handle,
-                      struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
-{
-	unsigned int m;
-	int ret;
-	int error = 0;
-
-	printk("Loading image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
-	if (!m)
-		m = 1;
-	nr_pages = 0;
-	do {
-		ret = snapshot_write_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_read_page(handle, data_of(*snapshot));
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
-	if (!error)
-		printk("\b\b\b\bdone\n");
-	if (!snapshot_image_loaded(snapshot))
-		error = -ENODATA;
-	return error;
-}
-
-int swsusp_read(void)
-{
-	int error;
-	struct swap_map_handle handle;
-	struct snapshot_handle snapshot;
-	struct swsusp_info *header;
-	unsigned int nr_pages;
-
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return PTR_ERR(resume_bdev);
-	}
-
-	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_write_next(&snapshot, PAGE_SIZE);
-	if (error < PAGE_SIZE)
-		return error < 0 ? error : -EFAULT;
-	header = (struct swsusp_info *)data_of(snapshot);
-	error = get_swap_reader(&handle, swsusp_header.image);
-	if (!error)
-		error = swap_read_page(&handle, header);
-	if (!error) {
-		nr_pages = header->image_pages;
-		error = load_image(&handle, &snapshot, nr_pages);
-	}
-	release_swap_reader(&handle);
-
-	blkdev_put(resume_bdev);
-
-	if (!error)
-		pr_debug("swsusp: Reading resume file was successful\n");
-	else
-		pr_debug("swsusp: Error %d resuming\n", error);
-	return error;
-}
-
-/**
- *      swsusp_check - Check for swsusp signature in the resume device
- */
-
-int swsusp_check(void)
-{
-	int error;
-
-	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-	if (!IS_ERR(resume_bdev)) {
-		set_blocksize(resume_bdev, PAGE_SIZE);
-		memset(&swsusp_header, 0, sizeof(swsusp_header));
-		if ((error = bio_read_page(0, &swsusp_header)))
-			return error;
-		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-			/* Reset swap signature now */
-			error = bio_write_page(0, &swsusp_header);
-		} else {
-			return -EINVAL;
-		}
-		if (error)
-			blkdev_put(resume_bdev);
-		else
-			pr_debug("swsusp: Signature found, resuming\n");
-	} else {
-		error = PTR_ERR(resume_bdev);
-	}
-
-	if (error)
-		pr_debug("swsusp: Error %d check for resume file\n", error);
-
-	return error;
-}
-
-/**
- *	swsusp_close - close swap device.
- */
-
-void swsusp_close(void)
-{
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return;
-	}
-
-	blkdev_put(resume_bdev);
-}
-- 
cgit v1.2.3


From 74c7e2efbe37378026f00ad9e7253796d7b2fc99 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 23 Mar 2006 03:00:01 -0800
Subject: [PATCH] kernel/power: move externs to header files

Move externs from C source files to header files.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c  | 11 -----------
 kernel/power/power.h |  6 +++++-
 2 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4eb464b7134..4bd68f482f2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
 #include "power.h"
 
 
-extern suspend_disk_method_t pm_disk_mode;
-
-extern int swsusp_shrink_memory(void);
-extern int swsusp_suspend(void);
-extern int swsusp_write(void);
-extern int swsusp_check(void);
-extern int swsusp_read(void);
-extern void swsusp_close(void);
-extern int swsusp_resume(void);
-
-
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 089c84bed89..5d1abffbb9c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -48,7 +48,6 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void swsusp_free(void);
 
 struct snapshot_handle {
 	loff_t		offset;
@@ -91,6 +90,11 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
+extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
+extern void swsusp_free(void);
 extern int swsusp_suspend(void);
 extern int swsusp_resume(void);
+extern int swsusp_read(void);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
-- 
cgit v1.2.3


From 543cc27d09643640cbc34189c03a40beb8227aef Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 23 Mar 2006 03:00:02 -0800
Subject: [PATCH] swsusp: documentation updates

Update suspend-to-RAM documentation with new machines, and makes message
when processes can't be stopped little clearer.  (In one case, waiting
longer actually did help).

From: "Rafael J. Wysocki" <rjw@sisk.pl>

  Warn in the documentation that data may be lost if there are some
  filesystems mounted from USB devices before suspend.

  [Thanks to Alan Stern for providing the answer to the question in the
  Q:-A: part.]

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a0..02a1b3a9fa9 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -83,7 +83,7 @@ int freeze_processes(void)
 		yield();			/* Yield is okay here */
 		if (todo && time_after(jiffies, start_time + TIMEOUT)) {
 			printk( "\n" );
-			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
+			printk(KERN_ERR " stopping tasks timed out (%d tasks remaining)\n", todo );
 			break;
 		}
 	} while(todo);
-- 
cgit v1.2.3


From 6e1819d615f24ce0726a7d0bd3dd0152d7b21654 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:03 -0800
Subject: [PATCH] swsusp: userland interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a user space interface for swsusp.

The interface is based on a special character device, called the snapshot
device, that allows user space processes to perform suspend and resume-related
operations with the help of some ioctls and the read()/write() functions.
 Additionally it allows these processes to allocate free swap pages from a
selected swap partition, called the resume partition, so that they know which
sectors of the resume partition are available to them.

The interface uses the same low-level system memory snapshot-handling
functions that are used by the built-it swap-writing/reading code of swsusp.

The interface documentation is included in the patch.

The patch assumes that the major and minor numbers of the snapshot device will
be 10 (ie.  misc device) and 231, the registration of which has already been
requested.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Makefile   |   2 +-
 kernel/power/power.h    |  14 +++
 kernel/power/snapshot.c |   9 +-
 kernel/power/user.c     | 301 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 321 insertions(+), 5 deletions(-)
 create mode 100644 kernel/power/user.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index bb91a061530..8d0af3d37a4 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d1abffbb9c..42c431c8bdd 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
 	int			cpus;
 	unsigned long		image_pages;
 	unsigned long		pages;
+	unsigned long		size;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -65,6 +66,19 @@ extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 int snapshot_image_loaded(struct snapshot_handle *handle);
 
+#define SNAPSHOT_IOC_MAGIC	'3'
+#define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE		_IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE			_IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_IOC_MAXNR	10
+
 /**
  *	The bitmap is used for tracing allocated swap pages
  *
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cc349437fb7..0036955357e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,7 @@
 struct pbe *pagedir_nosave;
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
+static unsigned long *buffer;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 		free_pagedir(pblist);
 		pblist = NULL;
         } else
-        	create_pbe_list(pblist, nr_pages);
+		create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -418,6 +419,7 @@ void swsusp_free(void)
 	nr_copy_pages = 0;
 	nr_meta_pages = 0;
 	pagedir_nosave = NULL;
+	buffer = NULL;
 }
 
 
@@ -523,6 +525,8 @@ static void init_header(struct swsusp_info *info)
 	info->cpus = num_online_cpus();
 	info->image_pages = nr_copy_pages;
 	info->pages = nr_copy_pages + nr_meta_pages + 1;
+	info->size = info->pages;
+	info->size <<= PAGE_SHIFT;
 }
 
 /**
@@ -568,8 +572,6 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
 
 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
-	static unsigned long *buffer;
-
 	if (handle->page > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
@@ -774,7 +776,6 @@ static int create_image(struct snapshot_handle *handle)
 
 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
-	static unsigned long *buffer;
 	int error = 0;
 
 	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 00000000000..8cabc405ca1
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,301 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+#define SNAPSHOT_MINOR	231
+
+static struct snapshot_data {
+	struct snapshot_handle handle;
+	int swap;
+	struct bitmap_page *bitmap;
+	int mode;
+	char frozen;
+	char ready;
+} snapshot_state;
+
+static atomic_t device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	if (!atomic_add_unless(&device_available, -1, 0))
+		return -EBUSY;
+
+	if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+		return -ENOSYS;
+
+	nonseekable_open(inode, filp);
+	data = &snapshot_state;
+	filp->private_data = data;
+	memset(&data->handle, 0, sizeof(struct snapshot_handle));
+	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+		data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+		data->mode = O_RDONLY;
+	} else {
+		data->swap = -1;
+		data->mode = O_WRONLY;
+	}
+	data->bitmap = NULL;
+	data->frozen = 0;
+	data->ready = 0;
+
+	return 0;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	swsusp_free();
+	data = filp->private_data;
+	free_all_swap_pages(data->swap, data->bitmap);
+	free_bitmap(data->bitmap);
+	if (data->frozen) {
+		down(&pm_sem);
+		thaw_processes();
+		enable_nonboot_cpus();
+		up(&pm_sem);
+	}
+	atomic_inc(&device_available);
+	return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+                             size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_read_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_to_user(buf, data_of(data->handle), res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_write_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_from_user(data_of(data->handle), buf, res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+                          unsigned int cmd, unsigned long arg)
+{
+	int error = 0;
+	struct snapshot_data *data;
+	loff_t offset, avail;
+
+	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+		return -ENOTTY;
+	if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+		return -ENOTTY;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	data = filp->private_data;
+
+	switch (cmd) {
+
+	case SNAPSHOT_FREEZE:
+		if (data->frozen)
+			break;
+		sys_sync();
+		down(&pm_sem);
+		pm_prepare_console();
+		disable_nonboot_cpus();
+		if (freeze_processes()) {
+			thaw_processes();
+			enable_nonboot_cpus();
+			pm_restore_console();
+			error = -EBUSY;
+		}
+		up(&pm_sem);
+		if (!error)
+			data->frozen = 1;
+		break;
+
+	case SNAPSHOT_UNFREEZE:
+		if (!data->frozen)
+			break;
+		down(&pm_sem);
+		thaw_processes();
+		enable_nonboot_cpus();
+		pm_restore_console();
+		up(&pm_sem);
+		data->frozen = 0;
+		break;
+
+	case SNAPSHOT_ATOMIC_SNAPSHOT:
+		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
+			error = -EPERM;
+			break;
+		}
+		down(&pm_sem);
+		/* Free memory before shutting down devices. */
+		error = swsusp_shrink_memory();
+		if (!error) {
+			error = device_suspend(PMSG_FREEZE);
+			if (!error) {
+				in_suspend = 1;
+				error = swsusp_suspend();
+				device_resume();
+			}
+		}
+		up(&pm_sem);
+		if (!error)
+			error = put_user(in_suspend, (unsigned int __user *)arg);
+		if (!error)
+			data->ready = 1;
+		break;
+
+	case SNAPSHOT_ATOMIC_RESTORE:
+		if (data->mode != O_WRONLY || !data->frozen ||
+		    !snapshot_image_loaded(&data->handle)) {
+			error = -EPERM;
+			break;
+		}
+		down(&pm_sem);
+		pm_prepare_console();
+		error = device_suspend(PMSG_FREEZE);
+		if (!error) {
+			error = swsusp_resume();
+			device_resume();
+		}
+		pm_restore_console();
+		up(&pm_sem);
+		break;
+
+	case SNAPSHOT_FREE:
+		swsusp_free();
+		memset(&data->handle, 0, sizeof(struct snapshot_handle));
+		data->ready = 0;
+		break;
+
+	case SNAPSHOT_SET_IMAGE_SIZE:
+		image_size = arg;
+		break;
+
+	case SNAPSHOT_AVAIL_SWAP:
+		avail = count_swap_pages(data->swap, 1);
+		avail <<= PAGE_SHIFT;
+		error = put_user(avail, (loff_t __user *)arg);
+		break;
+
+	case SNAPSHOT_GET_SWAP_PAGE:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		if (!data->bitmap) {
+			data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+			if (!data->bitmap) {
+				error = -ENOMEM;
+				break;
+			}
+		}
+		offset = alloc_swap_page(data->swap, data->bitmap);
+		if (offset) {
+			offset <<= PAGE_SHIFT;
+			error = put_user(offset, (loff_t __user *)arg);
+		} else {
+			error = -ENOSPC;
+		}
+		break;
+
+	case SNAPSHOT_FREE_SWAP_PAGES:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		free_all_swap_pages(data->swap, data->bitmap);
+		free_bitmap(data->bitmap);
+		data->bitmap = NULL;
+		break;
+
+	case SNAPSHOT_SET_SWAP_FILE:
+		if (!data->bitmap) {
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			if (old_decode_dev(arg)) {
+				data->swap = swap_type_of(old_decode_dev(arg));
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		} else {
+			error = -EPERM;
+		}
+		break;
+
+	default:
+		error = -ENOTTY;
+
+	}
+
+	return error;
+}
+
+static struct file_operations snapshot_fops = {
+	.open = snapshot_open,
+	.release = snapshot_release,
+	.read = snapshot_read,
+	.write = snapshot_write,
+	.llseek = no_llseek,
+	.ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+	.minor = SNAPSHOT_MINOR,
+	.name = "snapshot",
+	.fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+	return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
-- 
cgit v1.2.3


From 02aaeb9b952f30b1ad6284d5d45be02030f679db Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:04 -0800
Subject: [PATCH] swsusp: freeze user space processes first

Allow swsusp to freeze processes successfully under heavy load by freezing
userspace processes before kernel threads.

[Thanks to Nigel Cunningham <nigel@suspend2.net> for suggesting the
way to go.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c    |  1 -
 kernel/power/process.c | 61 +++++++++++++++++++++++++++++++++++++-------------
 kernel/power/user.c    |  1 -
 3 files changed, 46 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4bd68f482f2..81d4d982f3f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -72,7 +72,6 @@ static int prepare_processes(void)
 	int error;
 
 	pm_prepare_console();
-	sys_sync();
 	disable_nonboot_cpus();
 
 	if (freeze_processes()) {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 02a1b3a9fa9..8ac7c35fad7 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 
 /* 
  * Timeout for stopping processes
  */
-#define TIMEOUT	(6 * HZ)
+#define TIMEOUT	(20 * HZ)
 
 
 static inline int freezeable(struct task_struct * p)
@@ -54,38 +55,62 @@ void refrigerator(void)
 	current->state = save;
 }
 
+static inline void freeze_process(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!freezing(p)) {
+		freeze(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		signal_wake_up(p, 0);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
+
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-	int todo;
+	int todo, nr_user, user_frozen;
 	unsigned long start_time;
 	struct task_struct *g, *p;
 	unsigned long flags;
 
 	printk( "Stopping tasks: " );
 	start_time = jiffies;
+	user_frozen = 0;
 	do {
-		todo = 0;
+		nr_user = todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (!freezeable(p))
 				continue;
 			if (frozen(p))
 				continue;
-
-			freeze(p);
-			spin_lock_irqsave(&p->sighand->siglock, flags);
-			signal_wake_up(p, 0);
-			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			todo++;
+			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
+				/* The task is a user-space one.
+				 * Freeze it unless there's a vfork completion
+				 * pending
+				 */
+				if (!p->vfork_done)
+					freeze_process(p);
+				nr_user++;
+			} else {
+				/* Freeze only if the user space is frozen */
+				if (user_frozen)
+					freeze_process(p);
+				todo++;
+			}
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
+		todo += nr_user;
+		if (!user_frozen && !nr_user) {
+			sys_sync();
+			start_time = jiffies;
+		}
+		user_frozen = !nr_user;
 		yield();			/* Yield is okay here */
-		if (todo && time_after(jiffies, start_time + TIMEOUT)) {
-			printk( "\n" );
-			printk(KERN_ERR " stopping tasks timed out (%d tasks remaining)\n", todo );
+		if (todo && time_after(jiffies, start_time + TIMEOUT))
 			break;
-		}
 	} while(todo);
 
 	/* This does not unfreeze processes that are already frozen
@@ -94,8 +119,14 @@ int freeze_processes(void)
 	 * but it cleans up leftover PF_FREEZE requests.
 	 */
 	if (todo) {
+		printk( "\n" );
+		printk(KERN_ERR " stopping tasks timed out "
+			"after %d seconds (%d tasks remaining):\n",
+			TIMEOUT / HZ, todo);
 		read_lock(&tasklist_lock);
-		do_each_thread(g, p)
+		do_each_thread(g, p) {
+			if (freezeable(p) && !frozen(p))
+				printk(KERN_ERR "  %s\n", p->comm);
 			if (freezing(p)) {
 				pr_debug("  clean up: %s\n", p->comm);
 				p->flags &= ~PF_FREEZE;
@@ -103,7 +134,7 @@ int freeze_processes(void)
 				recalc_sigpending_tsk(p);
 				spin_unlock_irqrestore(&p->sighand->siglock, flags);
 			}
-		while_each_thread(g, p);
+		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		return todo;
 	}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 8cabc405ca1..a97406b86ef 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -138,7 +138,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	case SNAPSHOT_FREEZE:
 		if (data->frozen)
 			break;
-		sys_sync();
 		down(&pm_sem);
 		pm_prepare_console();
 		disable_nonboot_cpus();
-- 
cgit v1.2.3


From ce6ed29f3136bc4b3644ecf4091d6390d444f628 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 23 Mar 2006 03:00:05 -0800
Subject: [PATCH] suspend: make progress printing prettier

Combination of printk/pr_debug led to <7> in the middle of the line, and we
printed way too many dots.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0036955357e..1b46c2da5a5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -83,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
 		void *kaddr;
 		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
 
-		if (!(pfn%1000))
+		if (!(pfn%10000))
 			printk(".");
 		if (!pfn_valid(pfn))
 			continue;
@@ -122,13 +122,14 @@ int save_highmem(void)
 	struct zone *zone;
 	int res = 0;
 
-	pr_debug("swsusp: Saving Highmem\n");
+	pr_debug("swsusp: Saving Highmem");
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			res = save_highmem_zone(zone);
 		if (res)
 			return res;
 	}
+	printk("\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From fc558a7496bfab3d29a68953b07a95883fdcfbb1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:05 -0800
Subject: [PATCH] swsusp: finally solve mysqld problem

This patch from Pavel moves userland freeze signals handling into more logical
place.  It now hits even with mysqld running.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ea154104a00..dfb09ba5c01 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1922,6 +1922,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+	try_to_freeze();
+
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
@@ -2307,7 +2309,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 
 			timeout = schedule_timeout_interruptible(timeout);
 
-			try_to_freeze();
 			spin_lock_irq(&current->sighand->siglock);
 			sig = dequeue_signal(current, &these, &info);
 			current->blocked = current->real_blocked;
-- 
cgit v1.2.3


From e4e4d665560c75afb6060cb43bb6738777648ca1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 23 Mar 2006 03:00:06 -0800
Subject: [PATCH] swsusp: drain high mem pages

Highmem could be in pcp list as well.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1b46c2da5a5..c5863d02c89 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -123,6 +123,7 @@ int save_highmem(void)
 	int res = 0;
 
 	pr_debug("swsusp: Saving Highmem");
+	drain_local_pages();
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			res = save_highmem_zone(zone);
-- 
cgit v1.2.3


From 94c188d32996beac00426740974310e32f162c14 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:08 -0800
Subject: [PATCH] swsusp: let userland tools switch console on suspend

Remove the console-switching code from the suspend part of the swsusp userland
interface and let the userland tools switch the console.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/user.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index a97406b86ef..bbd4842104a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -139,12 +139,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (data->frozen)
 			break;
 		down(&pm_sem);
-		pm_prepare_console();
 		disable_nonboot_cpus();
 		if (freeze_processes()) {
 			thaw_processes();
 			enable_nonboot_cpus();
-			pm_restore_console();
 			error = -EBUSY;
 		}
 		up(&pm_sem);
@@ -158,7 +156,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		down(&pm_sem);
 		thaw_processes();
 		enable_nonboot_cpus();
-		pm_restore_console();
 		up(&pm_sem);
 		data->frozen = 0;
 		break;
-- 
cgit v1.2.3


From 9b238205ba5d79a8a242d7a5ddb82b89e4dc4e48 Mon Sep 17 00:00:00 2001
From: Luca Tettamanti <kronos.it@gmail.com>
Date: Thu, 23 Mar 2006 03:00:09 -0800
Subject: [PATCH] swsusp: add s2ram ioctl to userland interface

Add the SNAPSHOT_S2RAM ioctl to the snapshot device.

This ioctl allows a userland application to make the system (previously frozen
with the SNAPSHOT_FREE ioctl) enter the S3 state without freezing processes
and disabling nonboot CPUs for the second time.

This will allow us to implement the suspend-to-disk-and-RAM (STDR)
functionality in the userland suspend tools.

Signed-off-by: Luca Tettamanti <kronos.it@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c  |  2 +-
 kernel/power/power.h |  4 +++-
 kernel/power/user.c  | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4a..ee371f50cca 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
 }
 
 
-static int suspend_enter(suspend_state_t state)
+int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 	unsigned long flags;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 42c431c8bdd..f06f12f2176 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,7 +77,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle);
 #define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
 #define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
 #define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
-#define SNAPSHOT_IOC_MAXNR	10
+#define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
+#define SNAPSHOT_IOC_MAXNR	11
 
 /**
  *	The bitmap is used for tracing allocated swap pages
@@ -112,3 +113,4 @@ extern int swsusp_resume(void);
 extern int swsusp_read(void);
 extern int swsusp_write(void);
 extern void swsusp_close(void);
+extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bbd4842104a..3f1539fbe48 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -266,6 +266,42 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		}
 		break;
 
+	case SNAPSHOT_S2RAM:
+		if (!data->frozen) {
+			error = -EPERM;
+			break;
+		}
+
+		if (down_trylock(&pm_sem)) {
+			error = -EBUSY;
+			break;
+		}
+
+		if (pm_ops->prepare) {
+			error = pm_ops->prepare(PM_SUSPEND_MEM);
+			if (error)
+				goto OutS3;
+		}
+
+		/* Put devices to sleep */
+		error = device_suspend(PMSG_SUSPEND);
+		if (error) {
+			printk(KERN_ERR "Failed to suspend some devices.\n");
+		} else {
+			/* Enter S3, system is already frozen */
+			suspend_enter(PM_SUSPEND_MEM);
+
+			/* Wake up devices */
+			device_resume();
+		}
+
+		if (pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_MEM);
+
+OutS3:
+		up(&pm_sem);
+		break;
+
 	default:
 		error = -ENOTTY;
 
-- 
cgit v1.2.3


From 0c9e63fd38a2fb2181668a0cdd622a3c23cfd567 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 23 Mar 2006 03:00:12 -0800
Subject: [PATCH] Shrinks sizeof(files_struct) and better layout

1) Reduce the size of (struct fdtable) to exactly 64 bytes on 32bits
   platforms, lowering kmalloc() allocated space by 50%.

2) Reduce the size of (files_struct), using a special 32 bits (or
   64bits) embedded_fd_set, instead of a 1024 bits fd_set for the
   close_on_exec_init and open_fds_init fields.  This save some ram (248
   bytes per task) as most tasks dont open more than 32 files.  D-Cache
   footprint for such tasks is also reduced to the minimum.

3) Reduce size of allocated fdset.  Currently two full pages are
   allocated, that is 32768 bits on x86 for example, and way too much.  The
   minimum is now L1_CACHE_BYTES.

UP and SMP should benefit from this patch, because most tasks will touch
only one cache line when open()/close() stdin/stdout/stderr (0/1/2),
(next_fd, close_on_exec_init, open_fds_init, fd_array[0 ..  2] being in the
same cache line)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd7b65ee41..c79ae0b19a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
 	fdt = &newf->fdtab;
-	fdt->next_fd = 0;
 	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->max_fdset = __FD_SETSIZE;
-	fdt->close_on_exec = &newf->close_on_exec_init;
-	fdt->open_fds = &newf->open_fds_init;
+	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
 	INIT_RCU_HEAD(&fdt->rcu);
 	fdt->free_files = NULL;
-- 
cgit v1.2.3


From 2dd0ebcd2ab7b18a50c0810ddb45a84316e4ee2e Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Thu, 23 Mar 2006 03:00:13 -0800
Subject: [PATCH] Avoid taking global tasklist_lock for single threadedprocess
 at getrusage()

Avoid taking the global tasklist_lock when possible, if a process is single
threaded during getrusage().  Any avoidance of tasklist_lock is good for
NUMA boxes (and possibly for large SMPs).  Thanks to Oleg Nesterov for
review and suggestions.

Signed-off-by: Nippun Goel <nippung@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a5463..4941b9b14b9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1677,9 +1677,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
  * a lot simpler!  (Which we're not doing right now because we're not
  * measuring them yet).
  *
- * This expects to be called with tasklist_lock read-locked or better,
- * and the siglock not locked.  It may momentarily take the siglock.
- *
  * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
  * races with threads incrementing their own counters.  But since word
  * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1684,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
  * the c* fields from p->signal from races with exit.c updating those
  * fields when reaping, so a sample either gets all the additions of a
  * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * tasklist_lock locking optimisation:
+ * If we are current and single threaded, we do not need to take the tasklist
+ * lock or the siglock.  No one else can take our signal_struct away,
+ * no one else can reap the children to update signal->c* counters, and
+ * no one else can race with the signal-> fields.
+ * If we do not take the tasklist_lock, the signal-> fields could be read
+ * out of order while another thread was just exiting. So we place a
+ * read memory barrier when we avoid the lock.  On the writer side,
+ * write memory barrier is implied in  __exit_signal as __exit_signal releases
+ * the siglock spinlock after updating the signal-> fields.
+ *
+ * We don't really need the siglock when we access the non c* fields
+ * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * case, since we take the tasklist lock for read and the non c* signal->
+ * fields are updated only in __exit_signal, which is called with
+ * tasklist_lock taken for write, hence these two threads cannot execute
+ * concurrently.
+ *
  */
 
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1710,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	cputime_t utime, stime;
+	int need_lock = 0;
 
 	memset((char *) r, 0, sizeof *r);
+	utime = stime = cputime_zero;
 
-	if (unlikely(!p->signal))
-		return;
+	if (p != current || !thread_group_empty(p))
+		need_lock = 1;
 
-	utime = stime = cputime_zero;
+	if (need_lock) {
+		read_lock(&tasklist_lock);
+		if (unlikely(!p->signal)) {
+			read_unlock(&tasklist_lock);
+			return;
+		}
+	} else
+		/* See locking comments above */
+		smp_rmb();
 
 	switch (who) {
 		case RUSAGE_BOTH:
@@ -1740,6 +1766,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			BUG();
 	}
 
+	if (need_lock)
+		read_unlock(&tasklist_lock);
 	cputime_to_timeval(utime, &r->ru_utime);
 	cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1747,9 +1775,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 {
 	struct rusage r;
-	read_lock(&tasklist_lock);
 	k_getrusage(p, who, &r);
-	read_unlock(&tasklist_lock);
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
 
-- 
cgit v1.2.3


From 3d3f26a7baaa921a0e790b4c72d20f0de91a5d65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:18 -0800
Subject: [PATCH] kernel/cpuset.c, mutex conversion

convert cpuset.c's callback_sem and manage_sem to mutexes.
Build and boot tested by Ingo.
Build, boot, unit and stress tested by pj.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 212 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 103 insertions(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a0..c86ee051b73 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -53,7 +53,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #define CPUSET_SUPER_MAGIC		0x27e0eb
 
@@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb;
 
 /*
- * We have two global cpuset semaphores below.  They can nest.
- * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * We have two global cpuset mutexes below.  They can nest.
+ * It is ok to first take manage_mutex, then nest callback_mutex.  We also
  * require taking task_lock() when dereferencing a tasks cpuset pointer.
  * See "The task_lock() exception", at the end of this comment.
  *
- * A task must hold both semaphores to modify cpusets.  If a task
- * holds manage_sem, then it blocks others wanting that semaphore,
- * ensuring that it is the only task able to also acquire callback_sem
+ * A task must hold both mutexes to modify cpusets.  If a task
+ * holds manage_mutex, then it blocks others wanting that mutex,
+ * ensuring that it is the only task able to also acquire callback_mutex
  * and be able to modify cpusets.  It can perform various checks on
  * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding manage_sem.  While it is
+ * also allocate memory while just holding manage_mutex.  While it is
  * performing these checks, various callback routines can briefly
- * acquire callback_sem to query cpusets.  Once it is ready to make
- * the changes, it takes callback_sem, blocking everyone else.
+ * acquire callback_mutex to query cpusets.  Once it is ready to make
+ * the changes, it takes callback_mutex, blocking everyone else.
  *
  * Calls to the kernel memory allocator can not be made while holding
- * callback_sem, as that would risk double tripping on callback_sem
+ * callback_mutex, as that would risk double tripping on callback_mutex
  * from one of the callbacks into the cpuset code from within
  * __alloc_pages().
  *
- * If a task is only holding callback_sem, then it has read-only
+ * If a task is only holding callback_mutex, then it has read-only
  * access to cpusets.
  *
  * The task_struct fields mems_allowed and mems_generation may only
  * be accessed in the context of that task, so require no locks.
  *
  * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_sem or callback_sem can't rely
+ * So in general, code holding manage_mutex or callback_mutex can't rely
  * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both semaphores, can
+ * zero, then only attach_task(), which holds both mutexes, can
  * increment it again.  Because a count of zero means that no tasks
  * are currently attached, therefore there is no way a task attached
  * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_sem or callback_sem can safely assume that
+ * So code holding manage_mutex or callback_mutex can safely assume that
  * if the count is zero, it will stay zero.  Similarly, if a task
- * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * holds manage_mutex or callback_mutex on a cpuset with zero count, it
  * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those semaphores.
- *
- * A possible optimization to improve parallelism would be to make
- * callback_sem a R/W semaphore (rwsem), allowing the callback routines
- * to proceed in parallel, with read access, until the holder of
- * manage_sem needed to take this rwsem for exclusive write access
- * and modify some cpusets.
+ * both of those mutexes.
  *
  * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_sem across the entire operation,
+ * the cpuset hierarchy holds manage_mutex across the entire operation,
  * single threading all such cpuset modifications across the system.
  *
- * The cpuset_common_file_read() handlers only hold callback_sem across
+ * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
  * cpumasks and nodemasks.
  *
  * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either semaphore.  These are the two most performance
+ * (usually) take either mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits.  Then manage_sem
+ * when a task in a notify_on_release cpuset exits.  Then manage_mutex
  * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
@@ -242,9 +236,9 @@ static struct super_block *cpuset_sb;
  *
  * The need for this exception arises from the action of attach_task(),
  * which overwrites one tasks cpuset pointer with another.  It does
- * so using both semaphores, however there are several performance
+ * so using both mutexes, however there are several performance
  * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global semaphore.  Therefore except as
+ * expense of grabbing a system global mutex.  Therefore except as
  * noted below, when dereferencing or, as in attach_task(), modifying
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +250,8 @@ static struct super_block *cpuset_sb;
  * the routine cpuset_update_task_memory_state().
  */
 
-static DECLARE_MUTEX(manage_sem);
-static DECLARE_MUTEX(callback_sem);
+static DEFINE_MUTEX(manage_mutex);
+static DEFINE_MUTEX(callback_mutex);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 
 /*
- * Call with manage_sem held.  Writes path of cpuset into buf.
+ * Call with manage_mutex held.  Writes path of cpuset into buf.
  * Returns 0 on success, -errno on error.
  */
 
@@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * status of the /sbin/cpuset_release_agent task, so no sense holding
  * our caller up for that.
  *
- * When we had only one cpuset semaphore, we had to call this
+ * When we had only one cpuset mutex, we had to call this
  * without holding it, to avoid deadlock when call_usermodehelper()
  * allocated memory.  With two locks, we could now call this while
- * holding manage_sem, but we still don't, so as to minimize
- * the time manage_sem is held.
+ * holding manage_mutex, but we still don't, so as to minimize
+ * the time manage_mutex is held.
  */
 
 static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf)
  * cs is notify_on_release() and now both the user count is zero and
  * the list of children is empty, prepare cpuset path in a kmalloc'd
  * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once manage_sem is dropped.
- * Call here with manage_sem held.
+ * cpuset_release_agent() with it later on, once manage_mutex is dropped.
+ * Call here with manage_mutex held.
  *
  * This check_for_release() routine is responsible for kmalloc'ing
  * pathbuf.  The above cpuset_release_agent() is responsible for
  * kfree'ing pathbuf.  The caller of these routines is responsible
  * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with manage_sem held and the address
- * of the pathbuf pointer, then dropping manage_sem, then calling
+ * calling check_for_release() with manage_mutex held and the address
+ * of the pathbuf pointer, then dropping manage_mutex, then calling
  * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
@@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_map.
  *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
  */
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
  * One way or another, we guarantee to return some non-empty subset
  * of node_online_map.
  *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
  */
 
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * current->cpuset if a task has its memory placement changed.
  * Do not call this routine if in_interrupt().
  *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Doesn't need task_lock to guard
+ * Call without callback_mutex or task_lock() held.  May be called
+ * with or without manage_mutex held.  Doesn't need task_lock to guard
  * against another task changing a non-NULL cpuset pointer to NULL,
  * as that is only done by a task on itself, and if the current task
  * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_sem and
+ * cpuset pointer.  This routine also might acquire callback_mutex and
  * current->mm->mmap_sem during call.
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void)
 	}
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		down(&callback_sem);
+		mutex_lock(&callback_mutex);
 		task_lock(tsk);
 		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
-		up(&callback_sem);
+		mutex_unlock(&callback_mutex);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
 	}
 }
@@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding manage_sem.
+ * are only set if the other's are set.  Call holding manage_mutex.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * manage_sem held.
+ * manage_mutex held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *    exclusive child cpusets
  * Build these two partitions by calling partition_sched_domains
  *
- * Call with manage_sem held.  May nest a call to the
+ * Call with manage_mutex held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
 
@@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur)
 }
 
 /*
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
  */
 
 static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		return retval;
 	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	cs->cpus_allowed = trialcs.cpus_allowed;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 	if (is_cpu_exclusive(cs) && !cpus_unchanged)
 		update_cpu_domains(cs);
 	return 0;
@@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  * the cpuset is marked 'memory_migrate', migrate the tasks
  * pages to the new memory.
  *
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		goto done;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs.mems_allowed;
 	atomic_inc(&cpuset_mems_generation);
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
 
@@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
 	 * cpuset_being_rebound check will catch such forks, and rebind
 	 * their vma mempolicies too.  Because we still hold the global
-	 * cpuset manage_sem, we know that no other rebind effort will
+	 * cpuset manage_mutex, we know that no other rebind effort will
 	 * be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -948,7 +942,7 @@ done:
 }
 
 /*
- * Call with manage_sem held.
+ * Call with manage_mutex held.
  */
 
 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
- * Call with manage_sem held.
+ * Call with manage_mutex held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		return err;
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	if (turning_on)
 		set_bit(bit, &cs->flags);
 	else
 		clear_bit(bit, &cs->flags);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	if (cpu_exclusive_changed)
                 update_cpu_domains(cs);
@@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp)
  * writing the path of the old cpuset in 'ppathbuf' if it needs to be
  * notified on release.
  *
- * Call holding manage_sem.  May take callback_sem and task_lock of
+ * Call holding manage_mutex.  May take callback_mutex and task_lock of
  * the task 'pid' during call.
  */
 
@@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		get_task_struct(tsk);
 	}
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
 	if (!oldcs) {
 		task_unlock(tsk);
-		up(&callback_sem);
+		mutex_unlock(&callback_mutex);
 		put_task_struct(tsk);
 		return -ESRCH;
 	}
@@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
 
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	mm = get_task_mm(tsk);
 	if (mm) {
@@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	mask = cs->cpus_allowed;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	mask = cs->mems_allowed;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
  * Handle an open on 'tasks' file.  Prepare a buffer listing the
  * process id's of tasks currently attached to the cpuset being opened.
  *
- * Does not require any specific cpuset semaphores, and does not take any.
+ * Does not require any specific cpuset mutexes, and does not take any.
  */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
@@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
  *	name:		name of the new cpuset. Will be strcpy'ed.
  *	mode:		mode to set on new inode
  *
- *	Must be called with the semaphore on the parent inode held
+ *	Must be called with the mutex on the parent inode held
  */
 
 static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
@@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	cs->parent = parent;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	list_add(&cs->sibling, &cs->parent->children);
 	number_of_cpusets++;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	err = cpuset_create_dir(cs, name, mode);
 	if (err < 0)
 		goto err;
 
 	/*
-	 * Release manage_sem before cpuset_populate_dir() because it
+	 * Release manage_mutex before cpuset_populate_dir() because it
 	 * will down() this new directory's i_mutex and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	kfree(cs);
 	return err;
 }
@@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_mutex already */
 
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
-		up(&manage_sem);
+		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		up(&manage_sem);
+		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
 	parent = cs->parent;
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	set_bit(CS_REMOVED, &cs->flags);
 	if (is_cpu_exclusive(cs))
 		update_cpu_domains(cs);
@@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	cpuset_d_remove_dir(d);
 	dput(d);
 	number_of_cpusets--;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child)
  * Description: Detach cpuset from @tsk and release it.
  *
  * Note that cpusets marked notify_on_release force every task in
- * them to take the global manage_sem semaphore when exiting.
+ * them to take the global manage_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cpusets where very high task exit scaling
  * is required on large systems.
  *
  * Don't even think about derefencing 'cs' after the cpuset use count
- * goes to zero, except inside a critical section guarded by manage_sem
- * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * goes to zero, except inside a critical section guarded by manage_mutex
+ * or callback_mutex.   Otherwise a zero cpuset use count is a license to
  * any other task to nuke the cpuset immediately, via cpuset_rmdir().
  *
- * This routine has to take manage_sem, not callback_sem, because
- * it is holding that semaphore while calling check_for_release(),
- * which calls kmalloc(), so can't be called holding callback__sem().
+ * This routine has to take manage_mutex, not callback_mutex, because
+ * it is holding that mutex while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback_mutex().
  *
  * We don't need to task_lock() this reference to tsk->cpuset,
  * because tsk is already marked PF_EXITING, so attach_task() won't
@@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk)
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		down(&manage_sem);
+		mutex_lock(&manage_mutex);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		up(&manage_sem);
+		mutex_unlock(&manage_mutex);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock(tsk);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return mask;
 }
@@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
 	nodemask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	task_lock(tsk);
 	guarantee_online_mems(tsk->cpuset, &mask);
 	task_unlock(tsk);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return mask;
 }
@@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 
 /*
  * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call holding callback_sem.
+ * ancestor to the specified cpuset.  Call holding callback_mutex.
  * If no ancestor is mem_exclusive (an unusual configuration), then
  * returns the root cpuset.
  */
@@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest mem_exclusive ancestor cpuset.
  *
- * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
  * routine only calls here with __GFP_HARDWALL bit _not_ set if
  * it's a GFP_KERNEL allocation, and all nodes in the current tasks
  * mems_allowed came up empty on the first pass over the zonelist.
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_sem semaphore.
+ * short of memory, might require taking the callback_mutex mutex.
  *
  * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
  * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 1;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 
 	task_lock(current);
 	cs = nearest_exclusive_ancestor(current->cpuset);
 	task_unlock(current);
 
 	allowed = node_isset(node, cs->mems_allowed);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 	return allowed;
 }
 
 /**
  * cpuset_lock - lock out any changes to cpuset structures
  *
- * The out of memory (oom) code needs to lock down cpusets
+ * The out of memory (oom) code needs to mutex_lock cpusets
  * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_sem via this
+ * task in an overlapping cpuset.  Expose callback_mutex via this
  * cpuset_lock() routine, so the oom code can lock it, before
  * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_sem.
+ * must be taken inside callback_mutex.
  */
 
 void cpuset_lock(void)
 {
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 }
 
 /**
@@ -2206,7 +2200,7 @@ void cpuset_lock(void)
 
 void cpuset_unlock(void)
 {
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 }
 
 /**
@@ -2218,7 +2212,7 @@ void cpuset_unlock(void)
  * determine if task @p's memory usage might impact the memory
  * available to the current task.
  *
- * Call while holding callback_sem.
+ * Call while holding callback_mutex.
  **/
 
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void)
  *  - Used for /proc/<pid>/cpuset.
  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
  *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take manage_sem, keeping attach_task() from changing it
+ *    and we take manage_mutex, keeping attach_task() from changing it
  *    anyway.
  */
 
@@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 	cs = tsk->cpuset;
 	if (!cs) {
 		retval = -EINVAL;
@@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 9331b3157c835353dd28efcd80d23563ad226aee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:19 -0800
Subject: [PATCH] convert kernel/rcupdate.c:rcu_barrier_sema to mutex

Convert kernel/rcupdate's rcu_barrier_sema to mutex.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index fedf5e36975..af8a2a57e17 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,6 +47,7 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/mutex.h>
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = {
@@ -75,7 +76,7 @@ static int rsinterval = 1000;
 #endif
 
 static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
+static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
 #ifdef CONFIG_SMP
@@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused)
 void rcu_barrier(void)
 {
 	BUG_ON(in_interrupt());
-	/* Take cpucontrol semaphore to protect against CPU hotplug */
-	down(&rcu_barrier_sema);
+	/* Take cpucontrol mutex to protect against CPU hotplug */
+	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	atomic_set(&rcu_barrier_cpu_count, 0);
 	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
 	wait_for_completion(&rcu_barrier_completion);
-	up(&rcu_barrier_sema);
+	mutex_unlock(&rcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -549,7 +550,6 @@ static struct notifier_block __devinitdata rcu_nb = {
  */
 void __init rcu_init(void)
 {
-	sema_init(&rcu_barrier_sema, 1);
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
-- 
cgit v1.2.3


From 97d1f15b7ef52c1e9c28dc48b454024bb53a5fd2 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:24 -0800
Subject: [PATCH] sem2mutex: kernel/

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kthread.c      |  7 ++++---
 kernel/module.c       | 15 ++++++++-------
 kernel/posix-timers.c |  1 +
 kernel/power/pm.c     | 21 +++++++++++----------
 kernel/profile.c      | 11 ++++++-----
 5 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a1092..6a5373868a9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <asm/semaphore.h>
 
 /*
@@ -41,7 +42,7 @@ struct kthread_stop_info
 
 /* Thread stopping is done by setthing this var: lock serializes
  * multiple kthread_stop calls. */
-static DECLARE_MUTEX(kthread_stop_lock);
+static DEFINE_MUTEX(kthread_stop_lock);
 static struct kthread_stop_info kthread_stop_info;
 
 int kthread_should_stop(void)
@@ -173,7 +174,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
-	down(&kthread_stop_lock);
+	mutex_lock(&kthread_stop_lock);
 
 	/* It could exit after stop_info.k set, but before wake_up_process. */
 	get_task_struct(k);
@@ -194,7 +195,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 	wait_for_completion(&kthread_stop_info.done);
 	kthread_stop_info.k = NULL;
 	ret = kthread_stop_info.err;
-	up(&kthread_stop_lock);
+	mutex_unlock(&kthread_stop_lock);
 
 	return ret;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 77764f22f02..de6312da6bb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -63,15 +64,15 @@ static DEFINE_SPINLOCK(modlist_lock);
 static DECLARE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
-static DECLARE_MUTEX(notify_mutex);
+static DEFINE_MUTEX(notify_mutex);
 static struct notifier_block * module_notify_list;
 
 int register_module_notifier(struct notifier_block * nb)
 {
 	int err;
-	down(&notify_mutex);
+	mutex_lock(&notify_mutex);
 	err = notifier_chain_register(&module_notify_list, nb);
-	up(&notify_mutex);
+	mutex_unlock(&notify_mutex);
 	return err;
 }
 EXPORT_SYMBOL(register_module_notifier);
@@ -79,9 +80,9 @@ EXPORT_SYMBOL(register_module_notifier);
 int unregister_module_notifier(struct notifier_block * nb)
 {
 	int err;
-	down(&notify_mutex);
+	mutex_lock(&notify_mutex);
 	err = notifier_chain_unregister(&module_notify_list, nb);
-	up(&notify_mutex);
+	mutex_unlock(&notify_mutex);
 	return err;
 }
 EXPORT_SYMBOL(unregister_module_notifier);
@@ -1989,9 +1990,9 @@ sys_init_module(void __user *umod,
 	/* Drop lock so they can recurse */
 	up(&module_mutex);
 
-	down(&notify_mutex);
+	mutex_lock(&notify_mutex);
 	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
-	up(&notify_mutex);
+	mutex_unlock(&notify_mutex);
 
 	/* Start the module */
 	if (mod->init != NULL)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fa895fc2ecf..9944379360b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857d..0f6908cce1d 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
 #include <linux/pm.h>
 #include <linux/pm_legacy.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 
 int pm_active;
 
@@ -40,7 +41,7 @@ int pm_active;
  *	until a resume but that will be fine.
  */
  
-static DECLARE_MUTEX(pm_devs_lock);
+static DEFINE_MUTEX(pm_devs_lock);
 static LIST_HEAD(pm_devs);
 
 /**
@@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type,
 		dev->id = id;
 		dev->callback = callback;
 
-		down(&pm_devs_lock);
+		mutex_lock(&pm_devs_lock);
 		list_add(&dev->entry, &pm_devs);
-		up(&pm_devs_lock);
+		mutex_unlock(&pm_devs_lock);
 	}
 	return dev;
 }
@@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type,
 void pm_unregister(struct pm_dev *dev)
 {
 	if (dev) {
-		down(&pm_devs_lock);
+		mutex_lock(&pm_devs_lock);
 		list_del(&dev->entry);
-		up(&pm_devs_lock);
+		mutex_unlock(&pm_devs_lock);
 
 		kfree(dev);
 	}
@@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback)
 	if (!callback)
 		return;
 
-	down(&pm_devs_lock);
+	mutex_lock(&pm_devs_lock);
 	entry = pm_devs.next;
 	while (entry != &pm_devs) {
 		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback)
 		if (dev->callback == callback)
 			__pm_unregister(dev);
 	}
-	up(&pm_devs_lock);
+	mutex_unlock(&pm_devs_lock);
 }
 
 /**
@@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data)
 {
 	struct list_head *entry;
 	
-	down(&pm_devs_lock);
+	mutex_lock(&pm_devs_lock);
 	entry = pm_devs.next;
 	while (entry != &pm_devs) {
 		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data)
 				 */
 				if (rqst == PM_SUSPEND)
 					pm_undo_all(dev);
-				up(&pm_devs_lock);
+				mutex_unlock(&pm_devs_lock);
 				return status;
 			}
 		}
 		entry = entry->next;
 	}
-	up(&pm_devs_lock);
+	mutex_unlock(&pm_devs_lock);
 	return 0;
 }
 
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d70..ad81f799a9b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/profile.h>
 #include <linux/highmem.h>
+#include <linux/mutex.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
 
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DECLARE_MUTEX(profile_flip_mutex);
+static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 
 static int __init profile_setup(char * str)
@@ -243,7 +244,7 @@ static void profile_flip_buffers(void)
 {
 	int i, j, cpu;
 
-	down(&profile_flip_mutex);
+	mutex_lock(&profile_flip_mutex);
 	j = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
 	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +260,14 @@ static void profile_flip_buffers(void)
 			hits[i].hits = hits[i].pc = 0;
 		}
 	}
-	up(&profile_flip_mutex);
+	mutex_unlock(&profile_flip_mutex);
 }
 
 static void profile_discard_flip_buffers(void)
 {
 	int i, cpu;
 
-	down(&profile_flip_mutex);
+	mutex_lock(&profile_flip_mutex);
 	i = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
 	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +275,7 @@ static void profile_discard_flip_buffers(void)
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
 	}
-	up(&profile_flip_mutex);
+	mutex_unlock(&profile_flip_mutex);
 }
 
 void profile_hit(int type, void *__pc)
-- 
cgit v1.2.3


From 70522e121a521aa09bd0f4e62e1aa68708b798e1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:31 -0800
Subject: [PATCH] sem2mutex: tty

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 4 ++--
 kernel/sys.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index d1e8d500a7e..8037405e136 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -345,9 +345,9 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	current->signal->tty = NULL;
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4941b9b14b9..c0fcad9f826 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void)
 	struct pid *pid;
 	int err = -EPERM;
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	write_lock_irq(&tasklist_lock);
 
 	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void)
 	err = process_group(group_leader);
 out:
 	write_unlock_irq(&tasklist_lock);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	return err;
 }
 
-- 
cgit v1.2.3


From 7a7d1cf95408863a657035701606b13644c9f55e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:35 -0800
Subject: [PATCH] sem2mutex: kprobes

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73c..1fb9f753ef6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-DECLARE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
+DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	}
 
 	p->nmissed = 0;
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
   	arch_arm_kprobe(p);
 
 out:
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 
 	if (ret && probed_mod)
 		module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 	struct kprobe *old_p, *list_p;
 	int cleanup_p;
 
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p)) {
-		up(&kprobe_mutex);
+		mutex_unlock(&kprobe_mutex);
 		return;
 	}
 	if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
 				goto valid_p;
-		up(&kprobe_mutex);
+		mutex_unlock(&kprobe_mutex);
 		return;
 	}
 valid_p:
@@ -523,7 +523,7 @@ valid_p:
 		cleanup_p = 0;
 	}
 
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 
 	synchronize_sched();
 	if (p->mod_refcounted &&
-- 
cgit v1.2.3


From 6389a385114ae358693f213266de6468ea116c77 Mon Sep 17 00:00:00 2001
From: Ashutosh Naik <ashutosh.naik@gmail.com>
Date: Thu, 23 Mar 2006 03:00:46 -0800
Subject: [PATCH] kernel/module.c Semaphore to Mutex Conversion for
 module_mutex

This patch converts the module_mutex semaphore to a mutex.

Signed-off-by: Ashutosh Naik <ashutosh.naik@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index de6312da6bb..fb404299082 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,7 +61,7 @@
 static DEFINE_SPINLOCK(modlist_lock);
 
 /* List of modules, protected by module_mutex AND modlist_lock */
-static DECLARE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
 static DEFINE_MUTEX(notify_mutex);
@@ -602,7 +602,7 @@ static void free_module(struct module *mod);
 static void wait_for_zero_refcount(struct module *mod)
 {
 	/* Since we might sleep for some time, drop the semaphore first */
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 	for (;;) {
 		DEBUGP("Looking at refcount...\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
@@ -611,7 +611,7 @@ static void wait_for_zero_refcount(struct module *mod)
 		schedule();
 	}
 	current->state = TASK_RUNNING;
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 }
 
 asmlinkage long
@@ -628,7 +628,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
-	if (down_interruptible(&module_mutex) != 0)
+	if (mutex_lock_interruptible(&module_mutex) != 0)
 		return -EINTR;
 
 	mod = find_module(name);
@@ -677,14 +677,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 
 	/* Final destruction now noone is using it. */
 	if (mod->exit != NULL) {
-		up(&module_mutex);
+		mutex_unlock(&module_mutex);
 		mod->exit();
-		down(&module_mutex);
+		mutex_lock(&module_mutex);
 	}
 	free_module(mod);
 
  out:
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 	return ret;
 }
 
@@ -1973,13 +1973,13 @@ sys_init_module(void __user *umod,
 		return -EPERM;
 
 	/* Only one module load at a time, please */
-	if (down_interruptible(&module_mutex) != 0)
+	if (mutex_lock_interruptible(&module_mutex) != 0)
 		return -EINTR;
 
 	/* Do all the hard work */
 	mod = load_module(umod, len, uargs);
 	if (IS_ERR(mod)) {
-		up(&module_mutex);
+		mutex_unlock(&module_mutex);
 		return PTR_ERR(mod);
 	}
 
@@ -1988,7 +1988,7 @@ sys_init_module(void __user *umod,
 	stop_machine_run(__link_module, mod, NR_CPUS);
 
 	/* Drop lock so they can recurse */
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 
 	mutex_lock(&notify_mutex);
 	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
@@ -2007,15 +2007,15 @@ sys_init_module(void __user *umod,
 			       mod->name);
 		else {
 			module_put(mod);
-			down(&module_mutex);
+			mutex_lock(&module_mutex);
 			free_module(mod);
-			up(&module_mutex);
+			mutex_unlock(&module_mutex);
 		}
 		return ret;
 	}
 
 	/* Now it's a first class citizen! */
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 	mod->state = MODULE_STATE_LIVE;
 	/* Drop initial reference. */
 	module_put(mod);
@@ -2023,7 +2023,7 @@ sys_init_module(void __user *umod,
 	mod->module_init = NULL;
 	mod->init_size = 0;
 	mod->init_text_size = 0;
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 
 	return 0;
 }
@@ -2113,7 +2113,7 @@ struct module *module_get_kallsym(unsigned int symnum,
 {
 	struct module *mod;
 
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 	list_for_each_entry(mod, &modules, list) {
 		if (symnum < mod->num_symtab) {
 			*value = mod->symtab[symnum].st_value;
@@ -2121,12 +2121,12 @@ struct module *module_get_kallsym(unsigned int symnum,
 			strncpy(namebuf,
 				mod->strtab + mod->symtab[symnum].st_name,
 				127);
-			up(&module_mutex);
+			mutex_unlock(&module_mutex);
 			return mod;
 		}
 		symnum -= mod->num_symtab;
 	}
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 	return NULL;
 }
 
@@ -2169,7 +2169,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	struct list_head *i;
 	loff_t n = 0;
 
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 	list_for_each(i, &modules) {
 		if (n++ == *pos)
 			break;
@@ -2190,7 +2190,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
 
 static void m_stop(struct seq_file *m, void *p)
 {
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 }
 
 static int m_show(struct seq_file *m, void *p)
-- 
cgit v1.2.3


From a26fd335b481e0bd14f4e7d1f5e7bb1138b1731f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Mar 2006 03:00:49 -0800
Subject: [PATCH] sigprocmask: kill unneeded temp var

Cleanup, remove unneeded double copying of current->blocked.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index dfb09ba5c01..75f7341b0c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2101,10 +2101,11 @@ long do_no_restart_syscall(struct restart_block *param)
 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 {
 	int error;
-	sigset_t old_block;
 
 	spin_lock_irq(&current->sighand->siglock);
-	old_block = current->blocked;
+	if (oldset)
+		*oldset = current->blocked;
+
 	error = 0;
 	switch (how) {
 	case SIG_BLOCK:
@@ -2121,8 +2122,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 	}
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
-	if (oldset)
-		*oldset = old_block;
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 91368d73e4b60d577ad171e5bd315b564265fcdb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:54 -0800
Subject: [PATCH] make bug messages more consistent

Consolidate all kernel bug printouts to begin with the "BUG: " string.
Makes it easier to find them in large bootup logs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a5bd60453ea..7ffaabd64f8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2873,7 +2873,7 @@ asmlinkage void __sched schedule(void)
 	 */
 	if (likely(!current->exit_state)) {
 		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "scheduling while atomic: "
+			printk(KERN_ERR "BUG: scheduling while atomic: "
 				"%s/0x%08x/%d\n",
 				current->comm, preempt_count(), current->pid);
 			dump_stack();
@@ -6074,7 +6074,7 @@ void __might_sleep(char *file, int line)
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
-		printk(KERN_ERR "Debug: sleeping function called from invalid"
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
-- 
cgit v1.2.3


From dd287796d608fcdc3fe5e8fdb5bf762a8f1bc32a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:57 -0800
Subject: [PATCH] pause_on_oops command line option

Attempt to fix the problem wherein people's oops reports scroll off the screen
due to repeated oopsing or to oopses on other CPUs.

If this happens the user can reboot with the `pause_on_oops=<seconds>' option.
It will allow the first oopsing CPU to print an oops record just a single
time.  Second oopsing attempts, or oopses on other CPUs will cause those CPUs
to enter a tight loop until the specified number of seconds have elapsed.

The patch implements the infrastructure generically in the expectation that
architectures other than x86 will find it useful.

Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c7..acd95adddb9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,10 +20,13 @@
 #include <linux/nmi.h>
 #include <linux/kexec.h>
 
-int panic_timeout;
 int panic_on_oops;
 int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
 
+int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
 struct notifier_block *panic_notifier_list;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
 	tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
+
+static int __init pause_on_oops_setup(char *str)
+{
+	pause_on_oops = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+
+static void spin_msec(int msecs)
+{
+	int i;
+
+	for (i = 0; i < msecs; i++) {
+		touch_nmi_watchdog();
+		mdelay(1);
+	}
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+	unsigned long flags;
+	static int spin_counter;
+
+	if (!pause_on_oops)
+		return;
+
+	spin_lock_irqsave(&pause_on_oops_lock, flags);
+	if (pause_on_oops_flag == 0) {
+		/* This CPU may now print the oops message */
+		pause_on_oops_flag = 1;
+	} else {
+		/* We need to stall this CPU */
+		if (!spin_counter) {
+			/* This CPU gets to do the counting */
+			spin_counter = pause_on_oops;
+			do {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(MSEC_PER_SEC);
+				spin_lock(&pause_on_oops_lock);
+			} while (--spin_counter);
+			pause_on_oops_flag = 0;
+		} else {
+			/* This CPU waits for a different one */
+			while (spin_counter) {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(1);
+				spin_lock(&pause_on_oops_lock);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+	return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+	do_oops_enter_exit();
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+	do_oops_enter_exit();
+}
-- 
cgit v1.2.3


From ee25e96fcd78837c9f192aa655ce12a88bfd63d4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Mar 2006 03:00:58 -0800
Subject: [PATCH] BUILD_LOCK_OPS: cleanup preempt_disable() usage

This patch changes the code from:

	preempt_disable();
	for (;;) {
		...
		preempt_disable();
	}
to:
	for (;;) {
		preempt_disable();
		...
	}

which seems more clean to me and saves a couple of bytes for
each function.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd5921..d1b810782bc 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
 #define BUILD_LOCK_OPS(op, locktype)					\
 void __lockfunc _##op##_lock(locktype##_t *lock)			\
 {									\
-	preempt_disable();						\
 	for (;;) {							\
+		preempt_disable();					\
 		if (likely(_raw_##op##_trylock(lock)))			\
 			break;						\
 		preempt_enable();					\
+									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
 			cpu_relax();					\
-		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
-	preempt_disable();						\
 	for (;;) {							\
+		preempt_disable();					\
 		local_irq_save(flags);					\
 		if (likely(_raw_##op##_trylock(lock)))			\
 			break;						\
 		local_irq_restore(flags);				\
-									\
 		preempt_enable();					\
+									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
 			cpu_relax();					\
-		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
-- 
cgit v1.2.3


From 2178426d26661ed6e18a8d6ea0bc05c98d73600d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Mar 2006 03:01:00 -0800
Subject: [PATCH] kernel/rcupdate.c: make two structs static

This patch makes two needlessly global structs static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index af8a2a57e17..6df1559b1c0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -50,13 +50,13 @@
 #include <linux/mutex.h>
 
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.lock = SPIN_LOCK_UNLOCKED,
 	.cpumask = CPU_MASK_NONE,
 };
-struct rcu_ctrlblk rcu_bh_ctrlblk = {
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.lock = SPIN_LOCK_UNLOCKED,
-- 
cgit v1.2.3


From b86ff981a8252d83d6a7719ae09f3a05307e3592 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 19:56:55 +0100
Subject: [PATCH] relay: migrate from relayfs to a generic relay API

Original patch from Paul Mundt, sysfs parts removed by me since they
were broken.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/Makefile |   1 +
 kernel/relay.c  | 919 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 920 insertions(+)
 create mode 100644 kernel/relay.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815..aebd7a78984 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RELAY) += relay.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 00000000000..9358e8eb847
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,919 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = vma->vm_private_data;
+	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * nopage() vm_op implementation for relay file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+				     unsigned long address,
+				     int *type)
+{
+	struct page *page;
+	struct rchan_buf *buf = vma->vm_private_data;
+	unsigned long offset = address - vma->vm_start;
+
+	if (address > vma->vm_end)
+		return NOPAGE_SIGBUS; /* Disallow mremap */
+	if (!buf)
+		return NOPAGE_OOM;
+
+	page = vmalloc_to_page(buf->start + offset);
+	if (!page)
+		return NOPAGE_OOM;
+	get_page(page);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+	.nopage = relay_buf_nopage,
+	.close = relay_file_mmap_close,
+};
+
+/**
+ *	relay_mmap_buf: - mmap channel buffer to process address space
+ *	@buf: relay channel buffer
+ *	@vma: vm_area_struct describing memory to be mapped
+ *
+ *	Returns 0 if ok, negative on error
+ *
+ *	Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+	unsigned long length = vma->vm_end - vma->vm_start;
+	struct file *filp = vma->vm_file;
+
+	if (!buf)
+		return -EBADF;
+
+	if (length != (unsigned long)buf->chan->alloc_size)
+		return -EINVAL;
+
+	vma->vm_ops = &relay_file_mmap_ops;
+	vma->vm_private_data = buf;
+	buf->chan->cb->buf_mapped(buf, filp);
+
+	return 0;
+}
+
+/**
+ *	relay_alloc_buf - allocate a channel buffer
+ *	@buf: the buffer struct
+ *	@size: total size of the buffer
+ *
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+{
+	void *mem;
+	unsigned int i, j, n_pages;
+
+	size = PAGE_ALIGN(size);
+	n_pages = size >> PAGE_SHIFT;
+
+	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!buf->page_array)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		buf->page_array[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!buf->page_array[i]))
+			goto depopulate;
+	}
+	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+	if (!mem)
+		goto depopulate;
+
+	memset(mem, 0, size);
+	buf->page_count = n_pages;
+	return mem;
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(buf->page_array[j]);
+	kfree(buf->page_array);
+	return NULL;
+}
+
+/**
+ *	relay_create_buf - allocate and initialize a channel buffer
+ *	@alloc_size: size of the buffer to allocate
+ *	@n_subbufs: number of sub-buffers in the channel
+ *
+ *	Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+	if (!buf->padding)
+		goto free_buf;
+
+	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	if (!buf->start)
+		goto free_buf;
+
+	buf->chan = chan;
+	kref_get(&buf->chan->kref);
+	return buf;
+
+free_buf:
+	kfree(buf->padding);
+	kfree(buf);
+	return NULL;
+}
+
+/**
+ *	relay_destroy_channel - free the channel struct
+ *
+ *	Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+	struct rchan *chan = container_of(kref, struct rchan, kref);
+	kfree(chan);
+}
+
+/**
+ *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *	@buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+	struct rchan *chan = buf->chan;
+	unsigned int i;
+
+	if (likely(buf->start)) {
+		vunmap(buf->start);
+		for (i = 0; i < buf->page_count; i++)
+			__free_page(buf->page_array[i]);
+		kfree(buf->page_array);
+	}
+	kfree(buf->padding);
+	kfree(buf);
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_remove_buf - remove a channel buffer
+ *
+ *	Removes the file from the fileystem, which also frees the
+ *	rchan_buf_struct and the channel buffer.  Should only be called from
+ *	kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+	buf->chan->cb->remove_buf_file(buf->dentry);
+	relay_destroy_buf(buf);
+}
+
+/**
+ *	relay_buf_empty - boolean, is the channel buffer empty?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(relay_buf_empty);
+
+/**
+ *	relay_buf_full - boolean, is the channel buffer full?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+
+/*
+ * High-level relay kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+					  void *subbuf,
+					  void *prev_subbuf,
+					  size_t prev_padding)
+{
+	if (relay_buf_full(buf))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+					struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+					  struct file *filp)
+{
+}
+
+/*
+ * create_buf_file_create() default callback.  Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf,
+						       int *is_global)
+{
+	return NULL;
+}
+
+/*
+ * remove_buf_file() default callback.  Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return -EINVAL;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+	.subbuf_start = subbuf_start_default_callback,
+	.buf_mapped = buf_mapped_default_callback,
+	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
+};
+
+/**
+ *	wakeup_readers - wake up readers waiting on a channel
+ *	@private: the channel buffer
+ *
+ *	This is the work function used to defer reader waking.  The
+ *	reason waking is deferred is that calling directly from write
+ *	causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+	struct rchan_buf *buf = private;
+	wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ *	__relay_reset - reset a channel buffer
+ *	@buf: the channel buffer
+ *	@init: 1 if this is a first-time initialization
+ *
+ *	See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+	size_t i;
+
+	if (init) {
+		init_waitqueue_head(&buf->read_wait);
+		kref_init(&buf->kref);
+		INIT_WORK(&buf->wake_readers, NULL, NULL);
+	} else {
+		cancel_delayed_work(&buf->wake_readers);
+		flush_scheduled_work();
+	}
+
+	buf->subbufs_produced = 0;
+	buf->subbufs_consumed = 0;
+	buf->bytes_consumed = 0;
+	buf->finalized = 0;
+	buf->data = buf->start;
+	buf->offset = 0;
+
+	for (i = 0; i < buf->chan->n_subbufs; i++)
+		buf->padding[i] = 0;
+
+	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ *	relay_reset - reset the channel
+ *	@chan: the channel
+ *
+ *	This has the effect of erasing all data from all channel buffers
+ *	and restarting the channel in its initial state.  The buffers
+ *	are not freed, so any mappings are still in effect.
+ *
+ *	NOTE: Care should be taken that the channel isn't actually
+ *	being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
+	}
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+
+/**
+ *	relay_open_buf - create a new relay channel buffer
+ *
+ *	Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+					const char *filename,
+					struct dentry *parent,
+					int *is_global)
+{
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+
+	if (*is_global)
+		return chan->buf[0];
+
+	buf = relay_create_buf(chan);
+	if (!buf)
+		return NULL;
+
+	/* Create file in fs */
+	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+					   buf, is_global);
+	if (!dentry) {
+		relay_destroy_buf(buf);
+		return NULL;
+	}
+
+	buf->dentry = dentry;
+	__relay_reset(buf, 1);
+
+	return buf;
+}
+
+/**
+ *	relay_close_buf - close a channel buffer
+ *	@buf: channel buffer
+ *
+ *	Marks the buffer finalized and restores the default callbacks.
+ *	The channel buffer and channel buffer data structure are then freed
+ *	automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+	buf->finalized = 1;
+	cancel_delayed_work(&buf->wake_readers);
+	flush_scheduled_work();
+	kref_put(&buf->kref, relay_remove_buf);
+}
+
+static inline void setup_callbacks(struct rchan *chan,
+				   struct rchan_callbacks *cb)
+{
+	if (!cb) {
+		chan->cb = &default_channel_callbacks;
+		return;
+	}
+
+	if (!cb->subbuf_start)
+		cb->subbuf_start = subbuf_start_default_callback;
+	if (!cb->buf_mapped)
+		cb->buf_mapped = buf_mapped_default_callback;
+	if (!cb->buf_unmapped)
+		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
+	chan->cb = cb;
+}
+
+/**
+ *	relay_open - create a new relay channel
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, NULL for root directory
+ *	@subbuf_size: size of sub-buffers
+ *	@n_subbufs: number of sub-buffers
+ *	@cb: client callback functions
+ *
+ *	Returns channel pointer if successful, NULL otherwise.
+ *
+ *	Creates a channel buffer for each cpu using the sizes and
+ *	attributes specified.  The created channel buffer files
+ *	will be named base_filename0...base_filenameN-1.  File
+ *	permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb)
+{
+	unsigned int i;
+	struct rchan *chan;
+	char *tmpname;
+	int is_global = 0;
+
+	if (!base_filename)
+		return NULL;
+
+	if (!(subbuf_size && n_subbufs))
+		return NULL;
+
+	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+	if (!chan)
+		return NULL;
+
+	chan->version = RELAYFS_CHANNEL_VERSION;
+	chan->n_subbufs = n_subbufs;
+	chan->subbuf_size = subbuf_size;
+	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+	setup_callbacks(chan, cb);
+	kref_init(&chan->kref);
+
+	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		goto free_chan;
+
+	for_each_online_cpu(i) {
+		sprintf(tmpname, "%s%d", base_filename, i);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
+		if (!chan->buf[i])
+			goto free_bufs;
+
+		chan->buf[i]->cpu = i;
+	}
+
+	kfree(tmpname);
+	return chan;
+
+free_bufs:
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			break;
+		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
+	}
+	kfree(tmpname);
+
+free_chan:
+	kref_put(&chan->kref, relay_destroy_channel);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+
+/**
+ *	relay_switch_subbuf - switch to a new sub-buffer
+ *	@buf: channel buffer
+ *	@length: size of current event
+ *
+ *	Returns either the length passed in or 0 if full.
+ *
+ *	Performs sub-buffer-switch tasks such as invoking callbacks,
+ *	updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+	void *old, *new;
+	size_t old_subbuf, new_subbuf;
+
+	if (unlikely(length > buf->chan->subbuf_size))
+		goto toobig;
+
+	if (buf->offset != buf->chan->subbuf_size + 1) {
+		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->subbufs_produced++;
+		if (waitqueue_active(&buf->read_wait)) {
+			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+			schedule_delayed_work(&buf->wake_readers, 1);
+		}
+	}
+
+	old = buf->data;
+	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+	new = buf->start + new_subbuf * buf->chan->subbuf_size;
+	buf->offset = 0;
+	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+		buf->offset = buf->chan->subbuf_size + 1;
+		return 0;
+	}
+	buf->data = new;
+	buf->padding[new_subbuf] = 0;
+
+	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+		goto toobig;
+
+	return length;
+
+toobig:
+	buf->chan->last_toobig = length;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+
+/**
+ *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *	@chan: the channel
+ *	@cpu: the cpu associated with the channel buffer to update
+ *	@subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *	Adds to the channel buffer's consumed sub-buffer count.
+ *	subbufs_consumed should be the number of sub-buffers newly consumed,
+ *	not the total consumed.
+ *
+ *	NOTE: kernel clients don't need to call this function if the channel
+ *	mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+			    unsigned int cpu,
+			    size_t subbufs_consumed)
+{
+	struct rchan_buf *buf;
+
+	if (!chan)
+		return;
+
+	if (cpu >= NR_CPUS || !chan->buf[cpu])
+		return;
+
+	buf = chan->buf[cpu];
+	buf->subbufs_consumed += subbufs_consumed;
+	if (buf->subbufs_consumed > buf->subbufs_produced)
+		buf->subbufs_consumed = buf->subbufs_produced;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+
+/**
+ *	relay_close - close the channel
+ *	@chan: the channel
+ *
+ *	Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
+	}
+
+	if (chan->last_toobig)
+		printk(KERN_WARNING "relay: one or more items not logged "
+		       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+		       chan->last_toobig, chan->subbuf_size);
+
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+
+/**
+ *	relay_flush - close the channel
+ *	@chan: the channel
+ *
+ *	Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
+	}
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+
+/**
+ *	relay_file_open - open file op for relay files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = inode->u.generic_ip;
+	kref_get(&buf->kref);
+	filp->private_data = buf;
+
+	return 0;
+}
+
+/**
+ *	relay_file_mmap - mmap file op for relay files
+ *	@filp: the file
+ *	@vma: the vma describing what to map
+ *
+ *	Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = filp->private_data;
+	return relay_mmap_buf(buf, vma);
+}
+
+/**
+ *	relay_file_poll - poll file op for relay files
+ *	@filp: the file
+ *	@wait: poll table
+ *
+ *	Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask = 0;
+	struct rchan_buf *buf = filp->private_data;
+
+	if (buf->finalized)
+		return POLLERR;
+
+	if (filp->f_mode & FMODE_READ) {
+		poll_wait(filp, &buf->read_wait, wait);
+		if (!relay_buf_empty(buf))
+			mask |= POLLIN | POLLRDNORM;
+	}
+
+	return mask;
+}
+
+/**
+ *	relay_file_release - release file op for relay files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Decrements the channel refcount, as the filesystem is
+ *	no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = filp->private_data;
+	kref_put(&buf->kref, relay_remove_buf);
+
+	return 0;
+}
+
+/**
+ *	relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
+{
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t read_subbuf;
+
+	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+
+	buf->bytes_consumed += bytes_consumed;
+	read_subbuf = read_pos / buf->chan->subbuf_size;
+	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+		    (buf->offset == subbuf_size))
+			return;
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+}
+
+/**
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+	size_t bytes_produced, bytes_consumed, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t produced = buf->subbufs_produced % n_subbufs;
+	size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+
+	if (consumed > produced) {
+		if ((produced > n_subbufs) &&
+		    (produced + n_subbufs - consumed <= n_subbufs))
+			produced += n_subbufs;
+	} else if (consumed == produced) {
+		if (buf->offset > subbuf_size) {
+			produced += n_subbufs;
+			if (buf->subbufs_produced == buf->subbufs_consumed)
+				consumed += n_subbufs;
+		}
+	}
+
+	if (buf->offset > subbuf_size)
+		bytes_produced = (produced - 1) * subbuf_size + write_offset;
+	else
+		bytes_produced = produced * subbuf_size + write_offset;
+	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
+
+	if (bytes_produced == bytes_consumed)
+		return 0;
+
+	relay_file_read_consume(buf, read_pos, 0);
+
+	return 1;
+}
+
+/**
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
+{
+	size_t padding, avail = 0;
+	size_t read_subbuf, read_offset, write_subbuf, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+
+	write_subbuf = (buf->data - buf->start) / subbuf_size;
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	read_subbuf = read_pos / subbuf_size;
+	read_offset = read_pos % subbuf_size;
+	padding = buf->padding[read_subbuf];
+
+	if (read_subbuf == write_subbuf) {
+		if (read_offset + padding < write_offset)
+			avail = write_offset - (read_offset + padding);
+	} else
+		avail = (subbuf_size - padding) - read_offset;
+
+	return avail;
+}
+
+/**
+ *	relay_file_read_start_pos - find the first available byte to read
+ *
+ *	If the read_pos is in the middle of padding, return the
+ *	position of the first actually available byte, otherwise
+ *	return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
+{
+	size_t read_subbuf, padding, padding_start, padding_end;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	padding_start = (read_subbuf + 1) * subbuf_size - padding;
+	padding_end = (read_subbuf + 1) * subbuf_size;
+	if (read_pos >= padding_start && read_pos < padding_end) {
+		read_subbuf = (read_subbuf + 1) % n_subbufs;
+		read_pos = read_subbuf * subbuf_size;
+	}
+
+	return read_pos;
+}
+
+/**
+ *	relay_file_read_end_pos - return the new read position
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
+{
+	size_t read_subbuf, padding, end_pos;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	if (read_pos % subbuf_size + count + padding == subbuf_size)
+		end_pos = (read_subbuf + 1) * subbuf_size;
+	else
+		end_pos = read_pos + count;
+	if (end_pos >= subbuf_size * n_subbufs)
+		end_pos = 0;
+
+	return end_pos;
+}
+
+/**
+ *	relay_file_read - read file op for relay files
+ *	@filp: the file
+ *	@buffer: the userspace buffer
+ *	@count: number of bytes to read
+ *	@ppos: position to read from
+ *
+ *	Reads count bytes or the number of bytes available in the
+ *	current sub-buffer being read, whichever is smaller.
+ */
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
+{
+	struct rchan_buf *buf = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	size_t read_start, avail;
+	ssize_t ret = 0;
+	void *from;
+
+	mutex_lock(&inode->i_mutex);
+	if(!relay_file_read_avail(buf, *ppos))
+		goto out;
+
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		goto out;
+
+	from = buf->start + read_start;
+	ret = count = min(count, avail);
+	if (copy_to_user(buffer, from, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
+	.llseek		= no_llseek,
+	.release	= relay_file_release,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
-- 
cgit v1.2.3


From 221415d76231d9012871e6e6abcbad906c46626a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 19:57:55 +0100
Subject: [PATCH] relay: add sendfile() support

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/relay.c | 144 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 115 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 9358e8eb847..fefe2b2a727 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,15 +95,16 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
  *	@buf: the buffer struct
  *	@size: total size of the buffer
  *
- *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ *	passed in size will get page aligned, if it isn't already.
  */
-static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 {
 	void *mem;
 	unsigned int i, j, n_pages;
 
-	size = PAGE_ALIGN(size);
-	n_pages = size >> PAGE_SHIFT;
+	*size = PAGE_ALIGN(*size);
+	n_pages = *size >> PAGE_SHIFT;
 
 	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!buf->page_array)
@@ -118,7 +119,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
 	if (!mem)
 		goto depopulate;
 
-	memset(mem, 0, size);
+	memset(mem, 0, *size);
 	buf->page_count = n_pages;
 	return mem;
 
@@ -146,7 +147,7 @@ struct rchan_buf *relay_create_buf(struct rchan *chan)
 	if (!buf->padding)
 		goto free_buf;
 
-	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	buf->start = relay_alloc_buf(buf, &chan->alloc_size);
 	if (!buf->start)
 		goto free_buf;
 
@@ -543,6 +544,9 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 		buf->padding[old_subbuf] = buf->prev_padding;
 		buf->subbufs_produced++;
+		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+			buf->padding[old_subbuf];
+		smp_mb();
 		if (waitqueue_active(&buf->read_wait)) {
 			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
 			schedule_delayed_work(&buf->wake_readers, 1);
@@ -757,37 +761,33 @@ static void relay_file_read_consume(struct rchan_buf *buf,
  */
 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 {
-	size_t bytes_produced, bytes_consumed, write_offset;
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
-	size_t produced = buf->subbufs_produced % n_subbufs;
-	size_t consumed = buf->subbufs_consumed % n_subbufs;
+	size_t produced = buf->subbufs_produced;
+	size_t consumed = buf->subbufs_consumed;
 
-	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	relay_file_read_consume(buf, read_pos, 0);
 
-	if (consumed > produced) {
-		if ((produced > n_subbufs) &&
-		    (produced + n_subbufs - consumed <= n_subbufs))
-			produced += n_subbufs;
-	} else if (consumed == produced) {
-		if (buf->offset > subbuf_size) {
-			produced += n_subbufs;
-			if (buf->subbufs_produced == buf->subbufs_consumed)
-				consumed += n_subbufs;
-		}
+	if (unlikely(buf->offset > subbuf_size)) {
+		if (produced == consumed)
+			return 0;
+		return 1;
 	}
 
-	if (buf->offset > subbuf_size)
-		bytes_produced = (produced - 1) * subbuf_size + write_offset;
-	else
-		bytes_produced = produced * subbuf_size + write_offset;
-	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
-
-	if (bytes_produced == bytes_consumed)
+	if (unlikely(produced - consumed >= n_subbufs)) {
+		consumed = (produced / n_subbufs) * n_subbufs;
+		buf->subbufs_consumed = consumed;
+	}
+	
+	produced = (produced % n_subbufs) * subbuf_size + buf->offset;
+	consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
+
+	if (consumed > produced)
+		produced += n_subbufs * subbuf_size;
+	
+	if (consumed == produced)
 		return 0;
 
-	relay_file_read_consume(buf, read_pos, 0);
-
 	return 1;
 }
 
@@ -908,6 +908,91 @@ out:
 	return ret;
 }
 
+static ssize_t relay_file_sendsubbuf(struct file *filp, loff_t *ppos,
+				     size_t count, read_actor_t actor,
+				     void *target)
+{
+	struct rchan_buf *buf = filp->private_data;
+	read_descriptor_t desc;
+	size_t read_start, avail;
+	unsigned long pidx, poff;
+	unsigned int subbuf_pages;
+	ssize_t ret = 0;
+
+	if (!relay_file_read_avail(buf, *ppos))
+		return 0;
+
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		return 0;
+
+	count = min(count, avail);
+
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
+
+	subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
+	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
+	poff = read_start & ~PAGE_MASK;
+	while (count) {
+		struct page *p = buf->page_array[pidx];
+		unsigned int len;
+
+		len = PAGE_SIZE - poff;
+		if (len > count)
+			len = count;
+
+		len = actor(&desc, p, poff, len);
+
+		if (desc.error) {
+			if (!ret)
+				ret = desc.error;
+			break;
+		}
+
+		count -= len;
+		ret += len;
+		poff = 0;
+		pidx = (pidx + 1) % subbuf_pages;
+	}
+
+	if (ret > 0) {
+		relay_file_read_consume(buf, read_start, ret);
+		*ppos = relay_file_read_end_pos(buf, read_start, ret);
+	}
+
+	return ret;
+}
+
+static ssize_t relay_file_sendfile(struct file *filp, loff_t *ppos,
+				   size_t count, read_actor_t actor,
+				   void *target)
+{
+	ssize_t sent = 0, ret = 0;
+
+	if (!count)
+		return 0;
+
+	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+
+	do {
+		ret = relay_file_sendsubbuf(filp, ppos, count, actor, target);
+		if (ret < 0) {
+			if (!sent)
+				sent = ret;
+			break;
+		}
+		count -= ret;
+		sent += ret;
+	} while (count && ret);
+
+	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+	return sent;
+}
+
 struct file_operations relay_file_operations = {
 	.open		= relay_file_open,
 	.poll		= relay_file_poll,
@@ -915,5 +1000,6 @@ struct file_operations relay_file_operations = {
 	.read		= relay_file_read,
 	.llseek		= no_llseek,
 	.release	= relay_file_release,
+	.sendfile       = relay_file_sendfile,
 };
 EXPORT_SYMBOL_GPL(relay_file_operations);
-- 
cgit v1.2.3


From 6dac40a7ce2483a47b54af07afebeb84131c7228 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Thu, 23 Mar 2006 19:58:45 +0100
Subject: [PATCH] relay: consolidate sendfile() and read() code

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/relay.c | 175 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 91 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index fefe2b2a727..33345e73485 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -866,131 +866,138 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 }
 
 /**
- *	relay_file_read - read file op for relay files
- *	@filp: the file
- *	@buffer: the userspace buffer
- *	@count: number of bytes to read
- *	@ppos: position to read from
- *
- *	Reads count bytes or the number of bytes available in the
- *	current sub-buffer being read, whichever is smaller.
+ *	subbuf_read_actor - read up to one subbuf's worth of data
  */
-static ssize_t relay_file_read(struct file *filp,
-			       char __user *buffer,
-			       size_t count,
-			       loff_t *ppos)
+static int subbuf_read_actor(size_t read_start,
+			     struct rchan_buf *buf,
+			     size_t avail,
+			     read_descriptor_t *desc,
+			     read_actor_t actor)
 {
-	struct rchan_buf *buf = filp->private_data;
-	struct inode *inode = filp->f_dentry->d_inode;
-	size_t read_start, avail;
-	ssize_t ret = 0;
 	void *from;
-
-	mutex_lock(&inode->i_mutex);
-	if(!relay_file_read_avail(buf, *ppos))
-		goto out;
-
-	read_start = relay_file_read_start_pos(*ppos, buf);
-	avail = relay_file_read_subbuf_avail(read_start, buf);
-	if (!avail)
-		goto out;
+	int ret = 0;
 
 	from = buf->start + read_start;
-	ret = count = min(count, avail);
-	if (copy_to_user(buffer, from, count)) {
-		ret = -EFAULT;
-		goto out;
+	ret = avail;
+	if (copy_to_user(desc->arg.data, from, avail)) {
+		desc->error = -EFAULT;
+		ret = 0;
 	}
-	relay_file_read_consume(buf, read_start, count);
-	*ppos = relay_file_read_end_pos(buf, read_start, count);
-out:
-	mutex_unlock(&inode->i_mutex);
+	desc->arg.data += ret;
+	desc->written += ret;
+	desc->count -= ret;
+
 	return ret;
 }
 
-static ssize_t relay_file_sendsubbuf(struct file *filp, loff_t *ppos,
-				     size_t count, read_actor_t actor,
-				     void *target)
+/**
+ *	subbuf_send_actor - send up to one subbuf's worth of data
+ */
+static int subbuf_send_actor(size_t read_start,
+			     struct rchan_buf *buf,
+			     size_t avail,
+			     read_descriptor_t *desc,
+			     read_actor_t actor)
 {
-	struct rchan_buf *buf = filp->private_data;
-	read_descriptor_t desc;
-	size_t read_start, avail;
 	unsigned long pidx, poff;
 	unsigned int subbuf_pages;
-	ssize_t ret = 0;
-
-	if (!relay_file_read_avail(buf, *ppos))
-		return 0;
-
-	read_start = relay_file_read_start_pos(*ppos, buf);
-	avail = relay_file_read_subbuf_avail(read_start, buf);
-	if (!avail)
-		return 0;
-
-	count = min(count, avail);
-
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
+	int ret = 0;
 
 	subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	while (count) {
+	while (avail) {
 		struct page *p = buf->page_array[pidx];
 		unsigned int len;
 
 		len = PAGE_SIZE - poff;
-		if (len > count)
-			len = count;
+		if (len > avail)
+			len = avail;
 
-		len = actor(&desc, p, poff, len);
-
-		if (desc.error) {
-			if (!ret)
-				ret = desc.error;
+		len = actor(desc, p, poff, len);
+		if (desc->error)
 			break;
-		}
 
-		count -= len;
+		avail -= len;
 		ret += len;
 		poff = 0;
 		pidx = (pidx + 1) % subbuf_pages;
 	}
 
-	if (ret > 0) {
-		relay_file_read_consume(buf, read_start, ret);
-		*ppos = relay_file_read_end_pos(buf, read_start, ret);
-	}
-
 	return ret;
 }
 
-static ssize_t relay_file_sendfile(struct file *filp, loff_t *ppos,
-				   size_t count, read_actor_t actor,
-				   void *target)
+typedef int (*subbuf_actor_t) (size_t read_start,
+			       struct rchan_buf *buf,
+			       size_t avail,
+			       read_descriptor_t *desc,
+			       read_actor_t actor);
+
+/**
+ *	relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
+ */
+static inline ssize_t relay_file_read_subbufs(struct file *filp,
+					      loff_t *ppos,
+					      size_t count,
+					      subbuf_actor_t subbuf_actor,
+					      read_actor_t actor,
+					      void *target)
 {
-	ssize_t sent = 0, ret = 0;
+	struct rchan_buf *buf = filp->private_data;
+	size_t read_start, avail;
+	read_descriptor_t desc;
+	int ret;
 
 	if (!count)
 		return 0;
 
-	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
 
+	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
 	do {
-		ret = relay_file_sendsubbuf(filp, ppos, count, actor, target);
-		if (ret < 0) {
-			if (!sent)
-				sent = ret;
+		if (!relay_file_read_avail(buf, *ppos))
+			break;
+
+		read_start = relay_file_read_start_pos(*ppos, buf);
+		avail = relay_file_read_subbuf_avail(read_start, buf);
+		if (!avail)
 			break;
-		}
-		count -= ret;
-		sent += ret;
-	} while (count && ret);
 
+		avail = min(desc.count, avail);
+		ret = subbuf_actor(read_start, buf, avail, &desc, actor);
+		if (desc.error < 0)
+			break;
+
+		if (ret) {
+			relay_file_read_consume(buf, read_start, ret);
+			*ppos = relay_file_read_end_pos(buf, read_start, ret);
+		}
+	} while (desc.count && ret);
 	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
-	return sent;
+
+	return desc.written;
+}
+
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
+{
+	return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
+				       NULL, buffer);
+}
+
+static ssize_t relay_file_sendfile(struct file *filp,
+				   loff_t *ppos,
+				   size_t count,
+				   read_actor_t actor,
+				   void *target)
+{
+	return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
+				       actor, target);
 }
 
 struct file_operations relay_file_operations = {
-- 
cgit v1.2.3


From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 20:00:26 +0100
Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a4..c21bae8c93b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
 	return tsk;
 }
 
-- 
cgit v1.2.3