aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz20
-rw-r--r--kernel/acct.c29
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c13
-rw-r--r--kernel/compat.c33
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/cpuset.c126
-rw-r--r--kernel/delayacct.c19
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c83
-rw-r--r--kernel/fork.c132
-rw-r--r--kernel/futex.c62
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/kallsyms.c33
-rw-r--r--kernel/kexec.c60
-rw-r--r--kernel/kmod.c26
-rw-r--r--kernel/kprobes.c117
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latency.c1
-rw-r--r--kernel/lockdep.c244
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c6
-rw-r--r--kernel/module.c75
-rw-r--r--kernel/mutex-debug.c3
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/nsproxy.c38
-rw-r--r--kernel/pid.c77
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/disk.c101
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/power.h32
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c143
-rw-r--r--kernel/power/snapshot.c860
-rw-r--r--kernel/power/swap.c347
-rw-r--r--kernel/power/swsusp.c98
-rw-r--r--kernel/power/user.c102
-rw-r--r--kernel/printk.c45
-rw-r--r--kernel/profile.c47
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c20
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/sched.c554
-rw-r--r--kernel/signal.c38
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/spinlock.c21
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c446
-rw-r--r--kernel/taskstats.c193
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/timer.c162
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/unwind.c212
-rw-r--r--kernel/user.c15
-rw-r--r--kernel/workqueue.c214
64 files changed, 3336 insertions, 1678 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8..4af15802ccd 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
default HZ_250
help
Allows the configuration of the timer frequency. It is customary
- to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+ to have the timer interrupt run at 1000 Hz but 100 Hz may be more
beneficial for servers and NUMA systems that do not need to have
a fast response for user interaction and that may experience bus
contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
config HZ_100
bool "100 HZ"
help
- 100 HZ is a typical choice for servers, SMP and NUMA systems
+ 100 Hz is a typical choice for servers, SMP and NUMA systems
with lots of processors that may show reduced performance if
too many timer interrupts are occurring.
config HZ_250
bool "250 HZ"
help
- 250 HZ is a good compromise choice allowing server performance
+ 250 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
- on SMP and NUMA systems.
+ on SMP and NUMA systems. If you are going to be using NTSC video
+ or multimedia, selected 300Hz instead.
+
+ config HZ_300
+ bool "300 HZ"
+ help
+ 300 Hz is a good compromise choice allowing server performance
+ while also showing good interactive responsiveness even
+ on SMP and NUMA systems and exactly dividing by both PAL and
+ NTSC frame rates for video and multimedia work.
config HZ_1000
bool "1000 HZ"
help
- 1000 HZ is the preferred choice for desktop systems and other
+ 1000 Hz is the preferred choice for desktop systems and other
systems requiring fast interactive responses to events.
endchoice
@@ -42,5 +51,6 @@ config HZ
int
default 100 if HZ_100
default 250 if HZ_250
+ default 300 if HZ_300
default 1000 if HZ_1000
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a8..70d0d88e555 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
struct timer_list timer;
};
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned =
+ {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
/*
* Called whenever the timer says to check the free space.
@@ -117,7 +118,7 @@ static int check_free_space(struct file *file)
spin_unlock(&acct_globals.lock);
/* May block */
- if (vfs_statfs(file->f_dentry, &sbuf))
+ if (vfs_statfs(file->f_path.dentry, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
@@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file)
add_timer(&acct_globals.timer);
}
if (old_acct) {
- mnt_unpin(old_acct->f_vfsmnt);
+ mnt_unpin(old_acct->f_path.mnt);
spin_unlock(&acct_globals.lock);
do_acct_process(old_acct);
filp_close(old_acct, NULL);
@@ -211,7 +212,7 @@ static int acct_on(char *name)
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -228,11 +229,11 @@ static int acct_on(char *name)
}
spin_lock(&acct_globals.lock);
- mnt_pin(file->f_vfsmnt);
+ mnt_pin(file->f_path.mnt);
acct_file_reopen(file);
spin_unlock(&acct_globals.lock);
- mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
+ mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
return 0;
}
@@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
void acct_auto_close_mnt(struct vfsmount *m)
{
spin_lock(&acct_globals.lock);
- if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+ if (acct_globals.file && acct_globals.file->f_path.mnt == m)
acct_file_reopen(NULL);
spin_unlock(&acct_globals.lock);
}
@@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
{
spin_lock(&acct_globals.lock);
if (acct_globals.file &&
- acct_globals.file->f_vfsmnt->mnt_sb == sb) {
+ acct_globals.file->f_path.mnt->mnt_sb == sb) {
acct_file_reopen(NULL);
}
spin_unlock(&acct_globals.lock);
@@ -427,6 +428,7 @@ static void do_acct_process(struct file *file)
u64 elapsed;
u64 run_time;
struct timespec uptime;
+ struct tty_struct *tty;
/*
* First check to see if there is enough free_space to continue
@@ -483,16 +485,9 @@ static void do_acct_process(struct file *file)
ac.ac_ppid = current->parent->tgid;
#endif
- mutex_lock(&tty_mutex);
- /* FIXME: Whoever is responsible for current->signal locking needs
- to use the same locking all over the kernel and document it */
- read_lock(&tasklist_lock);
- ac.ac_tty = current->signal->tty ?
- old_encode_dev(tty_devnum(current->signal->tty)) : 0;
- read_unlock(&tasklist_lock);
- mutex_unlock(&tty_mutex);
-
spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty;
+ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b..d9b690ac684 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
#include <linux/netlink.h>
#include <linux/selinux.h>
#include <linux/inotify.h>
+#include <linux/freezer.h>
#include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8..2e896f8ae29 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
struct audit_rule *rule;
int i;
- rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule), GFP_KERNEL);
if (unlikely(!rule))
return NULL;
- memset(rule, 0, sizeof(*rule));
rule->flags = krule->flags | krule->listnr;
rule->action = krule->action;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 42f2f117971..298897559ca 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
#include <linux/tty.h>
#include <linux/selinux.h>
#include <linux/binfmts.h>
+#include <linux/highmem.h>
#include <linux/syscalls.h>
#include "audit.h"
@@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
printk(KERN_ERR "audit: freed %d contexts\n", count);
}
-static void audit_log_task_context(struct audit_buffer *ab)
+void audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
ssize_t len = 0;
@@ -759,6 +760,8 @@ error_path:
return;
}
+EXPORT_SYMBOL(audit_log_task_context);
+
static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
{
char name[sizeof(tsk->comm)];
@@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
audit_log_d_path(ab, "exe=",
- vma->vm_file->f_dentry,
- vma->vm_file->f_vfsmnt);
+ vma->vm_file->f_path.dentry,
+ vma->vm_file->f_path.mnt);
break;
}
vma = vma->vm_next;
@@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->return_code);
mutex_lock(&tty_mutex);
+ read_lock(&tasklist_lock);
if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
tty = tsk->signal->tty->name;
else
tty = "(none)";
+ read_unlock(&tasklist_lock);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
return ctx ? ctx->loginuid : -1;
}
+EXPORT_SYMBOL(audit_get_loginuid);
+
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
diff --git a/kernel/compat.c b/kernel/compat.c
index d4898aad6cf..6952dd05730 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
}
return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
+
+asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+ compat_ulong_t maxnode,
+ const compat_ulong_t __user *old_nodes,
+ const compat_ulong_t __user *new_nodes)
+{
+ unsigned long __user *old = NULL;
+ unsigned long __user *new = NULL;
+ nodemask_t tmp_mask;
+ unsigned long nr_bits;
+ unsigned long size;
+
+ nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+ size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+ if (old_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+ return -EFAULT;
+ old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+ if (new_nodes)
+ new = old + size / sizeof(unsigned long);
+ if (copy_to_user(old, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ if (new_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+ return -EFAULT;
+ if (new == NULL)
+ new = compat_alloc_user_space(size);
+ if (copy_to_user(new, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ return sys_migrate_pages(pid, nr_bits + 1, old, new);
+}
#endif
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4a..8fa1fb28f8a 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
return count;
}
-static struct file_operations ikconfig_file_ops = {
+static const struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 663c920b223..9124669f458 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void)
recursive_depth--;
return;
}
- mutex_unlock(&cpu_bitmask_lock);
recursive = NULL;
+ mutex_unlock(&cpu_bitmask_lock);
}
EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
goto out;
}
}
- error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
- if (error) {
- printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
- goto out;
- }
+
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930..232aed2b10f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
*
*
* When reading/writing to a file:
- * - the cpuset to use in file->f_dentry->d_parent->d_fsdata
- * - the 'cftype' of the file is file->f_dentry->d_fsdata
+ * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
+ * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
*/
struct cftype {
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
}
/* Remaining checks don't apply to root cpuset */
- if ((par = cur->parent) == NULL)
+ if (cur == &top_cpuset)
return 0;
+ par = cur->parent;
+
/* We must be a subset of our parent cpuset */
if (!is_cpuset_subset(trial, par))
return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
cpu_exclusive_changed =
(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
mutex_lock(&callback_mutex);
- if (turning_on)
- set_bit(bit, &cs->flags);
- else
- clear_bit(bit, &cs->flags);
+ cs->flags = trialcs.flags;
mutex_unlock(&callback_mutex);
if (cpu_exclusive_changed)
@@ -1281,18 +1280,19 @@ typedef enum {
FILE_TASKLIST,
} cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct file *file,
+ const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
{
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
cpuset_filetype_t type = cft->private;
char *buffer;
char *pathbuf = NULL;
int retval = 0;
/* Crude upper limit on largest legitimate cpulist user might write. */
- if (nbytes > 100 + 6 * NR_CPUS)
+ if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
return -E2BIG;
/* +1 for nul-terminator */
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
size_t nbytes, loff_t *ppos)
{
ssize_t retval = 0;
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
if (!cft)
return -ENODEV;
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
+ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
cpuset_filetype_t type = cft->private;
char *page;
ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
loff_t *ppos)
{
ssize_t retval = 0;
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
if (!cft)
return -ENODEV;
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
if (err)
return err;
- cft = __d_cft(file->f_dentry);
+ cft = __d_cft(file->f_path.dentry);
if (!cft)
return -ENODEV;
if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
static int cpuset_file_release(struct inode *inode, struct file *file)
{
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
if (cft->release)
return cft->release(inode, file);
return 0;
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
}
-static struct file_operations cpuset_file_operations = {
+static const struct file_operations cpuset_file_operations = {
.read = cpuset_file_read,
.write = cpuset_file_write,
.llseek = generic_file_llseek,
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
*/
static int cpuset_tasks_open(struct inode *unused, struct file *file)
{
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
struct ctr_struct *ctr;
pid_t *pidarray;
int npids;
@@ -2045,7 +2045,6 @@ out:
return err;
}
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
/*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
mutex_unlock(&callback_mutex);
mutex_unlock(&manage_mutex);
}
-#endif
-#ifdef CONFIG_HOTPLUG_CPU
/*
* The top_cpuset tracks what CPUs and Memory Nodes are online,
* period. This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
common_cpu_mem_hotplug_unplug();
return 0;
}
-#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -2346,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
}
/**
- * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
* @z: is this zone on an allowed node?
- * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
+ * @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If zone
+ * If we're in interrupt, yes, we can always allocate. If
+ * __GFP_THISNODE is set, yes, we can always allocate. If zone
* z's node is in our tasks mems_allowed, yes. If it's not a
* __GFP_HARDWALL request and this zone's nodes is in the nearest
* mem_exclusive cpuset ancestor to this tasks cpuset, yes.
* Otherwise, no.
*
+ * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
+ * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
+ * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
+ * from an enclosing cpuset.
+ *
+ * cpuset_zone_allowed_hardwall() only handles the simpler case of
+ * hardwall cpusets, and never sleeps.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first. By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset.
* GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest mem_exclusive ancestor cpuset.
+ * nearest enclosing mem_exclusive ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
- * routine only calls here with __GFP_HARDWALL bit _not_ set if
- * it's a GFP_KERNEL allocation, and all nodes in the current tasks
- * mems_allowed came up empty on the first pass over the zonelist.
- * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_mutex mutex.
+ * Scanning up parent cpusets requires callback_mutex. The
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
+ * current tasks mems_allowed came up empty on the first pass over
+ * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
+ * cpuset are short of memory, might require taking the callback_mutex
+ * mutex.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
- * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
- * no allocation on a node outside the cpuset is allowed (unless in
- * interrupt, of course).
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
+ * so no allocation on a node outside the cpuset is allowed (unless
+ * in interrupt, of course).
*
* The second pass through get_page_from_freelist() doesn't even call
* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -2384,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* GFP_USER - only nodes in current tasks mems allowed ok.
*
* Rule:
- * Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
* the code that might scan up ancestor cpusets and sleep.
- **/
+ */
-int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
@@ -2419,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
return allowed;
}
+/*
+ * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate.
+ * If __GFP_THISNODE is set, yes, we can always allocate. If zone
+ * z's node is in our tasks mems_allowed, yes. Otherwise, no.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first. By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
+ * Unlike the cpuset_zone_allowed_softwall() variant, above,
+ * this variant requires that the zone be in the current tasks
+ * mems_allowed or that we're in interrupt. It does not scan up the
+ * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
+ * It never sleeps.
+ */
+
+int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+ int node; /* node that zone z is on */
+
+ if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+ return 1;
+ node = zone_to_nid(z);
+ if (node_isset(node, current->mems_allowed))
+ return 1;
+ return 0;
+}
+
/**
* cpuset_lock - lock out any changes to cpuset structures
*
@@ -2610,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
return single_open(file, proc_cpuset_show, pid);
}
-struct file_operations proc_cpuset_operations = {
+const struct file_operations proc_cpuset_operations = {
.open = cpuset_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 36752f124c6..766d5912b26 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
#include <linux/delayacct.h>
int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
-kmem_cache_t *delayacct_cache;
+struct kmem_cache *delayacct_cache;
static int __init delayacct_setup_disable(char *str)
{
@@ -41,7 +41,7 @@ void delayacct_init(void)
void __delayacct_tsk_init(struct task_struct *tsk)
{
- tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+ tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
if (tsk->delays)
spin_lock_init(&tsk->delays->lock);
}
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
{
struct timespec ts;
s64 ns;
+ unsigned long flags;
do_posix_clock_monotonic_gettime(end);
ts = timespec_sub(*end, *start);
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
if (ns < 0)
return;
- spin_lock(&current->delays->lock);
+ spin_lock_irqsave(&current->delays->lock, flags);
*total += ns;
(*count)++;
- spin_unlock(&current->delays->lock);
+ spin_unlock_irqrestore(&current->delays->lock, flags);
}
void __delayacct_blkio_start(void)
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
s64 tmp;
struct timespec ts;
unsigned long t1,t2,t3;
+ unsigned long flags;
/* Though tsk->delays accessed later, early exit avoids
* unnecessary returning of other data
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
- spin_lock(&tsk->delays->lock);
+ spin_lock_irqsave(&tsk->delays->lock, flags);
tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
- spin_unlock(&tsk->delays->lock);
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
done:
return 0;
@@ -152,11 +154,12 @@ done:
__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
{
__u64 ret;
+ unsigned long flags;
- spin_lock(&tsk->delays->lock);
+ spin_lock_irqsave(&tsk->delays->lock, flags);
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
tsk->delays->swapin_delay);
- spin_unlock(&tsk->delays->lock);
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938..937b13ca33b 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
return single_open(file, proc_dma_show, NULL);
}
-static struct file_operations proc_dma_operations = {
+static const struct file_operations proc_dma_operations = {
.open = proc_dma_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca..122fadb972f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/key.h>
#include <linux/security.h>
#include <linux/cpu.h>
@@ -22,6 +22,7 @@
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
@@ -48,7 +49,6 @@
#include <asm/mmu_context.h>
extern void sem_exit (void);
-extern struct task_struct *child_reaper;
static void exit_mm(struct task_struct * tsk);
@@ -189,21 +189,18 @@ repeat:
int session_of_pgrp(int pgrp)
{
struct task_struct *p;
- int sid = -1;
+ int sid = 0;
read_lock(&tasklist_lock);
- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
- if (p->signal->session > 0) {
- sid = p->signal->session;
- goto out;
- }
- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
- p = find_task_by_pid(pgrp);
- if (p)
- sid = p->signal->session;
-out:
+
+ p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
+ if (p == NULL)
+ p = find_task_by_pid(pgrp);
+ if (p != NULL)
+ sid = process_session(p);
+
read_unlock(&tasklist_lock);
-
+
return sid;
}
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
|| p->exit_state
|| is_init(p->real_parent))
continue;
- if (process_group(p->real_parent) != pgrp
- && p->real_parent->signal->session == p->signal->session) {
+ if (process_group(p->real_parent) != pgrp &&
+ process_session(p->real_parent) == process_session(p)) {
ret = 0;
break;
}
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
}
/**
- * reparent_to_init - Reparent the calling kernel thread to the init task.
+ * reparent_to_init - Reparent the calling kernel thread to the init task
+ * of the pid space that the thread belongs to.
*
* If a kernel thread is launched as a result of a system call, or if
* it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
ptrace_unlink(current);
/* Reparent to init */
remove_parent(current);
- current->parent = child_reaper;
- current->real_parent = child_reaper;
+ current->parent = child_reaper(current);
+ current->real_parent = child_reaper(current);
add_parent(current);
/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
{
struct task_struct *curr = current->group_leader;
- if (curr->signal->session != session) {
+ if (process_session(curr) != session) {
detach_pid(curr, PIDTYPE_SID);
- curr->signal->session = session;
+ set_signal_session(curr->signal, session);
attach_pid(curr, PIDTYPE_SID, session);
}
if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
}
}
-void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(pid_t session, pid_t pgrp)
{
write_lock_irq(&tasklist_lock);
__set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
exit_mm(current);
set_special_pids(1, 1);
- mutex_lock(&tty_mutex);
- current->signal->tty = NULL;
- mutex_unlock(&tty_mutex);
+ proc_clear_tty(current);
/* Block and flush all signals */
sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
for (;;) {
unsigned long set;
i = j * __NFDBITS;
- if (i >= fdt->max_fdset || i >= fdt->max_fds)
+ if (i >= fdt->max_fds)
break;
set = fdt->open_fds->fds_bits[j++];
while (set) {
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
* you can free files immediately.
*/
fdt = files_fdtable(files);
- if (fdt == &files->fdtab)
- fdt->free_files = files;
- else
+ if (fdt != &files->fdtab)
kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
+ call_rcu(&fdt->rcu, free_fdtable_rcu);
}
}
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
* outside, so the child pgrp is now orphaned.
*/
if ((process_group(p) != process_group(father)) &&
- (p->signal->session == father->signal->session)) {
+ (process_session(p) == process_session(father))) {
int pgrp = process_group(p);
- if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+ if (will_become_orphaned_pgrp(pgrp, NULL) &&
+ has_stopped_jobs(pgrp)) {
__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
}
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
- * the global child reaper process (ie "init")
+ * the child reaper process (ie "init") in our pid
+ * space.
*/
static void
forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
do {
reaper = next_thread(reaper);
if (reaper == father) {
- reaper = child_reaper;
+ reaper = child_reaper(father);
break;
}
} while (reaper->exit_state);
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
t = tsk->real_parent;
if ((process_group(t) != process_group(tsk)) &&
- (t->signal->session == tsk->signal->session) &&
+ (process_session(t) == process_session(tsk)) &&
will_become_orphaned_pgrp(process_group(tsk), tsk) &&
has_stopped_jobs(process_group(tsk))) {
__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -850,9 +846,7 @@ static void exit_notify(struct task_struct *tsk)
fastcall NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
- struct taskstats *tidstats;
int group_dead;
- unsigned int mycpu;
profile_task_exit(tsk);
@@ -862,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
- if (unlikely(tsk == child_reaper))
- panic("Attempted to kill init!");
+ if (unlikely(tsk == child_reaper(tsk))) {
+ if (tsk->nsproxy->pid_ns != &init_pid_ns)
+ tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
+ else
+ panic("Attempted to kill init!");
+ }
+
if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
current->ptrace_message = code;
@@ -890,8 +889,6 @@ fastcall NORET_TYPE void do_exit(long code)
current->comm, current->pid,
preempt_count());
- taskstats_exit_alloc(&tidstats, &mycpu);
-
acct_update_integrals(tsk);
if (tsk->mm) {
update_hiwater_rss(tsk->mm);
@@ -911,8 +908,8 @@ fastcall NORET_TYPE void do_exit(long code)
#endif
if (unlikely(tsk->audit_context))
audit_free(tsk);
- taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
- taskstats_exit_free(tidstats);
+
+ taskstats_exit(tsk, group_dead);
exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da978eec79..fc723e595cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
@@ -36,6 +36,7 @@
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
@@ -82,26 +83,26 @@ int nr_processes(void)
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
-static kmem_cache_t *task_struct_cachep;
+static struct kmem_cache *task_struct_cachep;
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
-static kmem_cache_t *signal_cachep;
+static struct kmem_cache *signal_cachep;
/* SLAB cache for sighand_struct structures (tsk->sighand) */
-kmem_cache_t *sighand_cachep;
+struct kmem_cache *sighand_cachep;
/* SLAB cache for files_struct structures (tsk->files) */
-kmem_cache_t *files_cachep;
+struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
-kmem_cache_t *fs_cachep;
+struct kmem_cache *fs_cachep;
/* SLAB cache for vm_area_struct structures */
-kmem_cache_t *vm_area_cachep;
+struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
-static kmem_cache_t *mm_cachep;
+static struct kmem_cache *mm_cachep;
void free_task(struct task_struct *tsk)
{
@@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct mempolicy *pol;
down_write(&oldmm->mmap_sem);
- flush_cache_mm(oldmm);
+ flush_cache_dup_mm(oldmm);
/*
* Not linked in yet - no deadlock potential:
*/
@@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
goto fail_nomem;
charge = len;
}
- tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
anon_vma_link(tmp);
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
@@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
#include <linux/init_task.h>
@@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
tsk->vfork_done = NULL;
complete(vfork_done);
}
- if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+
+ /*
+ * If we're exiting normally, clear a user-space tid field if
+ * requested. We leave this alone when dying by signal, to leave
+ * the value intact in a core dump, and to save the unnecessary
+ * trouble otherwise. Userland only wants this done for a sys_exit.
+ */
+ if (tsk->clear_child_tid
+ && !(tsk->flags & PF_SIGNALED)
+ && atomic_read(&mm->mm_users) > 1) {
u32 __user * tidptr = tsk->clear_child_tid;
tsk->clear_child_tid = NULL;
@@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
+ /* Initializing for Swap token stuff */
+ mm->token_priority = 0;
+ mm->last_interval = 0;
+
if (!mm_init(mm))
goto fail_nomem;
@@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
goto fail_nomem;
good_mm:
+ /* Initializing for Swap token stuff */
+ mm->token_priority = 0;
+ mm->last_interval = 0;
+
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
@@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
static int count_open_files(struct fdtable *fdt)
{
- int size = fdt->max_fdset;
+ int size = fdt->max_fds;
int i;
/* Find the last open fd */
@@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void)
struct files_struct *newf;
struct fdtable *fdt;
- newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+ newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
goto out;
@@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void)
newf->next_fd = 0;
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
INIT_RCU_HEAD(&fdt->rcu);
- fdt->free_files = NULL;
fdt->next = NULL;
rcu_assign_pointer(newf->fdt, fdt);
out:
@@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, size, i, expand;
+ int open_files, size, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
new_fdt = files_fdtable(newf);
- size = old_fdt->max_fdset;
open_files = count_open_files(old_fdt);
- expand = 0;
/*
- * Check whether we need to allocate a larger fd array or fd set.
- * Note: we're not a clone task, so the open count won't change.
+ * Check whether we need to allocate a larger fd array and fd set.
+ * Note: we're not a clone task, so the open count won't change.
*/
- if (open_files > new_fdt->max_fdset) {
- new_fdt->max_fdset = 0;
- expand = 1;
- }
if (open_files > new_fdt->max_fds) {
new_fdt->max_fds = 0;
- expand = 1;
- }
-
- /* if the old fdset gets grown now, we'll only copy up to "size" fds */
- if (expand) {
spin_unlock(&oldf->file_lock);
spin_lock(&newf->file_lock);
*errorp = expand_files(newf, open_files-1);
@@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
- memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
- memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+ memcpy(new_fdt->open_fds->fds_bits,
+ old_fdt->open_fds->fds_bits, open_files/8);
+ memcpy(new_fdt->close_on_exec->fds_bits,
+ old_fdt->close_on_exec->fds_bits, open_files/8);
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
@@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
/* This is long word aligned thus could use a optimized version */
memset(new_fds, 0, size);
- if (new_fdt->max_fdset > open_files) {
- int left = (new_fdt->max_fdset-open_files)/8;
+ if (new_fdt->max_fds > open_files) {
+ int left = (new_fdt->max_fds-open_files)/8;
int start = open_files / (8 * sizeof(unsigned long));
memset(&new_fdt->open_fds->fds_bits[start], 0, left);
memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
}
-out:
return newf;
out_release:
- free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
- free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
- free_fd_array(new_fdt->fd, new_fdt->max_fds);
kmem_cache_free(files_cachep, newf);
+out:
return NULL;
}
@@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
if (clone_flags & CLONE_THREAD) {
atomic_inc(&current->signal->count);
atomic_inc(&current->signal->live);
- taskstats_tgid_alloc(current);
return 0;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1039,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
+ task_io_accounting_init(p);
acct_clear_integrals(p);
p->it_virt_expires = cputime_zero;
@@ -1243,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (thread_group_leader(p)) {
p->signal->tty = current->signal->tty;
p->signal->pgrp = process_group(current);
- p->signal->session = current->signal->session;
+ set_signal_session(p->signal, process_session(current));
attach_pid(p, PIDTYPE_PGID, process_group(p));
- attach_pid(p, PIDTYPE_SID, p->signal->session);
+ attach_pid(p, PIDTYPE_SID, process_session(p));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
@@ -1303,7 +1307,7 @@ fork_out:
return ERR_PTR(retval);
}
-struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
return regs;
@@ -1315,9 +1319,8 @@ struct task_struct * __devinit fork_idle(int cpu)
struct pt_regs regs;
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
- if (!task)
- return ERR_PTR(-ENOMEM);
- init_idle(task, cpu);
+ if (!IS_ERR(task))
+ init_idle(task, cpu);
return task;
}
@@ -1414,7 +1417,7 @@ long do_fork(unsigned long clone_flags,
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
-static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
{
struct sighand_struct *sighand = data;
@@ -1510,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unshare the namespace structure if it is being shared
+ * Unshare the mnt_namespace structure if it is being shared
*/
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+static int unshare_mnt_namespace(unsigned long unshare_flags,
+ struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
{
- struct namespace *ns = current->nsproxy->namespace;
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
if ((unshare_flags & CLONE_NEWNS) && ns) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+ *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
if (!*new_nsp)
return -ENOMEM;
}
@@ -1529,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
}
/*
- * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
- * supported yet
+ * Unsharing of sighand is not supported yet
*/
static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
{
struct sighand_struct *sigh = current->sighand;
- if ((unshare_flags & CLONE_SIGHAND) &&
- (sigh && atomic_read(&sigh->count) > 1))
+ if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
return -EINVAL;
else
return 0;
@@ -1610,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
{
int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct namespace *ns, *new_ns = NULL;
- struct sighand_struct *sigh, *new_sigh = NULL;
+ struct mnt_namespace *ns, *new_ns = NULL;
+ struct sighand_struct *new_sigh = NULL;
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
@@ -1632,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
goto bad_unshare_cleanup_thread;
- if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+ if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
goto bad_unshare_cleanup_fs;
if ((err = unshare_sighand(unshare_flags, &new_sigh)))
goto bad_unshare_cleanup_ns;
@@ -1656,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
}
}
- if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+ if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
new_uts || new_ipc) {
task_lock(current);
@@ -1673,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
}
if (new_ns) {
- ns = current->nsproxy->namespace;
- current->nsproxy->namespace = new_ns;
+ ns = current->nsproxy->mnt_ns;
+ current->nsproxy->mnt_ns = new_ns;
new_ns = ns;
}
- if (new_sigh) {
- sigh = current->sighand;
- rcu_assign_pointer(current->sighand, new_sigh);
- new_sigh = sigh;
- }
-
if (new_mm) {
mm = current->mm;
active_mm = current->active_mm;
@@ -1741,7 +1737,7 @@ bad_unshare_cleanup_sigh:
bad_unshare_cleanup_ns:
if (new_ns)
- put_namespace(new_ns);
+ put_mnt_ns(new_ns);
bad_unshare_cleanup_fs:
if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index b364e002619..5a737de857d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
/*
* Get parameters which are the keys for a futex.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
/*
* Linear file mappings are also simple.
*/
- key->shared.inode = vma->vm_file->f_dentry->d_inode;
+ key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
int ret;
- inc_preempt_count();
+ pagefault_disable();
ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
- dec_preempt_count();
+ pagefault_enable();
return ret ? -EFAULT : 0;
}
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
if (likely(current->pi_state_cache))
return 0;
- pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
if (!pi_state)
return -ENOMEM;
- memset(pi_state, 0, sizeof(*pi_state));
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
* at the end of wake_up_all() does not prevent this store from
* moving.
*/
- wmb();
+ smp_wmb();
q->lock_ptr = NULL;
}
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
- dec_preempt_count();
+ pagefault_enable();
if (curval == -EFAULT)
return -EFAULT;
if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- inc_preempt_count();
+ pagefault_disable();
oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
- dec_preempt_count();
+ pagefault_enable();
if (oldval == -EFAULT)
return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
*/
newval = current->pid;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
- dec_preempt_count();
+ pagefault_enable();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
uval = curval;
newval = uval | FUTEX_WAITERS;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
- dec_preempt_count();
+ pagefault_enable();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
newval = current->pid |
FUTEX_OWNER_DIED | FUTEX_WAITERS;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr,
uval, newval);
- dec_preempt_count();
+ pagefault_enable();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
* anyone else up:
*/
if (!(uval & FUTEX_OWNER_DIED)) {
- inc_preempt_count();
+ pagefault_disable();
uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
- dec_preempt_count();
+ pagefault_enable();
}
if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
return ret;
}
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
.release = futex_close,
.poll = futex_poll,
};
@@ -1507,6 +1506,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
struct futex_q *q;
struct file *filp;
int ret, err;
+ static unsigned long printk_interval;
+
+ if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+ printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+ "will be removed from the kernel in June 2007\n",
+ current->comm);
+ }
ret = -EINVAL;
if (!valid_signal(signal))
@@ -1522,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
goto out;
}
filp->f_op = &futex_fops;
- filp->f_vfsmnt = mntget(futex_mnt);
- filp->f_dentry = dget(futex_mnt->mnt_root);
- filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+ filp->f_path.mnt = mntget(futex_mnt);
+ filp->f_path.dentry = dget(futex_mnt->mnt_root);
+ filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
if (signal) {
err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
@@ -1851,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
static int __init init(void)
{
- unsigned int i;
+ int i = register_filesystem(&futex_fs_type);
+
+ if (i)
+ return i;
- register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
+ if (IS_ERR(futex_mnt)) {
+ unregister_filesystem(&futex_fs_type);
+ return PTR_ERR(futex_mnt);
+ }
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d0dc3efe81..ebfd24a4185 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
chip->shutdown = chip->disable;
if (!chip->name)
chip->name = chip->typename;
+ if (!chip->end)
+ chip->end = dummy_irq_chip.end;
}
static inline void mask_ack_irq(struct irq_desc *desc, int irq)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 42aa6f1a3f0..aff1f0fabb0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
.chip = &no_irq_chip,
.handle_irq = handle_bad_irq,
.depth = 1,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
#ifdef CONFIG_SMP
.affinity = CPU_MASK_ALL
#endif
@@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, action);
-
- spin_lock(&desc->lock);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
+
+ spin_lock(&desc->lock);
if (likely(!(desc->status & IRQ_PENDING)))
break;
desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6879202afe9..b385878c6e8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *old, **p;
+ const char *old_name = NULL;
unsigned long flags;
int shared = 0;
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
* set the trigger type must match.
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
+ ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+ old_name = old->name;
goto mismatch;
+ }
#if defined(CONFIG_IRQ_PER_CPU)
/* All handlers must agree on per-cpuness */
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
return 0;
mismatch:
- spin_unlock_irqrestore(&desc->lock, flags);
if (!(new->flags & IRQF_PROBE_SHARED)) {
printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+ if (old_name)
+ printk(KERN_ERR "current handler: %s\n", old_name);
dump_stack();
}
+ spin_unlock_irqrestore(&desc->lock, flags);
return -EBUSY;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007..61f5c717a8f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
unsigned int irq = (int)(long)data, full_count = count, err;
cpumask_t new_value, tmp;
- if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
+ if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+ CHECK_IRQ_PER_CPU(irq_desc[irq].status))
return -EIO;
err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2..6f294ff4f9e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
#include <linux/proc_fs.h>
#include <linux/sched.h> /* for cond_resched */
#include <linux/mm.h>
+#include <linux/ctype.h>
#include <asm/sections.h>
@@ -30,14 +31,14 @@
#endif
/* These will be re-linked against their real values during the second link stage */
-extern unsigned long kallsyms_addresses[] __attribute__((weak));
-extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
-extern u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
-extern u8 kallsyms_token_table[] __attribute__((weak));
-extern u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
{
int len, skipped_first = 0;
- u8 *tptr, *data;
+ const u8 *tptr, *data;
/* get the compressed symbol length from the first symbol byte */
data = &kallsyms_names[off];
@@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
* kallsyms array */
static unsigned int get_symbol_offset(unsigned long pos)
{
- u8 *name;
+ const u8 *name;
int i;
/* use the closest marker we have. We have markers every 256 positions,
@@ -301,13 +302,6 @@ struct kallsym_iter
char name[KSYM_NAME_LEN+1];
};
-/* Only label it "global" if it is exported. */
-static void upcase_if_global(struct kallsym_iter *iter)
-{
- if (is_exported(iter->name, iter->owner))
- iter->type += 'A' - 'a';
-}
-
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
if (iter->owner == NULL)
return 0;
- upcase_if_global(iter);
+ /* Label it "global" if it is exported, "local" if not exported. */
+ iter->type = is_exported(iter->name, iter->owner)
+ ? toupper(iter->type) : tolower(iter->type);
+
return 1;
}
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
return 0;
}
-static struct seq_operations kallsyms_op = {
+static const struct seq_operations kallsyms_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static struct file_operations kallsyms_operations = {
+static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f..2a59c8a01ae 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
#include <linux/syscalls.h>
#include <linux/ioport.h>
#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
/* Allocate a controlling structure */
result = -ENOMEM;
- image = kmalloc(sizeof(*image), GFP_KERNEL);
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
if (!image)
goto out;
- memset(image, 0, sizeof(*image));
image->head = 0;
image->entry = &image->head;
image->last_entry = &image->head;
@@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image,
memset(ptr + uchunk, 0, mchunk - uchunk);
}
result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
kunmap(page);
if (result) {
result = (result < 0) ? result : -EIO;
@@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs)
}
}
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= NR_CPUS))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ final_note(buf);
+}
+
static int __init crash_notes_memory_init(void)
{
/* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e..3a7379aa31c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
#include <linux/kmod.h>
#include <linux/smp_lock.h>
#include <linux/slab.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/workqueue.h>
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
#endif /* CONFIG_KMOD */
struct subprocess_info {
+ struct work_struct work;
struct completion *complete;
char *path;
char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
}
/* This is run by khelper thread */
-static void __call_usermodehelper(void *data)
+static void __call_usermodehelper(struct work_struct *work)
{
- struct subprocess_info *sub_info = data;
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
pid_t pid;
int wait = sub_info->wait;
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
{
DECLARE_COMPLETION_ONSTACK(done);
struct subprocess_info sub_info = {
+ .work = __WORK_INITIALIZER(sub_info.work,
+ __call_usermodehelper),
.complete = &done,
.path = path,
.argv = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
.wait = wait,
.retval = 0,
};
- DECLARE_WORK(work, __call_usermodehelper, &sub_info);
if (!khelper_wq)
return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
if (path[0] == '\0')
return 0;
- queue_work(khelper_wq, &work);
+ queue_work(khelper_wq, &sub_info.work);
wait_for_completion(&done);
return sub_info.retval;
}
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
{
DECLARE_COMPLETION(done);
struct subprocess_info sub_info = {
+ .work = __WORK_INITIALIZER(sub_info.work,
+ __call_usermodehelper),
.complete = &done,
.path = path,
.argv = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
.retval = 0,
};
struct file *f;
- DECLARE_WORK(work, __call_usermodehelper, &sub_info);
if (!khelper_wq)
return -EBUSY;
@@ -307,18 +311,18 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
return 0;
f = create_write_pipe();
- if (!f)
- return -ENOMEM;
+ if (IS_ERR(f))
+ return PTR_ERR(f);
*filp = f;
f = create_read_pipe(f);
- if (!f) {
+ if (IS_ERR(f)) {
free_write_pipe(*filp);
- return -ENOMEM;
+ return PTR_ERR(f);
}
sub_info.stdin = f;
- queue_work(khelper_wq, &work);
+ queue_work(khelper_wq, &sub_info.work);
wait_for_completion(&done);
return sub_info.retval;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e..17ec4afb099 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
+#include <linux/freezer.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
kprobe_opcode_t *insns; /* Page of instruction slots */
char slot_used[INSNS_PER_PAGE];
int nused;
+ int ngarbage;
};
static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+
+static int __kprobes check_safety(void)
+{
+ int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+ ret = freeze_processes();
+ if (ret == 0) {
+ struct task_struct *p, *q;
+ do_each_thread(p, q) {
+ if (p != current && p->state == TASK_RUNNING &&
+ p->pid != 0) {
+ printk("Check failed: %s is running\n",p->comm);
+ ret = -1;
+ goto loop_end;
+ }
+ } while_each_thread(p, q);
+ }
+loop_end:
+ thaw_processes();
+#else
+ synchronize_sched();
+#endif
+ return ret;
+}
/**
* get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
+ retry:
hlist_for_each(pos, &kprobe_insn_pages) {
kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
}
}
- /* All out of space. Need to allocate a new page. Use slot 0.*/
+ /* If there are any garbage slots, collect it and try again. */
+ if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+ goto retry;
+ }
+ /* All out of space. Need to allocate a new page. Use slot 0. */
kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
if (!kip) {
return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
memset(kip->slot_used, 0, INSNS_PER_PAGE);
kip->slot_used[0] = 1;
kip->nused = 1;
+ kip->ngarbage = 0;
return kip->insns;
}
-void __kprobes free_insn_slot(kprobe_opcode_t *slot)
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+ kip->slot_used[idx] = 0;
+ kip->nused--;
+ if (kip->nused == 0) {
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ hlist_del(&kip->hlist);
+ if (hlist_empty(&kprobe_insn_pages)) {
+ INIT_HLIST_NODE(&kip->hlist);
+ hlist_add_head(&kip->hlist,
+ &kprobe_insn_pages);
+ } else {
+ module_free(NULL, kip->insns);
+ kfree(kip);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __kprobes collect_garbage_slots(void)
+{
+ struct kprobe_insn_page *kip;
+ struct hlist_node *pos, *next;
+
+ /* Ensure no-one is preepmted on the garbages */
+ if (check_safety() != 0)
+ return -EAGAIN;
+
+ hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+ int i;
+ kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ if (kip->ngarbage == 0)
+ continue;
+ kip->ngarbage = 0; /* we will collect all garbages */
+ for (i = 0; i < INSNS_PER_PAGE; i++) {
+ if (kip->slot_used[i] == -1 &&
+ collect_one_slot(kip, i))
+ break;
+ }
+ }
+ kprobe_garbage_slots = 0;
+ return 0;
+}
+
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
{
struct kprobe_insn_page *kip;
struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
- kip->slot_used[i] = 0;
- kip->nused--;
- if (kip->nused == 0) {
- /*
- * Page is no longer in use. Free it unless
- * it's the last one. We keep the last one
- * so as not to have to set it up again the
- * next time somebody inserts a probe.
- */
- hlist_del(&kip->hlist);
- if (hlist_empty(&kprobe_insn_pages)) {
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist,
- &kprobe_insn_pages);
- } else {
- module_free(NULL, kip->insns);
- kfree(kip);
- }
+ if (dirty) {
+ kip->slot_used[i] = -1;
+ kip->ngarbage++;
+ } else {
+ collect_one_slot(kip, i);
}
- return;
+ break;
}
}
+ if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+ collect_garbage_slots();
+ }
}
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e..1db8c72d0d3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
/* Result passed back to kthread_create() from keventd. */
struct task_struct *result;
struct completion done;
+
+ struct work_struct work;
};
struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
}
/* We are keventd: create a thread. */
-static void keventd_create_kthread(void *_create)
+static void keventd_create_kthread(struct work_struct *work)
{
- struct kthread_create_info *create = _create;
+ struct kthread_create_info *create =
+ container_of(work, struct kthread_create_info, work);
int pid;
/* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
...)
{
struct kthread_create_info create;
- DECLARE_WORK(work, keventd_create_kthread, &create);
create.threadfn = threadfn;
create.data = data;
init_completion(&create.started);
init_completion(&create.done);
+ INIT_WORK(&create.work, keventd_create_kthread);
/*
* The workqueue needs to start up first:
*/
if (!helper_wq)
- work.func(work.data);
+ create.work.func(&create.work);
else {
- queue_work(helper_wq, &work);
+ queue_work(helper_wq, &create.work);
wait_for_completion(&create.done);
}
if (!IS_ERR(create.result)) {
diff --git a/kernel/latency.c b/kernel/latency.c
index 258f2555abb..e63fcacb61a 100644
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/notifier.h>
+#include <linux/jiffies.h>
#include <asm/atomic.h>
struct latency_info {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b739be2a6dc..01e75055903 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,49 @@
#include "lockdep_internals.h"
/*
- * hash_lock: protects the lockdep hashes and class/list/hash allocators.
+ * lockdep_lock: protects the lockdep graph, the hashes and the
+ * class/list/hash allocators.
*
* This is one of the rare exceptions where it's justified
* to use a raw spinlock - we really dont want the spinlock
- * code to recurse back into the lockdep code.
+ * code to recurse back into the lockdep code...
*/
-static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int graph_lock(void)
+{
+ __raw_spin_lock(&lockdep_lock);
+ /*
+ * Make sure that if another CPU detected a bug while
+ * walking the graph we dont change it (while the other
+ * CPU is busy printing out stuff with the graph lock
+ * dropped already)
+ */
+ if (!debug_locks) {
+ __raw_spin_unlock(&lockdep_lock);
+ return 0;
+ }
+ return 1;
+}
+
+static inline int graph_unlock(void)
+{
+ __raw_spin_unlock(&lockdep_lock);
+ return 0;
+}
+
+/*
+ * Turn lock debugging off and return with 0 if it was off already,
+ * and also release the graph lock:
+ */
+static inline int debug_locks_off_graph_unlock(void)
+{
+ int ret = debug_locks_off();
+
+ __raw_spin_unlock(&lockdep_lock);
+
+ return ret;
+}
static int lockdep_initialized;
@@ -57,14 +93,15 @@ unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
/*
- * Allocate a lockdep entry. (assumes hash_lock held, returns
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
* with NULL on failure)
*/
static struct lock_list *alloc_list_entry(void)
{
if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
+ if (!debug_locks_off_graph_unlock())
+ return NULL;
+
printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
printk("turning off the locking correctness validator.\n");
return NULL;
@@ -140,21 +177,12 @@ void lockdep_on(void)
EXPORT_SYMBOL(lockdep_on);
-int lockdep_internal(void)
-{
- return current->lockdep_recursion != 0;
-}
-
-EXPORT_SYMBOL(lockdep_internal);
-
/*
* Debugging switches:
*/
#define VERBOSE 0
-#ifdef VERBOSE
-# define VERY_VERBOSE 0
-#endif
+#define VERY_VERBOSE 0
#if VERBOSE
# define HARDIRQ_VERBOSE 1
@@ -179,8 +207,8 @@ static int class_filter(struct lock_class *class)
!strcmp(class->name, "&struct->lockfield"))
return 1;
#endif
- /* Allow everything else. 0 would be filter everything else */
- return 1;
+ /* Filter everything else. 1 would be to allow everything else */
+ return 0;
}
#endif
@@ -214,7 +242,7 @@ static int softirq_verbose(struct lock_class *class)
/*
* Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the hash_lock.
+ * addresses. Protected by the graph_lock.
*/
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
@@ -228,25 +256,20 @@ static int save_trace(struct stack_trace *trace)
trace->skip = 3;
trace->all_contexts = 0;
- /* Make sure to not recurse in case the the unwinder needs to tak
-e locks. */
- lockdep_off();
save_stack_trace(trace, NULL);
- lockdep_on();
trace->max_entries = trace->nr_entries;
nr_stack_trace_entries += trace->nr_entries;
- if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
- return 0;
if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
- __raw_spin_unlock(&hash_lock);
- if (debug_locks_off()) {
- printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
- dump_stack();
- }
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+
return 0;
}
@@ -357,7 +380,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
static void print_lock_name(struct lock_class *class)
{
- char str[128], c1, c2, c3, c4;
+ char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
const char *name;
get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -379,7 +402,7 @@ static void print_lock_name(struct lock_class *class)
static void print_lockdep_cache(struct lockdep_map *lock)
{
const char *name;
- char str[128];
+ char str[KSYM_NAME_LEN + 1];
name = lock->name;
if (!name)
@@ -449,7 +472,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
print_lock_class_header(class, depth);
list_for_each_entry(entry, &class->locks_after, entry) {
- DEBUG_LOCKS_WARN_ON(!entry->class);
+ if (DEBUG_LOCKS_WARN_ON(!entry->class))
+ return;
+
print_lock_dependencies(entry->class, depth + 1);
printk("%*s ... acquired at:\n",depth,"");
@@ -474,7 +499,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
return 0;
entry->class = this;
- save_trace(&entry->trace);
+ if (!save_trace(&entry->trace))
+ return 0;
/*
* Since we never remove from the dependency list, the list can
@@ -532,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
{
struct task_struct *curr = current;
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
printk("\n=======================================================\n");
@@ -563,7 +587,9 @@ static noinline int print_circular_bug_tail(void)
return 0;
this.class = check_source->class;
- save_trace(&this.trace);
+ if (!save_trace(&this.trace))
+ return 0;
+
print_circular_bug_entry(&this, 0);
printk("\nother info that might help us debug this:\n\n");
@@ -579,8 +605,10 @@ static noinline int print_circular_bug_tail(void)
static int noinline print_infinite_recursion_bug(void)
{
- __raw_spin_unlock(&hash_lock);
- DEBUG_LOCKS_WARN_ON(1);
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ WARN_ON(1);
return 0;
}
@@ -715,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr,
enum lock_usage_bit bit2,
const char *irqclass)
{
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
printk("\n======================================================\n");
@@ -798,9 +824,7 @@ static int
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
- debug_locks_off();
- __raw_spin_unlock(&hash_lock);
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
printk("\n=============================================\n");
@@ -966,27 +990,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
&prev->class->locks_after, next->acquire_ip);
if (!ret)
return 0;
- /*
- * Return value of 2 signals 'dependency already added',
- * in that case we dont have to add the backlink either.
- */
- if (ret == 2)
- return 2;
+
ret = add_lock_to_list(next->class, prev->class,
&next->class->locks_before, next->acquire_ip);
+ if (!ret)
+ return 0;
/*
* Debugging printouts:
*/
if (verbose(prev->class) || verbose(next->class)) {
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
printk("\n new dependency: ");
print_lock_name(prev->class);
printk(" => ");
print_lock_name(next->class);
printk("\n");
dump_stack();
- __raw_spin_lock(&hash_lock);
+ return graph_lock();
}
return 1;
}
@@ -1025,7 +1046,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
* added:
*/
if (hlock->read != 2) {
- check_prev_add(curr, hlock, next);
+ if (!check_prev_add(curr, hlock, next))
+ return 0;
/*
* Stop after the first non-trylock entry,
* as non-trylock entries have added their
@@ -1050,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
}
return 1;
out_bug:
- __raw_spin_unlock(&hash_lock);
- DEBUG_LOCKS_WARN_ON(1);
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ WARN_ON(1);
return 0;
}
@@ -1081,7 +1105,8 @@ static int static_obj(void *obj)
*/
for_each_possible_cpu(i) {
start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
- end = (unsigned long) &__per_cpu_end + per_cpu_offset(i);
+ end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+ + per_cpu_offset(i);
if ((addr >= start) && (addr < end))
return 1;
@@ -1181,6 +1206,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct list_head *hash_head;
struct lock_class *class;
+ unsigned long flags;
class = look_up_lock_class(lock, subclass);
if (likely(class))
@@ -1202,7 +1228,11 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
key = lock->key->subkeys + subclass;
hash_head = classhashentry(key);
- __raw_spin_lock(&hash_lock);
+ raw_local_irq_save(flags);
+ if (!graph_lock()) {
+ raw_local_irq_restore(flags);
+ return NULL;
+ }
/*
* We have to do the hash-walk again, to avoid races
* with another CPU:
@@ -1215,8 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* the hash:
*/
if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
+ if (!debug_locks_off_graph_unlock()) {
+ raw_local_irq_restore(flags);
+ return NULL;
+ }
+ raw_local_irq_restore(flags);
+
printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
printk("turning off the locking correctness validator.\n");
return NULL;
@@ -1237,16 +1271,24 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
list_add_tail_rcu(&class->hash_entry, hash_head);
if (verbose(class)) {
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
+ raw_local_irq_restore(flags);
+
printk("\nnew class %p: %s", class->key, class->name);
if (class->name_version > 1)
printk("#%d", class->name_version);
printk("\n");
dump_stack();
- __raw_spin_lock(&hash_lock);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock()) {
+ raw_local_irq_restore(flags);
+ return NULL;
+ }
}
out_unlock_set:
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
+ raw_local_irq_restore(flags);
if (!subclass || force)
lock->class_cache = class;
@@ -1261,7 +1303,7 @@ out_unlock_set:
* add it and return 0 - in this case the new dependency chain is
* validated. If the key is already hashed, return 1.
*/
-static inline int lookup_chain_cache(u64 chain_key)
+static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
{
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
@@ -1275,34 +1317,32 @@ static inline int lookup_chain_cache(u64 chain_key)
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(&chain_lookup_hits);
- /*
- * In the debugging case, force redundant checking
- * by returning 1:
- */
-#ifdef CONFIG_DEBUG_LOCKDEP
- __raw_spin_lock(&hash_lock);
- return 1;
-#endif
+ if (very_verbose(class))
+ printk("\nhash chain already cached, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
return 0;
}
}
+ if (very_verbose(class))
+ printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
/*
* Allocate a new chain entry from the static array, and add
* it to the hash:
*/
- __raw_spin_lock(&hash_lock);
+ if (!graph_lock())
+ return 0;
/*
* We have to walk the chain again locked - to avoid duplicates:
*/
list_for_each_entry(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
goto cache_hit;
}
}
if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
printk("turning off the locking correctness validator.\n");
return 0;
@@ -1378,9 +1418,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
struct held_lock *this, int forwards,
const char *irqclass)
{
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
printk("\n=========================================================\n");
@@ -1450,7 +1488,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
}
-static inline void print_irqtrace_events(struct task_struct *curr)
+void print_irqtrace_events(struct task_struct *curr)
{
printk("irq event stamp: %u\n", curr->irq_events);
printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
@@ -1463,19 +1501,13 @@ static inline void print_irqtrace_events(struct task_struct *curr)
print_ip_sym(curr->softirq_disable_ip);
}
-#else
-static inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
#endif
static int
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- __raw_spin_unlock(&hash_lock);
- debug_locks_off();
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
printk("\n=================================\n");
@@ -1536,12 +1568,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
if (likely(this->class->usage_mask & new_mask))
return 1;
- __raw_spin_lock(&hash_lock);
+ if (!graph_lock())
+ return 0;
/*
* Make sure we didnt race:
*/
if (unlikely(this->class->usage_mask & new_mask)) {
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
return 1;
}
@@ -1727,15 +1760,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
debug_atomic_dec(&nr_unused_locks);
break;
default:
- debug_locks_off();
+ if (!debug_locks_off_graph_unlock())
+ return 0;
WARN_ON(1);
return 0;
}
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
/*
- * We must printk outside of the hash_lock:
+ * We must printk outside of the graph_lock:
*/
if (ret == 2) {
printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
@@ -2133,9 +2167,9 @@ out_calc_hash:
* We look up the chain_key and do the O(N^2) check and update of
* the dependencies only if this is a new dependency chain.
* (If lookup_chain_cache() returns with 1 it acquires
- * hash_lock for us)
+ * graph_lock for us)
*/
- if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+ if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
/*
* Check whether last held lock:
*
@@ -2166,7 +2200,7 @@ out_calc_hash:
if (!chain_head && ret != 2)
if (!check_prevs_add(curr, hlock))
return 0;
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
}
curr->lockdep_depth++;
check_chain_key(curr);
@@ -2429,6 +2463,7 @@ EXPORT_SYMBOL_GPL(lock_release);
void lockdep_reset(void)
{
unsigned long flags;
+ int i;
raw_local_irq_save(flags);
current->curr_chain_key = 0;
@@ -2439,6 +2474,8 @@ void lockdep_reset(void)
nr_softirq_chains = 0;
nr_process_chains = 0;
debug_locks = 1;
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
raw_local_irq_restore(flags);
}
@@ -2475,7 +2512,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
int i;
raw_local_irq_save(flags);
- __raw_spin_lock(&hash_lock);
+ graph_lock();
/*
* Unhash all classes that were created by this module:
@@ -2489,7 +2526,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
zap_class(class);
}
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
raw_local_irq_restore(flags);
}
@@ -2517,20 +2554,20 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* Debug check: in the end all mapped classes should
* be gone.
*/
- __raw_spin_lock(&hash_lock);
+ graph_lock();
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
if (list_empty(head))
continue;
list_for_each_entry_safe(class, next, head, hash_entry) {
if (unlikely(class == lock->class_cache)) {
- __raw_spin_unlock(&hash_lock);
- DEBUG_LOCKS_WARN_ON(1);
+ if (debug_locks_off_graph_unlock())
+ WARN_ON(1);
goto out_restore;
}
}
}
- __raw_spin_unlock(&hash_lock);
+ graph_unlock();
out_restore:
raw_local_irq_restore(flags);
@@ -2644,6 +2681,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
}
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
static void print_held_locks_bug(struct task_struct *curr)
{
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb..8ce09bc4613 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
#define MAX_LOCKDEP_KEYS_BITS 11
#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 13
+#define MAX_LOCKDEP_CHAINS_BITS 14
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3f..b554b40a4aa 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
return 0;
}
-static struct seq_operations lockdep_ops = {
+static const struct seq_operations lockdep_ops = {
.start = l_start,
.next = l_next,
.stop = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
return res;
}
-static struct file_operations proc_lockdep_operations = {
+static const struct file_operations proc_lockdep_operations = {
.open = lockdep_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
return single_open(file, lockdep_stats_show, NULL);
}
-static struct file_operations proc_lockdep_stats_operations = {
+static const struct file_operations proc_lockdep_stats_operations = {
.open = lockdep_stats_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index 5072a943fe3..b565eaeff7e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -34,10 +34,10 @@
#include <linux/err.h>
#include <linux/vermagic.h>
#include <linux/notifier.h>
+#include <linux/sched.h>
#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
-#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/unwind.h>
#include <asm/uaccess.h>
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = {
.show = show_refcnt,
};
+void module_put(struct module *module)
+{
+ if (module) {
+ unsigned int cpu = get_cpu();
+ local_dec(&module->ref[cpu].count);
+ /* Maybe they're waiting for us to drop reference? */
+ if (unlikely(!module_is_live(module)))
+ wake_up_process(module->waiter);
+ put_cpu();
+ }
+}
+EXPORT_SYMBOL(module_put);
+
#else /* !CONFIG_MODULE_UNLOAD */
static void print_unload_info(struct seq_file *m, struct module *mod)
{
@@ -811,9 +824,34 @@ static inline void module_unload_init(struct module *mod)
}
#endif /* CONFIG_MODULE_UNLOAD */
+static ssize_t show_initstate(struct module_attribute *mattr,
+ struct module *mod, char *buffer)
+{
+ const char *state = "unknown";
+
+ switch (mod->state) {
+ case MODULE_STATE_LIVE:
+ state = "live";
+ break;
+ case MODULE_STATE_COMING:
+ state = "coming";
+ break;
+ case MODULE_STATE_GOING:
+ state = "going";
+ break;
+ }
+ return sprintf(buffer, "%s\n", state);
+}
+
+static struct module_attribute initstate = {
+ .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE },
+ .show = show_initstate,
+};
+
static struct module_attribute *modinfo_attrs[] = {
&modinfo_version,
&modinfo_srcversion,
+ &initstate,
#ifdef CONFIG_MODULE_UNLOAD
&refcnt,
#endif
@@ -1086,22 +1124,35 @@ static int mod_sysfs_setup(struct module *mod,
goto out;
kobj_set_kset_s(&mod->mkobj, module_subsys);
mod->mkobj.mod = mod;
- err = kobject_register(&mod->mkobj.kobj);
+
+ /* delay uevent until full sysfs population */
+ kobject_init(&mod->mkobj.kobj);
+ err = kobject_add(&mod->mkobj.kobj);
if (err)
goto out;
+ mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers");
+ if (!mod->drivers_dir)
+ goto out_unreg;
+
err = module_param_sysfs_setup(mod, kparam, num_params);
if (err)
- goto out_unreg;
+ goto out_unreg_drivers;
err = module_add_modinfo_attrs(mod);
if (err)
- goto out_unreg;
+ goto out_unreg_param;
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
+out_unreg_drivers:
+ kobject_unregister(mod->drivers_dir);
+out_unreg_param:
+ module_param_sysfs_remove(mod);
out_unreg:
- kobject_unregister(&mod->mkobj.kobj);
+ kobject_del(&mod->mkobj.kobj);
+ kobject_put(&mod->mkobj.kobj);
out:
return err;
}
@@ -1110,6 +1161,7 @@ static void mod_kobject_remove(struct module *mod)
{
module_remove_modinfo_attrs(mod);
module_param_sysfs_remove(mod);
+ kobject_unregister(mod->drivers_dir);
kobject_unregister(&mod->mkobj.kobj);
}
@@ -1718,7 +1770,7 @@ static struct module *load_module(void __user *umod,
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE);
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -2182,7 +2234,7 @@ static int m_show(struct seq_file *m, void *p)
Where refcount is a number or -, and deps is a comma-separated list
of depends or -.
*/
-struct seq_operations modules_op = {
+const struct seq_operations modules_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
@@ -2275,11 +2327,14 @@ void print_modules(void)
void module_add_driver(struct module *mod, struct device_driver *drv)
{
+ int no_warn;
+
if (!mod || !drv)
return;
- /* Don't check return code; this call is idempotent */
- sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+ /* Don't check return codes; these calls are idempotent */
+ no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+ no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name);
}
EXPORT_SYMBOL(module_add_driver);
@@ -2288,6 +2343,8 @@ void module_remove_driver(struct device_driver *drv)
if (!drv)
return;
sysfs_remove_link(&drv->kobj, "module");
+ if (drv->owner && drv->owner->drivers_dir)
+ sysfs_remove_link(drv->owner->drivers_dir, drv->name);
}
EXPORT_SYMBOL(module_remove_driver);
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b..841539d72c5 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void debug_mutex_unlock(struct mutex *lock)
{
+ if (unlikely(!debug_locks))
+ return;
+
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a49..e7cbbb82765 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
#endif
/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335..f5b9ee6f6bb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
#include <linux/version.h>
#include <linux/nsproxy.h>
#include <linux/init_task.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
@@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
struct nsproxy *ns = clone_namespaces(orig);
if (ns) {
- if (ns->namespace)
- get_namespace(ns->namespace);
+ if (ns->mnt_ns)
+ get_mnt_ns(ns->mnt_ns);
if (ns->uts_ns)
get_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
get_ipc_ns(ns->ipc_ns);
+ if (ns->pid_ns)
+ get_pid_ns(ns->pid_ns);
}
return ns;
@@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
tsk->nsproxy = new_ns;
- err = copy_namespace(flags, tsk);
+ err = copy_mnt_ns(flags, tsk);
if (err)
goto out_ns;
@@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
if (err)
goto out_ipc;
+ err = copy_pid_ns(flags, tsk);
+ if (err)
+ goto out_pid;
+
out:
put_nsproxy(old_ns);
return err;
+out_pid:
+ if (new_ns->ipc_ns)
+ put_ipc_ns(new_ns->ipc_ns);
out_ipc:
if (new_ns->uts_ns)
put_uts_ns(new_ns->uts_ns);
out_uts:
- if (new_ns->namespace)
- put_namespace(new_ns->namespace);
+ if (new_ns->mnt_ns)
+ put_mnt_ns(new_ns->mnt_ns);
out_ns:
tsk->nsproxy = old_ns;
kfree(new_ns);
@@ -127,11 +137,13 @@ out_ns:
void free_nsproxy(struct nsproxy *ns)
{
- if (ns->namespace)
- put_namespace(ns->namespace);
- if (ns->uts_ns)
- put_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- put_ipc_ns(ns->ipc_ns);
- kfree(ns);
+ if (ns->mnt_ns)
+ put_mnt_ns(ns->mnt_ns);
+ if (ns->uts_ns)
+ put_uts_ns(ns->uts_ns);
+ if (ns->ipc_ns)
+ put_ipc_ns(ns->ipc_ns);
+ if (ns->pid_ns)
+ put_pid_ns(ns->pid_ns);
+ kfree(ns);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f..2efe9d8d367 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,12 +26,12 @@
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
-#include <linux/pspace.h>
+#include <linux/pid_namespace.h>
#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
static struct hlist_head *pid_hash;
static int pidhash_shift;
-static kmem_cache_t *pid_cachep;
+static struct kmem_cache *pid_cachep;
int pid_max = PID_MAX_DEFAULT;
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+static inline int mk_pid(struct pid_namespace *pid_ns,
+ struct pidmap *map, int off)
{
- return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+ return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
}
#define find_next_offset(map, off) \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
-struct pspace init_pspace = {
+struct pid_namespace init_pid_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
.pidmap = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
- .last_pid = 0
+ .last_pid = 0,
+ .child_reaper = &init_task
};
/*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(struct pspace *pspace, int pid)
+static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
{
- struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
+ struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
-static int alloc_pidmap(struct pspace *pspace)
+static int alloc_pidmap(struct pid_namespace *pid_ns)
{
- int i, offset, max_scan, pid, last = pspace->last_pid;
+ int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
- map = &pspace->pidmap[pid/BITS_PER_PAGE];
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
- pspace->last_pid = pid;
+ pid_ns->last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
- pid = mk_pid(pspace, map, offset);
+ pid = mk_pid(pid_ns, map, offset);
/*
* find_next_offset() found a bit, the pid from it
* is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
- if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+ if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
- map = &pspace->pidmap[0];
+ map = &pid_ns->pidmap[0];
offset = RESERVED_PIDS;
if (unlikely(last == offset))
break;
}
- pid = mk_pid(pspace, map, offset);
+ pid = mk_pid(pid_ns, map, offset);
}
return -1;
}
-static int next_pidmap(struct pspace *pspace, int last)
+static int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pspace->pidmap[PIDMAP_ENTRIES];
+ map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
+ end = &pid_ns->pidmap[PIDMAP_ENTRIES];
for (; map < end; map++, offset = 0) {
if (unlikely(!map->page))
continue;
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
if (offset < BITS_PER_PAGE)
- return mk_pid(pspace, map, offset);
+ return mk_pid(pid_ns, map, offset);
}
return -1;
}
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
hlist_del_rcu(&pid->pid_chain);
spin_unlock_irqrestore(&pidmap_lock, flags);
- free_pidmap(&init_pspace, pid->nr);
+ free_pidmap(current->nsproxy->pid_ns, pid->nr);
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
if (!pid)
goto out;
- nr = alloc_pidmap(&init_pspace);
+ nr = alloc_pidmap(current->nsproxy->pid_ns);
if (nr < 0)
goto out_free;
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
pid = find_pid(nr);
if (pid)
break;
- nr = next_pidmap(&init_pspace, nr);
+ nr = next_pidmap(current->nsproxy->pid_ns, nr);
} while (nr > 0);
return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);
+int copy_pid_ns(int flags, struct task_struct *tsk)
+{
+ struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_pid_ns(old_ns);
+ return err;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+ struct pid_namespace *ns;
+
+ ns = container_of(kref, struct pid_namespace, kref);
+ kfree(ns);
+}
+
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pspace.pidmap[0].page);
- atomic_dec(&init_pspace.pidmap[0].nr_free);
+ set_bit(0, init_pid_ns.pidmap[0].page);
+ atomic_dec(&init_pid_ns.pidmap[0].nr_free);
pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
__alignof__(struct pid),
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06..5fe87de10ff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
/*
* Lets keep our timers in a slab cache :-)
*/
-static kmem_cache_t *posix_timers_cache;
+static struct kmem_cache *posix_timers_cache;
static struct idr posix_timers_id;
static DEFINE_SPINLOCK(idr_lock);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca347..ed296225dcd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -20,13 +20,14 @@ config PM
sending the processor to sleep and saving power.
config PM_LEGACY
- bool "Legacy Power Management API"
+ bool "Legacy Power Management API (DEPRECATED)"
depends on PM
- default y
+ default n
---help---
- Support for pm_register() and friends.
+ Support for pm_register() and friends. This old API is obsoleted
+ by the driver model.
- If unsure, say Y.
+ If unsure, say N.
config PM_DEBUG
bool "Power Management Debug Support"
@@ -78,7 +79,7 @@ config PM_SYSFS_DEPRECATED
config SOFTWARE_SUSPEND
bool "Software Suspend"
- depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
+ depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
---help---
Enable the possibility of suspending the machine.
It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index ae6bbc903b7..88fc5d7ac73 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
#include <linux/pm.h>
#include <linux/console.h>
#include <linux/cpu.h>
+#include <linux/freezer.h>
#include "power.h"
@@ -27,6 +28,23 @@
static int noresume = 0;
char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+
+/**
+ * platform_prepare - prepare the machine for hibernation using the
+ * platform driver if so configured and return an error code if it fails
+ */
+
+static inline int platform_prepare(void)
+{
+ int error = 0;
+
+ if (pm_disk_mode == PM_DISK_PLATFORM) {
+ if (pm_ops && pm_ops->prepare)
+ error = pm_ops->prepare(PM_SUSPEND_DISK);
+ }
+ return error;
+}
/**
* power_down - Shut machine down for hibernate.
@@ -40,13 +58,11 @@ dev_t swsusp_resume_device;
static void power_down(suspend_disk_method_t mode)
{
- int error = 0;
-
switch(mode) {
case PM_DISK_PLATFORM:
if (pm_ops && pm_ops->enter) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- error = pm_ops->enter(PM_SUSPEND_DISK);
+ pm_ops->enter(PM_SUSPEND_DISK);
break;
}
case PM_DISK_SHUTDOWN:
@@ -73,7 +89,7 @@ static inline void platform_finish(void)
static int prepare_processes(void)
{
- int error;
+ int error = 0;
pm_prepare_console();
@@ -86,12 +102,24 @@ static int prepare_processes(void)
goto thaw;
}
+ if (pm_disk_mode == PM_DISK_TESTPROC) {
+ printk("swsusp debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ goto thaw;
+ }
+
+ error = platform_prepare();
+ if (error)
+ goto thaw;
+
/* Free memory before shutting down devices. */
if (!(error = swsusp_shrink_memory()))
return 0;
-thaw:
+
+ platform_finish();
+ thaw:
thaw_processes();
-enable_cpus:
+ enable_cpus:
enable_nonboot_cpus();
pm_restore_console();
return error;
@@ -122,13 +150,21 @@ int pm_suspend_disk(void)
if (error)
return error;
+ if (pm_disk_mode == PM_DISK_TESTPROC)
+ return 0;
+
suspend_console();
error = device_suspend(PMSG_FREEZE);
if (error) {
resume_console();
printk("Some devices failed to suspend\n");
- unprepare_processes();
- return error;
+ goto Thaw;
+ }
+
+ if (pm_disk_mode == PM_DISK_TEST) {
+ printk("swsusp debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ goto Done;
}
pr_debug("PM: snapshotting memory.\n");
@@ -145,16 +181,17 @@ int pm_suspend_disk(void)
power_down(pm_disk_mode);
else {
swsusp_free();
- unprepare_processes();
- return error;
+ goto Thaw;
}
- } else
+ } else {
pr_debug("PM: Image restored successfully.\n");
+ }
swsusp_free();
Done:
device_resume();
resume_console();
+ Thaw:
unprepare_processes();
return error;
}
@@ -176,10 +213,10 @@ static int software_resume(void)
{
int error;
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
if (!swsusp_resume_device) {
if (!strlen(resume_file)) {
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
return -ENOENT;
}
swsusp_resume_device = name_to_dev_t(resume_file);
@@ -194,7 +231,7 @@ static int software_resume(void)
* FIXME: If noresume is specified, we need to find the partition
* and reset it back to normal swap space.
*/
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
return 0;
}
@@ -238,7 +275,7 @@ static int software_resume(void)
unprepare_processes();
Done:
/* For success case, the suspend path will release the lock */
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
pr_debug("PM: Resume from disk failed.\n");
return 0;
}
@@ -251,6 +288,8 @@ static const char * const pm_disk_modes[] = {
[PM_DISK_PLATFORM] = "platform",
[PM_DISK_SHUTDOWN] = "shutdown",
[PM_DISK_REBOOT] = "reboot",
+ [PM_DISK_TEST] = "test",
+ [PM_DISK_TESTPROC] = "testproc",
};
/**
@@ -297,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
if (!strncmp(buf, pm_disk_modes[i], len)) {
mode = i;
@@ -305,21 +344,23 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
}
}
if (mode) {
- if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+ if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
+ mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
pm_disk_mode = mode;
- else {
+ } else {
if (pm_ops && pm_ops->enter &&
(mode == pm_ops->pm_disk_mode))
pm_disk_mode = mode;
else
error = -EINVAL;
}
- } else
+ } else {
error = -EINVAL;
+ }
pr_debug("PM: suspend-to-disk mode set to '%s'\n",
pm_disk_modes[mode]);
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
return error ? error : n;
}
@@ -344,14 +385,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
if (maj != MAJOR(res) || min != MINOR(res))
goto out;
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
swsusp_resume_device = res;
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
printk("Attempting manual resume\n");
noresume = 0;
software_resume();
ret = n;
-out:
+ out:
return ret;
}
@@ -406,6 +447,19 @@ static int __init resume_setup(char *str)
return 1;
}
+static int __init resume_offset_setup(char *str)
+{
+ unsigned long long offset;
+
+ if (noresume)
+ return 1;
+
+ if (sscanf(str, "%llu", &offset) == 1)
+ swsusp_resume_block = offset;
+
+ return 1;
+}
+
static int __init noresume_setup(char *str)
{
noresume = 1;
@@ -413,4 +467,5 @@ static int __init noresume_setup(char *str)
}
__setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1210961a5aa..ff3a6182f5f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/kobject.h>
#include <linux/string.h>
@@ -18,13 +19,14 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/resume-trace.h>
+#include <linux/freezer.h>
#include "power.h"
/*This is just an arbitrary number */
#define FREE_PAGE_NUMBER (100)
-DECLARE_MUTEX(pm_sem);
+DEFINE_MUTEX(pm_mutex);
struct pm_ops *pm_ops;
suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
void pm_set_ops(struct pm_ops * ops)
{
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
pm_ops = ops;
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
}
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
if (!valid_state(state))
return -ENODEV;
- if (down_trylock(&pm_sem))
+ if (!mutex_trylock(&pm_mutex))
return -EBUSY;
if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
pr_debug("PM: Finishing wakeup.\n");
suspend_finish(state);
Unlock:
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
return error;
}
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
return -EINVAL;
}
-
+EXPORT_SYMBOL(pm_suspend);
decl_subsys(power,NULL,NULL);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b27..eb461b816bf 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
return -EPERM;
}
#endif
-extern struct semaphore pm_sem;
+
+extern struct mutex pm_mutex;
+
#define power_attr(_name) \
static struct subsys_attribute _name##_attr = { \
.attr = { \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
extern unsigned long image_size;
extern int in_suspend;
extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
extern asmlinkage int swsusp_arch_suspend(void);
extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
extern unsigned int snapshot_additional_pages(struct zone *zone);
extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
-extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
+
+/*
+ * This structure is used to pass the values needed for the identification
+ * of the resume swap area from a user space to the kernel via the
+ * SNAPSHOT_SET_SWAP_AREA ioctl
+ */
+struct resume_swap_area {
+ loff_t offset;
+ u_int32_t dev;
+} __attribute__((packed));
#define SNAPSHOT_IOC_MAGIC '3'
#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
-#define SNAPSHOT_IOC_MAXNR 11
+#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
+ struct resume_swap_area)
+#define SNAPSHOT_IOC_MAXNR 13
+
+#define PMOPS_PREPARE 1
+#define PMOPS_ENTER 2
+#define PMOPS_FINISH 3
/**
* The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
extern void free_bitmap(struct bitmap_page *bitmap);
extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
extern int swsusp_write(void);
extern void swsusp_close(void);
extern int suspend_enter(suspend_state_t state);
+
+struct timeval;
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+ unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac316..678ec736076 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
* callback we use.
*/
-static void do_poweroff(void *dummy)
+static void do_poweroff(struct work_struct *dummy)
{
kernel_power_off();
}
-static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+static DECLARE_WORK(poweroff_work, do_poweroff);
static void handle_poweroff(int key, struct tty_struct *tty)
{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e..6d566bf7085 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,20 +13,22 @@
#include <linux/suspend.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/freezer.h>
/*
* Timeout for stopping processes
*/
#define TIMEOUT (20 * HZ)
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_USER_SPACE 1
static inline int freezeable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
(p->exit_state == EXIT_ZOMBIE) ||
- (p->exit_state == EXIT_DEAD) ||
- (p->state == TASK_STOPPED))
+ (p->exit_state == EXIT_DEAD))
return 0;
return 1;
}
@@ -39,7 +41,6 @@ void refrigerator(void)
long save;
save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
- printk("=");
frozen_process(current);
spin_lock_irq(&current->sighand->siglock);
@@ -59,10 +60,16 @@ static inline void freeze_process(struct task_struct *p)
unsigned long flags;
if (!freezing(p)) {
- freeze(p);
- spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, 0);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ rmb();
+ if (!frozen(p)) {
+ if (p->state == TASK_STOPPED)
+ force_sig_specific(SIGSTOP, p);
+
+ freeze(p);
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ signal_wake_up(p, p->state == TASK_STOPPED);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
}
}
@@ -79,96 +86,134 @@ static void cancel_freezing(struct task_struct *p)
}
}
-/* 0 = success, else # of processes that we failed to stop */
-int freeze_processes(void)
+static inline int is_user_space(struct task_struct *p)
+{
+ return p->mm && !(p->flags & PF_BORROWED_MM);
+}
+
+static unsigned int try_to_freeze_tasks(int freeze_user_space)
{
- int todo, nr_user, user_frozen;
- unsigned long start_time;
struct task_struct *g, *p;
+ unsigned long end_time;
+ unsigned int todo;
- printk( "Stopping tasks: " );
- start_time = jiffies;
- user_frozen = 0;
+ end_time = jiffies + TIMEOUT;
do {
- nr_user = todo = 0;
+ todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
if (!freezeable(p))
continue;
+
if (frozen(p))
continue;
+
if (p->state == TASK_TRACED && frozen(p->parent)) {
cancel_freezing(p);
continue;
}
- if (p->mm && !(p->flags & PF_BORROWED_MM)) {
- /* The task is a user-space one.
- * Freeze it unless there's a vfork completion
- * pending
+ if (is_user_space(p)) {
+ if (!freeze_user_space)
+ continue;
+
+ /* Freeze the task unless there is a vfork
+ * completion pending
*/
if (!p->vfork_done)
freeze_process(p);
- nr_user++;
} else {
- /* Freeze only if the user space is frozen */
- if (user_frozen)
- freeze_process(p);
- todo++;
+ if (freeze_user_space)
+ continue;
+
+ freeze_process(p);
}
+ todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
- todo += nr_user;
- if (!user_frozen && !nr_user) {
- sys_sync();
- start_time = jiffies;
- }
- user_frozen = !nr_user;
yield(); /* Yield is okay here */
- if (todo && time_after(jiffies, start_time + TIMEOUT))
+ if (todo && time_after(jiffies, end_time))
break;
- } while(todo);
+ } while (todo);
- /* This does not unfreeze processes that are already frozen
- * (we have slightly ugly calling convention in that respect,
- * and caller must call thaw_processes() if something fails),
- * but it cleans up leftover PF_FREEZE requests.
- */
if (todo) {
- printk( "\n" );
- printk(KERN_ERR " stopping tasks timed out "
- "after %d seconds (%d tasks remaining):\n",
- TIMEOUT / HZ, todo);
+ /* This does not unfreeze processes that are already frozen
+ * (we have slightly ugly calling convention in that respect,
+ * and caller must call thaw_processes() if something fails),
+ * but it cleans up leftover PF_FREEZE requests.
+ */
+ printk("\n");
+ printk(KERN_ERR "Stopping %s timed out after %d seconds "
+ "(%d tasks refusing to freeze):\n",
+ freeze_user_space ? "user space processes" :
+ "kernel threads",
+ TIMEOUT / HZ, todo);
read_lock(&tasklist_lock);
do_each_thread(g, p) {
+ if (is_user_space(p) == !freeze_user_space)
+ continue;
+
if (freezeable(p) && !frozen(p))
- printk(KERN_ERR " %s\n", p->comm);
+ printk(KERN_ERR " %s\n", p->comm);
+
cancel_freezing(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
- return todo;
}
- printk( "|\n" );
+ return todo;
+}
+
+/**
+ * freeze_processes - tell processes to enter the refrigerator
+ *
+ * Returns 0 on success, or the number of processes that didn't freeze,
+ * although they were told to.
+ */
+int freeze_processes(void)
+{
+ unsigned int nr_unfrozen;
+
+ printk("Stopping tasks ... ");
+ nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
+ if (nr_unfrozen)
+ return nr_unfrozen;
+
+ sys_sync();
+ nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+ if (nr_unfrozen)
+ return nr_unfrozen;
+
+ printk("done.\n");
BUG_ON(in_atomic());
return 0;
}
-void thaw_processes(void)
+static void thaw_tasks(int thaw_user_space)
{
struct task_struct *g, *p;
- printk( "Restarting tasks..." );
read_lock(&tasklist_lock);
do_each_thread(g, p) {
if (!freezeable(p))
continue;
+
+ if (is_user_space(p) == !thaw_user_space)
+ continue;
+
if (!thaw_process(p))
- printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+ printk(KERN_WARNING " Strange, %s not stopped\n",
+ p->comm );
} while_each_thread(g, p);
-
read_unlock(&tasklist_lock);
+}
+
+void thaw_processes(void)
+{
+ printk("Restarting tasks ... ");
+ thaw_tasks(FREEZER_KERNEL_THREADS);
+ thaw_tasks(FREEZER_USER_SPACE);
schedule();
- printk( " done\n" );
+ printk("done.\n");
}
EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d..c024606221c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
/*
* linux/kernel/power/snapshot.c
*
- * This file provide system snapshot/restore functionality.
+ * This file provides system snapshot/restore functionality for swsusp.
*
* Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
- * This file is released under the GPLv2, and is based on swsusp.c.
+ * This file is released under the GPLv2.
*
*/
-
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -34,137 +34,24 @@
#include "power.h"
-/* List of PBEs used for creating and restoring the suspend image */
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
struct pbe *restore_pblist;
-static unsigned int nr_copy_pages;
-static unsigned int nr_meta_pages;
+/* Pointer to an auxiliary buffer (1 page) */
static void *buffer;
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void)
-{
- struct zone *zone;
- unsigned long zone_pfn;
- unsigned int n = 0;
-
- for_each_zone (zone)
- if (is_highmem(zone)) {
- mark_free_pages(zone);
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
- struct page *page;
- unsigned long pfn = zone_pfn + zone->zone_start_pfn;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (PageReserved(page))
- continue;
- if (PageNosaveFree(page))
- continue;
- n++;
- }
- }
- return n;
-}
-
-struct highmem_page {
- char *data;
- struct page *page;
- struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
- unsigned long zone_pfn;
- mark_free_pages(zone);
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
- struct page *page;
- struct highmem_page *save;
- void *kaddr;
- unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
- if (!(pfn%10000))
- printk(".");
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- /*
- * This condition results from rvmalloc() sans vmalloc_32()
- * and architectural memory reservations. This should be
- * corrected eventually when the cases giving rise to this
- * are better understood.
- */
- if (PageReserved(page))
- continue;
- BUG_ON(PageNosave(page));
- if (PageNosaveFree(page))
- continue;
- save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
- if (!save)
- return -ENOMEM;
- save->next = highmem_copy;
- save->page = page;
- save->data = (void *) get_zeroed_page(GFP_ATOMIC);
- if (!save->data) {
- kfree(save);
- return -ENOMEM;
- }
- kaddr = kmap_atomic(page, KM_USER0);
- memcpy(save->data, kaddr, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- highmem_copy = save;
- }
- return 0;
-}
-
-int save_highmem(void)
-{
- struct zone *zone;
- int res = 0;
-
- pr_debug("swsusp: Saving Highmem");
- drain_local_pages();
- for_each_zone (zone) {
- if (is_highmem(zone))
- res = save_highmem_zone(zone);
- if (res)
- return res;
- }
- printk("\n");
- return 0;
-}
-
-int restore_highmem(void)
-{
- printk("swsusp: Restoring Highmem\n");
- while (highmem_copy) {
- struct highmem_page *save = highmem_copy;
- void *kaddr;
- highmem_copy = save->next;
-
- kaddr = kmap_atomic(save->page, KM_USER0);
- memcpy(kaddr, save->data, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- free_page((long) save->data);
- kfree(save);
- }
- return 0;
-}
-#else
-static inline unsigned int count_highmem_pages(void) {return 0;}
-static inline int save_highmem(void) {return 0;}
-static inline int restore_highmem(void) {return 0;}
-#endif
-
/**
* @safe_needed - on resume, for storing the PBE list and the image,
* we can only use memory pages that do not conflict with the pages
- * used before suspend.
+ * used before suspend. The unsafe pages have PageNosaveFree set
+ * and we count them using unsafe_pages.
*
- * The unsafe pages are marked with the PG_nosave_free flag
- * and we count them using unsafe_pages
+ * Each allocated image page is marked as PageNosave and PageNosaveFree
+ * so that swsusp_free() can release it.
*/
#define PG_ANY 0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
static unsigned int allocated_unsafe_pages;
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
unsigned long get_safe_page(gfp_t gfp_mask)
{
- return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
+ return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
+ struct page *page;
+
+ page = alloc_page(gfp_mask);
+ if (page) {
+ SetPageNosave(page);
+ SetPageNosaveFree(page);
+ }
+ return page;
}
/**
* free_image_page - free page represented by @addr, allocated with
- * alloc_image_page (page flags set by it must be cleared)
+ * get_image_page (page flags set by it must be cleared)
*/
static inline void free_image_page(void *addr, int clear_nosave_free)
{
- ClearPageNosave(virt_to_page(addr));
+ struct page *page;
+
+ BUG_ON(!virt_addr_valid(addr));
+
+ page = virt_to_page(addr);
+
+ ClearPageNosave(page);
if (clear_nosave_free)
- ClearPageNosaveFree(virt_to_page(addr));
- free_page((unsigned long)addr);
+ ClearPageNosaveFree(page);
+
+ __free_page(page);
}
/* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
struct linked_page *lp;
- lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+ lp = get_image_page(ca->gfp_mask, ca->safe_needed);
if (!lp)
return NULL;
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
/* Compute the number of zones */
nr = 0;
- for_each_zone (zone)
- if (populated_zone(zone) && !is_highmem(zone))
+ for_each_zone(zone)
+ if (populated_zone(zone))
nr++;
/* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
/* Initialize the zone bitmap objects */
- for_each_zone (zone) {
+ for_each_zone(zone) {
unsigned long pfn;
- if (!populated_zone(zone) || is_highmem(zone))
+ if (!populated_zone(zone))
continue;
zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
while (bb) {
unsigned long *ptr;
- ptr = alloc_image_page(gfp_mask, safe_needed);
+ ptr = get_image_page(gfp_mask, safe_needed);
bb->data = ptr;
if (!ptr)
goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
memory_bm_position_reset(bm);
return 0;
-Free:
+ Free:
bm->p_list = ca.chain;
memory_bm_free(bm, PG_UNSAFE_CLEAR);
return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
memory_bm_position_reset(bm);
return BM_END_OF_MAP;
-Return_pfn:
+ Return_pfn:
bm->cur.chunk = chunk;
bm->cur.bit = bit;
return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
- return res;
+ return 2 * res;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * count_free_highmem_pages - compute the total number of free highmem
+ * pages, system-wide.
+ */
+
+static unsigned int count_free_highmem_pages(void)
+{
+ struct zone *zone;
+ unsigned int cnt = 0;
+
+ for_each_zone(zone)
+ if (populated_zone(zone) && is_highmem(zone))
+ cnt += zone->free_pages;
+
+ return cnt;
+}
+
+/**
+ * saveable_highmem_page - Determine whether a highmem page should be
+ * included in the suspend image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't a part of a free chunk of pages.
+ */
+
+static struct page *saveable_highmem_page(unsigned long pfn)
+{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+
+ BUG_ON(!PageHighMem(page));
+
+ if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+ return NULL;
+
+ return page;
}
/**
+ * count_highmem_pages - compute the total number of saveable highmem
+ * pages.
+ */
+
+unsigned int count_highmem_pages(void)
+{
+ struct zone *zone;
+ unsigned int n = 0;
+
+ for_each_zone(zone) {
+ unsigned long pfn, max_zone_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (saveable_highmem_page(pfn))
+ n++;
+ }
+ return n;
+}
+#else
+static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
* pfn_is_nosave - check if given pfn is in the 'nosave' section
*/
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
}
/**
- * saveable - Determine whether a page should be cloned or not.
- * @pfn: The page
+ * saveable - Determine whether a non-highmem page should be included in
+ * the suspend image.
*
- * We save a page if it isn't Nosave, and is not in the range of pages
- * statically defined as 'unsaveable', and it
- * isn't a part of a free chunk of pages.
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't a part of
+ * a free chunk of pages.
*/
static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
page = pfn_to_page(pfn);
- if (PageNosave(page))
+ BUG_ON(PageHighMem(page));
+
+ if (PageNosave(page) || PageNosaveFree(page))
return NULL;
+
if (PageReserved(page) && pfn_is_nosave(pfn))
return NULL;
- if (PageNosaveFree(page))
- return NULL;
return page;
}
+/**
+ * count_data_pages - compute the total number of saveable non-highmem
+ * pages.
+ */
+
unsigned int count_data_pages(void)
{
struct zone *zone;
unsigned long pfn, max_zone_pfn;
unsigned int n = 0;
- for_each_zone (zone) {
+ for_each_zone(zone) {
if (is_highmem(zone))
continue;
+
mark_free_pages(zone);
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- n += !!saveable_page(pfn);
+ if(saveable_page(pfn))
+ n++;
}
return n;
}
-static inline void copy_data_page(long *dst, long *src)
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
{
int n;
- /* copy_page and memcpy are not usable for copying task structs. */
for (n = PAGE_SIZE / sizeof(long); n; n--)
*dst++ = *src++;
}
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+ return is_highmem(zone) ?
+ saveable_highmem_page(pfn) : saveable_page(pfn);
+}
+
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+ struct page *s_page, *d_page;
+ void *src, *dst;
+
+ s_page = pfn_to_page(src_pfn);
+ d_page = pfn_to_page(dst_pfn);
+ if (PageHighMem(s_page)) {
+ src = kmap_atomic(s_page, KM_USER0);
+ dst = kmap_atomic(d_page, KM_USER1);
+ do_copy_page(dst, src);
+ kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ } else {
+ src = page_address(s_page);
+ if (PageHighMem(d_page)) {
+ /* Page pointed to by src may contain some kernel
+ * data modified by kmap_atomic()
+ */
+ do_copy_page(buffer, src);
+ dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+ memcpy(dst, buffer, PAGE_SIZE);
+ kunmap_atomic(dst, KM_USER0);
+ } else {
+ dst = page_address(d_page);
+ do_copy_page(dst, src);
+ }
+ }
+}
+#else
+#define page_is_saveable(zone, pfn) saveable_page(pfn)
+
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+ do_copy_page(page_address(pfn_to_page(dst_pfn)),
+ page_address(pfn_to_page(src_pfn)));
+}
+#endif /* CONFIG_HIGHMEM */
+
static void
copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
{
struct zone *zone;
unsigned long pfn;
- for_each_zone (zone) {
+ for_each_zone(zone) {
unsigned long max_zone_pfn;
- if (is_highmem(zone))
- continue;
-
mark_free_pages(zone);
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (saveable_page(pfn))
+ if (page_is_saveable(zone, pfn))
memory_bm_set_bit(orig_bm, pfn);
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
do {
pfn = memory_bm_next_pfn(orig_bm);
- if (likely(pfn != BM_END_OF_MAP)) {
- struct page *page;
- void *src;
-
- page = pfn_to_page(pfn);
- src = page_address(page);
- page = pfn_to_page(memory_bm_next_pfn(copy_bm));
- copy_data_page(page_address(page), src);
- }
+ if (likely(pfn != BM_END_OF_MAP))
+ copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
} while (pfn != BM_END_OF_MAP);
}
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
+
/**
* swsusp_free - free pages allocated for the suspend.
*
@@ -792,7 +824,7 @@ void swsusp_free(void)
if (PageNosave(page) && PageNosaveFree(page)) {
ClearPageNosave(page);
ClearPageNosaveFree(page);
- free_page((long) page_address(page));
+ __free_page(page);
}
}
}
@@ -802,34 +834,108 @@ void swsusp_free(void)
buffer = NULL;
}
+#ifdef CONFIG_HIGHMEM
+/**
+ * count_pages_for_highmem - compute the number of non-highmem pages
+ * that will be necessary for creating copies of highmem pages.
+ */
+
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+ unsigned int free_highmem = count_free_highmem_pages();
+
+ if (free_highmem >= nr_highmem)
+ nr_highmem = 0;
+ else
+ nr_highmem -= free_highmem;
+
+ return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
/**
- * enough_free_mem - Make sure we enough free memory to snapshot.
- *
- * Returns TRUE or FALSE after checking the number of available
- * free pages.
+ * enough_free_mem - Make sure we have enough free memory for the
+ * snapshot image.
*/
-static int enough_free_mem(unsigned int nr_pages)
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
{
struct zone *zone;
unsigned int free = 0, meta = 0;
- for_each_zone (zone)
- if (!is_highmem(zone)) {
+ for_each_zone(zone) {
+ meta += snapshot_additional_pages(zone);
+ if (!is_highmem(zone))
free += zone->free_pages;
- meta += snapshot_additional_pages(zone);
- }
+ }
- pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+ nr_pages += count_pages_for_highmem(nr_highmem);
+ pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
nr_pages, PAGES_FOR_IO, meta, free);
return free > nr_pages + PAGES_FOR_IO + meta;
}
+#ifdef CONFIG_HIGHMEM
+/**
+ * get_highmem_buffer - if there are some highmem pages in the suspend
+ * image, we may need the buffer to copy them and/or load their data.
+ */
+
+static inline int get_highmem_buffer(int safe_needed)
+{
+ buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ return buffer ? 0 : -ENOMEM;
+}
+
+/**
+ * alloc_highmem_image_pages - allocate some highmem pages for the image.
+ * Try to allocate as many pages as needed, but if the number of free
+ * highmem pages is lesser than that, allocate them all.
+ */
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+ unsigned int to_alloc = count_free_highmem_pages();
+
+ if (to_alloc > nr_highmem)
+ to_alloc = nr_highmem;
+
+ nr_highmem -= to_alloc;
+ while (to_alloc-- > 0) {
+ struct page *page;
+
+ page = alloc_image_page(__GFP_HIGHMEM);
+ memory_bm_set_bit(bm, page_to_pfn(page));
+ }
+ return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * swsusp_alloc - allocate memory for the suspend image
+ *
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
+ *
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
+ */
+
static int
swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
- unsigned int nr_pages)
+ unsigned int nr_pages, unsigned int nr_highmem)
{
int error;
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
if (error)
goto Free;
+ if (nr_highmem > 0) {
+ error = get_highmem_buffer(PG_ANY);
+ if (error)
+ goto Free;
+
+ nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+ }
while (nr_pages-- > 0) {
- struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+ struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+
if (!page)
goto Free;
- SetPageNosave(page);
- SetPageNosaveFree(page);
memory_bm_set_bit(copy_bm, page_to_pfn(page));
}
return 0;
-Free:
+ Free:
swsusp_free();
return -ENOMEM;
}
-/* Memory bitmap used for marking saveable pages */
+/* Memory bitmap used for marking saveable pages (during suspend) or the
+ * suspend image pages (during resume)
+ */
static struct memory_bitmap orig_bm;
-/* Memory bitmap used for marking allocated pages that will contain the copies
- * of saveable pages
+/* Memory bitmap used on suspend for marking allocated pages that will contain
+ * the copies of saveable pages. During resume it is initially used for
+ * marking the suspend image pages, but then its set bits are duplicated in
+ * @orig_bm and it is released. Next, on systems with high memory, it may be
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
*/
static struct memory_bitmap copy_bm;
asmlinkage int swsusp_save(void)
{
- unsigned int nr_pages;
+ unsigned int nr_pages, nr_highmem;
- pr_debug("swsusp: critical section: \n");
+ printk("swsusp: critical section: \n");
drain_local_pages();
nr_pages = count_data_pages();
- printk("swsusp: Need to copy %u pages\n", nr_pages);
+ nr_highmem = count_highmem_pages();
+ printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
- if (!enough_free_mem(nr_pages)) {
+ if (!enough_free_mem(nr_pages, nr_highmem)) {
printk(KERN_ERR "swsusp: Not enough free memory\n");
return -ENOMEM;
}
- if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
+ if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+ printk(KERN_ERR "swsusp: Memory allocation failed\n");
return -ENOMEM;
+ }
/* During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
* touch swap space! Except we must write out our image of course.
*/
+ nr_pages += nr_highmem;
nr_copy_pages = nr_pages;
- nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+
return 0;
}
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
- buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+ buffer = get_image_page(GFP_ATOMIC, PG_ANY);
if (!buffer)
return -ENOMEM;
}
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
memset(buffer, 0, PAGE_SIZE);
pack_pfns(buffer, &orig_bm);
} else {
- unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+ struct page *page;
- handle->buffer = page_address(pfn_to_page(pfn));
+ page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+ if (PageHighMem(page)) {
+ /* Highmem pages are copied to the buffer,
+ * because we can't return with a kmapped
+ * highmem page (we may not be called again).
+ */
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(buffer, kaddr, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ handle->buffer = buffer;
+ } else {
+ handle->buffer = page_address(page);
+ }
}
handle->prev = handle->cur;
}
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
unsigned long pfn, max_zone_pfn;
/* Clear page flags */
- for_each_zone (zone) {
+ for_each_zone(zone) {
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
}
}
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+ struct page *copy_page; /* data is here now */
+ struct page *orig_page; /* data was here before the suspend */
+ struct highmem_pbe *next;
+};
+
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+
+/**
+ * count_highmem_image_pages - compute the number of highmem pages in the
+ * suspend image. The bits in the memory bitmap @bm that correspond to the
+ * image pages are assumed to be set.
+ */
+
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+ unsigned long pfn;
+ unsigned int cnt = 0;
+
+ memory_bm_position_reset(bm);
+ pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (PageHighMem(pfn_to_page(pfn)))
+ cnt++;
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ return cnt;
+}
+
+/**
+ * prepare_highmem_image - try to allocate as many highmem pages as
+ * there are highmem image pages (@nr_highmem_p points to the variable
+ * containing the number of highmem image pages). The pages that are
+ * "safe" (ie. will not be overwritten when the suspend image is
+ * restored) have the corresponding bits set in @bm (it must be
+ * unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem
+ * image pages.
+ */
+
+static unsigned int safe_highmem_pages;
+
+static struct memory_bitmap *safe_highmem_bm;
+
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+ unsigned int to_alloc;
+
+ if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+ return -ENOMEM;
+
+ if (get_highmem_buffer(PG_SAFE))
+ return -ENOMEM;
+
+ to_alloc = count_free_highmem_pages();
+ if (to_alloc > *nr_highmem_p)
+ to_alloc = *nr_highmem_p;
+ else
+ *nr_highmem_p = to_alloc;
+
+ safe_highmem_pages = 0;
+ while (to_alloc-- > 0) {
+ struct page *page;
+
+ page = alloc_page(__GFP_HIGHMEM);
+ if (!PageNosaveFree(page)) {
+ /* The page is "safe", set its bit the bitmap */
+ memory_bm_set_bit(bm, page_to_pfn(page));
+ safe_highmem_pages++;
+ }
+ /* Mark the page as allocated */
+ SetPageNosave(page);
+ SetPageNosaveFree(page);
+ }
+ memory_bm_position_reset(bm);
+ safe_highmem_bm = bm;
+ return 0;
+}
+
+/**
+ * get_highmem_page_buffer - for given highmem image page find the buffer
+ * that suspend_write_next() should set for its caller to write to.
+ *
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
+ *
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem page is set to the page to which
+ * the data will have to be copied from @buffer.
+ */
+
+static struct page *last_highmem_page;
+
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+ struct highmem_pbe *pbe;
+ void *kaddr;
+
+ if (PageNosave(page) && PageNosaveFree(page)) {
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
+ */
+ last_highmem_page = page;
+ return buffer;
+ }
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
+ */
+ pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+ if (!pbe) {
+ swsusp_free();
+ return NULL;
+ }
+ pbe->orig_page = page;
+ if (safe_highmem_pages > 0) {
+ struct page *tmp;
+
+ /* Copy of the page will be stored in high memory */
+ kaddr = buffer;
+ tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+ safe_highmem_pages--;
+ last_highmem_page = tmp;
+ pbe->copy_page = tmp;
+ } else {
+ /* Copy of the page will be stored in normal memory */
+ kaddr = safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->copy_page = virt_to_page(kaddr);
+ }
+ pbe->next = highmem_pblist;
+ highmem_pblist = pbe;
+ return kaddr;
+}
+
+/**
+ * copy_last_highmem_page - copy the contents of a highmem image from
+ * @buffer, where the caller of snapshot_write_next() has place them,
+ * to the right location represented by @last_highmem_page .
+ */
+
+static void copy_last_highmem_page(void)
+{
+ if (last_highmem_page) {
+ void *dst;
+
+ dst = kmap_atomic(last_highmem_page, KM_USER0);
+ memcpy(dst, buffer, PAGE_SIZE);
+ kunmap_atomic(dst, KM_USER0);
+ last_highmem_page = NULL;
+ }
+}
+
+static inline int last_highmem_page_copied(void)
+{
+ return !last_highmem_page;
+}
+
+static inline void free_highmem_data(void)
+{
+ if (safe_highmem_bm)
+ memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+
+ if (buffer)
+ free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static inline int get_safe_write_buffer(void) { return 0; }
+
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+ return 0;
+}
+
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+ return NULL;
+}
+
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
+
/**
* prepare_image - use the memory bitmap @bm to mark the pages that will
* be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
* The idea is to allocate a new memory bitmap first and then allocate
* as many pages as needed for the image data, but not to assign these
* pages to specific tasks initially. Instead, we just mark them as
- * allocated and create a list of "safe" pages that will be used later.
+ * allocated and create a lists of "safe" pages that will be used
+ * later. On systems with high memory a list of "safe" highmem pages is
+ * also created.
*/
#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-static struct linked_page *safe_pages_list;
-
static int
prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
{
- unsigned int nr_pages;
+ unsigned int nr_pages, nr_highmem;
struct linked_page *sp_list, *lp;
int error;
+ /* If there is no highmem, the buffer will not be necessary */
+ free_image_page(buffer, PG_UNSAFE_CLEAR);
+ buffer = NULL;
+
+ nr_highmem = count_highmem_image_pages(bm);
error = mark_unsafe_pages(bm);
if (error)
goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
duplicate_memory_bitmap(new_bm, bm);
memory_bm_free(bm, PG_UNSAFE_KEEP);
+ if (nr_highmem > 0) {
+ error = prepare_highmem_image(bm, &nr_highmem);
+ if (error)
+ goto Free;
+ }
/* Reserve some safe pages for potential later use.
*
* NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
*/
sp_list = NULL;
/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
- nr_pages = nr_copy_pages - allocated_unsafe_pages;
+ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
- lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+ lp = get_image_page(GFP_ATOMIC, PG_SAFE);
if (!lp) {
error = -ENOMEM;
goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
}
/* Preallocate memory for the image */
safe_pages_list = NULL;
- nr_pages = nr_copy_pages - allocated_unsafe_pages;
+ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
}
return 0;
-Free:
+ Free:
swsusp_free();
return error;
}
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
struct pbe *pbe;
struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
+ if (PageHighMem(page))
+ return get_highmem_page_buffer(page, ca);
+
if (PageNosave(page) && PageNosaveFree(page))
/* We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
swsusp_free();
return NULL;
}
- pbe->orig_address = (unsigned long)page_address(page);
- pbe->address = (unsigned long)safe_pages_list;
+ pbe->orig_address = page_address(page);
+ pbe->address = safe_pages_list;
safe_pages_list = safe_pages_list->next;
pbe->next = restore_pblist;
restore_pblist = pbe;
- return (void *)pbe->address;
+ return pbe->address;
}
/**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
return 0;
- if (!buffer) {
- /* This makes the buffer be freed by swsusp_free() */
- buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+ if (handle->offset == 0) {
+ if (!buffer)
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+
if (!buffer)
return -ENOMEM;
- }
- if (!handle->offset)
+
handle->buffer = buffer;
+ }
handle->sync_read = 1;
if (handle->prev < handle->cur) {
if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
return -ENOMEM;
}
} else {
+ copy_last_highmem_page();
handle->buffer = get_buffer(&orig_bm, &ca);
- handle->sync_read = 0;
+ if (handle->buffer != buffer)
+ handle->sync_read = 0;
}
handle->prev = handle->cur;
}
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
return count;
}
+/**
+ * snapshot_write_finalize - must be called after the last call to
+ * snapshot_write_next() in case the last page in the image happens
+ * to be a highmem page and its contents should be stored in the
+ * highmem. Additionally, it releases the memory that will not be
+ * used any more.
+ */
+
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+ copy_last_highmem_page();
+ /* Free only if we have loaded the image entirely */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+ memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ free_highmem_data();
+ }
+}
+
int snapshot_image_loaded(struct snapshot_handle *handle)
{
- return !(!nr_copy_pages ||
+ return !(!nr_copy_pages || !last_highmem_page_copied() ||
handle->cur <= nr_meta_pages + nr_copy_pages);
}
-void snapshot_free_unused_memory(struct snapshot_handle *handle)
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
{
- /* Free only if we have loaded the image entirely */
- if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
- memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ void *kaddr1, *kaddr2;
+
+ kaddr1 = kmap_atomic(p1, KM_USER0);
+ kaddr2 = kmap_atomic(p2, KM_USER1);
+ memcpy(buf, kaddr1, PAGE_SIZE);
+ memcpy(kaddr1, kaddr2, PAGE_SIZE);
+ memcpy(kaddr2, buf, PAGE_SIZE);
+ kunmap_atomic(kaddr1, KM_USER0);
+ kunmap_atomic(kaddr2, KM_USER1);
+}
+
+/**
+ * restore_highmem - for each highmem page that was allocated before
+ * the suspend and included in the suspend image, and also has been
+ * allocated by the "resume" kernel swap its current (ie. "before
+ * resume") contents with the previous (ie. "before suspend") one.
+ *
+ * If the resume eventually fails, we can call this function once
+ * again and restore the "before resume" highmem state.
+ */
+
+int restore_highmem(void)
+{
+ struct highmem_pbe *pbe = highmem_pblist;
+ void *buf;
+
+ if (!pbe)
+ return 0;
+
+ buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!buf)
+ return -ENOMEM;
+
+ while (pbe) {
+ swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+ pbe = pbe->next;
+ }
+ free_image_page(buf, PG_UNSAFE_CLEAR);
+ return 0;
}
+#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3f..f133d4a6d81 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
#define SWSUSP_SIG "S1SUSPEND"
static struct swsusp_header {
- char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
- swp_entry_t image;
+ char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
+ sector_t image;
char orig_sig[10];
char sig[10];
} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
/*
- * Saving part...
+ * General things
*/
static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+
+/**
+ * submit - submit BIO request.
+ * @rw: READ or WRITE.
+ * @off physical offset of page.
+ * @page: page we're reading or writing.
+ * @bio_chain: list of pending biod (for async reading)
+ *
+ * Straight from the textbook - allocate and initialize the bio.
+ * If we're reading, make sure the page is marked as dirty.
+ * Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+ struct bio **bio_chain)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ if (!bio)
+ return -ENOMEM;
+ bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+ bio->bi_bdev = resume_bdev;
+ bio->bi_end_io = end_swap_bio_read;
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ lock_page(page);
+ bio_get(bio);
+
+ if (bio_chain == NULL) {
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ wait_on_page_locked(page);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ bio_put(bio);
+ } else {
+ if (rw == READ)
+ get_page(page); /* These pages are freed later */
+ bio->bi_private = *bio_chain;
+ *bio_chain = bio;
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ }
+ return 0;
+}
+
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+ return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+ return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+ struct bio *bio;
+ struct bio *next_bio;
+ int ret = 0;
+
+ if (bio_chain == NULL)
+ return 0;
+
+ bio = *bio_chain;
+ if (bio == NULL)
+ return 0;
+ while (bio) {
+ struct page *page;
+
+ next_bio = bio->bi_private;
+ page = bio->bi_io_vec[0].bv_page;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page) || PageError(page))
+ ret = -EIO;
+ put_page(page);
+ bio_put(bio);
+ bio = next_bio;
+ }
+ *bio_chain = NULL;
+ return ret;
+}
+
+/*
+ * Saving part
+ */
-static int mark_swapfiles(swp_entry_t start)
+static int mark_swapfiles(sector_t start)
{
int error;
- rw_swap_page_sync(READ, swp_entry(root_swap, 0),
- virt_to_page((unsigned long)&swsusp_header), NULL);
+ bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
swsusp_header.image = start;
- error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
- virt_to_page((unsigned long)&swsusp_header),
- NULL);
+ error = bio_write_page(swsusp_resume_block,
+ &swsusp_header, NULL);
} else {
- pr_debug("swsusp: Partition is not swap space.\n");
+ printk(KERN_ERR "swsusp: Swap header not found!\n");
error = -ENODEV;
}
return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
static int swsusp_swap_check(void) /* This is called before saving image */
{
- int res = swap_type_of(swsusp_resume_device);
+ int res;
+
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+ if (res < 0)
+ return res;
+
+ root_swap = res;
+ resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
+ if (IS_ERR(resume_bdev))
+ return PTR_ERR(resume_bdev);
+
+ res = set_blocksize(resume_bdev, PAGE_SIZE);
+ if (res < 0)
+ blkdev_put(resume_bdev);
- if (res >= 0) {
- root_swap = res;
- return 0;
- }
return res;
}
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
* @bio_chain: Link the next write BIO here
*/
-static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
{
- swp_entry_t entry;
- int error = -ENOSPC;
-
- if (offset) {
- struct page *page = virt_to_page(buf);
-
- if (bio_chain) {
- /*
- * Whether or not we successfully allocated a copy page,
- * we take a ref on the page here. It gets undone in
- * wait_on_bio_chain().
- */
- struct page *page_copy;
- page_copy = alloc_page(GFP_ATOMIC);
- if (page_copy == NULL) {
- WARN_ON_ONCE(1);
- bio_chain = NULL; /* Go synchronous */
- get_page(page);
- } else {
- memcpy(page_address(page_copy),
- page_address(page), PAGE_SIZE);
- page = page_copy;
- }
+ void *src;
+
+ if (!offset)
+ return -ENOSPC;
+
+ if (bio_chain) {
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ if (src) {
+ memcpy(src, buf, PAGE_SIZE);
+ } else {
+ WARN_ON_ONCE(1);
+ bio_chain = NULL; /* Go synchronous */
+ src = buf;
}
- entry = swp_entry(root_swap, offset);
- error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
+ } else {
+ src = buf;
}
- return error;
+ return bio_write_page(offset, src, bio_chain);
}
/*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
* at a time.
*/
-#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1)
+#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
struct swap_map_page {
- unsigned long entries[MAP_PAGE_ENTRIES];
- unsigned long next_swap;
+ sector_t entries[MAP_PAGE_ENTRIES];
+ sector_t next_swap;
};
/**
@@ -151,7 +239,7 @@ struct swap_map_page {
struct swap_map_handle {
struct swap_map_page *cur;
- unsigned long cur_swap;
+ sector_t cur_swap;
struct bitmap_page *bitmap;
unsigned int k;
};
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
handle->bitmap = NULL;
}
-static void show_speed(struct timeval *start, struct timeval *stop,
- unsigned nr_pages, char *msg)
-{
- s64 elapsed_centisecs64;
- int centisecs;
- int k;
- int kps;
-
- elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
- do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
- centisecs = elapsed_centisecs64;
- if (centisecs == 0)
- centisecs = 1; /* avoid div-by-zero */
- k = nr_pages * (PAGE_SIZE / 1024);
- kps = (k * 100) / centisecs;
- printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
- centisecs / 100, centisecs % 100,
- kps / 1000, (kps % 1000) / 10);
-}
-
static int get_swap_writer(struct swap_map_handle *handle)
{
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
release_swap_writer(handle);
return -ENOMEM;
}
- handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+ handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
if (!handle->cur_swap) {
release_swap_writer(handle);
return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
return 0;
}
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
- struct bio *bio;
- struct bio *next_bio;
- int ret = 0;
-
- if (bio_chain == NULL)
- return 0;
-
- bio = *bio_chain;
- if (bio == NULL)
- return 0;
- while (bio) {
- struct page *page;
-
- next_bio = bio->bi_private;
- page = bio->bi_io_vec[0].bv_page;
- wait_on_page_locked(page);
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- put_page(page);
- bio_put(bio);
- bio = next_bio;
- }
- *bio_chain = NULL;
- return ret;
-}
-
static int swap_write_page(struct swap_map_handle *handle, void *buf,
struct bio **bio_chain)
{
int error = 0;
- unsigned long offset;
+ sector_t offset;
if (!handle->cur)
return -EINVAL;
- offset = alloc_swap_page(root_swap, handle->bitmap);
+ offset = alloc_swapdev_block(root_swap, handle->bitmap);
error = write_page(buf, offset, bio_chain);
if (error)
return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
error = wait_on_bio_chain(bio_chain);
if (error)
goto out;
- offset = alloc_swap_page(root_swap, handle->bitmap);
+ offset = alloc_swapdev_block(root_swap, handle->bitmap);
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
handle->cur_swap = offset;
handle->k = 0;
}
-out:
+ out:
return error;
}
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
error = err2;
if (!error)
printk("\b\b\b\bdone\n");
- show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
return error;
}
@@ -350,100 +390,50 @@ int swsusp_write(void)
struct swsusp_info *header;
int error;
- if ((error = swsusp_swap_check())) {
+ error = swsusp_swap_check();
+ if (error) {
printk(KERN_ERR "swsusp: Cannot find swap device, try "
"swapon -a.\n");
return error;
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot, PAGE_SIZE);
- if (error < PAGE_SIZE)
- return error < 0 ? error : -EFAULT;
+ if (error < PAGE_SIZE) {
+ if (error >= 0)
+ error = -EFAULT;
+
+ goto out;
+ }
header = (struct swsusp_info *)data_of(snapshot);
if (!enough_swap(header->pages)) {
printk(KERN_ERR "swsusp: Not enough free swap\n");
- return -ENOSPC;
+ error = -ENOSPC;
+ goto out;
}
error = get_swap_writer(&handle);
if (!error) {
- unsigned long start = handle.cur_swap;
+ sector_t start = handle.cur_swap;
+
error = swap_write_page(&handle, header, NULL);
if (!error)
error = save_image(&handle, &snapshot,
header->pages - 1);
+
if (!error) {
flush_swap_writer(&handle);
printk("S");
- error = mark_swapfiles(swp_entry(root_swap, start));
+ error = mark_swapfiles(start);
printk("|\n");
}
}
if (error)
free_all_swap_pages(root_swap, handle.bitmap);
release_swap_writer(&handle);
+ out:
+ swsusp_close();
return error;
}
-static struct block_device *resume_bdev;
-
-/**
- * submit - submit BIO request.
- * @rw: READ or WRITE.
- * @off physical offset of page.
- * @page: page we're reading or writing.
- * @bio_chain: list of pending biod (for async reading)
- *
- * Straight from the textbook - allocate and initialize the bio.
- * If we're reading, make sure the page is marked as dirty.
- * Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
- struct bio **bio_chain)
-{
- struct bio *bio;
-
- bio = bio_alloc(GFP_ATOMIC, 1);
- if (!bio)
- return -ENOMEM;
- bio->bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = resume_bdev;
- bio->bi_end_io = end_swap_bio_read;
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
- bio_put(bio);
- return -EFAULT;
- }
-
- lock_page(page);
- bio_get(bio);
-
- if (bio_chain == NULL) {
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
- wait_on_page_locked(page);
- if (rw == READ)
- bio_set_pages_dirty(bio);
- bio_put(bio);
- } else {
- if (rw == READ)
- get_page(page); /* These pages are freed later */
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
- }
- return 0;
-}
-
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int bio_write_page(pgoff_t page_off, void *addr)
-{
- return submit(WRITE, page_off, virt_to_page(addr), NULL);
-}
-
/**
* The following functions allow us to read data using a swap map
* in a file-alike way
@@ -456,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
handle->cur = NULL;
}
-static int get_swap_reader(struct swap_map_handle *handle,
- swp_entry_t start)
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
{
int error;
- if (!swp_offset(start))
+ if (!start)
return -EINVAL;
- handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+
+ handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
if (!handle->cur)
return -ENOMEM;
- error = bio_read_page(swp_offset(start), handle->cur, NULL);
+
+ error = bio_read_page(start, handle->cur, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -478,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
static int swap_read_page(struct swap_map_handle *handle, void *buf,
struct bio **bio_chain)
{
- unsigned long offset;
+ sector_t offset;
int error;
if (!handle->cur)
@@ -547,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
error = err2;
if (!error) {
printk("\b\b\b\bdone\n");
- snapshot_free_unused_memory(snapshot);
+ snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
error = -ENODATA;
}
- show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(&start, &stop, nr_to_read, "Read");
return error;
}
@@ -600,12 +591,16 @@ int swsusp_check(void)
if (!IS_ERR(resume_bdev)) {
set_blocksize(resume_bdev, PAGE_SIZE);
memset(&swsusp_header, 0, sizeof(swsusp_header));
- if ((error = bio_read_page(0, &swsusp_header, NULL)))
+ error = bio_read_page(swsusp_resume_block,
+ &swsusp_header, NULL);
+ if (error)
return error;
+
if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
/* Reset swap signature now */
- error = bio_write_page(0, &swsusp_header);
+ error = bio_write_page(swsusp_resume_block,
+ &swsusp_header, NULL);
} else {
return -EINVAL;
}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc51..31aa0390c77 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/highmem.h>
+#include <linux/time.h>
#include "power.h"
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
#ifdef CONFIG_HIGHMEM
unsigned int count_highmem_pages(void);
-int save_highmem(void);
int restore_highmem(void);
#else
-static inline int save_highmem(void) { return 0; }
static inline int restore_highmem(void) { return 0; }
static inline unsigned int count_highmem_pages(void) { return 0; }
#endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
return 0;
}
-unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
{
unsigned long offset;
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
- if (bitmap_set(bitmap, offset)) {
+ if (bitmap_set(bitmap, offset))
swap_free(swp_entry(swap, offset));
- offset = 0;
- }
+ else
+ return swapdev_block(swap, offset);
}
- return offset;
+ return 0;
}
void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
}
/**
+ * swsusp_show_speed - print the time elapsed between two events represented by
+ * @start and @stop
+ *
+ * @nr_pages - number of pages processed between @start and @stop
+ * @msg - introductory message to print
+ */
+
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+ unsigned nr_pages, char *msg)
+{
+ s64 elapsed_centisecs64;
+ int centisecs;
+ int k;
+ int kps;
+
+ elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ centisecs = elapsed_centisecs64;
+ if (centisecs == 0)
+ centisecs = 1; /* avoid div-by-zero */
+ k = nr_pages * (PAGE_SIZE / 1024);
+ kps = (k * 100) / centisecs;
+ printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+ centisecs / 100, centisecs % 100,
+ kps / 1000, (kps % 1000) / 10);
+}
+
+/**
* swsusp_shrink_memory - Try to free as much memory as needed
*
* ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
int swsusp_shrink_memory(void)
{
- long size, tmp;
+ long tmp;
struct zone *zone;
unsigned long pages = 0;
unsigned int i = 0;
char *p = "-\\|/";
+ struct timeval start, stop;
printk("Shrinking memory... ");
+ do_gettimeofday(&start);
do {
- size = 2 * count_highmem_pages();
- size += size / 50 + count_data_pages() + PAGES_FOR_IO;
+ long size, highmem_size;
+
+ highmem_size = count_highmem_pages();
+ size = count_data_pages() + PAGES_FOR_IO;
tmp = size;
+ size += highmem_size;
for_each_zone (zone)
- if (!is_highmem(zone) && populated_zone(zone)) {
- tmp -= zone->free_pages;
- tmp += zone->lowmem_reserve[ZONE_NORMAL];
- tmp += snapshot_additional_pages(zone);
+ if (populated_zone(zone)) {
+ if (is_highmem(zone)) {
+ highmem_size -= zone->free_pages;
+ } else {
+ tmp -= zone->free_pages;
+ tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ tmp += snapshot_additional_pages(zone);
+ }
}
+
+ if (highmem_size < 0)
+ highmem_size = 0;
+
+ tmp += highmem_size;
if (tmp > 0) {
tmp = __shrink_memory(tmp);
if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
}
printk("\b%c", p[i++%4]);
} while (tmp > 0);
+ do_gettimeofday(&stop);
printk("\bdone (%lu pages freed)\n", pages);
+ swsusp_show_speed(&start, &stop, pages, "Freed");
return 0;
}
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
if ((error = arch_prepare_suspend()))
return error;
+
local_irq_disable();
/* At this point, device_suspend() has been called, but *not*
* device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
goto Enable_irqs;
}
- if ((error = save_highmem())) {
- printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
- goto Restore_highmem;
- }
-
save_processor_state();
if ((error = swsusp_arch_suspend()))
printk(KERN_ERR "Error %d suspending\n", error);
/* Restore control flow magically appears here */
restore_processor_state();
-Restore_highmem:
- restore_highmem();
/* NOTE: device_power_up() is just a resume() for devices
* that suspended with irqs off ... no overall powerup.
*/
device_power_up();
-Enable_irqs:
+ Enable_irqs:
local_irq_enable();
return error;
}
@@ -268,18 +305,23 @@ int swsusp_resume(void)
printk(KERN_ERR "Some devices failed to power down, very bad\n");
/* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
- error = swsusp_arch_resume();
- /* Code below is only ever reached in case of failure. Otherwise
- * execution continues at place where swsusp_arch_suspend was called
- */
- BUG_ON(!error);
+ error = restore_highmem();
+ if (!error) {
+ error = swsusp_arch_resume();
+ /* The code below is only ever reached in case of a failure.
+ * Otherwise execution continues at place where
+ * swsusp_arch_suspend() was called
+ */
+ BUG_ON(!error);
+ /* This call to restore_highmem() undos the previous one */
+ restore_highmem();
+ }
/* The only reason why swsusp_arch_resume() can fail is memory being
* very tight, so we have to free it as soon as we can to avoid
* subsequent failures
*/
swsusp_free();
restore_processor_state();
- restore_highmem();
touch_softlockup_watchdog();
device_power_up();
local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a..89443b85163 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
#include <linux/suspend.h>
#include <linux/syscalls.h>
+#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
#include <linux/fs.h>
#include <linux/console.h>
#include <linux/cpu.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
filp->private_data = data;
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
- data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+ data->swap = swsusp_resume_device ?
+ swap_type_of(swsusp_resume_device, 0) : -1;
data->mode = O_RDONLY;
} else {
data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
free_all_swap_pages(data->swap, data->bitmap);
free_bitmap(data->bitmap);
if (data->frozen) {
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
thaw_processes();
enable_nonboot_cpus();
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
}
atomic_inc(&device_available);
return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
{
int error = 0;
struct snapshot_data *data;
- loff_t offset, avail;
+ loff_t avail;
+ sector_t offset;
if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
case SNAPSHOT_FREEZE:
if (data->frozen)
break;
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
error = disable_nonboot_cpus();
if (!error) {
error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = -EBUSY;
}
}
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
if (!error)
data->frozen = 1;
break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
case SNAPSHOT_UNFREEZE:
if (!data->frozen)
break;
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
thaw_processes();
enable_nonboot_cpus();
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
data->frozen = 0;
break;
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = -EPERM;
break;
}
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
}
resume_console();
}
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
if (!error)
error = put_user(in_suspend, (unsigned int __user *)arg);
if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
break;
case SNAPSHOT_ATOMIC_RESTORE:
+ snapshot_write_finalize(&data->handle);
if (data->mode != O_WRONLY || !data->frozen ||
!snapshot_image_loaded(&data->handle)) {
error = -EPERM;
break;
}
- snapshot_free_unused_memory(&data->handle);
- down(&pm_sem);
+ mutex_lock(&pm_mutex);
pm_prepare_console();
suspend_console();
error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
}
resume_console();
pm_restore_console();
- up(&pm_sem);
+ mutex_unlock(&pm_mutex);
break;
case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
break;
}
}
- offset = alloc_swap_page(data->swap, data->bitmap);
+ offset = alloc_swapdev_block(data->swap, data->bitmap);
if (offset) {
offset <<= PAGE_SHIFT;
- error = put_user(offset, (loff_t __user *)arg);
+ error = put_user(offset, (sector_t __user *)arg);
} else {
error = -ENOSPC;
}
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
* so we need to recode them
*/
if (old_decode_dev(arg)) {
- data->swap = swap_type_of(old_decode_dev(arg));
+ data->swap = swap_type_of(old_decode_dev(arg), 0);
if (data->swap < 0)
error = -ENODEV;
} else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
break;
}
- if (down_trylock(&pm_sem)) {
+ if (!mutex_trylock(&pm_mutex)) {
error = -EBUSY;
break;
}
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
if (pm_ops->finish)
pm_ops->finish(PM_SUSPEND_MEM);
-OutS3:
- up(&pm_sem);
+ OutS3:
+ mutex_unlock(&pm_mutex);
+ break;
+
+ case SNAPSHOT_PMOPS:
+ switch (arg) {
+
+ case PMOPS_PREPARE:
+ if (pm_ops->prepare) {
+ error = pm_ops->prepare(PM_SUSPEND_DISK);
+ }
+ break;
+
+ case PMOPS_ENTER:
+ kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+ error = pm_ops->enter(PM_SUSPEND_DISK);
+ break;
+
+ case PMOPS_FINISH:
+ if (pm_ops && pm_ops->finish) {
+ pm_ops->finish(PM_SUSPEND_DISK);
+ }
+ break;
+
+ default:
+ printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+ error = -EINVAL;
+
+ }
+ break;
+
+ case SNAPSHOT_SET_SWAP_AREA:
+ if (data->bitmap) {
+ error = -EPERM;
+ } else {
+ struct resume_swap_area swap_area;
+ dev_t swdev;
+
+ error = copy_from_user(&swap_area, (void __user *)arg,
+ sizeof(struct resume_swap_area));
+ if (error) {
+ error = -EFAULT;
+ break;
+ }
+
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ swdev = old_decode_dev(swap_area.dev);
+ if (swdev) {
+ offset = swap_area.offset;
+ data->swap = swap_type_of(swdev, offset);
+ if (data->swap < 0)
+ error = -ENODEV;
+ } else {
+ data->swap = -1;
+ error = -EINVAL;
+ }
+ }
break;
default:
@@ -321,7 +383,7 @@ OutS3:
return error;
}
-static struct file_operations snapshot_fops = {
+static const struct file_operations snapshot_fops = {
.open = snapshot_open,
.release = snapshot_release,
.read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index f7d427ef503..185bb45eacf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
+#include <linux/jiffies.h>
#include <asm/uaccess.h>
@@ -52,8 +53,6 @@ int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
-EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
-
/*
* Low lever drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -334,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
}
}
+static int __read_mostly ignore_loglevel;
+
+int __init ignore_loglevel_setup(char *str)
+{
+ ignore_loglevel = 1;
+ printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+
+ return 1;
+}
+
+__setup("ignore_loglevel", ignore_loglevel_setup);
+
/*
* Write out chars from start to end - 1 inclusive
*/
static void _call_console_drivers(unsigned long start,
unsigned long end, int msg_log_level)
{
- if (msg_log_level < console_loglevel &&
+ if ((msg_log_level < console_loglevel || ignore_loglevel) &&
console_drivers && start != end) {
if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
/* wrapped write */
@@ -630,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
asmlinkage long sys_syslog(int type, char __user *buf, int len)
{
- return 0;
-}
-
-int do_syslog(int type, char __user *buf, int len)
-{
- return 0;
+ return -ENOSYS;
}
static void call_console_drivers(unsigned long start, unsigned long end)
@@ -776,7 +782,6 @@ int is_console_locked(void)
{
return console_locked;
}
-EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
/**
* release_console_sem - unlock the console system
@@ -1101,3 +1106,23 @@ int printk_ratelimit(void)
printk_ratelimit_burst);
}
EXPORT_SYMBOL(printk_ratelimit);
+
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+ unsigned int interval_msecs)
+{
+ if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
+ *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec..fb5e03d57e9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;
-static int prof_on __read_mostly;
+int prof_on __read_mostly;
static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
static int __init profile_setup(char * str)
{
static char __initdata schedstr[] = "schedule";
+ static char __initdata sleepstr[] = "sleep";
int par;
- if (!strncmp(str, schedstr, strlen(schedstr))) {
+ if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+ prof_on = SLEEP_PROFILING;
+ if (str[strlen(sleepstr)] == ',')
+ str += strlen(sleepstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ printk(KERN_INFO
+ "kernel sleep profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
prof_on = SCHED_PROFILING;
if (str[strlen(schedstr)] == ',')
str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
* positions to which hits are accounted during short intervals (e.g.
* several seconds) is usually very small. Exclusion from buffer
* flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING profile_hit() may be called from process context).
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
* The hash function is meant to be lightweight as opposed to strong,
* and was vaguely inspired by ppc64 firmware-supported inverted
* pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
mutex_unlock(&profile_flip_mutex);
}
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
put_cpu();
return;
}
+ /*
+ * We buffer the global profiler buffer into a per-CPU
+ * queue and thus reduce the number of global (and possibly
+ * NUMA-alien) accesses. The write-queue is self-coalescing:
+ */
local_irq_save(flags);
do {
for (j = 0; j < PROFILE_GRPSZ; ++j) {
if (hits[i + j].pc == pc) {
- hits[i + j].hits++;
+ hits[i + j].hits += nr_hits;
goto out;
} else if (!hits[i + j].hits) {
hits[i + j].pc = pc;
- hits[i + j].hits = 1;
+ hits[i + j].hits = nr_hits;
goto out;
}
}
i = (i + secondary) & (NR_PROFILE_HIT - 1);
} while (i != primary);
- atomic_inc(&prof_buffer[pc]);
+
+ /*
+ * Add the current hit(s) and flush the write-queue out
+ * to the global buffer:
+ */
+ atomic_add(nr_hits, &prof_buffer[pc]);
for (i = 0; i < NR_PROFILE_HIT; ++i) {
atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
put_cpu();
}
-#ifdef CONFIG_HOTPLUG_CPU
static int __devinit profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
#else /* !CONFIG_SMP */
#define profile_flip_buffers() do { } while (0)
#define profile_discard_flip_buffers() do { } while (0)
+#define profile_cpu_callback NULL
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
if (prof_on != type || !prof_buffer)
return;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
- atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+ atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
read = 0;
while (p < sizeof(unsigned int) && count > 0) {
- put_user(*((char *)(&sample_step)+p),buf);
+ if (put_user(*((char *)(&sample_step)+p),buf))
+ return -EFAULT;
buf++; p++; count--; read++;
}
pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
return count;
}
-static struct file_operations proc_profile_operations = {
+static const struct file_operations proc_profile_operations = {
.read = read_profile,
.write = write_profile,
};
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef..3554b76da84 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
list = rdp->donelist;
while (list) {
- next = rdp->donelist = list->next;
+ next = list->next;
+ prefetch(next);
list->func(list);
list = next;
if (++count >= rdp->blimit)
break;
}
+ rdp->donelist = list;
local_irq_disable();
rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f4..c52f981ea00 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
cleanup_srcu_struct(&srcu_ctl);
}
-static int srcu_torture_read_lock(void)
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
{
return srcu_read_lock(&srcu_ctl);
}
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
schedule_timeout_interruptible(longdelay);
}
-static void srcu_torture_read_unlock(int idx)
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
{
srcu_read_unlock(&srcu_ctl, idx);
}
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac..a4701e7ba7d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -138,7 +138,7 @@ depopulate:
*/
struct rchan_buf *relay_create_buf(struct rchan *chan)
{
- struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+ struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
* reason waking is deferred is that calling directly from write
* causes problems if you're writing from say the scheduler.
*/
-static void wakeup_readers(void *private)
+static void wakeup_readers(struct work_struct *work)
{
- struct rchan_buf *buf = private;
+ struct rchan_buf *buf =
+ container_of(work, struct rchan_buf, wake_readers.work);
wake_up_interruptible(&buf->read_wait);
}
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- INIT_WORK(&buf->wake_readers, NULL, NULL);
+ INIT_DELAYED_WORK(&buf->wake_readers, NULL);
} else {
cancel_delayed_work(&buf->wake_readers);
flush_scheduled_work();
@@ -478,7 +479,7 @@ struct rchan *relay_open(const char *base_filename,
if (!(subbuf_size && n_subbufs))
return NULL;
- chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+ chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
return NULL;
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
buf->padding[old_subbuf];
smp_mb();
if (waitqueue_active(&buf->read_wait)) {
- PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+ PREPARE_DELAYED_WORK(&buf->wake_readers,
+ wakeup_readers);
schedule_delayed_work(&buf->wake_readers, 1);
}
}
@@ -957,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
if (!desc->count)
return 0;
- mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+ mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -977,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+ mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
return desc->written;
}
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
actor, &desc);
}
-struct file_operations relay_file_operations = {
+const struct file_operations relay_file_operations = {
.open = relay_file_open,
.poll = relay_file_poll,
.mmap = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143..7b9a497419d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
return 0;
}
-static struct seq_operations resource_op = {
+static const struct seq_operations resource_op = {
.start = r_start,
.next = r_next,
.stop = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
return res;
}
-static struct file_operations proc_ioports_operations = {
+static const struct file_operations proc_ioports_operations = {
.open = ioports_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
-static struct file_operations proc_iomem_operations = {
+static const struct file_operations proc_iomem_operations = {
.open = iomem_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c9..015fc633c96 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
#include <linux/spinlock.h>
#include <linux/sysdev.h>
#include <linux/timer.h>
+#include <linux/freezer.h>
#include "rtmutex.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680..5cd833bc217 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
@@ -225,8 +225,10 @@ struct rq {
unsigned long nr_uninterruptible;
unsigned long expired_timestamp;
- unsigned long long timestamp_last_tick;
+ /* Cached timestamp set by update_cpu_clock() */
+ unsigned long long most_recent_timestamp;
struct task_struct *curr, *idle;
+ unsigned long next_balance;
struct mm_struct *prev_mm;
struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 14
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+ "%lu",
sd->lb_cnt[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+ " %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
}
preempt_enable();
#endif
@@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
return res;
}
-struct file_operations proc_schedstat_operations = {
+const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
#endif
/*
- * rq_lock - lock a given runqueue and disable interrupts.
+ * this_rq_lock - lock this runqueue and disable interrupts.
*/
static inline struct rq *this_rq_lock(void)
__acquires(rq->lock)
@@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
{
unsigned long long now;
+ if (rt_task(p))
+ goto out;
+
now = sched_clock();
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
struct rq *this_rq = this_rq();
- now = (now - this_rq->timestamp_last_tick)
- + rq->timestamp_last_tick;
+ now = (now - this_rq->most_recent_timestamp)
+ + rq->most_recent_timestamp;
}
#endif
- if (!rt_task(p))
- p->prio = recalc_task_prio(p, now);
+ /*
+ * Sleep time is in units of nanosecs, so shift by 20 to get a
+ * milliseconds-range estimation of the amount of time that the task
+ * spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+ (now - p->timestamp) >> 20);
+ }
+
+ p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
@@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
}
}
p->timestamp = now;
-
+out:
__activate_task(p, rq);
}
@@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (this_sd->flags & SD_WAKE_AFFINE) {
unsigned long tl = this_load;
- unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+ unsigned long tl_per_task;
+
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
/*
* If sync wakeup then subtract the (maximum possible)
@@ -1677,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* Not the local CPU - must adjust timestamp. This should
* get optimised away in the !CONFIG_SMP case.
*/
- p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
- + rq->timestamp_last_tick;
+ p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+ + rq->most_recent_timestamp;
__activate_task(p, rq);
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
@@ -1941,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
+ BUG_ON(!irqs_disabled());
if (rq1 == rq2) {
spin_lock(&rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */
@@ -1980,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
@@ -2050,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
set_task_cpu(p, this_cpu);
inc_nr_running(p, this_rq);
enqueue_task(p, this_array);
- p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
- + this_rq->timestamp_last_tick;
+ p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+ + this_rq->most_recent_timestamp;
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
@@ -2087,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
- if (sd->nr_balance_failed > sd->cache_nice_tries)
+ if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(p, rq->most_recent_timestamp, sd))
+ schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
return 1;
+ }
- if (task_hot(p, rq->timestamp_last_tick, sd))
+ if (task_hot(p, rq->most_recent_timestamp, sd))
return 0;
return 1;
}
@@ -2188,11 +2219,6 @@ skip_queue:
goto skip_bitmap;
}
-#ifdef CONFIG_SCHEDSTATS
- if (task_hot(tmp, busiest->timestamp_last_tick, sd))
- schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
-
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
rem_load_move -= tmp->load_weight;
@@ -2230,7 +2256,7 @@ out:
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum idle_type idle, int *sd_idle,
- cpumask_t *cpus)
+ cpumask_t *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2259,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long load, group_capacity;
int local_group;
int i;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;
local_group = cpu_isset(this_cpu, group->cpumask);
+ if (local_group)
+ balance_cpu = first_cpu(group->cpumask);
+
/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -2278,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
- if (local_group)
+ if (local_group) {
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
+ }
+
load = target_load(i, load_idx);
- else
+ } else
load = source_load(i, load_idx);
avg_load += load;
@@ -2288,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
sum_weighted_load += rq->raw_weighted_load;
}
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains.
+ */
+ if (local_group && balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ goto ret;
+ }
+
total_load += avg_load;
total_pwr += group->cpu_power;
@@ -2447,18 +2492,21 @@ small_imbalance:
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+ tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+ busiest->cpu_power;
if (max_load > tmp)
pwr_move += busiest->cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load*busiest->cpu_power <
- busiest_load_per_task*SCHED_LOAD_SCALE)
- tmp = max_load*busiest->cpu_power/this->cpu_power;
+ if (max_load * busiest->cpu_power <
+ busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = max_load * busiest->cpu_power / this->cpu_power;
else
- tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
- pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+ tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+ this->cpu_power;
+ pwr_move += this->cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -2479,8 +2527,8 @@ out_balanced:
*imbalance = min_load_per_task;
return group_min;
}
-ret:
#endif
+ret:
*imbalance = 0;
return NULL;
}
@@ -2529,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
- struct sched_domain *sd, enum idle_type idle)
+ struct sched_domain *sd, enum idle_type idle,
+ int *balance)
{
int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
cpumask_t cpus = CPU_MASK_ALL;
+ unsigned long flags;
/*
* When power savings policy is enabled for the parent domain, idle
@@ -2555,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
- &cpus);
+ &cpus, balance);
+
+ if (*balance == 0)
+ goto out_balanced;
+
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
@@ -2579,11 +2631,13 @@ redo:
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ local_irq_save(flags);
double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
minus_1_or_zero(busiest->nr_running),
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
+ local_irq_restore(flags);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
@@ -2600,13 +2654,13 @@ redo:
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- spin_lock(&busiest->lock);
+ spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
- spin_unlock(&busiest->lock);
+ spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
}
@@ -2616,7 +2670,7 @@ redo:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- spin_unlock(&busiest->lock);
+ spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)
wake_up_process(busiest->migration_thread);
@@ -2695,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
- &sd_idle, &cpus);
+ &sd_idle, &cpus, NULL);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
@@ -2755,14 +2809,28 @@ out_balanced:
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
+ int pulled_task = 0;
+ unsigned long next_balance = jiffies + 60 * HZ;
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
/* If we've pulled tasks over stop searching: */
- if (load_balance_newidle(this_cpu, this_rq, sd))
+ pulled_task = load_balance_newidle(this_cpu,
+ this_rq, sd);
+ if (time_after(next_balance,
+ sd->last_balance + sd->balance_interval))
+ next_balance = sd->last_balance
+ + sd->balance_interval;
+ if (pulled_task)
break;
}
}
+ if (!pulled_task)
+ /*
+ * We are going idle. next_balance may be set based on
+ * a busy processor. So reset next_balance.
+ */
+ this_rq->next_balance = next_balance;
}
/*
@@ -2815,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
spin_unlock(&target_rq->lock);
}
-/*
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-
-/* Don't have all balancing operations going off at once: */
-static inline unsigned long cpu_offset(int cpu)
+static void update_load(struct rq *this_rq)
{
- return jiffies + cpu * HZ / NR_CPUS;
-}
-
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
-{
- unsigned long this_load, interval, j = cpu_offset(this_cpu);
- struct sched_domain *sd;
+ unsigned long this_load;
int i, scale;
this_load = this_rq->raw_weighted_load;
@@ -2854,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
}
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static DEFINE_SPINLOCK(balancing);
+
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int this_cpu = smp_processor_id(), balance = 1;
+ struct rq *this_rq = cpu_rq(this_cpu);
+ unsigned long interval;
+ struct sched_domain *sd;
+ /*
+ * We are idle if there are no processes running. This
+ * is valid even if we are the idle process (SMT).
+ */
+ enum idle_type idle = !this_rq->nr_running ?
+ SCHED_IDLE : NOT_IDLE;
+ /* Earliest time when we have to call run_rebalance_domains again */
+ unsigned long next_balance = jiffies + 60*HZ;
for_each_domain(this_cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2868,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
if (unlikely(!interval))
interval = 1;
- if (j - sd->last_balance >= interval) {
- if (load_balance(this_cpu, this_rq, sd, idle)) {
+ if (sd->flags & SD_SERIALIZE) {
+ if (!spin_trylock(&balancing))
+ goto out;
+ }
+
+ if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -2877,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
*/
idle = NOT_IDLE;
}
- sd->last_balance += interval;
+ sd->last_balance = jiffies;
}
+ if (sd->flags & SD_SERIALIZE)
+ spin_unlock(&balancing);
+out:
+ if (time_after(next_balance, sd->last_balance + interval))
+ next_balance = sd->last_balance + interval;
+
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!balance)
+ break;
}
+ this_rq->next_balance = next_balance;
}
#else
/*
* on UP we do not need to balance between CPUs:
*/
-static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
-{
-}
static inline void idle_balance(int cpu, struct rq *rq)
{
}
#endif
-static inline int wake_priority_sleeper(struct rq *rq)
+static inline void wake_priority_sleeper(struct rq *rq)
{
- int ret = 0;
-
#ifdef CONFIG_SCHED_SMT
+ if (!rq->nr_running)
+ return;
+
spin_lock(&rq->lock);
/*
* If an SMT sibling task has been put to sleep for priority
* reasons reschedule the idle task to see if it can now run.
*/
- if (rq->nr_running) {
+ if (rq->nr_running)
resched_task(rq->idle);
- ret = 1;
- }
spin_unlock(&rq->lock);
#endif
- return ret;
}
DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2923,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
static inline void
update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
{
- p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+ p->sched_time += now - p->last_ran;
+ p->last_ran = rq->most_recent_timestamp = now;
}
/*
@@ -2936,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
unsigned long flags;
local_irq_save(flags);
- ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
- ns = p->sched_time + sched_clock() - ns;
+ ns = p->sched_time + sched_clock() - p->last_ran;
local_irq_restore(flags);
return ns;
@@ -3037,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
cpustat->steal = cputime64_add(cpustat->steal, tmp);
}
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
- */
-void scheduler_tick(void)
+static void task_running_tick(struct rq *rq, struct task_struct *p)
{
- unsigned long long now = sched_clock();
- struct task_struct *p = current;
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
-
- update_cpu_clock(p, rq, now);
-
- rq->timestamp_last_tick = now;
-
- if (p == rq->idle) {
- if (wake_priority_sleeper(rq))
- goto out;
- rebalance_tick(cpu, rq, SCHED_IDLE);
- return;
- }
-
- /* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
+ /* Task has expired but was not scheduled yet */
set_tsk_need_resched(p);
- goto out;
+ return;
}
spin_lock(&rq->lock);
/*
@@ -3133,8 +3201,34 @@ void scheduler_tick(void)
}
out_unlock:
spin_unlock(&rq->lock);
-out:
- rebalance_tick(cpu, rq, NOT_IDLE);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+ unsigned long long now = sched_clock();
+ struct task_struct *p = current;
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ update_cpu_clock(p, rq, now);
+
+ if (p == rq->idle)
+ /* Task on the idle queue */
+ wake_priority_sleeper(rq);
+ else
+ task_running_tick(rq, p);
+#ifdef CONFIG_SMP
+ update_load(rq);
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
+#endif
}
#ifdef CONFIG_SCHED_SMT
@@ -3280,7 +3374,8 @@ void fastcall add_preempt_count(int val)
/*
* Spinlock count overflowing soon?
*/
- DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
}
EXPORT_SYMBOL(add_preempt_count);
@@ -3333,6 +3428,9 @@ asmlinkage void __sched schedule(void)
printk(KERN_ERR "BUG: scheduling while atomic: "
"%s/0x%08x/%d\n",
current->comm, preempt_count(), current->pid);
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
dump_stack();
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4902,18 @@ static void show_task(struct task_struct *p)
show_stack(p, NULL);
}
-void show_state(void)
+void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
#if (BITS_PER_LONG == 32)
printk("\n"
- " sibling\n");
- printk(" task PC pid father child younger older\n");
+ " free sibling\n");
+ printk(" task PC stack pid father child younger older\n");
#else
printk("\n"
- " sibling\n");
- printk(" task PC pid father child younger older\n");
+ " free sibling\n");
+ printk(" task PC stack pid father child younger older\n");
#endif
read_lock(&tasklist_lock);
do_each_thread(g, p) {
@@ -4824,11 +4922,16 @@ void show_state(void)
* console might take alot of time:
*/
touch_nmi_watchdog();
- show_task(p);
+ if (p->state & state_filter)
+ show_task(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
- debug_show_all_locks();
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (state_filter == -1)
+ debug_show_all_locks();
}
/**
@@ -4973,8 +5076,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* afterwards, and pretending it was a local activate.
* This way is cleaner and logically correct.
*/
- p->timestamp = p->timestamp - rq_src->timestamp_last_tick
- + rq_dest->timestamp_last_tick;
+ p->timestamp = p->timestamp - rq_src->most_recent_timestamp
+ + rq_dest->most_recent_timestamp;
deactivate_task(p, rq_src);
__activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5050,7 +5153,10 @@ wait_to_die:
}
#ifdef CONFIG_HOTPLUG_CPU
-/* Figure out where task on dead CPU should go, use force if neccessary. */
+/*
+ * Figure out where task on dead CPU should go, use force if neccessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
unsigned long flags;
@@ -5170,6 +5276,7 @@ void idle_task_exit(void)
mmdrop(mm);
}
+/* called under rq->lock with disabled interrupts */
static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(dead_cpu);
@@ -5186,10 +5293,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
* Drop lock around migration; if someone else moves it,
* that's OK. No task can be added to this CPU, so iteration is
* fine.
+ * NOTE: interrupts should be left disabled --dev@
*/
- spin_unlock_irq(&rq->lock);
+ spin_unlock(&rq->lock);
move_task_off_dead_cpu(dead_cpu, p);
- spin_lock_irq(&rq->lock);
+ spin_lock(&rq->lock);
put_task_struct(p);
}
@@ -5342,16 +5450,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
break;
}
printk("span %s\n", str);
if (!cpu_isset(cpu, sd->span))
- printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
if (!cpu_isset(cpu, group->cpumask))
- printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
printk(KERN_DEBUG);
for (i = 0; i < level + 2; i++)
@@ -5366,7 +5477,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
if (!group->cpu_power) {
printk("\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_power not "
+ "set\n");
}
if (!cpus_weight(group->cpumask)) {
@@ -5389,15 +5501,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk("\n");
if (!cpus_equal(sd->span, groupmask))
- printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+ printk(KERN_ERR "ERROR: groups don't span "
+ "domain->span\n");
level++;
sd = sd->parent;
+ if (!sd)
+ continue;
- if (sd) {
- if (!cpus_subset(groupmask, sd->span))
- printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
- }
+ if (!cpus_subset(groupmask, sd->span))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
} while (sd);
}
@@ -5511,28 +5625,27 @@ static int __init isolated_cpu_setup(char *str)
__setup ("isolcpus=", isolated_cpu_setup);
/*
- * init_sched_build_groups takes an array of groups, the cpumask we wish
- * to span, and a pointer to a function which identifies what group a CPU
- * belongs to. The return value of group_fn must be a valid index into the
- * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
- * keep track of groups covered with a cpumask_t).
+ * init_sched_build_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
+ * (due to the fact that we keep track of groups covered with a cpumask_t).
*
* init_sched_build_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
static void
-init_sched_build_groups(struct sched_group groups[], cpumask_t span,
- const cpumask_t *cpu_map,
- int (*group_fn)(int cpu, const cpumask_t *cpu_map))
+init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
+ int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg))
{
struct sched_group *first = NULL, *last = NULL;
cpumask_t covered = CPU_MASK_NONE;
int i;
for_each_cpu_mask(i, span) {
- int group = group_fn(i, cpu_map);
- struct sched_group *sg = &groups[group];
+ struct sched_group *sg;
+ int group = group_fn(i, cpu_map, &sg);
int j;
if (cpu_isset(i, covered))
@@ -5542,7 +5655,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
sg->cpu_power = 0;
for_each_cpu_mask(j, span) {
- if (group_fn(j, cpu_map) != group)
+ if (group_fn(j, cpu_map, NULL) != group)
continue;
cpu_set(j, covered);
@@ -5716,8 +5829,9 @@ __setup("max_cache_size=", setup_max_cache_size);
*/
static void touch_cache(void *__cache, unsigned long __size)
{
- unsigned long size = __size/sizeof(long), chunk1 = size/3,
- chunk2 = 2*size/3;
+ unsigned long size = __size / sizeof(long);
+ unsigned long chunk1 = size / 3;
+ unsigned long chunk2 = 2 * size / 3;
unsigned long *cache = __cache;
int i;
@@ -5826,11 +5940,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
*/
measure_one(cache, size, cpu1, cpu2);
for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+ cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
measure_one(cache, size, cpu2, cpu1);
for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+ cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
/*
* (We measure the non-migrating [cached] cost on both
@@ -5840,17 +5954,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
measure_one(cache, size, cpu1, cpu1);
for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+ cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
measure_one(cache, size, cpu2, cpu2);
for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+ cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
/*
* Get the per-iteration migration cost:
*/
- do_div(cost1, 2*ITERATIONS);
- do_div(cost2, 2*ITERATIONS);
+ do_div(cost1, 2 * ITERATIONS);
+ do_div(cost2, 2 * ITERATIONS);
return cost1 - cost2;
}
@@ -5888,7 +6002,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
*/
cache = vmalloc(max_size);
if (!cache) {
- printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+ printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
return 1000000; /* return 1 msec on very small boxen */
}
@@ -5913,7 +6027,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
avg_fluct = (avg_fluct + fluct)/2;
if (migration_debug)
- printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+ printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+ "(%8Ld %8Ld)\n",
cpu1, cpu2, size,
(long)cost / 1000000,
((long)cost / 100000) % 10,
@@ -6008,20 +6123,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
-1
#endif
);
- if (system_state == SYSTEM_BOOTING) {
- if (num_online_cpus() > 1) {
- printk("migration_cost=");
- for (distance = 0; distance <= max_distance; distance++) {
- if (distance)
- printk(",");
- printk("%ld", (long)migration_cost[distance] / 1000);
- }
- printk("\n");
+ if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
+ printk("migration_cost=");
+ for (distance = 0; distance <= max_distance; distance++) {
+ if (distance)
+ printk(",");
+ printk("%ld", (long)migration_cost[distance] / 1000);
}
+ printk("\n");
}
j1 = jiffies;
if (migration_debug)
- printk("migration: %ld seconds\n", (j1-j0)/HZ);
+ printk("migration: %ld seconds\n", (j1-j0) / HZ);
/*
* Move back to the original CPU. NUMA-Q gets confused
@@ -6118,10 +6231,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
*/
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
-static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ if (sg)
+ *sg = &per_cpu(sched_group_cpus, cpu);
return cpu;
}
#endif
@@ -6131,39 +6247,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
*/
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
#endif
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ int group;
cpumask_t mask = cpu_sibling_map[cpu];
cpus_and(mask, mask, *cpu_map);
- return first_cpu(mask);
+ group = first_cpu(mask);
+ if (sg)
+ *sg = &per_cpu(sched_group_core, group);
+ return group;
}
#elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ if (sg)
+ *sg = &per_cpu(sched_group_core, cpu);
return cpu;
}
#endif
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
-static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ int group;
#ifdef CONFIG_SCHED_MC
cpumask_t mask = cpu_coregroup_map(cpu);
cpus_and(mask, mask, *cpu_map);
- return first_cpu(mask);
+ group = first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
cpumask_t mask = cpu_sibling_map[cpu];
cpus_and(mask, mask, *cpu_map);
- return first_cpu(mask);
+ group = first_cpu(mask);
#else
- return cpu;
+ group = cpu;
#endif
+ if (sg)
+ *sg = &per_cpu(sched_group_phys, group);
+ return group;
}
#ifdef CONFIG_NUMA
@@ -6176,12 +6305,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
- return cpu_to_node(cpu);
+ cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
+ int group;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ group = first_cpu(nodemask);
+
+ if (sg)
+ *sg = &per_cpu(sched_group_allnodes, group);
+ return group;
}
+
static void init_numa_sched_groups_power(struct sched_group *group_head)
{
struct sched_group *sg = group_head;
@@ -6217,16 +6356,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
int cpu, i;
for_each_cpu_mask(cpu, *cpu_map) {
- struct sched_group *sched_group_allnodes
- = sched_group_allnodes_bycpu[cpu];
struct sched_group **sched_group_nodes
= sched_group_nodes_bycpu[cpu];
- if (sched_group_allnodes) {
- kfree(sched_group_allnodes);
- sched_group_allnodes_bycpu[cpu] = NULL;
- }
-
if (!sched_group_nodes)
continue;
@@ -6320,7 +6452,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
struct sched_domain *sd;
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
- struct sched_group *sched_group_allnodes = NULL;
+ int sd_allnodes = 0;
/*
* Allocate the per-node list of sched groups
@@ -6338,7 +6470,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
* Set up domains for cpus specified by the cpu_map.
*/
for_each_cpu_mask(i, *cpu_map) {
- int group;
struct sched_domain *sd = NULL, *p;
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
@@ -6347,26 +6478,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
#ifdef CONFIG_NUMA
if (cpus_weight(*cpu_map)
> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
- if (!sched_group_allnodes) {
- sched_group_allnodes
- = kmalloc_node(sizeof(struct sched_group)
- * MAX_NUMNODES,
- GFP_KERNEL,
- cpu_to_node(i));
- if (!sched_group_allnodes) {
- printk(KERN_WARNING
- "Can not alloc allnodes sched group\n");
- goto error;
- }
- sched_group_allnodes_bycpu[i]
- = sched_group_allnodes;
- }
sd = &per_cpu(allnodes_domains, i);
*sd = SD_ALLNODES_INIT;
sd->span = *cpu_map;
- group = cpu_to_allnodes_group(i, cpu_map);
- sd->groups = &sched_group_allnodes[group];
+ cpu_to_allnodes_group(i, cpu_map, &sd->groups);
p = sd;
+ sd_allnodes = 1;
} else
p = NULL;
@@ -6381,36 +6498,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
p = sd;
sd = &per_cpu(phys_domains, i);
- group = cpu_to_phys_group(i, cpu_map);
*sd = SD_CPU_INIT;
sd->span = nodemask;
sd->parent = p;
if (p)
p->child = sd;
- sd->groups = &sched_group_phys[group];
+ cpu_to_phys_group(i, cpu_map, &sd->groups);
#ifdef CONFIG_SCHED_MC
p = sd;
sd = &per_cpu(core_domains, i);
- group = cpu_to_core_group(i, cpu_map);
*sd = SD_MC_INIT;
sd->span = cpu_coregroup_map(i);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
- sd->groups = &sched_group_core[group];
+ cpu_to_core_group(i, cpu_map, &sd->groups);
#endif
#ifdef CONFIG_SCHED_SMT
p = sd;
sd = &per_cpu(cpu_domains, i);
- group = cpu_to_cpu_group(i, cpu_map);
*sd = SD_SIBLING_INIT;
sd->span = cpu_sibling_map[i];
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
- sd->groups = &sched_group_cpus[group];
+ cpu_to_cpu_group(i, cpu_map, &sd->groups);
#endif
}
@@ -6422,8 +6536,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
if (i != first_cpu(this_sibling_map))
continue;
- init_sched_build_groups(sched_group_cpus, this_sibling_map,
- cpu_map, &cpu_to_cpu_group);
+ init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
}
#endif
@@ -6434,8 +6547,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
cpus_and(this_core_map, this_core_map, *cpu_map);
if (i != first_cpu(this_core_map))
continue;
- init_sched_build_groups(sched_group_core, this_core_map,
- cpu_map, &cpu_to_core_group);
+ init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
}
#endif
@@ -6448,15 +6560,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
if (cpus_empty(nodemask))
continue;
- init_sched_build_groups(sched_group_phys, nodemask,
- cpu_map, &cpu_to_phys_group);
+ init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
}
#ifdef CONFIG_NUMA
/* Set up node groups */
- if (sched_group_allnodes)
- init_sched_build_groups(sched_group_allnodes, *cpu_map,
- cpu_map, &cpu_to_allnodes_group);
+ if (sd_allnodes)
+ init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
for (i = 0; i < MAX_NUMNODES; i++) {
/* Set up node groups */
@@ -6548,10 +6658,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
for (i = 0; i < MAX_NUMNODES; i++)
init_numa_sched_groups_power(sched_group_nodes[i]);
- if (sched_group_allnodes) {
- int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
- struct sched_group *sg = &sched_group_allnodes[group];
+ if (sd_allnodes) {
+ struct sched_group *sg;
+ cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
init_numa_sched_groups_power(sg);
}
#endif
@@ -6723,8 +6833,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
sched_smt_power_savings_store);
#endif
-
-#ifdef CONFIG_HOTPLUG_CPU
/*
* Force a reinitialization of the sched domains hierarchy. The domains
* and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6865,6 @@ static int update_sched_domains(struct notifier_block *nfb,
return NOTIFY_OK;
}
-#endif
void __init sched_init_smp(void)
{
@@ -6833,6 +6940,10 @@ void __init sched_init(void)
set_load_weight(&init_task);
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+#endif
+
#ifdef CONFIG_RT_MUTEXES
plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
#endif
@@ -6867,6 +6978,9 @@ void __might_sleep(char *file, int line)
" context at %s:%d\n", file, line);
printk("in_atomic():%d, irqs_disabled():%d\n",
in_atomic(), irqs_disabled());
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
dump_stack();
}
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 7ed8d5304be..5630255d2e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,10 @@
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/capability.h>
+#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
+
#include <asm/param.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -33,7 +37,7 @@
* SLAB caches for signal bits.
*/
-static kmem_cache_t *sigqueue_cachep;
+static struct kmem_cache *sigqueue_cachep;
/*
* In POSIX a signal is sent either to a specific thread (Linux task)
@@ -267,18 +271,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
int override_rlimit)
{
struct sigqueue *q = NULL;
+ struct user_struct *user;
- atomic_inc(&t->user->sigpending);
+ /*
+ * In order to avoid problems with "switch_user()", we want to make
+ * sure that the compiler doesn't re-load "t->user"
+ */
+ user = t->user;
+ barrier();
+ atomic_inc(&user->sigpending);
if (override_rlimit ||
- atomic_read(&t->user->sigpending) <=
+ atomic_read(&user->sigpending) <=
t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
q = kmem_cache_alloc(sigqueue_cachep, flags);
if (unlikely(q == NULL)) {
- atomic_dec(&t->user->sigpending);
+ atomic_dec(&user->sigpending);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
- q->user = get_uid(t->user);
+ q->user = get_uid(user);
}
return(q);
}
@@ -575,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
error = -EPERM;
if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
&& ((sig != SIGCONT) ||
- (current->signal->session != t->signal->session))
+ (process_session(current) != process_session(t)))
&& (current->euid ^ t->suid) && (current->euid ^ t->uid)
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
&& !capable(CAP_KILL))
@@ -1126,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
return error;
}
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
rcu_read_lock();
@@ -1695,7 +1705,9 @@ finish_stop(int stop_count)
read_unlock(&tasklist_lock);
}
- schedule();
+ do {
+ schedule();
+ } while (try_to_freeze());
/*
* Now we don't run again until continued.
*/
@@ -1870,8 +1882,12 @@ relock:
if (sig_kernel_ignore(signr)) /* Default is nothing. */
continue;
- /* Init gets no signals it doesn't want. */
- if (current == child_reaper)
+ /*
+ * Init of a pid space gets no signals it doesn't want from
+ * within that pid space. It can of course get signals from
+ * its parent pid space.
+ */
+ if (current == child_reaper(current))
continue;
if (sig_kernel_stop(signr)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce1..918e52df090 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
- BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
- BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476c3741511..2c6c2bf8551 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_spin_lock_nested);
+unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ /*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_spin_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#ifdef CONFIG_PROVE_SPIN_LOCKING
+ _raw_spin_lock(lock);
+#else
+ _raw_spin_lock_flags(lock, &flags);
+#endif
+ return flags;
+}
+
+EXPORT_SYMBOL(_spin_lock_irqsave_nested);
#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801..c7675c1bfdf 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
return 0;
}
-static void deferred_cad(void *dummy)
+static void deferred_cad(struct work_struct *dummy)
{
kernel_restart(NULL);
}
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
*/
void ctrl_alt_del(void)
{
- static DECLARE_WORK(cad_work, deferred_cad, NULL);
+ static DECLARE_WORK(cad_work, deferred_cad);
if (C_A_D)
schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
asmlinkage long sys_setuid(uid_t uid)
{
int old_euid = current->euid;
- int old_ruid, old_suid, new_ruid, new_suid;
+ int old_ruid, old_suid, new_suid;
int retval;
retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
if (retval)
return retval;
- old_ruid = new_ruid = current->uid;
+ old_ruid = current->uid;
old_suid = current->suid;
new_suid = old_suid;
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (p->real_parent == group_leader) {
err = -EPERM;
- if (p->signal->session != group_leader->signal->session)
+ if (process_session(p) != process_session(group_leader))
goto out;
err = -EACCES;
if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
goto out;
if (pgid != pid) {
- struct task_struct *p;
+ struct task_struct *g =
+ find_task_by_pid_type(PIDTYPE_PGID, pgid);
- do_each_task_pid(pgid, PIDTYPE_PGID, p) {
- if (p->signal->session == group_leader->signal->session)
- goto ok_pgid;
- } while_each_task_pid(pgid, PIDTYPE_PGID, p);
- goto out;
+ if (!g || process_session(g) != process_session(group_leader))
+ goto out;
}
-ok_pgid:
err = security_task_setpgid(p, pgid);
if (err)
goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
asmlinkage long sys_getsid(pid_t pid)
{
if (!pid)
- return current->signal->session;
+ return process_session(current);
else {
int retval;
struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
if (p) {
retval = security_task_getsid(p);
if (!retval)
- retval = p->signal->session;
+ retval = process_session(p);
}
read_unlock(&tasklist_lock);
return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
pid_t session;
int err = -EPERM;
- mutex_lock(&tty_mutex);
write_lock_irq(&tasklist_lock);
/* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
group_leader->signal->leader = 1;
__set_special_pids(session, session);
+
+ spin_lock(&group_leader->sighand->siglock);
group_leader->signal->tty = NULL;
group_leader->signal->tty_old_pgrp = 0;
+ spin_unlock(&group_leader->sighand->siglock);
+
err = process_group(group_leader);
out:
write_unlock_irq(&tasklist_lock);
- mutex_unlock(&tty_mutex);
return err;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0e53314b14d..d7306d0f3df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,7 @@ cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
cond_syscall(sys_remap_file_pages);
cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
/* block-layer dependent */
cond_syscall(sys_bdflush);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bff2c18fb5..600b33358de 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
#ifdef CONFIG_X86
#include <asm/nmi.h>
+#include <asm/stacktrace.h>
#endif
#if defined(CONFIG_SYSCTL)
@@ -64,7 +65,6 @@ extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern int sysctl_panic_on_oom;
extern int max_threads;
-extern int sysrq_enabled;
extern int core_uses_pid;
extern int suid_dumpable;
extern char core_pattern[];
@@ -91,7 +91,9 @@ extern char modprobe_path[];
extern int sg_big_buff;
#endif
#ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
@@ -130,12 +132,22 @@ extern int max_lock_depth;
#ifdef CONFIG_SYSCTL_SYSCALL
static int parse_table(int __user *, int, void __user *, size_t __user *,
- void __user *, size_t, ctl_table *, void **);
+ void __user *, size_t, ctl_table *);
#endif
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen);
+
+#ifdef CONFIG_SYSVIPC
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen);
+#endif
+
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -162,6 +174,40 @@ extern ctl_table inotify_table[];
int sysctl_legacy_va_layout;
#endif
+static void *get_uts(ctl_table *table, int write)
+{
+ char *which = table->data;
+#ifdef CONFIG_UTS_NS
+ struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+ which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+#endif
+ if (!write)
+ down_read(&uts_sem);
+ else
+ down_write(&uts_sem);
+ return which;
+}
+
+static void put_uts(ctl_table *table, int write, void *which)
+{
+ if (!write)
+ up_read(&uts_sem);
+ else
+ up_write(&uts_sem);
+}
+
+#ifdef CONFIG_SYSVIPC
+static void *get_ipc(ctl_table *table, int write)
+{
+ char *which = table->data;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+ return which;
+}
+#else
+#define get_ipc(T,W) ((T)->data)
+#endif
+
/* /proc declarations: */
#ifdef CONFIG_PROC_SYSCTL
@@ -170,7 +216,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
static int proc_opensys(struct inode *, struct file *);
-struct file_operations proc_sys_file_operations = {
+const struct file_operations proc_sys_file_operations = {
.open = proc_opensys,
.read = proc_readsys,
.write = proc_writesys,
@@ -228,7 +274,6 @@ static ctl_table root_table[] = {
};
static ctl_table kern_table[] = {
-#ifndef CONFIG_UTS_NS
{
.ctl_name = KERN_OSTYPE,
.procname = "ostype",
@@ -236,7 +281,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.sysname),
.mode = 0444,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_OSRELEASE,
@@ -245,7 +290,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.release),
.mode = 0444,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_VERSION,
@@ -254,7 +299,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.version),
.mode = 0444,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_NODENAME,
@@ -263,7 +308,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.nodename),
.mode = 0644,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_DOMAINNAME,
@@ -272,56 +317,8 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.domainname),
.mode = 0644,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
-#else /* !CONFIG_UTS_NS */
- {
- .ctl_name = KERN_OSTYPE,
- .procname = "ostype",
- .data = NULL,
- /* could maybe use __NEW_UTS_LEN here? */
- .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_OSRELEASE,
- .procname = "osrelease",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, release),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_VERSION,
- .procname = "version",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, version),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_NODENAME,
- .procname = "hostname",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
- .mode = 0644,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_DOMAINNAME,
- .procname = "domainname",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
- .mode = 0644,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
-#endif /* !CONFIG_UTS_NS */
{
.ctl_name = KERN_PANIC,
.procname = "panic",
@@ -480,65 +477,72 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_SHMMAX,
.procname = "shmmax",
- .data = NULL,
- .maxlen = sizeof (size_t),
+ .data = &init_ipc_ns.shm_ctlmax,
+ .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_doulongvec_minmax,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_SHMALL,
.procname = "shmall",
- .data = NULL,
- .maxlen = sizeof (size_t),
+ .data = &init_ipc_ns.shm_ctlall,
+ .maxlen = sizeof (init_ipc_ns.shm_ctlall),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_doulongvec_minmax,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_SHMMNI,
.procname = "shmmni",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.shm_ctlmni,
+ .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_MSGMAX,
.procname = "msgmax",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.msg_ctlmax,
+ .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_MSGMNI,
.procname = "msgmni",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.msg_ctlmni,
+ .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_MSGMNB,
.procname = "msgmnb",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.msg_ctlmnb,
+ .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_SEM,
.procname = "sem",
- .data = NULL,
+ .data = &init_ipc_ns.sem_ctls,
.maxlen = 4*sizeof (int),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
#endif
#ifdef CONFIG_MAGIC_SYSRQ
{
.ctl_name = KERN_SYSRQ,
.procname = "sysrq",
- .data = &sysrq_enabled,
+ .data = &__sysrq_enabled,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = &proc_dointvec,
@@ -707,6 +711,14 @@ static ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kstack_depth_to_print",
+ .data = &kstack_depth_to_print,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
#if defined(CONFIG_MMU)
{
@@ -977,17 +989,6 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
},
#endif
-#ifdef CONFIG_SWAP
- {
- .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
- .procname = "swap_token_timeout",
- .data = &swap_token_default_timeout,
- .maxlen = sizeof(swap_token_default_timeout),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
-#endif
#ifdef CONFIG_NUMA
{
.ctl_name = VM_ZONE_RECLAIM_MODE,
@@ -1241,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
do {
struct ctl_table_header *head =
list_entry(tmp, struct ctl_table_header, ctl_entry);
- void *context = NULL;
if (!use_table(head))
continue;
@@ -1249,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
spin_unlock(&sysctl_lock);
error = parse_table(name, nlen, oldval, oldlenp,
- newval, newlen, head->ctl_table,
- &context);
- kfree(context);
+ newval, newlen, head->ctl_table);
spin_lock(&sysctl_lock);
unuse_table(head);
@@ -1307,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op)
static int parse_table(int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen,
- ctl_table *table, void **context)
+ ctl_table *table)
{
int n;
repeat:
@@ -1315,7 +1313,9 @@ repeat:
return -ENOTDIR;
if (get_user(n, name))
return -EFAULT;
- for ( ; table->ctl_name; table++) {
+ for ( ; table->ctl_name || table->procname; table++) {
+ if (!table->ctl_name)
+ continue;
if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
int error;
if (table->child) {
@@ -1325,7 +1325,7 @@ repeat:
error = table->strategy(
table, name, nlen,
oldval, oldlenp,
- newval, newlen, context);
+ newval, newlen);
if (error)
return error;
}
@@ -1336,7 +1336,7 @@ repeat:
}
error = do_sysctl_strategy(table, name, nlen,
oldval, oldlenp,
- newval, newlen, context);
+ newval, newlen);
return error;
}
}
@@ -1347,7 +1347,7 @@ repeat:
int do_sysctl_strategy (ctl_table *table,
int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
int op = 0, rc;
size_t len;
@@ -1361,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table,
if (table->strategy) {
rc = table->strategy(table, name, nlen, oldval, oldlenp,
- newval, newlen, context);
+ newval, newlen);
if (rc < 0)
return rc;
if (rc > 0)
@@ -1532,7 +1532,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
int len;
mode_t mode;
- for (; table->ctl_name; table++) {
+ for (; table->ctl_name || table->procname; table++) {
/* Can't do anything without a proc name. */
if (!table->procname)
continue;
@@ -1579,7 +1579,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
{
struct proc_dir_entry *de;
- for (; table->ctl_name; table++) {
+ for (; table->ctl_name || table->procname; table++) {
if (!(de = table->de))
continue;
if (de->mode & S_IFDIR) {
@@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
int op;
- struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
+ struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
struct ctl_table *table;
size_t res;
ssize_t error = -ENOTDIR;
@@ -1753,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
-
-#ifndef CONFIG_UTS_NS
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int r;
- if (!write) {
- down_read(&uts_sem);
- r=proc_dostring(table,0,filp,buffer,lenp, ppos);
- up_read(&uts_sem);
- } else {
- down_write(&uts_sem);
- r=proc_dostring(table,1,filp,buffer,lenp, ppos);
- up_write(&uts_sem);
- }
- return r;
-}
-#else /* !CONFIG_UTS_NS */
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int r;
- struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
- char* which;
-
- switch (table->ctl_name) {
- case KERN_OSTYPE:
- which = uts_ns->name.sysname;
- break;
- case KERN_NODENAME:
- which = uts_ns->name.nodename;
- break;
- case KERN_OSRELEASE:
- which = uts_ns->name.release;
- break;
- case KERN_VERSION:
- which = uts_ns->name.version;
- break;
- case KERN_DOMAINNAME:
- which = uts_ns->name.domainname;
- break;
- default:
- r = -EINVAL;
- goto out;
- }
-
- if (!write) {
- down_read(&uts_sem);
- r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
- up_read(&uts_sem);
- } else {
- down_write(&uts_sem);
- r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
- up_write(&uts_sem);
- }
- out:
+ void *which;
+ which = get_uts(table, write);
+ r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
+ put_uts(table, write, which);
return r;
}
-#endif /* !CONFIG_UTS_NS */
static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -1884,7 +1835,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
p = buf;
if (*p == '-' && left > 1) {
neg = 1;
- left--, p++;
+ p++;
}
if (*p < '0' || *p > '9')
break;
@@ -1976,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
#define OP_SET 0
#define OP_AND 1
-#define OP_OR 2
-#define OP_MAX 3
-#define OP_MIN 4
static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -1990,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
switch(op) {
case OP_SET: *valp = val; break;
case OP_AND: *valp &= val; break;
- case OP_OR: *valp |= val; break;
- case OP_MAX: if(*valp < val)
- *valp = val;
- break;
- case OP_MIN: if(*valp > val)
- *valp = val;
- break;
}
} else {
int val = *valp;
@@ -2135,7 +2076,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
p = buf;
if (*p == '-' && left > 1) {
neg = 1;
- left--, p++;
+ p++;
}
if (*p < '0' || *p > '9')
break;
@@ -2391,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
}
#ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- void *data;
- struct ipc_namespace *ns;
-
- ns = current->nsproxy->ipc_ns;
-
- switch (table->ctl_name) {
- case KERN_SHMMAX:
- data = &ns->shm_ctlmax;
- goto proc_minmax;
- case KERN_SHMALL:
- data = &ns->shm_ctlall;
- goto proc_minmax;
- case KERN_SHMMNI:
- data = &ns->shm_ctlmni;
- break;
- case KERN_MSGMAX:
- data = &ns->msg_ctlmax;
- break;
- case KERN_MSGMNI:
- data = &ns->msg_ctlmni;
- break;
- case KERN_MSGMNB:
- data = &ns->msg_ctlmnb;
- break;
- case KERN_SEM:
- data = &ns->sem_ctls;
- break;
- default:
- return -EINVAL;
- }
-
- return __do_proc_dointvec(data, table, write, filp, buffer,
+ void *which;
+ which = get_ipc(table, write);
+ return __do_proc_dointvec(which, table, write, filp, buffer,
lenp, ppos, NULL, NULL);
-proc_minmax:
- return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+}
+
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ void *which;
+ which = get_ipc(table, write);
+ return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
lenp, ppos, 1l, 1l);
}
+
#endif
static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2475,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
{
return -ENOSYS;
}
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
#endif
int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2539,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
/* The generic string strategy routine: */
int sysctl_string(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (!table->data || !table->maxlen)
return -ENOTDIR;
@@ -2585,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
*/
int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (newval && newlen) {
@@ -2621,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
/* Strategy function to convert jiffies to seconds */
int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (oldval) {
size_t olen;
@@ -2649,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
/* Strategy function to convert jiffies to seconds */
int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (oldval) {
size_t olen;
@@ -2674,50 +2604,140 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return 1;
}
+
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ struct ctl_table uts_table;
+ int r, write;
+ write = newval && newlen;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = sysctl_string(&uts_table, name, nlen,
+ oldval, oldlenp, newval, newlen);
+ put_uts(table, write, uts_table.data);
+ return r;
+}
+
+#ifdef CONFIG_SYSVIPC
+/* The generic sysctl ipc data routine. */
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ size_t len;
+ void *data;
+
+ /* Get out of I don't have a variable */
+ if (!table->data || !table->maxlen)
+ return -ENOTDIR;
+
+ data = get_ipc(table, 1);
+ if (!data)
+ return -ENOTDIR;
+
+ if (oldval && oldlenp) {
+ if (get_user(len, oldlenp))
+ return -EFAULT;
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (copy_to_user(oldval, data, len))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+
+ if (newval && newlen) {
+ if (newlen > table->maxlen)
+ newlen = table->maxlen;
+
+ if (copy_from_user(data, newval, newlen))
+ return -EFAULT;
+ }
+ return 1;
+}
+#endif
+
#else /* CONFIG_SYSCTL_SYSCALL */
asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
{
static int msg_count;
+ struct __sysctl_args tmp;
+ int name[CTL_MAXNAME];
+ int i;
+
+ /* Read in the sysctl name for better debug message logging */
+ if (copy_from_user(&tmp, args, sizeof(tmp)))
+ return -EFAULT;
+ if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
+ return -ENOTDIR;
+ for (i = 0; i < tmp.nlen; i++)
+ if (get_user(name[i], tmp.name + i))
+ return -EFAULT;
+
+ /* Ignore accesses to kernel.version */
+ if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+ goto out;
if (msg_count < 5) {
msg_count++;
printk(KERN_INFO
"warning: process `%s' used the removed sysctl "
- "system call\n", current->comm);
+ "system call with ", current->comm);
+ for (i = 0; i < tmp.nlen; i++)
+ printk("%d.", name[i]);
+ printk("\n");
}
+out:
return -ENOSYS;
}
int sysctl_string(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_SYSCTL_SYSCALL */
/*
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2039585ec5e..4c3476fa058 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
static int family_registered;
-kmem_cache_t *taskstats_cache;
+struct kmem_cache *taskstats_cache;
static struct genl_family family = {
.id = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
};
static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
- void **replyp, size_t size)
+ size_t size)
{
struct sk_buff *skb;
void *reply;
@@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
/*
* If new attributes are added, please revisit this allocation
*/
- size = nlmsg_total_size(genlmsg_total_size(size));
- skb = nlmsg_new(size, GFP_KERNEL);
+ skb = genlmsg_new(size, GFP_KERNEL);
if (!skb)
return -ENOMEM;
@@ -86,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
int seq = get_cpu_var(taskstats_seqnum)++;
put_cpu_var(taskstats_seqnum);
- reply = genlmsg_put(skb, 0, seq,
- family.id, 0, 0,
- cmd, family.version);
+ reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
} else
- reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
- family.id, 0, 0,
- cmd, family.version);
+ reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
if (reply == NULL) {
nlmsg_free(skb);
return -EINVAL;
}
*skbp = skb;
- *replyp = reply;
return 0;
}
@@ -124,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
/*
* Send taskstats data in @skb to listeners registered for @cpu's exit data
*/
-static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb,
+ struct listener_list *listeners)
{
struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
- struct listener_list *listeners;
struct listener *s, *tmp;
struct sk_buff *skb_next, *skb_cur = skb;
void *reply = genlmsg_data(genlhdr);
@@ -140,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
}
rc = 0;
- listeners = &per_cpu(listener_array, cpu);
down_read(&listeners->sem);
list_for_each_entry(s, &listeners->list, list) {
skb_next = NULL;
@@ -191,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
} else
get_task_struct(tsk);
+ memset(stats, 0, sizeof(*stats));
/*
* Each accounting subsystem adds calls to its functions to
* fill in relevant parts of struct taskstsats as follows
@@ -233,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
if (first->signal->stats)
memcpy(stats, first->signal->stats, sizeof(*stats));
+ else
+ memset(stats, 0, sizeof(*stats));
tsk = first;
do {
@@ -349,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
return ret;
}
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
+{
+ struct nlattr *na, *ret;
+ int aggr;
+
+ aggr = (type == TASKSTATS_TYPE_PID)
+ ? TASKSTATS_TYPE_AGGR_PID
+ : TASKSTATS_TYPE_AGGR_TGID;
+
+ na = nla_nest_start(skb, aggr);
+ if (!na)
+ goto err;
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+ goto err;
+ ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+ if (!ret)
+ goto err;
+ nla_nest_end(skb, na);
+
+ return nla_data(ret);
+err:
+ return NULL;
+}
+
static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
{
int rc = 0;
struct sk_buff *rep_skb;
- struct taskstats stats;
- void *reply;
+ struct taskstats *stats;
size_t size;
- struct nlattr *na;
cpumask_t mask;
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -377,141 +395,122 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
size = nla_total_size(sizeof(u32)) +
nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
- memset(&stats, 0, sizeof(stats));
- rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
if (rc < 0)
return rc;
+ rc = -EINVAL;
if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
- rc = fill_pid(pid, NULL, &stats);
- if (rc < 0)
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+ if (!stats)
goto err;
- na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
- NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
- NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
- stats);
+ rc = fill_pid(pid, NULL, stats);
+ if (rc < 0)
+ goto err;
} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
- rc = fill_tgid(tgid, NULL, &stats);
- if (rc < 0)
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ if (!stats)
goto err;
- na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
- NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
- NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
- stats);
- } else {
- rc = -EINVAL;
+ rc = fill_tgid(tgid, NULL, stats);
+ if (rc < 0)
+ goto err;
+ } else
goto err;
- }
-
- nla_nest_end(rep_skb, na);
return send_reply(rep_skb, info->snd_pid);
-
-nla_put_failure:
- rc = genlmsg_cancel(rep_skb, reply);
err:
nlmsg_free(rep_skb);
return rc;
}
-void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
- struct listener_list *listeners;
- struct taskstats *tmp;
- /*
- * This is the cpu on which the task is exiting currently and will
- * be the one for which the exit event is sent, even if the cpu
- * on which this function is running changes later.
- */
- *mycpu = raw_smp_processor_id();
+ struct signal_struct *sig = tsk->signal;
+ struct taskstats *stats;
- *ptidstats = NULL;
- tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
- if (!tmp)
- return;
+ if (sig->stats || thread_group_empty(tsk))
+ goto ret;
- listeners = &per_cpu(listener_array, *mycpu);
- down_read(&listeners->sem);
- if (!list_empty(&listeners->list)) {
- *ptidstats = tmp;
- tmp = NULL;
+ /* No problem if kmem_cache_zalloc() fails */
+ stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!sig->stats) {
+ sig->stats = stats;
+ stats = NULL;
}
- up_read(&listeners->sem);
- kfree(tmp);
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (stats)
+ kmem_cache_free(taskstats_cache, stats);
+ret:
+ return sig->stats;
}
/* Send pid data out on exit */
-void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
- int group_dead, unsigned int mycpu)
+void taskstats_exit(struct task_struct *tsk, int group_dead)
{
int rc;
+ struct listener_list *listeners;
+ struct taskstats *stats;
struct sk_buff *rep_skb;
- void *reply;
size_t size;
int is_thread_group;
- struct nlattr *na;
- if (!family_registered || !tidstats)
+ if (!family_registered)
return;
- rc = 0;
/*
* Size includes space for nested attributes
*/
size = nla_total_size(sizeof(u32)) +
nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
- is_thread_group = (tsk->signal->stats != NULL);
- if (is_thread_group)
- size = 2 * size; /* PID + STATS + TGID + STATS */
+ is_thread_group = !!taskstats_tgid_alloc(tsk);
+ if (is_thread_group) {
+ /* PID + STATS + TGID + STATS */
+ size = 2 * size;
+ /* fill the tsk->signal->stats structure */
+ fill_tgid_exit(tsk);
+ }
- rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
- if (rc < 0)
- goto ret;
+ listeners = &__raw_get_cpu_var(listener_array);
+ if (list_empty(&listeners->list))
+ return;
- rc = fill_pid(tsk->pid, tsk, tidstats);
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
if (rc < 0)
- goto err_skb;
+ return;
- na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
- NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
- NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
- *tidstats);
- nla_nest_end(rep_skb, na);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+ if (!stats)
+ goto err;
- if (!is_thread_group)
- goto send;
+ rc = fill_pid(tsk->pid, tsk, stats);
+ if (rc < 0)
+ goto err;
/*
- * tsk has/had a thread group so fill the tsk->signal->stats structure
* Doesn't matter if tsk is the leader or the last group member leaving
*/
-
- fill_tgid_exit(tsk);
- if (!group_dead)
+ if (!is_thread_group || !group_dead)
goto send;
- na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
- NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
- /* No locking needed for tsk->signal->stats since group is dead */
- NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
- *tsk->signal->stats);
- nla_nest_end(rep_skb, na);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+ if (!stats)
+ goto err;
+
+ memcpy(stats, tsk->signal->stats, sizeof(*stats));
send:
- send_cpu_listeners(rep_skb, mycpu);
+ send_cpu_listeners(rep_skb, listeners);
return;
-
-nla_put_failure:
- genlmsg_cancel(rep_skb, reply);
-err_skb:
+err:
nlmsg_free(rep_skb);
-ret:
- return;
}
static struct genl_ops taskstats_ops = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd..22504afc0d3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
/* check if clocksource is already registered */
if (is_registered_source(c)) {
printk("register_clocksource: Cannot register %s. "
- "Already registered!", c->name);
+ "Already registered!", c->name);
ret = -EBUSY;
} else {
/* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
}
EXPORT_SYMBOL(clocksource_reselect);
+#ifdef CONFIG_SYSFS
/**
* sysfs_show_current_clocksources - sysfs interface for current clocksource
* @dev: unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
* Sysfs setup bits:
*/
static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
- sysfs_override_clocksource);
+ sysfs_override_clocksource);
static SYSDEV_ATTR(available_clocksource, 0600,
- sysfs_show_available_clocksources, NULL);
+ sysfs_show_available_clocksources, NULL);
static struct sysdev_class clocksource_sysclass = {
set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
}
device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
/**
* boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec..feddf817baa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ int rem;
+ unsigned long original = j;
+
+ /*
+ * We don't want all cpus firing their timers at once hitting the
+ * same lock or cachelines, so we skew each extra cpu with an extra
+ * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+ * already did this.
+ * The skew is done by adding 3*cpunr, then round, then subtract this
+ * extra offset again.
+ */
+ j += cpu * 3;
+
+ rem = j % HZ;
+
+ /*
+ * If the target jiffie is just after a whole second (which can happen
+ * due to delays of the timer irq, long irq off times etc etc) then
+ * we should round down to the whole second, not up. Use 1/4th second
+ * as cutoff for this rounding as an extreme upper bound for this.
+ */
+ if (rem < HZ/4) /* round down */
+ j = j - rem;
+ else /* round up */
+ j = j - rem + HZ;
+
+ /* now that we have rounded, subtract the extra skew again */
+ j -= cpu * 3;
+
+ if (j <= jiffies) /* rounding ate our timeout entirely; */
+ return original;
+ return j;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+ /*
+ * In theory the following code can skip a jiffy in case jiffies
+ * increments right between the addition and the later subtraction.
+ * However since the entire point of this function is to use approximate
+ * timeouts, it's entirely ok to not handle that.
+ */
+ return __round_jiffies(j + jiffies, cpu) - jiffies;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+ return __round_jiffies(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+ return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
+
+
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
{
@@ -714,7 +846,7 @@ static int change_clocksource(void)
clock = new;
clock->cycle_last = now;
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
+ clock->name);
return 1;
} else if (clock->update_callback) {
return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
return 0;
}
#else
-#define change_clocksource() (0)
+static inline int change_clocksource(void)
+{
+ return 0;
+}
#endif
/**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
{
s64 tick_error, i;
u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error = current_tick_length() >>
+ (TICK_LENGTH_SHIFT - clock->shift + 1);
tick_error -= clock->xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
clock->mult += adj;
clock->xtime_interval += interval;
clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+ clock->error -= (interval - offset) <<
+ (TICK_LENGTH_SHIFT - clock->shift);
}
/**
@@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks)
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
- active_tasks = count_active_tasks();
- for (count -= ticks; count < 0; count += LOAD_FREQ) {
- CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+ count -= ticks;
+ if (unlikely(count < 0)) {
+ active_tasks = count_active_tasks();
+ do {
+ CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+ CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+ CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+ count += LOAD_FREQ;
+ } while (count < 0);
}
}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 65a5036a3d9..baacc369141 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -80,18 +80,31 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
*/
void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
+ struct mm_struct *mm;
+
/* convert pages-jiffies to Mbyte-usec */
stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
- if (p->mm) {
+ mm = get_task_mm(p);
+ if (mm) {
/* adjust to KB unit */
- stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB;
- stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB;
+ stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB;
+ stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB;
+ mmput(mm);
}
stats->read_char = p->rchar;
stats->write_char = p->wchar;
stats->read_syscalls = p->syscr;
stats->write_syscalls = p->syscw;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ stats->read_bytes = p->ioac.read_bytes;
+ stats->write_bytes = p->ioac.write_bytes;
+ stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+#else
+ stats->read_bytes = 0;
+ stats->write_bytes = 0;
+ stats->cancelled_write_bytes = 0;
+#endif
}
#undef KB
#undef MB
diff --git a/kernel/unwind.c b/kernel/unwind.c
index f7e50d16dbf..09c26132924 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -14,11 +14,12 @@
#include <linux/bootmem.h>
#include <linux/sort.h>
#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
#include <asm/unaligned.h>
-extern char __start_unwind[], __end_unwind[];
+extern const char __start_unwind[], __end_unwind[];
extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
#define MAX_STACK_DEPTH 8
@@ -94,6 +95,7 @@ static const struct {
typedef unsigned long uleb128_t;
typedef signed long sleb128_t;
+#define sleb128abs __builtin_labs
static struct unwind_table {
struct {
@@ -135,6 +137,17 @@ struct unwind_state {
static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+static unsigned unwind_debug;
+static int __init unwind_debug_setup(char *s)
+{
+ unwind_debug = simple_strtoul(s, NULL, 0);
+ return 1;
+}
+__setup("unwind_debug=", unwind_debug_setup);
+#define dprintk(lvl, fmt, args...) \
+ ((void)(lvl > unwind_debug \
+ || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
+
static struct unwind_table *find_table(unsigned long pc)
{
struct unwind_table *table;
@@ -151,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc)
static unsigned long read_pointer(const u8 **pLoc,
const void *end,
- signed ptrType);
+ signed ptrType,
+ unsigned long text_base,
+ unsigned long data_base);
static void init_unwind_table(struct unwind_table *table,
const char *name,
@@ -176,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table,
/* See if the linker provided table looks valid. */
if (header_size <= 4
|| header_start[0] != 1
- || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
- || header_start[2] == DW_EH_PE_omit
- || read_pointer(&ptr, end, header_start[2]) <= 0
- || header_start[3] == DW_EH_PE_omit)
+ || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
+ != table_start
+ || !read_pointer(&ptr, end, header_start[2], 0, 0)
+ || !read_pointer(&ptr, end, header_start[3], 0,
+ (unsigned long)header_start)
+ || !read_pointer(&ptr, end, header_start[3], 0,
+ (unsigned long)header_start))
header_start = NULL;
table->hdrsz = header_size;
smp_wmb();
@@ -269,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
ptr = (const u8 *)(fde + 2);
if (!read_pointer(&ptr,
(const u8 *)(fde + 1) + *fde,
- ptrType))
+ ptrType, 0, 0))
return;
++n;
}
@@ -279,6 +297,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
+ 2 * n * sizeof(unsigned long);
+ dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
header = alloc(hdrSize);
if (!header)
return;
@@ -303,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
ptr = (const u8 *)(fde + 2);
header->table[n].start = read_pointer(&ptr,
(const u8 *)(fde + 1) + *fde,
- fde_pointer_type(cie));
+ fde_pointer_type(cie), 0, 0);
header->table[n].fde = (unsigned long)fde;
++n;
}
@@ -486,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
static unsigned long read_pointer(const u8 **pLoc,
const void *end,
- signed ptrType)
+ signed ptrType,
+ unsigned long text_base,
+ unsigned long data_base)
{
unsigned long value = 0;
union {
@@ -498,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc,
const unsigned long *pul;
} ptr;
- if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+ if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
+ dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
return 0;
+ }
ptr.p8 = *pLoc;
switch(ptrType & DW_EH_PE_FORM) {
case DW_EH_PE_data2:
- if (end < (const void *)(ptr.p16u + 1))
+ if (end < (const void *)(ptr.p16u + 1)) {
+ dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
return 0;
+ }
if(ptrType & DW_EH_PE_signed)
value = get_unaligned(ptr.p16s++);
else
@@ -512,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc,
break;
case DW_EH_PE_data4:
#ifdef CONFIG_64BIT
- if (end < (const void *)(ptr.p32u + 1))
+ if (end < (const void *)(ptr.p32u + 1)) {
+ dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
return 0;
+ }
if(ptrType & DW_EH_PE_signed)
value = get_unaligned(ptr.p32s++);
else
@@ -525,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc,
BUILD_BUG_ON(sizeof(u32) != sizeof(value));
#endif
case DW_EH_PE_native:
- if (end < (const void *)(ptr.pul + 1))
+ if (end < (const void *)(ptr.pul + 1)) {
+ dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
return 0;
+ }
value = get_unaligned(ptr.pul++);
break;
case DW_EH_PE_leb128:
@@ -534,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc,
value = ptrType & DW_EH_PE_signed
? get_sleb128(&ptr.p8, end)
: get_uleb128(&ptr.p8, end);
- if ((const void *)ptr.p8 > end)
+ if ((const void *)ptr.p8 > end) {
+ dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
return 0;
+ }
break;
default:
+ dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
+ ptrType, ptr.p8, end);
return 0;
}
switch(ptrType & DW_EH_PE_ADJUST) {
@@ -546,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc,
case DW_EH_PE_pcrel:
value += (unsigned long)*pLoc;
break;
+ case DW_EH_PE_textrel:
+ if (likely(text_base)) {
+ value += text_base;
+ break;
+ }
+ dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
+ ptrType, *pLoc, end);
+ return 0;
+ case DW_EH_PE_datarel:
+ if (likely(data_base)) {
+ value += data_base;
+ break;
+ }
+ dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
+ ptrType, *pLoc, end);
+ return 0;
default:
+ dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
+ ptrType, *pLoc, end);
return 0;
}
if ((ptrType & DW_EH_PE_indirect)
- && __get_user(value, (unsigned long *)value))
+ && probe_kernel_address((unsigned long *)value, value)) {
+ dprintk(1, "Cannot read indirect value %lx (%p,%p).",
+ value, *pLoc, end);
return 0;
+ }
*pLoc = ptr.p8;
return value;
@@ -594,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
case 'P': {
signed ptrType = *ptr++;
- if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+ if (!read_pointer(&ptr, end, ptrType, 0, 0)
+ || ptr > end)
return -1;
}
break;
@@ -654,7 +709,8 @@ static int processCFI(const u8 *start,
case DW_CFA_nop:
break;
case DW_CFA_set_loc:
- if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+ state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
+ if (state->loc == 0)
result = 0;
break;
case DW_CFA_advance_loc1:
@@ -700,8 +756,10 @@ static int processCFI(const u8 *start,
state->label = NULL;
return 1;
}
- if (state->stackDepth >= MAX_STACK_DEPTH)
+ if (state->stackDepth >= MAX_STACK_DEPTH) {
+ dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
return 0;
+ }
state->stack[state->stackDepth++] = ptr.p8;
break;
case DW_CFA_restore_state:
@@ -716,8 +774,10 @@ static int processCFI(const u8 *start,
result = processCFI(start, end, 0, ptrType, state);
state->loc = loc;
state->label = label;
- } else
+ } else {
+ dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
return 0;
+ }
break;
case DW_CFA_def_cfa:
state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -749,6 +809,7 @@ static int processCFI(const u8 *start,
break;
case DW_CFA_GNU_window_save:
default:
+ dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
result = 0;
break;
}
@@ -764,12 +825,17 @@ static int processCFI(const u8 *start,
set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
break;
}
- if (ptr.p8 > end)
+ if (ptr.p8 > end) {
+ dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
result = 0;
+ }
if (result && targetLoc != 0 && targetLoc < state->loc)
return 1;
}
+ if (result && ptr.p8 < end)
+ dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
+
return result
&& ptr.p8 == end
&& (targetLoc == 0
@@ -786,7 +852,7 @@ int unwind(struct unwind_frame_info *frame)
#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
const u32 *fde = NULL, *cie = NULL;
const u8 *ptr = NULL, *end = NULL;
- unsigned long pc = UNW_PC(frame) - frame->call_frame;
+ unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
unsigned long startLoc = 0, endLoc = 0, cfa;
unsigned i;
signed ptrType = -1;
@@ -813,9 +879,9 @@ int unwind(struct unwind_frame_info *frame)
ptr = hdr + 4;
end = hdr + table->hdrsz;
if (tableSize
- && read_pointer(&ptr, end, hdr[1])
+ && read_pointer(&ptr, end, hdr[1], 0, 0)
== (unsigned long)table->address
- && (i = read_pointer(&ptr, end, hdr[2])) > 0
+ && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
&& i == (end - ptr) / (2 * tableSize)
&& !((end - ptr) % (2 * tableSize))) {
do {
@@ -823,7 +889,8 @@ int unwind(struct unwind_frame_info *frame)
startLoc = read_pointer(&cur,
cur + tableSize,
- hdr[3]);
+ hdr[3], 0,
+ (unsigned long)hdr);
if (pc < startLoc)
i /= 2;
else {
@@ -834,13 +901,17 @@ int unwind(struct unwind_frame_info *frame)
if (i == 1
&& (startLoc = read_pointer(&ptr,
ptr + tableSize,
- hdr[3])) != 0
+ hdr[3], 0,
+ (unsigned long)hdr)) != 0
&& pc >= startLoc)
fde = (void *)read_pointer(&ptr,
ptr + tableSize,
- hdr[3]);
+ hdr[3], 0,
+ (unsigned long)hdr);
}
}
+ if(hdr && !fde)
+ dprintk(3, "Binary lookup for %lx failed.", pc);
if (fde != NULL) {
cie = cie_for_fde(fde, table);
@@ -851,17 +922,19 @@ int unwind(struct unwind_frame_info *frame)
&& (ptrType = fde_pointer_type(cie)) >= 0
&& read_pointer(&ptr,
(const u8 *)(fde + 1) + *fde,
- ptrType) == startLoc) {
+ ptrType, 0, 0) == startLoc) {
if (!(ptrType & DW_EH_PE_indirect))
ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
endLoc = startLoc
+ read_pointer(&ptr,
(const u8 *)(fde + 1) + *fde,
- ptrType);
+ ptrType, 0, 0);
if(pc >= endLoc)
fde = NULL;
} else
fde = NULL;
+ if(!fde)
+ dprintk(1, "Binary lookup result for %lx discarded.", pc);
}
if (fde == NULL) {
for (fde = table->address, tableSize = table->size;
@@ -881,7 +954,7 @@ int unwind(struct unwind_frame_info *frame)
ptr = (const u8 *)(fde + 2);
startLoc = read_pointer(&ptr,
(const u8 *)(fde + 1) + *fde,
- ptrType);
+ ptrType, 0, 0);
if (!startLoc)
continue;
if (!(ptrType & DW_EH_PE_indirect))
@@ -889,10 +962,12 @@ int unwind(struct unwind_frame_info *frame)
endLoc = startLoc
+ read_pointer(&ptr,
(const u8 *)(fde + 1) + *fde,
- ptrType);
+ ptrType, 0, 0);
if (pc >= startLoc && pc < endLoc)
break;
}
+ if(!fde)
+ dprintk(3, "Linear lookup for %lx failed.", pc);
}
}
if (cie != NULL) {
@@ -926,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame)
if (ptr >= end || *ptr)
cie = NULL;
}
+ if(!cie)
+ dprintk(1, "CIE unusable (%p,%p).", ptr, end);
++ptr;
}
if (cie != NULL) {
@@ -935,17 +1012,27 @@ int unwind(struct unwind_frame_info *frame)
state.dataAlign = get_sleb128(&ptr, end);
if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
cie = NULL;
- else {
+ else if (UNW_PC(frame) % state.codeAlign
+ || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+ dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
+ UNW_PC(frame), UNW_SP(frame));
+ return -EPERM;
+ } else {
retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
/* skip augmentation */
- if (((const char *)(cie + 2))[1] == 'z')
- ptr += get_uleb128(&ptr, end);
+ if (((const char *)(cie + 2))[1] == 'z') {
+ uleb128_t augSize = get_uleb128(&ptr, end);
+
+ ptr += augSize;
+ }
if (ptr > end
|| retAddrReg >= ARRAY_SIZE(reg_info)
|| REG_INVALID(retAddrReg)
|| reg_info[retAddrReg].width != sizeof(unsigned long))
cie = NULL;
}
+ if(!cie)
+ dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
}
if (cie != NULL) {
state.cieStart = ptr;
@@ -959,13 +1046,15 @@ int unwind(struct unwind_frame_info *frame)
if ((ptr += augSize) > end)
fde = NULL;
}
+ if(!fde)
+ dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
}
if (cie == NULL || fde == NULL) {
#ifdef CONFIG_FRAME_POINTER
unsigned long top, bottom;
-#endif
-#ifdef CONFIG_FRAME_POINTER
+ if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
+ return -EPERM;
top = STACK_TOP(frame->task);
bottom = STACK_BOTTOM(frame->task);
# if FRAME_RETADDR_OFFSET < 0
@@ -981,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame)
& (sizeof(unsigned long) - 1))) {
unsigned long link;
- if (!__get_user(link,
+ if (!probe_kernel_address(
(unsigned long *)(UNW_FP(frame)
- + FRAME_LINK_OFFSET))
+ + FRAME_LINK_OFFSET),
+ link)
# if FRAME_RETADDR_OFFSET < 0
&& link > bottom && link < UNW_FP(frame)
# else
&& link > UNW_FP(frame) && link < bottom
# endif
&& !(link & (sizeof(link) - 1))
- && !__get_user(UNW_PC(frame),
+ && !probe_kernel_address(
(unsigned long *)(UNW_FP(frame)
- + FRAME_RETADDR_OFFSET))) {
+ + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
# if FRAME_RETADDR_OFFSET < 0
-
@@ -1015,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame)
|| state.regs[retAddrReg].where == Nowhere
|| state.cfa.reg >= ARRAY_SIZE(reg_info)
|| reg_info[state.cfa.reg].width != sizeof(unsigned long)
- || state.cfa.offs % sizeof(unsigned long))
+ || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
+ || state.cfa.offs % sizeof(unsigned long)) {
+ dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
return -EIO;
+ }
/* update frame */
#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
if(frame->call_frame
@@ -1035,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame)
#else
# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
#endif
+ pc = UNW_PC(frame);
+ sp = UNW_SP(frame);
for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
if (REG_INVALID(i)) {
if (state.regs[i].where == Nowhere)
continue;
+ dprintk(1, "Cannot restore register %u (%d).",
+ i, state.regs[i].where);
return -EIO;
}
switch(state.regs[i].where) {
@@ -1047,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame)
case Register:
if (state.regs[i].value >= ARRAY_SIZE(reg_info)
|| REG_INVALID(state.regs[i].value)
- || reg_info[i].width > reg_info[state.regs[i].value].width)
+ || reg_info[i].width > reg_info[state.regs[i].value].width) {
+ dprintk(1, "Cannot restore register %u from register %lu.",
+ i, state.regs[i].value);
return -EIO;
+ }
switch(reg_info[state.regs[i].value].width) {
#define CASE(n) \
case sizeof(u##n): \
@@ -1058,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame)
CASES;
#undef CASE
default:
+ dprintk(1, "Unsupported register size %u (%lu).",
+ reg_info[state.regs[i].value].width,
+ state.regs[i].value);
return -EIO;
}
break;
@@ -1082,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame)
CASES;
#undef CASE
default:
+ dprintk(1, "Unsupported register size %u (%u).",
+ reg_info[i].width, i);
return -EIO;
}
break;
case Value:
- if (reg_info[i].width != sizeof(unsigned long))
+ if (reg_info[i].width != sizeof(unsigned long)) {
+ dprintk(1, "Unsupported value size %u (%u).",
+ reg_info[i].width, i);
return -EIO;
+ }
FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
* state.dataAlign;
break;
@@ -1099,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame)
% sizeof(unsigned long)
|| addr < startLoc
|| addr + sizeof(unsigned long) < addr
- || addr + sizeof(unsigned long) > endLoc)
+ || addr + sizeof(unsigned long) > endLoc) {
+ dprintk(1, "Bad memory location %lx (%lx).",
+ addr, state.regs[i].value);
return -EIO;
+ }
switch(reg_info[i].width) {
#define CASE(n) case sizeof(u##n): \
- __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+ probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
break
CASES;
#undef CASE
default:
+ dprintk(1, "Unsupported memory size %u (%u).",
+ reg_info[i].width, i);
return -EIO;
}
}
@@ -1115,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame)
}
}
+ if (UNW_PC(frame) % state.codeAlign
+ || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+ dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
+ UNW_PC(frame), UNW_SP(frame));
+ return -EIO;
+ }
+ if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
+ dprintk(1, "No progress (%lx,%lx).", pc, sp);
+ return -EIO;
+ }
+
return 0;
#undef CASES
#undef FRAME_REG
diff --git a/kernel/user.c b/kernel/user.c
index 6408c042429..4869563080e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
-static kmem_cache_t *uid_cachep;
+static struct kmem_cache *uid_cachep;
static struct list_head uidhash_table[UIDHASH_SZ];
/*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
if (!up) {
struct user_struct *new;
- new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+ new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
if (!new)
return NULL;
new->uid = uid;
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user)
atomic_dec(&old_user->processes);
switch_uid_keyring(new_user);
current->user = new_user;
+
+ /*
+ * We need to synchronize with __sigqueue_alloc()
+ * doing a get_uid(p->user).. If that saw the old
+ * user value, we need to wait until it has exited
+ * its critical region before we can free the old
+ * structure.
+ */
+ smp_mb();
+ spin_unlock_wait(&current->sighand->siglock);
+
free_uid(old_user);
suid_keys(current);
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c2..db49886bfae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
struct task_struct *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
+
+ int freezeable; /* Freeze the thread during suspend */
} ____cacheline_aligned;
/*
@@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
return list_empty(&wq->list);
}
+/*
+ * Set the workqueue on which a work item is to be run
+ * - Must *only* be called if the pending flag is set
+ */
+static inline void set_wq_data(struct work_struct *work, void *wq)
+{
+ unsigned long new;
+
+ BUG_ON(!work_pending(work));
+
+ new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+ new |= work->management & WORK_STRUCT_FLAG_MASK;
+ work->management = new;
+}
+
+static inline void *get_wq_data(struct work_struct *work)
+{
+ return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
+}
+
+static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cwq->lock, flags);
+ /*
+ * We need to re-validate the work info after we've gotten
+ * the cpu_workqueue lock. We can run the work now iff:
+ *
+ * - the wq_data still matches the cpu_workqueue_struct
+ * - AND the work is still marked pending
+ * - AND the work is still on a list (which will be this
+ * workqueue_struct list)
+ *
+ * All these conditions are important, because we
+ * need to protect against the work being run right
+ * now on another CPU (all but the last one might be
+ * true if it's currently running and has not been
+ * released yet, for example).
+ */
+ if (get_wq_data(work) == cwq
+ && work_pending(work)
+ && !list_empty(&work->entry)) {
+ work_func_t f = work->func;
+ list_del_init(&work->entry);
+ spin_unlock_irqrestore(&cwq->lock, flags);
+
+ if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+ work_release(work);
+ f(work);
+
+ spin_lock_irqsave(&cwq->lock, flags);
+ cwq->remove_sequence++;
+ wake_up(&cwq->work_done);
+ ret = 1;
+ }
+ spin_unlock_irqrestore(&cwq->lock, flags);
+ return ret;
+}
+
+/**
+ * run_scheduled_work - run scheduled work synchronously
+ * @work: work to run
+ *
+ * This checks if the work was pending, and runs it
+ * synchronously if so. It returns a boolean to indicate
+ * whether it had any scheduled work to run or not.
+ *
+ * NOTE! This _only_ works for normal work_structs. You
+ * CANNOT use this for delayed work, because the wq data
+ * for delayed work will not point properly to the per-
+ * CPU workqueue struct, but will change!
+ */
+int fastcall run_scheduled_work(struct work_struct *work)
+{
+ for (;;) {
+ struct cpu_workqueue_struct *cwq;
+
+ if (!work_pending(work))
+ return 0;
+ if (list_empty(&work->entry))
+ return 0;
+ /* NOTE! This depends intimately on __queue_work! */
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return 0;
+ if (__run_work(cwq, work))
+ return 1;
+ }
+}
+EXPORT_SYMBOL(run_scheduled_work);
+
/* Preempt must be disabled. */
static void __queue_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work)
@@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
- work->wq_data = cwq;
+ set_wq_data(work, cwq);
list_add_tail(&work->entry, &cwq->worklist);
cwq->insert_sequence++;
wake_up(&cwq->more_work);
@@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
int ret = 0, cpu = get_cpu();
- if (!test_and_set_bit(0, &work->pending)) {
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
if (unlikely(is_single_threaded(wq)))
cpu = singlethread_cpu;
BUG_ON(!list_empty(&work->entry));
@@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work);
static void delayed_work_timer_fn(unsigned long __data)
{
- struct work_struct *work = (struct work_struct *)__data;
- struct workqueue_struct *wq = work->wq_data;
+ struct delayed_work *dwork = (struct delayed_work *)__data;
+ struct workqueue_struct *wq = get_wq_data(&dwork->work);
int cpu = smp_processor_id();
if (unlikely(is_single_threaded(wq)))
cpu = singlethread_cpu;
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
}
/**
* queue_delayed_work - queue work on a workqueue after delay
* @wq: workqueue to use
- * @work: work to queue
+ * @work: delayable work to queue
* @delay: number of jiffies to wait before queueing
*
* Returns 0 if @work was already on a queue, non-zero otherwise.
*/
int fastcall queue_delayed_work(struct workqueue_struct *wq,
- struct work_struct *work, unsigned long delay)
+ struct delayed_work *dwork, unsigned long delay)
{
int ret = 0;
- struct timer_list *timer = &work->timer;
+ struct timer_list *timer = &dwork->timer;
+ struct work_struct *work = &dwork->work;
+
+ if (delay == 0)
+ return queue_work(wq, work);
- if (!test_and_set_bit(0, &work->pending)) {
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
/* This stores wq for the moment, for the timer_fn */
- work->wq_data = wq;
+ set_wq_data(work, wq);
timer->expires = jiffies + delay;
- timer->data = (unsigned long)work;
+ timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
add_timer(timer);
ret = 1;
@@ -172,19 +274,20 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
* Returns 0 if @work was already on a queue, non-zero otherwise.
*/
int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct work_struct *work, unsigned long delay)
+ struct delayed_work *dwork, unsigned long delay)
{
int ret = 0;
- struct timer_list *timer = &work->timer;
+ struct timer_list *timer = &dwork->timer;
+ struct work_struct *work = &dwork->work;
- if (!test_and_set_bit(0, &work->pending)) {
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
/* This stores wq for the moment, for the timer_fn */
- work->wq_data = wq;
+ set_wq_data(work, wq);
timer->expires = jiffies + delay;
- timer->data = (unsigned long)work;
+ timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
add_timer_on(timer, cpu);
ret = 1;
@@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
while (!list_empty(&cwq->worklist)) {
struct work_struct *work = list_entry(cwq->worklist.next,
struct work_struct, entry);
- void (*f) (void *) = work->func;
- void *data = work->data;
+ work_func_t f = work->func;
list_del_init(cwq->worklist.next);
spin_unlock_irqrestore(&cwq->lock, flags);
- BUG_ON(work->wq_data != cwq);
- clear_bit(0, &work->pending);
- f(data);
+ BUG_ON(get_wq_data(work) != cwq);
+ if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+ work_release(work);
+ f(work);
+
+ if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+ printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(),
+ current->pid);
+ printk(KERN_ERR " last function: ");
+ print_symbol("%s\n", (unsigned long)f);
+ debug_show_held_locks(current);
+ dump_stack();
+ }
spin_lock_irqsave(&cwq->lock, flags);
cwq->remove_sequence++;
@@ -237,7 +351,8 @@ static int worker_thread(void *__cwq)
struct k_sigaction sa;
sigset_t blocked;
- current->flags |= PF_NOFREEZE;
+ if (!cwq->freezeable)
+ current->flags |= PF_NOFREEZE;
set_user_nice(current, -5);
@@ -260,6 +375,9 @@ static int worker_thread(void *__cwq)
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
+ if (cwq->freezeable)
+ try_to_freeze();
+
add_wait_queue(&cwq->more_work, &wait);
if (list_empty(&cwq->worklist))
schedule();
@@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
EXPORT_SYMBOL_GPL(flush_workqueue);
static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
- int cpu)
+ int cpu, int freezeable)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
struct task_struct *p;
@@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
cwq->thread = NULL;
cwq->insert_sequence = 0;
cwq->remove_sequence = 0;
+ cwq->freezeable = freezeable;
INIT_LIST_HEAD(&cwq->worklist);
init_waitqueue_head(&cwq->more_work);
init_waitqueue_head(&cwq->work_done);
@@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
}
struct workqueue_struct *__create_workqueue(const char *name,
- int singlethread)
+ int singlethread, int freezeable)
{
int cpu, destroy = 0;
struct workqueue_struct *wq;
@@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
mutex_lock(&workqueue_mutex);
if (singlethread) {
INIT_LIST_HEAD(&wq->list);
- p = create_workqueue_thread(wq, singlethread_cpu);
+ p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
if (!p)
destroy = 1;
else
@@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
} else {
list_add(&wq->list, &workqueues);
for_each_online_cpu(cpu) {
- p = create_workqueue_thread(wq, cpu);
+ p = create_workqueue_thread(wq, cpu, freezeable);
if (p) {
kthread_bind(p, cpu);
wake_up_process(p);
@@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work);
/**
* schedule_delayed_work - put work task in global workqueue after delay
- * @work: job to be done
- * @delay: number of jiffies to wait
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
*
* After waiting for a given time this puts a job in the kernel-global
* workqueue.
*/
-int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
{
- return queue_delayed_work(keventd_wq, work, delay);
+ return queue_delayed_work(keventd_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work);
/**
* schedule_delayed_work_on - queue work in global workqueue on CPU after delay
* @cpu: cpu to use
- * @work: job to be done
+ * @dwork: job to be done
* @delay: number of jiffies to wait
*
* After waiting for a given time this puts a job in the kernel-global
* workqueue on the specified CPU.
*/
int schedule_delayed_work_on(int cpu,
- struct work_struct *work, unsigned long delay)
+ struct delayed_work *dwork, unsigned long delay)
{
- return queue_delayed_work_on(cpu, keventd_wq, work, delay);
+ return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work_on);
/**
* schedule_on_each_cpu - call a function on each online CPU from keventd
* @func: the function to call
- * @info: a pointer to pass to func()
*
* Returns zero on success.
* Returns -ve errno on failure.
@@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
*
* schedule_on_each_cpu() is very slow.
*/
-int schedule_on_each_cpu(void (*func)(void *info), void *info)
+int schedule_on_each_cpu(work_func_t func)
{
int cpu;
struct work_struct *works;
@@ -519,7 +637,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu) {
- INIT_WORK(per_cpu_ptr(works, cpu), func, info);
+ INIT_WORK(per_cpu_ptr(works, cpu), func);
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
per_cpu_ptr(works, cpu));
}
@@ -539,12 +657,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
* cancel_rearming_delayed_workqueue - reliably kill off a delayed
* work whose handler rearms the delayed work.
* @wq: the controlling workqueue structure
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
*/
void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
- struct work_struct *work)
+ struct delayed_work *dwork)
{
- while (!cancel_delayed_work(work))
+ while (!cancel_delayed_work(dwork))
flush_workqueue(wq);
}
EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +670,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
/**
* cancel_rearming_delayed_work - reliably kill off a delayed keventd
* work whose handler rearms the delayed work.
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
*/
-void cancel_rearming_delayed_work(struct work_struct *work)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
{
- cancel_rearming_delayed_workqueue(keventd_wq, work);
+ cancel_rearming_delayed_workqueue(keventd_wq, dwork);
}
EXPORT_SYMBOL(cancel_rearming_delayed_work);
/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
- * @data: data to pass to the function
* @ew: guaranteed storage for the execute work structure (must
* be available when the work executes)
*
@@ -573,15 +690,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
* Returns: 0 - function was executed
* 1 - function was scheduled for execution
*/
-int execute_in_process_context(void (*fn)(void *data), void *data,
- struct execute_work *ew)
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
if (!in_interrupt()) {
- fn(data);
+ fn(&ew->work);
return 0;
}
- INIT_WORK(&ew->work, fn, data);
+ INIT_WORK(&ew->work, fn);
schedule_work(&ew->work);
return 1;
@@ -609,7 +725,6 @@ int current_is_keventd(void)
}
-#ifdef CONFIG_HOTPLUG_CPU
/* Take the work from this (downed) CPU. */
static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
{
@@ -642,7 +757,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
mutex_lock(&workqueue_mutex);
/* Create a new workqueue thread for it. */
list_for_each_entry(wq, &workqueues, list) {
- if (!create_workqueue_thread(wq, hotcpu)) {
+ if (!create_workqueue_thread(wq, hotcpu, 0)) {
printk("workqueue for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
@@ -692,7 +807,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-#endif
void init_workqueues(void)
{