From d110271e1f4140a9fb06d968b1afe9ca56a6064e Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Wed, 25 Mar 2009 15:11:36 -0600 Subject: sysfs: don't use global workqueue in sysfs_schedule_callback() A sysfs attribute using sysfs_schedule_callback() to commit suicide may end up calling device_unregister(), which will eventually call a driver's ->remove function. Drivers may call flush_scheduled_work() in their shutdown routines, in which case lockdep will complain with something like the following: ============================================= [ INFO: possible recursive locking detected ] 2.6.29-rc8-kk #1 --------------------------------------------- events/4/56 is trying to acquire lock: (events){--..}, at: [] flush_workqueue+0x0/0xa0 but task is already holding lock: (events){--..}, at: [] run_workqueue+0x108/0x230 other info that might help us debug this: 3 locks held by events/4/56: #0: (events){--..}, at: [] run_workqueue+0x108/0x230 #1: (&ss->work){--..}, at: [] run_workqueue+0x108/0x230 #2: (pci_remove_rescan_mutex){--..}, at: [] remove_callback+0x21/0x40 stack backtrace: Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1 Call Trace: [] validate_chain+0xb7d/0x1260 [] __lock_acquire+0x42e/0xa40 [] lock_acquire+0x58/0x80 [] ? flush_workqueue+0x0/0xa0 [] flush_workqueue+0x4d/0xa0 [] ? flush_workqueue+0x0/0xa0 [] flush_scheduled_work+0x10/0x20 [] e1000_remove+0x55/0xfe [e1000e] [] ? sysfs_schedule_callback_work+0x0/0x50 [] pci_device_remove+0x32/0x70 [] __device_release_driver+0x59/0x90 [] device_release_driver+0x2b/0x40 [] bus_remove_device+0xa6/0x120 [] device_del+0x12b/0x190 [] device_unregister+0x26/0x70 [] pci_stop_dev+0x49/0x60 [] pci_remove_bus_device+0x40/0xc0 [] remove_callback+0x29/0x40 [] sysfs_schedule_callback_work+0x1f/0x50 [] run_workqueue+0x15a/0x230 [] ? run_workqueue+0x108/0x230 [] worker_thread+0x9f/0x100 [] ? autoremove_wake_function+0x0/0x40 [] ? worker_thread+0x0/0x100 [] kthread+0x4d/0x80 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x80 [] ? child_rip+0x0/0x20 Although we know that the device_unregister path will never acquire a lock that a driver might try to acquire in its ->remove, in general we should never attempt to flush a workqueue from within the same workqueue, and lockdep rightly complains. So as long as sysfs attributes cannot commit suicide directly and we are stuck with this callback mechanism, put the sysfs callbacks on their own workqueue instead of the global one. This has the side benefit that if a suicidal sysfs attribute kicks off a long chain of ->remove callbacks, we no longer induce a long delay on the global queue. This also fixes a missing module_put in the error path introduced by sysfs-only-allow-one-scheduled-removal-callback-per-kobj.patch. We never destroy the workqueue, but I'm not sure that's a problem. Reported-by: Kenji Kaneshige Tested-by: Kenji Kaneshige Signed-off-by: Alex Chiang Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 289c43a4726..979e9379fb5 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct { struct work_struct work; }; +static struct workqueue_struct *sysfs_workqueue; static DEFINE_MUTEX(sysfs_workq_mutex); static LIST_HEAD(sysfs_workq); static void sysfs_schedule_callback_work(struct work_struct *work) @@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_lock(&sysfs_workq_mutex); list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) if (ss->kobj == kobj) { + module_put(owner); mutex_unlock(&sysfs_workq_mutex); return -EAGAIN; } mutex_unlock(&sysfs_workq_mutex); + if (sysfs_workqueue == NULL) { + sysfs_workqueue = create_workqueue("sysfsd"); + if (sysfs_workqueue == NULL) { + module_put(owner); + return -ENOMEM; + } + } + ss = kmalloc(sizeof(*ss), GFP_KERNEL); if (!ss) { module_put(owner); @@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_lock(&sysfs_workq_mutex); list_add_tail(&ss->workq_list, &sysfs_workq); mutex_unlock(&sysfs_workq_mutex); - schedule_work(&ss->work); + queue_work(sysfs_workqueue, &ss->work); return 0; } EXPORT_SYMBOL_GPL(sysfs_schedule_callback); -- cgit v1.2.3 From 1af3557abdef34ee036a6de4cb79e24468544b8d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 9 Apr 2009 13:53:22 +0900 Subject: sysfs: sysfs poll keep the poll rule of regular file. Currently, following test programs don't finished. % ruby -e ' Thread.new { sleep } File.read("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies") ' strace expose the reason. ... open("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies", O_RDONLY|O_LARGEFILE) = 3 ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbf9fa6b8) = -1 ENOTTY (Inappropriate ioctl for device) fstat64(3, {st_mode=S_IFREG|0444, st_size=4096, ...}) = 0 _llseek(3, 0, [0], SEEK_CUR) = 0 select(4, [3], NULL, NULL, NULL) = 1 (in [3]) read(3, "1400000 1300000 1200000 1100000 1"..., 4096) = 62 select(4, [3], NULL, NULL, NULL Because Ruby (the scripting language) VM assume select system-call against regular file don't block. it because SUSv3 says "Regular files shall always poll TRUE for reading and writing". see http://www.opengroup.org/onlinepubs/009695399/functions/poll.html it seems valid assumption. But sysfs_poll() don't keep this rule although sysfs file can read and write always. This patch restore proper poll behavior to sysfs. /sys/block/md*/md/sync_action polling application and another sysfs updating sensitive application still can use POLLERR and POLLPRI. Cc: Neil Brown Signed-off-by: KOSAKI Motohiro Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 979e9379fb5..b1606e07b7a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) if (buffer->event != atomic_read(&od->event)) goto trigger; - return 0; + return DEFAULT_POLLMASK; trigger: buffer->needs_read_fill = 1; - return POLLERR|POLLPRI; + return DEFAULT_POLLMASK|POLLERR|POLLPRI; } void sysfs_notify_dirent(struct sysfs_dirent *sd) -- cgit v1.2.3 From 31b07093c44a7a442394d44423e21d783f5523b8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 9 Apr 2009 13:57:59 +0900 Subject: proc: mounts_poll() make consistent to mdstat_poll In recently sysfs_poll discussion, Neil Brown pointed out /proc/mounts also should be fixed. SUSv3 says "Regular files shall always poll TRUE for reading and writing". see http://www.opengroup.org/onlinepubs/009695399/functions/poll.html Then, mounts_poll()'s default should be "POLLIN | POLLRDNORM". it mean always readable. In addition, event trigger should use "POLLERR | POLLPRI" instead POLLERR. it makes consistent to mdstat_poll() and sysfs_poll(). and, select(2) can handle POLLPRI easily. Reported-by: Neil Brown Signed-off-by: KOSAKI Motohiro Cc: Ram Pai Cc: Miklos Szeredi Cc: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index f71559784bf..aa763ab0077 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -648,14 +648,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; struct mnt_namespace *ns = p->ns; - unsigned res = 0; + unsigned res = POLLIN | POLLRDNORM; poll_wait(file, &ns->poll, wait); spin_lock(&vfsmount_lock); if (p->event != ns->event) { p->event = ns->event; - res = POLLERR; + res |= POLLERR | POLLPRI; } spin_unlock(&vfsmount_lock); -- cgit v1.2.3