From 710027a48ede75428cc68eaa8ae2269b1e356e2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Aug 2008 20:13:11 +0200 Subject: Add some block/ source files to the kernel-api docbook. Fix kernel-doc notation in them as needed. Fix changed function parameter names. Fix typos/spellos. In comments, change REQ_SPECIAL to REQ_TYPE_SPECIAL and REQ_BLOCK_PC to REQ_TYPE_BLOCK_PC. Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/genhd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index e0ce23ac2ec..c114a43052d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -211,10 +211,11 @@ void unlink_gendisk(struct gendisk *disk) /** * get_gendisk - get partitioning information for a given device - * @dev: device to get partitioning information for + * @devt: device to get partitioning information for + * @part: returned partition index * * This function gets the structure containing partitioning - * information for the given device @dev. + * information for the given device @devt. */ struct gendisk *get_gendisk(dev_t devt, int *part) { -- cgit v1.2.3 From ac65ece4eee10b03ac29ee925cadc179dc810bab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:30:12 +0900 Subject: block: fix partition info printouts Recent block_class iteration updates 5c6f35c5..27f3025 broke partition info printouts. * printk_all_partitions(): Partition print out stops when it meets a partition hole. Partition printing inner loop should continue instead of exiting on empty partition slot. * /proc/partitions and /proc/diskstats: If all information can't be read in single read(), the information is truncated. This is because find_start() doesn't actually update the counter containing the initial seek. It runs to the end and ends up always reporting EOF on the second read. This patch fixes both problems. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index c114a43052d..0be95135c40 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -236,7 +236,7 @@ static int printk_partition(struct device *dev, void *data) int n; if (dev->type != &disk_type) - goto exit; + return 0; sgp = dev_to_disk(dev); /* @@ -244,7 +244,7 @@ static int printk_partition(struct device *dev, void *data) */ if (get_capacity(sgp) == 0 || (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) - goto exit; + return 0; /* * Note, unlike /proc/partitions, I am showing the numbers in @@ -264,15 +264,15 @@ static int printk_partition(struct device *dev, void *data) /* now show the partitions */ for (n = 0; n < sgp->minors - 1; ++n) { if (sgp->part[n] == NULL) - goto exit; + continue; if (sgp->part[n]->nr_sects == 0) - goto exit; + continue; printk(" %02x%02x %10llu %s\n", sgp->major, n + 1 + sgp->first_minor, (unsigned long long)sgp->part[n]->nr_sects >> 1, disk_name(sgp, n + 1, buf)); } -exit: + return 0; } -- cgit v1.2.3 From 2ac3cee5298a247b2774f3319b28a05f588c3f0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 08:53:37 +0200 Subject: block: don't grab block_class_lock unnecessarily block_class_lock protects major_names array and bdev_map and doesn't have anything to do with block class devices. Don't grab them while iterating over block class devices. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 0be95135c40..9eb8b3e212c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -283,9 +283,7 @@ static int printk_partition(struct device *dev, void *data) */ void __init printk_all_partitions(void) { - mutex_lock(&block_class_lock); class_for_each_device(&block_class, NULL, NULL, printk_partition); - mutex_unlock(&block_class_lock); } #ifdef CONFIG_PROC_FS @@ -305,17 +303,15 @@ static int find_start(struct device *dev, void *data) static void *part_start(struct seq_file *part, loff_t *pos) { struct device *dev; - loff_t k = *pos; + loff_t n = *pos; - if (!k) + if (!n) part->private = (void *)1LU; /* tell show to print header */ - mutex_lock(&block_class_lock); - dev = class_find_device(&block_class, NULL, &k, find_start); - if (dev) { - put_device(dev); + dev = class_find_device(&block_class, NULL, &n, find_start); + if (dev) return dev_to_disk(dev); - } + return NULL; } @@ -341,7 +337,6 @@ static void *part_next(struct seq_file *part, void *v, loff_t *pos) static void part_stop(struct seq_file *part, void *v) { - mutex_unlock(&block_class_lock); } static int show_partition(struct seq_file *part, void *v) @@ -583,14 +578,12 @@ static struct device_type disk_type = { static void *diskstats_start(struct seq_file *part, loff_t *pos) { struct device *dev; - loff_t k = *pos; + loff_t n = *pos; - mutex_lock(&block_class_lock); - dev = class_find_device(&block_class, NULL, &k, find_start); - if (dev) { - put_device(dev); + dev = class_find_device(&block_class, NULL, &n, find_start); + if (dev) return dev_to_disk(dev); - } + return NULL; } @@ -610,7 +603,6 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos) static void diskstats_stop(struct seq_file *part, void *v) { - mutex_unlock(&block_class_lock); } static int diskstats_show(struct seq_file *s, void *v) @@ -729,7 +721,6 @@ dev_t blk_lookup_devt(const char *name, int part) dev_t devt = MKDEV(0, 0); struct find_block find; - mutex_lock(&block_class_lock); find.name = name; find.part = part; dev = class_find_device(&block_class, NULL, &find, match_id); @@ -738,7 +729,6 @@ dev_t blk_lookup_devt(const char *name, int part) devt = MKDEV(MAJOR(dev->devt), MINOR(dev->devt) + part); } - mutex_unlock(&block_class_lock); return devt; } -- cgit v1.2.3 From def4e38ddda9bef20b69bfa939195c2f79da7979 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 08:57:12 +0200 Subject: block: use class_dev_iterator instead of class_for_each_device() Recent block_class iteration updates 5c6f35c5..27f3025 converted all class device iteration to class_for_each_device() and class_find_device(), which are correct but pain in the ass to use. This pach converts them to newly introduced class_dev_iterator so that they can use more natural control structures instead of separate callbacks and struct to pass parameters to them. This results in smaller and easier code. This patch also restores the original behavior of not printing header in /proc/partitions if there's no partition to print. This is trivial but still user-visible behavior. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 252 ++++++++++++++++++++++------------------------------------ 1 file changed, 97 insertions(+), 155 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 9eb8b3e212c..8b9a9ff1a84 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -225,57 +225,6 @@ struct gendisk *get_gendisk(dev_t devt, int *part) return kobj ? dev_to_disk(dev) : NULL; } -/* - * print a partitions - intended for places where the root filesystem can't be - * mounted and thus to give the victim some idea of what went wrong - */ -static int printk_partition(struct device *dev, void *data) -{ - struct gendisk *sgp; - char buf[BDEVNAME_SIZE]; - int n; - - if (dev->type != &disk_type) - return 0; - - sgp = dev_to_disk(dev); - /* - * Don't show empty devices or things that have been surpressed - */ - if (get_capacity(sgp) == 0 || - (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) - return 0; - - /* - * Note, unlike /proc/partitions, I am showing the numbers in - * hex - the same format as the root= option takes. - */ - printk("%02x%02x %10llu %s", - sgp->major, sgp->first_minor, - (unsigned long long)get_capacity(sgp) >> 1, - disk_name(sgp, 0, buf)); - if (sgp->driverfs_dev != NULL && - sgp->driverfs_dev->driver != NULL) - printk(" driver: %s\n", - sgp->driverfs_dev->driver->name); - else - printk(" (driver?)\n"); - - /* now show the partitions */ - for (n = 0; n < sgp->minors - 1; ++n) { - if (sgp->part[n] == NULL) - continue; - if (sgp->part[n]->nr_sects == 0) - continue; - printk(" %02x%02x %10llu %s\n", - sgp->major, n + 1 + sgp->first_minor, - (unsigned long long)sgp->part[n]->nr_sects >> 1, - disk_name(sgp, n + 1, buf)); - } - - return 0; -} - /* * print a full list of all partitions - intended for places where the root * filesystem can't be mounted and thus to give the victim some idea of what @@ -283,60 +232,108 @@ static int printk_partition(struct device *dev, void *data) */ void __init printk_all_partitions(void) { - class_for_each_device(&block_class, NULL, NULL, printk_partition); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + char buf[BDEVNAME_SIZE]; + int n; + + /* + * Don't show empty devices or things that have been + * surpressed + */ + if (get_capacity(disk) == 0 || + (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the + * numbers in hex - the same format as the root= + * option takes. + */ + printk("%02x%02x %10llu %s", + disk->major, disk->first_minor, + (unsigned long long)get_capacity(disk) >> 1, + disk_name(disk, 0, buf)); + if (disk->driverfs_dev != NULL && + disk->driverfs_dev->driver != NULL) + printk(" driver: %s\n", + disk->driverfs_dev->driver->name); + else + printk(" (driver?)\n"); + + /* now show the partitions */ + for (n = 0; n < disk->minors - 1; ++n) { + if (disk->part[n] == NULL) + continue; + if (disk->part[n]->nr_sects == 0) + continue; + printk(" %02x%02x %10llu %s\n", + disk->major, n + 1 + disk->first_minor, + (unsigned long long)disk->part[n]->nr_sects >> 1, + disk_name(disk, n + 1, buf)); + } + } + class_dev_iter_exit(&iter); } #ifdef CONFIG_PROC_FS /* iterator */ -static int find_start(struct device *dev, void *data) +static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) { - loff_t *k = data; + loff_t skip = *pos; + struct class_dev_iter *iter; + struct device *dev; - if (dev->type != &disk_type) - return 0; - if (!*k) - return 1; - (*k)--; - return 0; + iter = kmalloc(GFP_KERNEL, sizeof(*iter)); + if (!iter) + return ERR_PTR(-ENOMEM); + + seqf->private = iter; + class_dev_iter_init(iter, &block_class, NULL, &disk_type); + do { + dev = class_dev_iter_next(iter); + if (!dev) + return NULL; + } while (skip--); + + return dev_to_disk(dev); } -static void *part_start(struct seq_file *part, loff_t *pos) +static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) { struct device *dev; - loff_t n = *pos; - - if (!n) - part->private = (void *)1LU; /* tell show to print header */ - dev = class_find_device(&block_class, NULL, &n, find_start); + (*pos)++; + dev = class_dev_iter_next(seqf->private); if (dev) return dev_to_disk(dev); return NULL; } -static int find_next(struct device *dev, void *data) +static void disk_seqf_stop(struct seq_file *seqf, void *v) { - if (dev->type == &disk_type) - return 1; - return 0; -} + struct class_dev_iter *iter = seqf->private; -static void *part_next(struct seq_file *part, void *v, loff_t *pos) -{ - struct gendisk *gp = v; - struct device *dev; - ++*pos; - dev = class_find_device(&block_class, &gp->dev, NULL, find_next); - if (dev) { - put_device(dev); - return dev_to_disk(dev); + /* stop is called even after start failed :-( */ + if (iter) { + class_dev_iter_exit(iter); + kfree(iter); } - return NULL; } -static void part_stop(struct seq_file *part, void *v) +static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { + static void *p; + + p = disk_seqf_start(seqf, pos); + if (!IS_ERR(p) && p) + seq_puts(seqf, "major minor #blocks name\n\n"); + return p; } static int show_partition(struct seq_file *part, void *v) @@ -383,9 +380,9 @@ static int show_partition(struct seq_file *part, void *v) } const struct seq_operations partitions_op = { - .start = part_start, - .next = part_next, - .stop = part_stop, + .start = show_partition_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, .show = show_partition }; #endif @@ -567,44 +564,6 @@ static struct device_type disk_type = { }; #ifdef CONFIG_PROC_FS -/* - * aggregate disk stat collector. Uses the same stats that the sysfs - * entries do, above, but makes them available through one seq_file. - * - * The output looks suspiciously like /proc/partitions with a bunch of - * extra fields. - */ - -static void *diskstats_start(struct seq_file *part, loff_t *pos) -{ - struct device *dev; - loff_t n = *pos; - - dev = class_find_device(&block_class, NULL, &n, find_start); - if (dev) - return dev_to_disk(dev); - - return NULL; -} - -static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos) -{ - struct gendisk *gp = v; - struct device *dev; - - ++*pos; - dev = class_find_device(&block_class, &gp->dev, NULL, find_next); - if (dev) { - put_device(dev); - return dev_to_disk(dev); - } - return NULL; -} - -static void diskstats_stop(struct seq_file *part, void *v) -{ -} - static int diskstats_show(struct seq_file *s, void *v) { struct gendisk *gp = v; @@ -666,9 +625,9 @@ static int diskstats_show(struct seq_file *s, void *v) } const struct seq_operations diskstats_op = { - .start = diskstats_start, - .next = diskstats_next, - .stop = diskstats_stop, + .start = disk_seqf_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, .show = diskstats_show }; #endif /* CONFIG_PROC_FS */ @@ -696,40 +655,23 @@ void genhd_media_change_notify(struct gendisk *disk) EXPORT_SYMBOL_GPL(genhd_media_change_notify); #endif /* 0 */ -struct find_block { - const char *name; - int part; -}; - -static int match_id(struct device *dev, void *data) +dev_t blk_lookup_devt(const char *name, int part) { - struct find_block *find = data; + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; - if (dev->type != &disk_type) - return 0; - if (strcmp(dev->bus_id, find->name) == 0) { + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - if (find->part < disk->minors) - return 1; - } - return 0; -} -dev_t blk_lookup_devt(const char *name, int part) -{ - struct device *dev; - dev_t devt = MKDEV(0, 0); - struct find_block find; - - find.name = name; - find.part = part; - dev = class_find_device(&block_class, NULL, &find, match_id); - if (dev) { - put_device(dev); - devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + part); + if (!strcmp(dev->bus_id, name) && part < disk->minors) { + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + part); + break; + } } - + class_dev_iter_exit(&iter); return devt; } EXPORT_SYMBOL(blk_lookup_devt); -- cgit v1.2.3 From 310a2c1012934f590192377f65940cad4aa72b15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:17 +0900 Subject: block: misc updates This patch makes the following misc updates in preparation for disk->part dereference fix and extended block devt support. * implment part_to_disk() * fix comment about gendisk->part indexing * rename get_part() to disk_map_sector() * don't use n which is always zero while printing disk information in diskstats_show() Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 8b9a9ff1a84..11038fbc75e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -568,7 +568,7 @@ static int diskstats_show(struct seq_file *s, void *v) { struct gendisk *gp = v; char buf[BDEVNAME_SIZE]; - int n = 0; + int n; /* if (&gp->dev.kobj.entry == block_class.devices.next) @@ -582,7 +582,7 @@ static int diskstats_show(struct seq_file *s, void *v) disk_round_stats(gp); preempt_enable(); seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", - gp->major, n + gp->first_minor, disk_name(gp, n, buf), + gp->major, gp->first_minor, disk_name(gp, 0, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), (unsigned long long)disk_stat_read(gp, sectors[0]), jiffies_to_msecs(disk_stat_read(gp, ticks[0])), -- cgit v1.2.3 From cf771cb5a7b716f3f9e532fd42a1e3a0a75adec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:01:09 +0200 Subject: block: make variable and argument names more consistent In hd_struct, @partno is used to denote partition number and a number of other places use @part to denote hd_struct. Functions use @part and @index instead. This causes confusion and makes it difficult to use consistent variable names for hd_struct. Always use @partno if a variable represents partition number. Also, print out functions use @f or @part for seq_file argument. Use @seqf uniformly instead. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 54 +++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 11038fbc75e..dc9ad4c171e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -43,14 +43,14 @@ static inline int major_to_index(int major) } #ifdef CONFIG_PROC_FS -void blkdev_show(struct seq_file *f, off_t offset) +void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; if (offset < BLKDEV_MAJOR_HASH_SIZE) { mutex_lock(&block_class_lock); for (dp = major_names[offset]; dp; dp = dp->next) - seq_printf(f, "%3d %s\n", dp->major, dp->name); + seq_printf(seqf, "%3d %s\n", dp->major, dp->name); mutex_unlock(&block_class_lock); } } @@ -157,7 +157,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) EXPORT_SYMBOL(blk_unregister_region); -static struct kobject *exact_match(dev_t devt, int *part, void *data) +static struct kobject *exact_match(dev_t devt, int *partno, void *data) { struct gendisk *p = data; @@ -217,9 +217,9 @@ void unlink_gendisk(struct gendisk *disk) * This function gets the structure containing partitioning * information for the given device @devt. */ -struct gendisk *get_gendisk(dev_t devt, int *part) +struct gendisk *get_gendisk(dev_t devt, int *partno) { - struct kobject *kobj = kobj_lookup(bdev_map, devt, part); + struct kobject *kobj = kobj_lookup(bdev_map, devt, partno); struct device *dev = kobj_to_dev(kobj); return kobj ? dev_to_disk(dev) : NULL; @@ -336,23 +336,12 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) return p; } -static int show_partition(struct seq_file *part, void *v) +static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; int n; char buf[BDEVNAME_SIZE]; - /* - * Print header if start told us to do. This is to preserve - * the original behavior of not printing header if no - * partition exists. This hackery will be removed later with - * class iteration clean up. - */ - if (part->private) { - seq_puts(part, "major minor #blocks name\n\n"); - part->private = NULL; - } - /* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE))) @@ -361,7 +350,7 @@ static int show_partition(struct seq_file *part, void *v) return 0; /* show the full disk and all non-0 size partitions of it */ - seq_printf(part, "%4d %4d %10llu %s\n", + seq_printf(seqf, "%4d %4d %10llu %s\n", sgp->major, sgp->first_minor, (unsigned long long)get_capacity(sgp) >> 1, disk_name(sgp, 0, buf)); @@ -370,7 +359,7 @@ static int show_partition(struct seq_file *part, void *v) continue; if (sgp->part[n]->nr_sects == 0) continue; - seq_printf(part, "%4d %4d %10llu %s\n", + seq_printf(seqf, "%4d %4d %10llu %s\n", sgp->major, n + 1 + sgp->first_minor, (unsigned long long)sgp->part[n]->nr_sects >> 1 , disk_name(sgp, n + 1, buf)); @@ -388,7 +377,7 @@ const struct seq_operations partitions_op = { #endif -static struct kobject *base_probe(dev_t devt, int *part, void *data) +static struct kobject *base_probe(dev_t devt, int *partno, void *data) { if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ @@ -564,7 +553,14 @@ static struct device_type disk_type = { }; #ifdef CONFIG_PROC_FS -static int diskstats_show(struct seq_file *s, void *v) +/* + * aggregate disk stat collector. Uses the same stats that the sysfs + * entries do, above, but makes them available through one seq_file. + * + * The output looks suspiciously like /proc/partitions with a bunch of + * extra fields. + */ +static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; char buf[BDEVNAME_SIZE]; @@ -572,7 +568,7 @@ static int diskstats_show(struct seq_file *s, void *v) /* if (&gp->dev.kobj.entry == block_class.devices.next) - seq_puts(s, "major minor name" + seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" "\n\n"); @@ -581,7 +577,7 @@ static int diskstats_show(struct seq_file *s, void *v) preempt_disable(); disk_round_stats(gp); preempt_enable(); - seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", + seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", gp->major, gp->first_minor, disk_name(gp, 0, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), (unsigned long long)disk_stat_read(gp, sectors[0]), @@ -603,7 +599,7 @@ static int diskstats_show(struct seq_file *s, void *v) preempt_disable(); part_round_stats(hd); preempt_enable(); - seq_printf(s, "%4d %4d %s %lu %lu %llu " + seq_printf(seqf, "%4d %4d %s %lu %lu %llu " "%u %lu %lu %llu %u %u %u %u\n", gp->major, n + gp->first_minor + 1, disk_name(gp, n + 1, buf), @@ -655,7 +651,7 @@ void genhd_media_change_notify(struct gendisk *disk) EXPORT_SYMBOL_GPL(genhd_media_change_notify); #endif /* 0 */ -dev_t blk_lookup_devt(const char *name, int part) +dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); struct class_dev_iter iter; @@ -665,9 +661,9 @@ dev_t blk_lookup_devt(const char *name, int part) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - if (!strcmp(dev->bus_id, name) && part < disk->minors) { + if (!strcmp(dev->bus_id, name) && partno < disk->minors) { devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + part); + MINOR(dev->devt) + partno); break; } } @@ -777,10 +773,10 @@ int bdev_read_only(struct block_device *bdev) EXPORT_SYMBOL(bdev_read_only); -int invalidate_partition(struct gendisk *disk, int index) +int invalidate_partition(struct gendisk *disk, int partno) { int res = 0; - struct block_device *bdev = bdget_disk(disk, index); + struct block_device *bdev = bdget_disk(disk, partno); if (bdev) { fsync_bdev(bdev); res = __invalidate_device(bdev); -- cgit v1.2.3 From f331c0296f2a9fee0d396a70598b954062603015 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:01:48 +0200 Subject: block: don't depend on consecutive minor space * Implement disk_devt() and part_devt() and use them to directly access devt instead of computing it from ->major and ->first_minor. Note that all references to ->major and ->first_minor outside of block layer is used to determine devt of the disk (the part0) and as ->major and ->first_minor will continue to represent devt for the disk, converting these users aren't strictly necessary. However, convert them for consistency. * Implement disk_max_parts() to avoid directly deferencing genhd->minors. * Update bdget_disk() such that it doesn't assume consecutive minor space. * Move devt computation from register_disk() to add_disk() and make it the only one (all other usages use the initially determined value). These changes clean up the code and will help disk->part dereference fix and extended block device numbers. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 107 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 31 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index dc9ad4c171e..fa32d09fda2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -186,13 +186,14 @@ void add_disk(struct gendisk *disk) int retval; disk->flags |= GENHD_FL_UP; - blk_register_region(MKDEV(disk->major, disk->first_minor), - disk->minors, NULL, exact_match, exact_lock, disk); + disk->dev.devt = MKDEV(disk->major, disk->first_minor); + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor)); + bdi_register_dev(bdi, disk_devt(disk)); retval = sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); } @@ -205,8 +206,7 @@ void unlink_gendisk(struct gendisk *disk) sysfs_remove_link(&disk->dev.kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); - blk_unregister_region(MKDEV(disk->major, disk->first_minor), - disk->minors); + blk_unregister_region(disk_devt(disk), disk->minors); } /** @@ -225,6 +225,38 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) return kobj ? dev_to_disk(dev) : NULL; } +/** + * bdget_disk - do bdget() by gendisk and partition number + * @disk: gendisk of interest + * @partno: partition number + * + * Find partition @partno from @disk, do bdget() on it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Resulting block_device on success, NULL on failure. + */ +extern struct block_device *bdget_disk(struct gendisk *disk, int partno) +{ + dev_t devt = MKDEV(0, 0); + + if (partno == 0) + devt = disk_devt(disk); + else { + struct hd_struct *part = disk->part[partno - 1]; + + if (part && part->nr_sects) + devt = part_devt(part); + } + + if (likely(devt != MKDEV(0, 0))) + return bdget(devt); + return NULL; +} +EXPORT_SYMBOL(bdget_disk); + /* * print a full list of all partitions - intended for places where the root * filesystem can't be mounted and thus to give the victim some idea of what @@ -255,7 +287,7 @@ void __init printk_all_partitions(void) * option takes. */ printk("%02x%02x %10llu %s", - disk->major, disk->first_minor, + MAJOR(disk_devt(disk)), MINOR(disk_devt(disk)), (unsigned long long)get_capacity(disk) >> 1, disk_name(disk, 0, buf)); if (disk->driverfs_dev != NULL && @@ -266,15 +298,15 @@ void __init printk_all_partitions(void) printk(" (driver?)\n"); /* now show the partitions */ - for (n = 0; n < disk->minors - 1; ++n) { - if (disk->part[n] == NULL) - continue; - if (disk->part[n]->nr_sects == 0) + for (n = 0; n < disk_max_parts(disk); ++n) { + struct hd_struct *part = disk->part[n]; + + if (!part || !part->nr_sects) continue; printk(" %02x%02x %10llu %s\n", - disk->major, n + 1 + disk->first_minor, - (unsigned long long)disk->part[n]->nr_sects >> 1, - disk_name(disk, n + 1, buf)); + MAJOR(part_devt(part)), MINOR(part_devt(part)), + (unsigned long long)part->nr_sects >> 1, + disk_name(disk, part->partno, buf)); } } class_dev_iter_exit(&iter); @@ -343,26 +375,27 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || - (sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE))) + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) return 0; /* show the full disk and all non-0 size partitions of it */ seq_printf(seqf, "%4d %4d %10llu %s\n", - sgp->major, sgp->first_minor, + MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)), (unsigned long long)get_capacity(sgp) >> 1, disk_name(sgp, 0, buf)); - for (n = 0; n < sgp->minors - 1; n++) { - if (!sgp->part[n]) + for (n = 0; n < disk_max_parts(sgp); n++) { + struct hd_struct *part = sgp->part[n]; + if (!part) continue; - if (sgp->part[n]->nr_sects == 0) + if (part->nr_sects == 0) continue; seq_printf(seqf, "%4d %4d %10llu %s\n", - sgp->major, n + 1 + sgp->first_minor, - (unsigned long long)sgp->part[n]->nr_sects >> 1 , - disk_name(sgp, n + 1, buf)); + MAJOR(part_devt(part)), MINOR(part_devt(part)), + (unsigned long long)part->nr_sects >> 1, + disk_name(sgp, part->partno, buf)); } return 0; @@ -578,7 +611,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_round_stats(gp); preempt_enable(); seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", - gp->major, gp->first_minor, disk_name(gp, 0, buf), + MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)), + disk_name(gp, 0, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), (unsigned long long)disk_stat_read(gp, sectors[0]), jiffies_to_msecs(disk_stat_read(gp, ticks[0])), @@ -590,7 +624,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) jiffies_to_msecs(disk_stat_read(gp, time_in_queue))); /* now show all non-0 size partitions of it */ - for (n = 0; n < gp->minors - 1; n++) { + for (n = 0; n < disk_max_parts(gp); n++) { struct hd_struct *hd = gp->part[n]; if (!hd || !hd->nr_sects) @@ -601,8 +635,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) preempt_enable(); seq_printf(seqf, "%4d %4d %s %lu %lu %llu " "%u %lu %lu %llu %u %u %u %u\n", - gp->major, n + gp->first_minor + 1, - disk_name(gp, n + 1, buf), + MAJOR(part_devt(hd)), MINOR(part_devt(hd)), + disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[0]), part_stat_read(hd, merges[0]), (unsigned long long)part_stat_read(hd, sectors[0]), @@ -661,11 +695,22 @@ dev_t blk_lookup_devt(const char *name, int partno) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - if (!strcmp(dev->bus_id, name) && partno < disk->minors) { - devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + partno); - break; + if (strcmp(dev->bus_id, name)) + continue; + if (partno < 0 || partno > disk_max_parts(disk)) + continue; + + if (partno == 0) + devt = disk_devt(disk); + else { + struct hd_struct *part = disk->part[partno - 1]; + + if (!part || !part->nr_sects) + continue; + + devt = part_devt(part); } + break; } class_dev_iter_exit(&iter); return devt; @@ -755,7 +800,7 @@ void set_disk_ro(struct gendisk *disk, int flag) { int i; disk->policy = flag; - for (i = 0; i < disk->minors - 1; i++) + for (i = 0; i < disk_max_parts(disk); i++) if (disk->part[i]) disk->part[i]->policy = flag; } -- cgit v1.2.3 From e71bf0d0ee89e51b92776391c5634938236977d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:03:02 +0200 Subject: block: fix disk->part[] dereferencing race disk->part[] is protected by its matching bdev's lock. However, non-critical accesses like collecting stats and printing out sysfs and proc information used to be performed without any locking. As partitions can come and go dynamically, partitions can go away underneath those non-critical accesses. As some of those accesses are writes, this theoretically can lead to silent corruption. This patch fixes the race by using RCU for the partition array and dev reference counter to hold partitions. * Rename disk->part[] to disk->__part[] to make sure no one outside genhd layer proper accesses it directly. * Use RCU for disk->__part[] dereferencing. * Implement disk_{get|put}_part() which can be used to get and put partitions from gendisk respectively. * Iterators are implemented to help iterate through all partitions safely. * Functions which require RCU readlock are marked with _rcu suffix. * Use disk_put_part() in __blkdev_put() instead of directly putting the contained kobject. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 187 insertions(+), 31 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index fa32d09fda2..b431d654394 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -26,6 +26,158 @@ struct kobject *block_depr; static struct device_type disk_type; +/** + * disk_get_part - get partition + * @disk: disk to look partition from + * @partno: partition number + * + * Look for partition @partno from @disk. If found, increment + * reference count and return it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Pointer to the found partition on success, NULL if not found. + */ +struct hd_struct *disk_get_part(struct gendisk *disk, int partno) +{ + struct hd_struct *part; + + if (unlikely(partno < 1 || partno > disk_max_parts(disk))) + return NULL; + rcu_read_lock(); + part = rcu_dereference(disk->__part[partno - 1]); + if (part) + get_device(&part->dev); + rcu_read_unlock(); + + return part; +} +EXPORT_SYMBOL_GPL(disk_get_part); + +/** + * disk_part_iter_init - initialize partition iterator + * @piter: iterator to initialize + * @disk: disk to iterate over + * @flags: DISK_PITER_* flags + * + * Initialize @piter so that it iterates over partitions of @disk. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, + unsigned int flags) +{ + piter->disk = disk; + piter->part = NULL; + + if (flags & DISK_PITER_REVERSE) + piter->idx = disk_max_parts(piter->disk) - 1; + else + piter->idx = 0; + + piter->flags = flags; +} +EXPORT_SYMBOL_GPL(disk_part_iter_init); + +/** + * disk_part_iter_next - proceed iterator to the next partition and return it + * @piter: iterator of interest + * + * Proceed @piter to the next partition and return it. + * + * CONTEXT: + * Don't care. + */ +struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +{ + int inc, end; + + /* put the last partition */ + disk_put_part(piter->part); + piter->part = NULL; + + rcu_read_lock(); + + /* determine iteration parameters */ + if (piter->flags & DISK_PITER_REVERSE) { + inc = -1; + end = -1; + } else { + inc = 1; + end = disk_max_parts(piter->disk); + } + + /* iterate to the next partition */ + for (; piter->idx != end; piter->idx += inc) { + struct hd_struct *part; + + part = rcu_dereference(piter->disk->__part[piter->idx]); + if (!part) + continue; + if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) + continue; + + get_device(&part->dev); + piter->part = part; + piter->idx += inc; + break; + } + + rcu_read_unlock(); + + return piter->part; +} +EXPORT_SYMBOL_GPL(disk_part_iter_next); + +/** + * disk_part_iter_exit - finish up partition iteration + * @piter: iter of interest + * + * Called when iteration is over. Cleans up @piter. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_exit(struct disk_part_iter *piter) +{ + disk_put_part(piter->part); + piter->part = NULL; +} +EXPORT_SYMBOL_GPL(disk_part_iter_exit); + +/** + * disk_map_sector_rcu - map sector to partition + * @disk: gendisk of interest + * @sector: sector to map + * + * Find out which partition @sector maps to on @disk. This is + * primarily used for stats accounting. + * + * CONTEXT: + * RCU read locked. The returned partition pointer is valid only + * while preemption is disabled. + * + * RETURNS: + * Found partition on success, NULL if there's no matching partition. + */ +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +{ + int i; + + for (i = 0; i < disk_max_parts(disk); i++) { + struct hd_struct *part = rcu_dereference(disk->__part[i]); + + if (part && part->start_sect <= sector && + sector < part->start_sect + part->nr_sects) + return part; + } + return NULL; +} +EXPORT_SYMBOL_GPL(disk_map_sector_rcu); + /* * Can be deleted altogether. Later. * @@ -245,10 +397,12 @@ extern struct block_device *bdget_disk(struct gendisk *disk, int partno) if (partno == 0) devt = disk_devt(disk); else { - struct hd_struct *part = disk->part[partno - 1]; + struct hd_struct *part; + part = disk_get_part(disk, partno); if (part && part->nr_sects) devt = part_devt(part); + disk_put_part(part); } if (likely(devt != MKDEV(0, 0))) @@ -270,8 +424,9 @@ void __init printk_all_partitions(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; char buf[BDEVNAME_SIZE]; - int n; /* * Don't show empty devices or things that have been @@ -298,16 +453,13 @@ void __init printk_all_partitions(void) printk(" (driver?)\n"); /* now show the partitions */ - for (n = 0; n < disk_max_parts(disk); ++n) { - struct hd_struct *part = disk->part[n]; - - if (!part || !part->nr_sects) - continue; + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) printk(" %02x%02x %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), (unsigned long long)part->nr_sects >> 1, disk_name(disk, part->partno, buf)); - } + disk_part_iter_exit(&piter); } class_dev_iter_exit(&iter); } @@ -371,7 +523,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; - int n; + struct disk_part_iter piter; + struct hd_struct *part; char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ @@ -386,17 +539,14 @@ static int show_partition(struct seq_file *seqf, void *v) MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)), (unsigned long long)get_capacity(sgp) >> 1, disk_name(sgp, 0, buf)); - for (n = 0; n < disk_max_parts(sgp); n++) { - struct hd_struct *part = sgp->part[n]; - if (!part) - continue; - if (part->nr_sects == 0) - continue; + + disk_part_iter_init(&piter, sgp, 0); + while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %4d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), (unsigned long long)part->nr_sects >> 1, disk_name(sgp, part->partno, buf)); - } + disk_part_iter_exit(&piter); return 0; } @@ -571,7 +721,7 @@ static void disk_release(struct device *dev) struct gendisk *disk = dev_to_disk(dev); kfree(disk->random); - kfree(disk->part); + kfree(disk->__part); free_disk_stats(disk); kfree(disk); } @@ -596,8 +746,9 @@ static struct device_type disk_type = { static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; + struct disk_part_iter piter; + struct hd_struct *hd; char buf[BDEVNAME_SIZE]; - int n; /* if (&gp->dev.kobj.entry == block_class.devices.next) @@ -624,12 +775,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) jiffies_to_msecs(disk_stat_read(gp, time_in_queue))); /* now show all non-0 size partitions of it */ - for (n = 0; n < disk_max_parts(gp); n++) { - struct hd_struct *hd = gp->part[n]; - - if (!hd || !hd->nr_sects) - continue; - + disk_part_iter_init(&piter, gp, 0); + while ((hd = disk_part_iter_next(&piter))) { preempt_disable(); part_round_stats(hd); preempt_enable(); @@ -650,6 +797,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); } + disk_part_iter_exit(&piter); return 0; } @@ -703,12 +851,16 @@ dev_t blk_lookup_devt(const char *name, int partno) if (partno == 0) devt = disk_devt(disk); else { - struct hd_struct *part = disk->part[partno - 1]; + struct hd_struct *part; - if (!part || !part->nr_sects) + part = disk_get_part(disk, partno); + if (!part || !part->nr_sects) { + disk_put_part(part); continue; + } devt = part_devt(part); + disk_put_part(part); } break; } @@ -735,9 +887,9 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } if (minors > 1) { int size = (minors - 1) * sizeof(struct hd_struct *); - disk->part = kmalloc_node(size, + disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, node_id); - if (!disk->part) { + if (!disk->__part) { free_disk_stats(disk); kfree(disk); return NULL; @@ -798,10 +950,14 @@ EXPORT_SYMBOL(set_device_ro); void set_disk_ro(struct gendisk *disk, int flag) { - int i; + struct disk_part_iter piter; + struct hd_struct *part; + disk->policy = flag; - for (i = 0; i < disk_max_parts(disk); i++) - if (disk->part[i]) disk->part[i]->policy = flag; + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + part->policy = flag; + disk_part_iter_exit(&piter); } EXPORT_SYMBOL(set_disk_ro); -- cgit v1.2.3 From c9959059161ddd7bf4670cf47367033d6b2f79c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:21 +0900 Subject: block: fix diskstats access There are two variants of stat functions - ones prefixed with double underbars which don't care about preemption and ones without which disable preemption before manipulating per-cpu counters. It's unclear whether the underbarred ones assume that preemtion is disabled on entry as some callers don't do that. This patch unifies diskstats access by implementing disk_stat_lock() and disk_stat_unlock() which take care of both RCU (for partition access) and preemption (for per-cpu counter access). diskstats access should always be enclosed between the two functions. As such, there's no need for the versions which disables preemption. They're removed and double underbars ones are renamed to drop the underbars. As an extra argument is added, there's no danger of using the old version unconverted. disk_stat_lock() uses get_cpu() and returns the cpu index and all diskstat functions which access per-cpu counters now has @cpu argument to help RT. This change adds RCU or preemption operations at some places but also collapses several preemption ops into one at others. Overall, the performance difference should be negligible as all involved ops are very lightweight per-cpu ones. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Signed-off-by: Jens Axboe --- block/genhd.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index b431d654394..430626e440f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -633,10 +633,11 @@ static ssize_t disk_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); + int cpu; - preempt_disable(); - disk_round_stats(disk); - preempt_enable(); + cpu = disk_stat_lock(); + disk_round_stats(cpu, disk); + disk_stat_unlock(); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -749,6 +750,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct disk_part_iter piter; struct hd_struct *hd; char buf[BDEVNAME_SIZE]; + int cpu; /* if (&gp->dev.kobj.entry == block_class.devices.next) @@ -758,9 +760,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - preempt_disable(); - disk_round_stats(gp); - preempt_enable(); + cpu = disk_stat_lock(); + disk_round_stats(cpu, gp); + disk_stat_unlock(); seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)), disk_name(gp, 0, buf), @@ -777,9 +779,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) /* now show all non-0 size partitions of it */ disk_part_iter_init(&piter, gp, 0); while ((hd = disk_part_iter_next(&piter))) { - preempt_disable(); - part_round_stats(hd); - preempt_enable(); + cpu = disk_stat_lock(); + part_round_stats(cpu, hd); + disk_stat_unlock(); seq_printf(seqf, "%4d %4d %s %lu %lu %llu " "%u %lu %lu %llu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), -- cgit v1.2.3 From bcce3de1be61e424deef35d1e86e86a35c4b6e65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:22 +0900 Subject: block: implement extended dev numbers Implement extended device numbers. A block driver can tell block layer that it wants to use extended device numbers. After the usual minor space is used up, block layer automatically allocates devt's from EXT_BLOCK_MAJOR. Currently only one major number is allocated for this but as the allocation is strictly on-demand, ~1mil minor space under it should suffice unless the system actually has more than ~1mil partitions and if that ever happens adding more majors to the extended devt area is easy. Due to internal implementation issues, the first partition can't be allocated on the extended area. In other words, genhd->minors should at least be 1. This limitation will be lifted by later changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 5 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 430626e440f..7bbfed05cec 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "blk.h" @@ -24,6 +25,15 @@ static DEFINE_MUTEX(block_class_lock); struct kobject *block_depr; #endif +/* for extended dynamic devt allocation, currently only one major is used */ +#define MAX_EXT_DEVT (1 << MINORBITS) + +/* For extended devt allocation. ext_devt_mutex prevents look up + * results from going away underneath its user. + */ +static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_IDR(ext_devt_idr); + static struct device_type disk_type; /** @@ -288,6 +298,74 @@ EXPORT_SYMBOL(unregister_blkdev); static struct kobj_map *bdev_map; +/** + * blk_alloc_devt - allocate a dev_t for a partition + * @part: partition to allocate dev_t for + * @gfp_mask: memory allocation flag + * @devt: out parameter for resulting dev_t + * + * Allocate a dev_t for block device. + * + * RETURNS: + * 0 on success, allocated dev_t is returned in *@devt. -errno on + * failure. + * + * CONTEXT: + * Might sleep. + */ +int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +{ + struct gendisk *disk = part_to_disk(part); + int idx, rc; + + /* in consecutive minor range? */ + if (part->partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->partno); + return 0; + } + + /* allocate ext devt */ + do { + if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) + return -ENOMEM; + rc = idr_get_new(&ext_devt_idr, part, &idx); + } while (rc == -EAGAIN); + + if (rc) + return rc; + + if (idx > MAX_EXT_DEVT) { + idr_remove(&ext_devt_idr, idx); + return -EBUSY; + } + + *devt = MKDEV(BLOCK_EXT_MAJOR, idx); + return 0; +} + +/** + * blk_free_devt - free a dev_t + * @devt: dev_t to free + * + * Free @devt which was allocated using blk_alloc_devt(). + * + * CONTEXT: + * Might sleep. + */ +void blk_free_devt(dev_t devt) +{ + might_sleep(); + + if (devt == MKDEV(0, 0)) + return; + + if (MAJOR(devt) == BLOCK_EXT_MAJOR) { + mutex_lock(&ext_devt_mutex); + idr_remove(&ext_devt_idr, MINOR(devt)); + mutex_unlock(&ext_devt_mutex); + } +} + /* * Register device numbers dev..(dev+range-1) * range must be nonzero @@ -371,10 +449,27 @@ void unlink_gendisk(struct gendisk *disk) */ struct gendisk *get_gendisk(dev_t devt, int *partno) { - struct kobject *kobj = kobj_lookup(bdev_map, devt, partno); - struct device *dev = kobj_to_dev(kobj); + struct gendisk *disk = NULL; + + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { + struct kobject *kobj; + + kobj = kobj_lookup(bdev_map, devt, partno); + if (kobj) + disk = dev_to_disk(kobj_to_dev(kobj)); + } else { + struct hd_struct *part; + + mutex_lock(&ext_devt_mutex); + part = idr_find(&ext_devt_idr, MINOR(devt)); + if (part && get_disk(part_to_disk(part))) { + *partno = part->partno; + disk = part_to_disk(part); + } + mutex_unlock(&ext_devt_mutex); + } - return kobj ? dev_to_disk(dev) : NULL; + return disk; } /** @@ -877,18 +972,30 @@ struct gendisk *alloc_disk(int minors) } struct gendisk *alloc_disk_node(int minors, int node_id) +{ + return alloc_disk_ext_node(minors, 0, node_id); +} + +struct gendisk *alloc_disk_ext(int minors, int ext_minors) +{ + return alloc_disk_ext_node(minors, ext_minors, -1); +} + +struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) { struct gendisk *disk; disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL | __GFP_ZERO, node_id); if (disk) { + int tot_minors = minors + ext_minors; + if (!init_disk_stats(disk)) { kfree(disk); return NULL; } - if (minors > 1) { - int size = (minors - 1) * sizeof(struct hd_struct *); + if (tot_minors > 1) { + int size = (tot_minors - 1) * sizeof(struct hd_struct *); disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, node_id); if (!disk->__part) { @@ -898,6 +1005,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } } disk->minors = minors; + disk->ext_minors = ext_minors; rand_initialize_disk(disk); disk->dev.class = &block_class; disk->dev.type = &disk_type; @@ -910,6 +1018,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) EXPORT_SYMBOL(alloc_disk); EXPORT_SYMBOL(alloc_disk_node); +EXPORT_SYMBOL(alloc_disk_ext); +EXPORT_SYMBOL(alloc_disk_ext_node); struct kobject *get_disk(struct gendisk *disk) { -- cgit v1.2.3 From 1f0142905d4812966831613847db38a66da29eb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:23 +0900 Subject: block: adjust formatting for large minors and add ext_range sysfs attr With extended minors and the soon-to-follow debug feature, large minor numbers for block devices will be common. This patch does the followings to make printouts pretty. * Adapt print formats such that large minors don't break the formatting. * For extended MAJ:MIN, %02x%02x for MAJ:MIN used in printk_all_partitions() doesn't cut it anymore. Update it such that %03x:%05x is used if either MAJ or MIN doesn't fit in %02x. * Implement ext_range sysfs attribute which shows total minors the device can use including both conventional minor space and the extended one. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 7bbfed05cec..ee4b13520e5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -366,6 +366,18 @@ void blk_free_devt(dev_t devt) } } +static char *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + /* * Register device numbers dev..(dev+range-1) * range must be nonzero @@ -521,7 +533,8 @@ void __init printk_all_partitions(void) struct gendisk *disk = dev_to_disk(dev); struct disk_part_iter piter; struct hd_struct *part; - char buf[BDEVNAME_SIZE]; + char name_buf[BDEVNAME_SIZE]; + char devt_buf[BDEVT_SIZE]; /* * Don't show empty devices or things that have been @@ -536,10 +549,10 @@ void __init printk_all_partitions(void) * numbers in hex - the same format as the root= * option takes. */ - printk("%02x%02x %10llu %s", - MAJOR(disk_devt(disk)), MINOR(disk_devt(disk)), + printk("%s %10llu %s", + bdevt_str(disk_devt(disk), devt_buf), (unsigned long long)get_capacity(disk) >> 1, - disk_name(disk, 0, buf)); + disk_name(disk, 0, name_buf)); if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) printk(" driver: %s\n", @@ -550,10 +563,10 @@ void __init printk_all_partitions(void) /* now show the partitions */ disk_part_iter_init(&piter, disk, 0); while ((part = disk_part_iter_next(&piter))) - printk(" %02x%02x %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), + printk(" %s %10llu %s\n", + bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, buf)); + disk_name(disk, part->partno, name_buf)); disk_part_iter_exit(&piter); } class_dev_iter_exit(&iter); @@ -630,14 +643,14 @@ static int show_partition(struct seq_file *seqf, void *v) return 0; /* show the full disk and all non-0 size partitions of it */ - seq_printf(seqf, "%4d %4d %10llu %s\n", + seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)), (unsigned long long)get_capacity(sgp) >> 1, disk_name(sgp, 0, buf)); disk_part_iter_init(&piter, sgp, 0); while ((part = disk_part_iter_next(&piter))) - seq_printf(seqf, "%4d %4d %10llu %s\n", + seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), (unsigned long long)part->nr_sects >> 1, disk_name(sgp, part->partno, buf)); @@ -691,6 +704,14 @@ static ssize_t disk_range_show(struct device *dev, return sprintf(buf, "%d\n", disk->minors); } +static ssize_t disk_ext_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk_max_parts(disk) + 1); +} + static ssize_t disk_removable_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -780,6 +801,7 @@ static ssize_t disk_fail_store(struct device *dev, #endif static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); +static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL); @@ -792,6 +814,7 @@ static struct device_attribute dev_attr_fail = static struct attribute *disk_attrs[] = { &dev_attr_range.attr, + &dev_attr_ext_range.attr, &dev_attr_removable.attr, &dev_attr_ro.attr, &dev_attr_size.attr, @@ -858,7 +881,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) cpu = disk_stat_lock(); disk_round_stats(cpu, gp); disk_stat_unlock(); - seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)), disk_name(gp, 0, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), @@ -877,7 +900,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) cpu = disk_stat_lock(); part_round_stats(cpu, hd); disk_stat_unlock(); - seq_printf(seqf, "%4d %4d %s %lu %lu %llu " + seq_printf(seqf, "%4d %7d %s %lu %lu %llu " "%u %lu %lu %llu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), -- cgit v1.2.3 From 870d6656126add8e383645732b03df2b7ccd4f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:25 +0900 Subject: block: implement CONFIG_DEBUG_BLOCK_EXT_DEVT Extended devt introduces non-contiguos device numbers. This patch implements a debug option which forces most devt allocations to be from the extended area and spreads them out. This is enabled by default if DEBUG_KERNEL is set and achieves... 1. Detects code paths in kernel or userland which expect predetermined consecutive device numbers. 2. When something goes wrong, avoid corruption as adding to the minor of earlier partition won't lead to the wrong but valid device. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index ee4b13520e5..67e5a59ced2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -298,6 +298,38 @@ EXPORT_SYMBOL(unregister_blkdev); static struct kobj_map *bdev_map; +/** + * blk_mangle_minor - scatter minor numbers apart + * @minor: minor number to mangle + * + * Scatter consecutively allocated @minor number apart if MANGLE_DEVT + * is enabled. Mangling twice gives the original value. + * + * RETURNS: + * Mangled value. + * + * CONTEXT: + * Don't care. + */ +static int blk_mangle_minor(int minor) +{ +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + int i; + + for (i = 0; i < MINORBITS / 2; i++) { + int low = minor & (1 << i); + int high = minor & (1 << (MINORBITS - 1 - i)); + int distance = MINORBITS - 1 - 2 * i; + + minor ^= low | high; /* clear both bits */ + low <<= distance; /* swap the positions */ + high >>= distance; + minor |= low | high; /* and set */ + } +#endif + return minor; +} + /** * blk_alloc_devt - allocate a dev_t for a partition * @part: partition to allocate dev_t for @@ -339,7 +371,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return -EBUSY; } - *devt = MKDEV(BLOCK_EXT_MAJOR, idx); + *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); return 0; } @@ -361,7 +393,7 @@ void blk_free_devt(dev_t devt) if (MAJOR(devt) == BLOCK_EXT_MAJOR) { mutex_lock(&ext_devt_mutex); - idr_remove(&ext_devt_idr, MINOR(devt)); + idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); mutex_unlock(&ext_devt_mutex); } } @@ -473,7 +505,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) struct hd_struct *part; mutex_lock(&ext_devt_mutex); - part = idr_find(&ext_devt_idr, MINOR(devt)); + part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part && get_disk(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); -- cgit v1.2.3 From ed9e1982347b36573cd622ee5f4e2a7ccd79b3fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:05 +0900 Subject: block: implement and use {disk|part}_to_dev() Implement {disk|part}_to_dev() and use them to access generic device instead of directly dereferencing {disk|part}->dev. To make sure no user is left behind, rename generic devices fields to __dev. This is in preparation of unifying partition 0 handling with other partitions. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 67e5a59ced2..0a2f16bd54b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -59,7 +59,7 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) rcu_read_lock(); part = rcu_dereference(disk->__part[partno - 1]); if (part) - get_device(&part->dev); + get_device(part_to_dev(part)); rcu_read_unlock(); return part; @@ -130,7 +130,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) continue; - get_device(&part->dev); + get_device(part_to_dev(part)); piter->part = part; piter->idx += inc; break; @@ -435,7 +435,7 @@ static struct kobject *exact_match(dev_t devt, int *partno, void *data) { struct gendisk *p = data; - return &p->dev.kobj; + return &disk_to_dev(p)->kobj; } static int exact_lock(dev_t devt, void *data) @@ -460,7 +460,7 @@ void add_disk(struct gendisk *disk) int retval; disk->flags |= GENHD_FL_UP; - disk->dev.devt = MKDEV(disk->major, disk->first_minor); + disk_to_dev(disk)->devt = MKDEV(disk->major, disk->first_minor); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); @@ -468,7 +468,8 @@ void add_disk(struct gendisk *disk) bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, disk_devt(disk)); - retval = sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi"); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, + "bdi"); WARN_ON(retval); } @@ -477,7 +478,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ void unlink_gendisk(struct gendisk *disk) { - sysfs_remove_link(&disk->dev.kobj, "bdi"); + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); @@ -903,7 +904,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) int cpu; /* - if (&gp->dev.kobj.entry == block_class.devices.next) + if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" @@ -972,7 +973,7 @@ static void media_change_notify_thread(struct work_struct *work) * set enviroment vars to indicate which event this is for * so that user space will know to go check the media status. */ - kobject_uevent_env(&gd->dev.kobj, KOBJ_CHANGE, envp); + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); put_device(gd->driverfs_dev); } @@ -1062,9 +1063,9 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) disk->minors = minors; disk->ext_minors = ext_minors; rand_initialize_disk(disk); - disk->dev.class = &block_class; - disk->dev.type = &disk_type; - device_initialize(&disk->dev); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); } @@ -1086,7 +1087,7 @@ struct kobject *get_disk(struct gendisk *disk) owner = disk->fops->owner; if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&disk->dev.kobj); + kobj = kobject_get(&disk_to_dev(disk)->kobj); if (kobj == NULL) { module_put(owner); return NULL; @@ -1100,7 +1101,7 @@ EXPORT_SYMBOL(get_disk); void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk->dev.kobj); + kobject_put(&disk_to_dev(disk)->kobj); } EXPORT_SYMBOL(put_disk); -- cgit v1.2.3 From b5d0b9df0ba5d9a044f3a21e7544f53d90bd1465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:06:42 +0200 Subject: block: introduce partition 0 genhd and partition code handled disk and partitions separately. All information about the whole disk was in struct genhd and partitions in struct hd_struct. However, the whole disk (part0) and other partitions have a lot in common and the data structures end up having good number of common fields and thus separate code paths doing the same thing. Also, the partition array was indexed by partno - 1 which gets pretty confusing at times. This patch introduces partition 0 and makes the partition array indexed by partno. Following patches will unify the handling of disk and parts piece-by-piece. This patch also implements disk_partitionable() which tests whether a disk is partitionable. With coming dynamic partition array change, the most common usage of disk_max_parts() will be testing whether a disk is partitionable and the number of max partitions will become much less important. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 0a2f16bd54b..65b7386c26d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -54,10 +54,10 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { struct hd_struct *part; - if (unlikely(partno < 1 || partno > disk_max_parts(disk))) + if (unlikely(partno < 0 || partno >= disk_max_parts(disk))) return NULL; rcu_read_lock(); - part = rcu_dereference(disk->__part[partno - 1]); + part = rcu_dereference(disk->__part[partno]); if (part) get_device(part_to_dev(part)); rcu_read_unlock(); @@ -85,8 +85,10 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, if (flags & DISK_PITER_REVERSE) piter->idx = disk_max_parts(piter->disk) - 1; - else + else if (flags & DISK_PITER_INCL_PART0) piter->idx = 0; + else + piter->idx = 1; piter->flags = flags; } @@ -114,7 +116,10 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) /* determine iteration parameters */ if (piter->flags & DISK_PITER_REVERSE) { inc = -1; - end = -1; + if (piter->flags & DISK_PITER_INCL_PART0) + end = -1; + else + end = 0; } else { inc = 1; end = disk_max_parts(piter->disk); @@ -177,7 +182,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { int i; - for (i = 0; i < disk_max_parts(disk); i++) { + for (i = 1; i < disk_max_parts(disk); i++) { struct hd_struct *part = rcu_dereference(disk->__part[i]); if (part && part->start_sect <= sector && @@ -669,7 +674,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!disk_partitionable(sgp) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) @@ -742,7 +747,7 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk_max_parts(disk) + 1); + return sprintf(buf, "%d\n", disk_max_parts(disk)); } static ssize_t disk_removable_show(struct device *dev, @@ -998,7 +1003,7 @@ dev_t blk_lookup_devt(const char *name, int partno) if (strcmp(dev->bus_id, name)) continue; - if (partno < 0 || partno > disk_max_parts(disk)) + if (partno < 0 || partno >= disk_max_parts(disk)) continue; if (partno == 0) @@ -1045,21 +1050,22 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) GFP_KERNEL | __GFP_ZERO, node_id); if (disk) { int tot_minors = minors + ext_minors; + int size = tot_minors * sizeof(struct hd_struct *); if (!init_disk_stats(disk)) { kfree(disk); return NULL; } - if (tot_minors > 1) { - int size = (tot_minors - 1) * sizeof(struct hd_struct *); - disk->__part = kmalloc_node(size, - GFP_KERNEL | __GFP_ZERO, node_id); - if (!disk->__part) { - free_disk_stats(disk); - kfree(disk); - return NULL; - } + + disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, + node_id); + if (!disk->__part) { + free_disk_stats(disk); + kfree(disk); + return NULL; } + disk->__part[0] = &disk->part0; + disk->minors = minors; disk->ext_minors = ext_minors; rand_initialize_disk(disk); -- cgit v1.2.3 From 548b10eb2959c96cef6fc29fc96e0931eeb53bc5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Aug 2008 09:01:47 +0200 Subject: block: move __dev from disk to part0 Move disk->__dev to part0->__dev. This simplifies bdget_disk() and lookup_devt() and allows common sysfs attributes to be unified. part_to_disk() is updated to handle part0 -> disk. Updated to include a fix from Bartlomiej Zolnierkiewicz , he writes: "part0 is a "special" partition and doesn't need to have capacity set - this fixes regression caused by "block: move __dev from disk to part0" commit." Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 65b7386c26d..36b9f1bdd91 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -537,22 +537,15 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) */ extern struct block_device *bdget_disk(struct gendisk *disk, int partno) { - dev_t devt = MKDEV(0, 0); + struct hd_struct *part; + struct block_device *bdev = NULL; - if (partno == 0) - devt = disk_devt(disk); - else { - struct hd_struct *part; + part = disk_get_part(disk, partno); + if (part && (part->nr_sects || partno == 0)) + bdev = bdget(part_devt(part)); + disk_put_part(part); - part = disk_get_part(disk, partno); - if (part && part->nr_sects) - devt = part_devt(part); - disk_put_part(part); - } - - if (likely(devt != MKDEV(0, 0))) - return bdget(devt); - return NULL; + return bdev; } EXPORT_SYMBOL(bdget_disk); @@ -1000,27 +993,18 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part; if (strcmp(dev->bus_id, name)) continue; - if (partno < 0 || partno >= disk_max_parts(disk)) - continue; - - if (partno == 0) - devt = disk_devt(disk); - else { - struct hd_struct *part; - - part = disk_get_part(disk, partno); - if (!part || !part->nr_sects) { - disk_put_part(part); - continue; - } + part = disk_get_part(disk, partno); + if (part && (part->nr_sects || partno == 0)) { devt = part_devt(part); disk_put_part(part); + break; } - break; + disk_put_part(part); } class_dev_iter_exit(&iter); return devt; -- cgit v1.2.3 From e56105214943ce5f0901d20e972a7cfd0d1d0656 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:09 +0900 Subject: block: unify sysfs size node handling Now that capacity and __dev are moved to part0, part0 and others can share the same method. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 36b9f1bdd91..c70db35076a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -760,14 +760,6 @@ static ssize_t disk_ro_show(struct device *dev, return sprintf(buf, "%d\n", disk->policy ? 1 : 0); } -static ssize_t disk_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%llu\n", (unsigned long long)get_capacity(disk)); -} - static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -835,7 +827,7 @@ static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); -static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL); +static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST -- cgit v1.2.3 From b7db9956e57c8151b930d5e5fe5c766e6aad3ff7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:10 +0900 Subject: block: move policy from disk to part0 Move disk->policy to part0->policy. Implement and use get_disk_ro(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index c70db35076a..70358f3c742 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -757,7 +757,7 @@ static ssize_t disk_ro_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk->policy ? 1 : 0); + return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } static ssize_t disk_capability_show(struct device *dev, @@ -1090,10 +1090,7 @@ EXPORT_SYMBOL(put_disk); void set_device_ro(struct block_device *bdev, int flag) { - if (bdev->bd_contains != bdev) - bdev->bd_part->policy = flag; - else - bdev->bd_disk->policy = flag; + bdev->bd_part->policy = flag; } EXPORT_SYMBOL(set_device_ro); @@ -1103,8 +1100,8 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - disk->policy = flag; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) part->policy = flag; disk_part_iter_exit(&piter); @@ -1116,10 +1113,7 @@ int bdev_read_only(struct block_device *bdev) { if (!bdev) return 0; - else if (bdev->bd_contains != bdev) - return bdev->bd_part->policy; - else - return bdev->bd_disk->policy; + return bdev->bd_part->policy; } EXPORT_SYMBOL(bdev_read_only); -- cgit v1.2.3 From eddb2e26b5ee3c5da68ba4bf1921ba20e2097bff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:13 +0900 Subject: block: kill GENHD_FL_FAIL and use part0->make_it_fail GENHD_FL_FAIL for disk is what make_it_fail is for parts. Kill it and use part0->make_it_fail. Sysfs node handling is unified too. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 70358f3c742..06a252f2b96 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -795,34 +795,6 @@ static ssize_t disk_stat_show(struct device *dev, jiffies_to_msecs(disk_stat_read(disk, time_in_queue))); } -#ifdef CONFIG_FAIL_MAKE_REQUEST -static ssize_t disk_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%d\n", disk->flags & GENHD_FL_FAIL ? 1 : 0); -} - -static ssize_t disk_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct gendisk *disk = dev_to_disk(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) { - if (i == 0) - disk->flags &= ~GENHD_FL_FAIL; - else - disk->flags |= GENHD_FL_FAIL; - } - - return count; -} - -#endif - static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); @@ -832,7 +804,7 @@ static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, S_IRUGO|S_IWUSR, disk_fail_show, disk_fail_store); + __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); #endif static struct attribute *disk_attrs[] = { -- cgit v1.2.3 From 074a7aca7afa6f230104e8e65eba3420263714a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:14 +0900 Subject: block: move stats from disk to part0 Move stats related fields - stamp, in_flight, dkstats - from disk to part0 and unify stat handling such that... * part_stat_*() now updates part0 together if the specified partition is not part0. ie. part_stat_*() are now essentially all_stat_*(). * {disk|all}_stat_*() are gone. * part_round_stats() is updated similary. It handles part0 stats automatically and disk_round_stats() is killed. * part_{inc|dec}_in_fligh() is implemented which automatically updates part0 stats for parts other than part0. * disk_map_sector_rcu() is updated to return part0 if no part matches. Combined with the above changes, this makes NULL special case handling in callers unnecessary. * Separate stats show code paths for disk are collapsed into part stats show code paths. * Rename disk_stat_lock/unlock() to part_stat_lock/unlock() While at it, reposition stat handling macros a bit and add missing parentheses around macro parameters. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 97 +++++++++++++++-------------------------------------------- 1 file changed, 24 insertions(+), 73 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 06a252f2b96..e1cb96fb883 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); * while preemption is disabled. * * RETURNS: - * Found partition on success, NULL if there's no matching partition. + * Found partition on success, part0 is returned if no partition matches */ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { @@ -189,7 +189,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) sector < part->start_sect + part->nr_sects) return part; } - return NULL; + return &disk->part0; } EXPORT_SYMBOL_GPL(disk_map_sector_rcu); @@ -580,24 +580,24 @@ void __init printk_all_partitions(void) * numbers in hex - the same format as the root= * option takes. */ - printk("%s %10llu %s", - bdevt_str(disk_devt(disk), devt_buf), - (unsigned long long)get_capacity(disk) >> 1, - disk_name(disk, 0, name_buf)); - if (disk->driverfs_dev != NULL && - disk->driverfs_dev->driver != NULL) - printk(" driver: %s\n", - disk->driverfs_dev->driver->name); - else - printk(" (driver?)\n"); + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) { + bool is_part0 = part == &disk->part0; - /* now show the partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - printk(" %s %10llu %s\n", + printk("%s%s %10llu %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, disk_name(disk, part->partno, name_buf)); + if (is_part0) { + if (disk->driverfs_dev != NULL && + disk->driverfs_dev->driver != NULL) + printk(" driver: %s\n", + disk->driverfs_dev->driver->name); + else + printk(" (driver?)\n"); + } else + printk("\n"); + } disk_part_iter_exit(&piter); } class_dev_iter_exit(&iter); @@ -674,12 +674,7 @@ static int show_partition(struct seq_file *seqf, void *v) return 0; /* show the full disk and all non-0 size partitions of it */ - seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)), - (unsigned long long)get_capacity(sgp) >> 1, - disk_name(sgp, 0, buf)); - - disk_part_iter_init(&piter, sgp, 0); + disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), @@ -768,40 +763,13 @@ static ssize_t disk_capability_show(struct device *dev, return sprintf(buf, "%x\n", disk->flags); } -static ssize_t disk_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - int cpu; - - cpu = disk_stat_lock(); - disk_round_stats(cpu, disk); - disk_stat_unlock(); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u" - "\n", - disk_stat_read(disk, ios[READ]), - disk_stat_read(disk, merges[READ]), - (unsigned long long)disk_stat_read(disk, sectors[READ]), - jiffies_to_msecs(disk_stat_read(disk, ticks[READ])), - disk_stat_read(disk, ios[WRITE]), - disk_stat_read(disk, merges[WRITE]), - (unsigned long long)disk_stat_read(disk, sectors[WRITE]), - jiffies_to_msecs(disk_stat_read(disk, ticks[WRITE])), - disk->in_flight, - jiffies_to_msecs(disk_stat_read(disk, io_ticks)), - jiffies_to_msecs(disk_stat_read(disk, time_in_queue))); -} - static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); -static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL); +static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -836,7 +804,7 @@ static void disk_release(struct device *dev) kfree(disk->random); kfree(disk->__part); - free_disk_stats(disk); + free_part_stats(&disk->part0); kfree(disk); } struct class block_class = { @@ -873,28 +841,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - cpu = disk_stat_lock(); - disk_round_stats(cpu, gp); - disk_stat_unlock(); - seq_printf(seqf, "%4d %7d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", - MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)), - disk_name(gp, 0, buf), - disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), - (unsigned long long)disk_stat_read(gp, sectors[0]), - jiffies_to_msecs(disk_stat_read(gp, ticks[0])), - disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]), - (unsigned long long)disk_stat_read(gp, sectors[1]), - jiffies_to_msecs(disk_stat_read(gp, ticks[1])), - gp->in_flight, - jiffies_to_msecs(disk_stat_read(gp, io_ticks)), - jiffies_to_msecs(disk_stat_read(gp, time_in_queue))); - - /* now show all non-0 size partitions of it */ - disk_part_iter_init(&piter, gp, 0); + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0); while ((hd = disk_part_iter_next(&piter))) { - cpu = disk_stat_lock(); + cpu = part_stat_lock(); part_round_stats(cpu, hd); - disk_stat_unlock(); + part_stat_unlock(); seq_printf(seqf, "%4d %7d %s %lu %lu %llu " "%u %lu %lu %llu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), @@ -1000,7 +951,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) int tot_minors = minors + ext_minors; int size = tot_minors * sizeof(struct hd_struct *); - if (!init_disk_stats(disk)) { + if (!init_part_stats(&disk->part0)) { kfree(disk); return NULL; } @@ -1008,7 +959,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, node_id); if (!disk->__part) { - free_disk_stats(disk); + free_part_stats(&disk->part0); kfree(disk); return NULL; } -- cgit v1.2.3 From 540eed5637b766bb1e881ef744c42617760b4815 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:15 +0900 Subject: block: make partition array dynamic disk->__part used to be statically allocated to the maximum possible number of partitions. This patch makes partition array allocation dynamic. The added overhead is minimal as only real change is one memory dereference changed to RCU one. This saves both a bit of memory and cpu cycles iterating through unoccupied slots and makes increasing partition limit easier. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 109 insertions(+), 20 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index e1cb96fb883..c2b14aa69d5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -52,14 +52,21 @@ static struct device_type disk_type; */ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { - struct hd_struct *part; + struct hd_struct *part = NULL; + struct disk_part_tbl *ptbl; - if (unlikely(partno < 0 || partno >= disk_max_parts(disk))) + if (unlikely(partno < 0)) return NULL; + rcu_read_lock(); - part = rcu_dereference(disk->__part[partno]); - if (part) - get_device(part_to_dev(part)); + + ptbl = rcu_dereference(disk->part_tbl); + if (likely(partno < ptbl->len)) { + part = rcu_dereference(ptbl->part[partno]); + if (part) + get_device(part_to_dev(part)); + } + rcu_read_unlock(); return part; @@ -80,17 +87,24 @@ EXPORT_SYMBOL_GPL(disk_get_part); void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags) { + struct disk_part_tbl *ptbl; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + piter->disk = disk; piter->part = NULL; if (flags & DISK_PITER_REVERSE) - piter->idx = disk_max_parts(piter->disk) - 1; + piter->idx = ptbl->len - 1; else if (flags & DISK_PITER_INCL_PART0) piter->idx = 0; else piter->idx = 1; piter->flags = flags; + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_part_iter_init); @@ -105,13 +119,16 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init); */ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) { + struct disk_part_tbl *ptbl; int inc, end; /* put the last partition */ disk_put_part(piter->part); piter->part = NULL; + /* get part_tbl */ rcu_read_lock(); + ptbl = rcu_dereference(piter->disk->part_tbl); /* determine iteration parameters */ if (piter->flags & DISK_PITER_REVERSE) { @@ -122,14 +139,14 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) end = 0; } else { inc = 1; - end = disk_max_parts(piter->disk); + end = ptbl->len; } /* iterate to the next partition */ for (; piter->idx != end; piter->idx += inc) { struct hd_struct *part; - part = rcu_dereference(piter->disk->__part[piter->idx]); + part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) @@ -180,10 +197,13 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); */ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { + struct disk_part_tbl *ptbl; int i; - for (i = 1; i < disk_max_parts(disk); i++) { - struct hd_struct *part = rcu_dereference(disk->__part[i]); + ptbl = rcu_dereference(disk->part_tbl); + + for (i = 1; i < ptbl->len; i++) { + struct hd_struct *part = rcu_dereference(ptbl->part[i]); if (part && part->start_sect <= sector && sector < part->start_sect + part->nr_sects) @@ -798,12 +818,86 @@ static struct attribute_group *disk_attr_groups[] = { NULL }; +static void disk_free_ptbl_rcu_cb(struct rcu_head *head) +{ + struct disk_part_tbl *ptbl = + container_of(head, struct disk_part_tbl, rcu_head); + + kfree(ptbl); +} + +/** + * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way + * @disk: disk to replace part_tbl for + * @new_ptbl: new part_tbl to install + * + * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The + * original ptbl is freed using RCU callback. + * + * LOCKING: + * Matching bd_mutx locked. + */ +static void disk_replace_part_tbl(struct gendisk *disk, + struct disk_part_tbl *new_ptbl) +{ + struct disk_part_tbl *old_ptbl = disk->part_tbl; + + rcu_assign_pointer(disk->part_tbl, new_ptbl); + if (old_ptbl) + call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); +} + +/** + * disk_expand_part_tbl - expand disk->part_tbl + * @disk: disk to expand part_tbl for + * @partno: expand such that this partno can fit in + * + * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl + * uses RCU to allow unlocked dereferencing for stats and other stuff. + * + * LOCKING: + * Matching bd_mutex locked, might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int disk_expand_part_tbl(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *new_ptbl; + int len = old_ptbl ? old_ptbl->len : 0; + int target = partno + 1; + size_t size; + int i; + + /* disk_max_parts() is zero during initialization, ignore if so */ + if (disk_max_parts(disk) && target > disk_max_parts(disk)) + return -EINVAL; + + if (target <= len) + return 0; + + size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); + new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); + if (!new_ptbl) + return -ENOMEM; + + INIT_RCU_HEAD(&new_ptbl->rcu_head); + new_ptbl->len = target; + + for (i = 0; i < len; i++) + rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); + + disk_replace_part_tbl(disk, new_ptbl); + return 0; +} + static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); kfree(disk->random); - kfree(disk->__part); + disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); kfree(disk); } @@ -948,22 +1042,16 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL | __GFP_ZERO, node_id); if (disk) { - int tot_minors = minors + ext_minors; - int size = tot_minors * sizeof(struct hd_struct *); - if (!init_part_stats(&disk->part0)) { kfree(disk); return NULL; } - - disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, - node_id); - if (!disk->__part) { - free_part_stats(&disk->part0); + if (disk_expand_part_tbl(disk, 0)) { + free_part_stats(&disk->part0); kfree(disk); return NULL; } - disk->__part[0] = &disk->part0; + disk->part_tbl->part[0] = &disk->part0; disk->minors = minors; disk->ext_minors = ext_minors; @@ -973,6 +1061,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) device_initialize(disk_to_dev(disk)); INIT_WORK(&disk->async_notify, media_change_notify_thread); + disk->node_id = node_id; } return disk; } -- cgit v1.2.3 From 689d6fac40b41c7bf154f362deaf442548e4dc81 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:16 +0900 Subject: block: replace @ext_minors with GENHD_FL_EXT_DEVT With previous changes, it's meaningless to limit the number of partitions. Replace @ext_minors with GENHD_FL_EXT_DEVT such that setting the flag allows the disk to have maximum number of allowed partitions (only limited by the number of entries in parsed_partitions as determined by MAX_PART constant). This kills not-too-pretty alloc_disk_ext[_node]() functions and makes @minors parameter to alloc_disk[_node]() unnecessary. The parameter is left alone to avoid disturbing the users. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index c2b14aa69d5..eedab5b4685 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1024,18 +1024,9 @@ struct gendisk *alloc_disk(int minors) { return alloc_disk_node(minors, -1); } +EXPORT_SYMBOL(alloc_disk); struct gendisk *alloc_disk_node(int minors, int node_id) -{ - return alloc_disk_ext_node(minors, 0, node_id); -} - -struct gendisk *alloc_disk_ext(int minors, int ext_minors) -{ - return alloc_disk_ext_node(minors, ext_minors, -1); -} - -struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) { struct gendisk *disk; @@ -1054,7 +1045,6 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) disk->part_tbl->part[0] = &disk->part0; disk->minors = minors; - disk->ext_minors = ext_minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1065,11 +1055,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id) } return disk; } - -EXPORT_SYMBOL(alloc_disk); EXPORT_SYMBOL(alloc_disk_node); -EXPORT_SYMBOL(alloc_disk_ext); -EXPORT_SYMBOL(alloc_disk_ext_node); struct kobject *get_disk(struct gendisk *disk) { -- cgit v1.2.3 From 3e1a7ff8a0a7b948f2684930166954f9e8e776fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:17 +0900 Subject: block: allow disk to have extended device number Now that disk and partition handlings are mostly unified, it's easy to allow disk to have extended device number. This patch makes add_disk() use extended device number if disk->minors is zero. Both sd and ide-disk are updated to use this. * sd_format_disk_name() is implemented which can generically determine the drive name. This removes disk number restriction stemming from limited device names. * If sd index goes over SD_MAX_DISKS (which can be increased now BTW), sd simply doesn't initialize minors letting block layer choose extended device number. * If CONFIG_DEBUG_EXT_DEVT is set, both sd and ide-disk always set minors to 0 and use extended device numbers. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index eedab5b4685..d9de3e482d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -478,14 +478,37 @@ static int exact_lock(dev_t devt, void *data) * * This function registers the partitioning information in @disk * with the kernel. + * + * FIXME: error handling */ void add_disk(struct gendisk *disk) { struct backing_dev_info *bdi; + dev_t devt; int retval; + /* minors == 0 indicates to use ext devt from part0 and should + * be accompanied with EXT_DEVT flag. Make sure all + * parameters make sense. + */ + WARN_ON(disk->minors && !(disk->major || disk->first_minor)); + WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); + disk->flags |= GENHD_FL_UP; - disk_to_dev(disk)->devt = MKDEV(disk->major, disk->first_minor); + + retval = blk_alloc_devt(&disk->part0, &devt); + if (retval) { + WARN_ON(1); + return; + } + disk_to_dev(disk)->devt = devt; + + /* ->major and ->first_minor aren't supposed to be + * dereferenced from here on, but set them just in case. + */ + disk->major = MAJOR(devt); + disk->first_minor = MINOR(devt); + blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); -- cgit v1.2.3 From aeb3d3a81e81c6323a17fe914e91eb228b3f1aa1 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Thu, 28 Aug 2008 09:27:42 +0200 Subject: block: kmalloc args reversed, small function definition fixes Noticed by sparse: block/blk-softirq.c:156:12: warning: symbol 'blk_softirq_init' was not declared. Should it be static? block/genhd.c:583:28: warning: function 'bdget_disk' with external linkage has definition block/genhd.c:659:17: warning: incorrect type in argument 1 (different base types) block/genhd.c:659:17: expected unsigned int [unsigned] [usertype] size block/genhd.c:659:17: got restricted gfp_t block/genhd.c:659:29: warning: incorrect type in argument 2 (different base types) block/genhd.c:659:29: expected restricted gfp_t [usertype] flags block/genhd.c:659:29: got unsigned int block: kmalloc args reversed Signed-off-by: Harvey Harrison Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index d9de3e482d1..32ee73c6756 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -578,7 +578,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) * RETURNS: * Resulting block_device on success, NULL on failure. */ -extern struct block_device *bdget_disk(struct gendisk *disk, int partno) +struct block_device *bdget_disk(struct gendisk *disk, int partno) { struct hd_struct *part; struct block_device *bdev = NULL; @@ -654,7 +654,7 @@ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) struct class_dev_iter *iter; struct device *dev; - iter = kmalloc(GFP_KERNEL, sizeof(*iter)); + iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 2bbedcb4c1abac498f18e5770d62ae66ff235ada Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Aug 2008 11:41:51 +0200 Subject: block: don't test for partition size in bdget_disk() and blk_lookup_devt() bdget_disk() and blk_lookup_devt() never cared whether the specified partition (or disk) is zero sized or not. I got confused while converting those not to depend on consecutive minor numbers in commit 5a6411b1178baf534aa9138052864dfa89d3eada and later when dev0 was added it broke callers which expected to get valid return for zero sized disk devices. So, they never needed nr_sects checks in the first place. Kill them. This problem was spotted and debugged by Bartlmoiej Zolnierkiewicz. Signed-off-by: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 32ee73c6756..ed926b760ca 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -584,7 +584,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) struct block_device *bdev = NULL; part = disk_get_part(disk, partno); - if (part && (part->nr_sects || partno == 0)) + if (part) bdev = bdget(part_devt(part)); disk_put_part(part); @@ -1031,7 +1031,7 @@ dev_t blk_lookup_devt(const char *name, int partno) continue; part = disk_get_part(disk, partno); - if (part && (part->nr_sects || partno == 0)) { + if (part) { devt = part_devt(part); disk_put_part(part); break; -- cgit v1.2.3 From 243294dae09c909c0442c8f04d470b69c3c19d6e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 4 Sep 2008 09:17:31 +0200 Subject: block: fix duplicate headers for /proc/partitions seqf can be started multiple times for a read and the header should be printed only for the initial one. Fix it. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index ed926b760ca..8acaff0154e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -697,7 +697,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static void *p; p = disk_seqf_start(seqf, pos); - if (!IS_ERR(p) && p) + if (!IS_ERR(p) && p && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } -- cgit v1.2.3 From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Sep 2008 05:56:33 -0700 Subject: block: add fault injection mechanism for faking request timeouts Only works for the generic request timer handling. Allows one to sporadically ignore request completions, thus exercising the timeout handling. Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 8acaff0154e..4cd3433c99a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -817,6 +817,11 @@ static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); #endif +#ifdef CONFIG_FAIL_IO_TIMEOUT +static struct device_attribute dev_attr_fail_timeout = + __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, + part_timeout_store); +#endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, @@ -828,6 +833,9 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT + &dev_attr_fail_timeout.attr, #endif NULL }; -- cgit v1.2.3 From e6d63840ba55ffd3a79aea6792aac6f29f338083 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Oct 2008 08:49:34 +0200 Subject: block: fix kernel-doc for blk_alloc_devt() No argument 'gfp_mask' for blk_alloc_devt(). Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index 4cd3433c99a..b8defae2ec0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -358,7 +358,6 @@ static int blk_mangle_minor(int minor) /** * blk_alloc_devt - allocate a dev_t for a partition * @part: partition to allocate dev_t for - * @gfp_mask: memory allocation flag * @devt: out parameter for resulting dev_t * * Allocate a dev_t for block device. -- cgit v1.2.3 From 496aa8a98f5ab22ced46be5dc2087cdf3d029bd7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 16 Oct 2008 07:46:23 +0200 Subject: block: fix current kernel-doc warnings Fix block kernel-doc warnings: Warning(linux-2.6.27-git4//fs/block_dev.c:1272): No description found for parameter 'path' Warning(linux-2.6.27-git4//block/blk-core.c:1021): No description found for parameter 'cpu' Warning(linux-2.6.27-git4//block/blk-core.c:1021): No description found for parameter 'part' Warning(/var/linsrc/linux-2.6.27-git4//block/genhd.c:544): No description found for parameter 'partno' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/genhd.c') diff --git a/block/genhd.c b/block/genhd.c index b8defae2ec0..646e1d2507c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -534,7 +534,7 @@ void unlink_gendisk(struct gendisk *disk) /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for - * @part: returned partition index + * @partno: returned partition index * * This function gets the structure containing partitioning * information for the given device @devt. -- cgit v1.2.3