diff options
Diffstat (limited to 'fs')
302 files changed, 21924 insertions, 6428 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index cef8b18ceaa..86b203fc3c5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -66,6 +66,13 @@ config GENERIC_ACL bool select FS_POSIX_ACL +menu "Caches" + +source "fs/fscache/Kconfig" +source "fs/cachefiles/Kconfig" + +endmenu + if BLOCK menu "CD-ROM/DVD Filesystems" @@ -169,6 +176,8 @@ source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" +source "fs/exofs/Kconfig" + endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 6e82a307bcd..70b2aed8713 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o + stack.o fs_struct.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o @@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line +obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 obj-$(CONFIG_EXT2_FS) += ext2/ @@ -116,7 +117,9 @@ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ +obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_EXOFS_FS) += exofs/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 7f83a46f2b7..dd9becca424 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data) static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); + struct super_block *sb = dentry->d_sb; + struct adfs_sb_info *sbi = ADFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ADFS_SUPER_MAGIC; - buf->f_namelen = asb->s_namelen; - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_blocks = asb->s_size; - buf->f_files = asb->s_ids_per_zone * asb->s_map_size; + buf->f_namelen = sbi->s_namelen; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_size; + buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; buf->f_bavail = - buf->f_bfree = adfs_map_free(dentry->d_sb); + buf->f_bfree = adfs_map_free(sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/affs/super.c b/fs/affs/super.c index a19d64b582a..5ce695e707f 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; int free; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, AFFS_SB(sb)->s_reserved); @@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; buf->f_bfree = free; buf->f_bavail = free; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = 30; return 0; } diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index e7b522fe15e..5c4e61d3c77 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -19,3 +19,11 @@ config AFS_DEBUG See <file:Documentation/filesystems/afs.txt> for more information. If unsure, say N. + +config AFS_FSCACHE + bool "Provide AFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y + help + Say Y here if you want AFS data to be cached locally on disk through + the generic filesystem cache manager diff --git a/fs/afs/Makefile b/fs/afs/Makefile index a66671082cf..4f64b95d57b 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -2,7 +2,10 @@ # Makefile for Red Hat Linux AFS client. # +afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o + kafs-objs := \ + $(afs-cache-y) \ callback.o \ cell.o \ cmservice.o \ diff --git a/fs/afs/cache.c b/fs/afs/cache.c index de0d7de69ed..e2b1d3f1651 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -1,6 +1,6 @@ /* AFS caching stuff * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -9,248 +9,395 @@ * 2 of the License, or (at your option) any later version. */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_cell_cache_match(void *target, - const void *entry); -static void afs_cell_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_cache_cell_index_def = { - .name = "cell_ix", - .data_size = sizeof(struct afs_cache_cell), - .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, - .match = afs_cell_cache_match, - .update = afs_cell_cache_update, +#include <linux/slab.h> +#include <linux/sched.h> +#include "internal.h" + +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); + +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vlocation_cache_check_aux( + void *cookie_netfs_data, const void *buffer, uint16_t buflen); + +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); + +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size); +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); + +struct fscache_netfs afs_cache_netfs = { + .name = "afs", + .version = 0, +}; + +struct fscache_cookie_def afs_cell_cache_index_def = { + .name = "AFS.cell", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_cell_cache_get_key, + .get_aux = afs_cell_cache_get_aux, + .check_aux = afs_cell_cache_check_aux, +}; + +struct fscache_cookie_def afs_vlocation_cache_index_def = { + .name = "AFS.vldb", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_vlocation_cache_get_key, + .get_aux = afs_vlocation_cache_get_aux, + .check_aux = afs_vlocation_cache_check_aux, +}; + +struct fscache_cookie_def afs_volume_cache_index_def = { + .name = "AFS.volume", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_volume_cache_get_key, +}; + +struct fscache_cookie_def afs_vnode_cache_index_def = { + .name = "AFS.vnode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = afs_vnode_cache_get_key, + .get_attr = afs_vnode_cache_get_attr, + .get_aux = afs_vnode_cache_get_aux, + .check_aux = afs_vnode_cache_check_aux, + .now_uncached = afs_vnode_cache_now_uncached, }; -#endif /* - * match a cell record obtained from the cache + * set the key for the index entry */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_cell_cache_match(void *target, - const void *entry) +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - const struct afs_cache_cell *ccell = entry; - struct afs_cell *cell = target; + const struct afs_cell *cell = cookie_netfs_data; + uint16_t klen; - _enter("{%s},{%s}", ccell->name, cell->name); + _enter("%p,%p,%u", cell, buffer, bufmax); - if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) { - _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; - } + klen = strlen(cell->name); + if (klen > bufmax) + return 0; - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + memcpy(buffer, cell->name, klen); + return klen; } -#endif /* - * update a cell record in the cache + * provide new auxilliary cache data */ -#ifdef AFS_CACHING_SUPPORT -static void afs_cell_cache_update(void *source, void *entry) +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - struct afs_cache_cell *ccell = entry; - struct afs_cell *cell = source; + const struct afs_cell *cell = cookie_netfs_data; + uint16_t dlen; - _enter("%p,%p", source, entry); + _enter("%p,%p,%u", cell, buffer, bufmax); - strncpy(ccell->name, cell->name, sizeof(ccell->name)); + dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]); + dlen = min(dlen, bufmax); + dlen &= ~(sizeof(cell->vl_addrs[0]) - 1); - memcpy(ccell->vl_servers, - cell->vl_addrs, - min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs))); + memcpy(buffer, cell->vl_addrs, dlen); + return dlen; +} +/* + * check that the auxilliary data indicates that the entry is still valid + */ +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; } -#endif - -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vlocation_cache_match(void *target, - const void *entry); -static void afs_vlocation_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_vlocation_cache_index_def = { - .name = "vldb", - .data_size = sizeof(struct afs_cache_vlocation), - .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, - .match = afs_vlocation_cache_match, - .update = afs_vlocation_cache_update, -}; -#endif +/*****************************************************************************/ /* - * match a VLDB record stored in the cache - * - may also load target from entry + * set the key for the index entry */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vlocation_cache_match(void *target, - const void *entry) +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - const struct afs_cache_vlocation *vldb = entry; - struct afs_vlocation *vlocation = target; + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t klen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name)); + if (klen > bufmax) + return 0; - _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); + memcpy(buffer, vlocation->vldb.name, klen); - if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 - ) { - if (!vlocation->valid || - vlocation->vldb.rtime == vldb->rtime + _leave(" = %u", klen); + return klen; +} + +/* + * provide new auxilliary cache data + */ +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen > bufmax) + return 0; + + memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxilliary data indicates that the entry is still valid + */ +static +enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct afs_cache_vlocation *cvldb; + struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen != buflen) + return FSCACHE_CHECKAUX_OBSOLETE; + + cvldb = container_of(buffer, struct afs_cache_vlocation, nservers); + + /* if what's on disk is more valid than what's in memory, then use the + * VL record from the cache */ + if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) { + memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen); + vlocation->valid = 1; + _leave(" = SUCCESS [c->m]"); + return FSCACHE_CHECKAUX_OKAY; + } + + /* need to update the cache if the cached info differs */ + if (memcmp(&vlocation->vldb, buffer, dlen) != 0) { + /* delete if the volume IDs for this name differ */ + if (memcmp(&vlocation->vldb.vid, &cvldb->vid, + sizeof(cvldb->vid)) != 0 ) { - vlocation->vldb = *vldb; - vlocation->valid = 1; - _leave(" = SUCCESS [c->m]"); - return CACHEFS_MATCH_SUCCESS; - } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) { - /* delete if VIDs for this name differ */ - if (memcmp(&vlocation->vldb.vid, - &vldb->vid, - sizeof(vldb->vid)) != 0) { - _leave(" = DELETE"); - return CACHEFS_MATCH_SUCCESS_DELETE; - } - - _leave(" = UPDATE"); - return CACHEFS_MATCH_SUCCESS_UPDATE; - } else { - _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; + _leave(" = OBSOLETE"); + return FSCACHE_CHECKAUX_OBSOLETE; } + + _leave(" = UPDATE"); + return FSCACHE_CHECKAUX_NEEDS_UPDATE; } - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; } -#endif +/*****************************************************************************/ /* - * update a VLDB record stored in the cache + * set the key for the volume index entry */ -#ifdef AFS_CACHING_SUPPORT -static void afs_vlocation_cache_update(void *source, void *entry) +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - struct afs_cache_vlocation *vldb = entry; - struct afs_vlocation *vlocation = source; + const struct afs_volume *volume = cookie_netfs_data; + uint16_t klen; + + _enter("{%u},%p,%u", volume->type, buffer, bufmax); + + klen = sizeof(volume->type); + if (klen > bufmax) + return 0; - _enter(""); + memcpy(buffer, &volume->type, sizeof(volume->type)); + + _leave(" = %u", klen); + return klen; - *vldb = vlocation->vldb; } -#endif - -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_volume_cache_match(void *target, - const void *entry); -static void afs_volume_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_volume_cache_index_def = { - .name = "volume", - .data_size = sizeof(struct afs_cache_vhash), - .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 }, - .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 }, - .match = afs_volume_cache_match, - .update = afs_volume_cache_update, -}; -#endif +/*****************************************************************************/ /* - * match a volume hash record stored in the cache + * set the key for the index entry */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_volume_cache_match(void *target, - const void *entry) +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) { - const struct afs_cache_vhash *vhash = entry; - struct afs_volume *volume = target; + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t klen; - _enter("{%u},{%u}", volume->type, vhash->vtype); + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); - if (volume->type == vhash->vtype) { - _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; - } + klen = sizeof(vnode->fid.vnode); + if (klen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode)); - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + _leave(" = %u", klen); + return klen; } -#endif /* - * update a volume hash record stored in the cache + * provide updated file attributes */ -#ifdef AFS_CACHING_SUPPORT -static void afs_volume_cache_update(void *source, void *entry) +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size) { - struct afs_cache_vhash *vhash = entry; - struct afs_volume *volume = source; + const struct afs_vnode *vnode = cookie_netfs_data; - _enter(""); + _enter("{%x,%x,%llx},", + vnode->fid.vnode, vnode->fid.unique, + vnode->status.data_version); - vhash->vtype = volume->type; + *size = vnode->status.size; } -#endif - -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vnode_cache_match(void *target, - const void *entry); -static void afs_vnode_cache_update(void *source, void *entry); - -struct cachefs_index_def afs_vnode_cache_index_def = { - .name = "vnode", - .data_size = sizeof(struct afs_cache_vnode), - .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 }, - .match = afs_vnode_cache_match, - .update = afs_vnode_cache_update, -}; -#endif /* - * match a vnode record stored in the cache + * provide new auxilliary cache data + */ +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%Lx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); + + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique)); + buffer += sizeof(vnode->fid.unique); + memcpy(buffer, &vnode->status.data_version, + sizeof(vnode->status.data_version)); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxilliary data indicates that the entry is still valid */ -#ifdef AFS_CACHING_SUPPORT -static cachefs_match_val_t afs_vnode_cache_match(void *target, - const void *entry) +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) { - const struct afs_cache_vnode *cvnode = entry; - struct afs_vnode *vnode = target; - - _enter("{%x,%x,%Lx},{%x,%x,%Lx}", - vnode->fid.vnode, - vnode->fid.unique, - vnode->status.version, - cvnode->vnode_id, - cvnode->vnode_unique, - cvnode->data_version); - - if (vnode->fid.vnode != cvnode->vnode_id) { - _leave(" = FAILED"); - return CACHEFS_MATCH_FAILED; + struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen != buflen) { + _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen); + return FSCACHE_CHECKAUX_OBSOLETE; } - if (vnode->fid.unique != cvnode->vnode_unique || - vnode->status.version != cvnode->data_version) { - _leave(" = DELETE"); - return CACHEFS_MATCH_SUCCESS_DELETE; + if (memcmp(buffer, + &vnode->fid.unique, + sizeof(vnode->fid.unique) + ) != 0) { + unsigned unique; + + memcpy(&unique, buffer, sizeof(unique)); + + _leave(" = OBSOLETE [uniq %x != %x]", + unique, vnode->fid.unique); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + if (memcmp(buffer + sizeof(vnode->fid.unique), + &vnode->status.data_version, + sizeof(vnode->status.data_version) + ) != 0) { + afs_dataversion_t version; + + memcpy(&version, buffer + sizeof(vnode->fid.unique), + sizeof(version)); + + _leave(" = OBSOLETE [vers %llx != %llx]", + version, vnode->status.data_version); + return FSCACHE_CHECKAUX_OBSOLETE; } _leave(" = SUCCESS"); - return CACHEFS_MATCH_SUCCESS; + return FSCACHE_CHECKAUX_OKAY; } -#endif /* - * update a vnode record stored in the cache + * indication the cookie is no longer uncached + * - this function is called when the backing store currently caching a cookie + * is removed + * - the netfs should use this to clean up any markers indicating cached pages + * - this is mandatory for any object that may have data */ -#ifdef AFS_CACHING_SUPPORT -static void afs_vnode_cache_update(void *source, void *entry) +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) { - struct afs_cache_vnode *cvnode = entry; - struct afs_vnode *vnode = source; + struct afs_vnode *vnode = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + _enter("{%x,%x,%Lx}", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + /* grab a bunch of pages to clean */ + nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; - _enter(""); + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } - cvnode->vnode_id = vnode->fid.vnode; - cvnode->vnode_unique = vnode->fid.unique; - cvnode->data_version = vnode->status.version; + _leave(""); } -#endif diff --git a/fs/afs/cache.h b/fs/afs/cache.h index 36a3642cf90..5c4f6b499e9 100644 --- a/fs/afs/cache.h +++ b/fs/afs/cache.h @@ -1,6 +1,6 @@ /* AFS local cache management interface * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -9,15 +9,4 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef AFS_CACHE_H -#define AFS_CACHE_H - -#undef AFS_CACHING_SUPPORT - -#include <linux/mm.h> -#ifdef AFS_CACHING_SUPPORT -#include <linux/cachefs.h> -#endif -#include "types.h" - -#endif /* AFS_CACHE_H */ +#include <linux/fscache.h> diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 5e1df14e16b..e19c13f059e 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist) if (ret < 0) goto error; -#ifdef AFS_CACHING_SUPPORT - /* put it up for caching */ - cachefs_acquire_cookie(afs_cache_netfs.primary_index, - &afs_vlocation_cache_index_def, - cell, - &cell->cache); +#ifdef CONFIG_AFS_FSCACHE + /* put it up for caching (this never returns an error) */ + cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell); #endif /* add to the cell lists */ @@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell) list_del_init(&cell->proc_link); up_write(&afs_proc_cells_sem); -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(cell->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(cell->cache, 0); #endif - key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/file.c b/fs/afs/file.c index a3901769a96..7a1d942ef68 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset); static int afs_releasepage(struct page *page, gfp_t gfp_flags); static int afs_launder_page(struct page *page); +static int afs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, @@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = { const struct address_space_operations afs_fs_aops = { .readpage = afs_readpage, + .readpages = afs_readpages, .set_page_dirty = afs_set_page_dirty, .launder_page = afs_launder_page, .releasepage = afs_releasepage, @@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file) /* * deal with notification that a page was read from the cache */ -#ifdef AFS_CACHING_SUPPORT -static void afs_readpage_read_complete(void *cookie_data, - struct page *page, - void *data, - int error) +static void afs_file_readpage_read_complete(struct page *page, + void *data, + int error) { - _enter("%p,%p,%p,%d", cookie_data, page, data, error); + _enter("%p,%p,%d", page, data, error); - if (error) - SetPageError(page); - else + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) SetPageUptodate(page); unlock_page(page); - } -#endif - -/* - * deal with notification that a page was written to the cache - */ -#ifdef AFS_CACHING_SUPPORT -static void afs_readpage_write_complete(void *cookie_data, - struct page *page, - void *data, - int error) -{ - _enter("%p,%p,%p,%d", cookie_data, page, data, error); - - unlock_page(page); -} -#endif /* * AFS read page from file, directory or symlink @@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page) if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) goto error; -#ifdef AFS_CACHING_SUPPORT /* is it cached? */ - ret = cachefs_read_or_alloc_page(vnode->cache, +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_page(vnode->cache, page, afs_file_readpage_read_complete, NULL, @@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page) #else ret = -ENOBUFS; #endif - switch (ret) { - /* read BIO submitted and wb-journal entry found */ - case 1: - BUG(); // TODO - handle wb-journal match - /* read BIO submitted (page in cache) */ case 0: break; - /* no page available in cache */ - case -ENOBUFS: + /* page not yet cached */ case -ENODATA: + _debug("cache said ENODATA"); + goto go_on; + + /* page will not be cached */ + case -ENOBUFS: + _debug("cache said ENOBUFS"); default: + go_on: offset = page->index << PAGE_CACHE_SHIFT; len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); @@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page) set_bit(AFS_VNODE_DELETED, &vnode->flags); ret = -ESTALE; } -#ifdef AFS_CACHING_SUPPORT - cachefs_uncache_page(vnode->cache, page); + +#ifdef CONFIG_AFS_FSCACHE + fscache_uncache_page(vnode->cache, page); #endif + BUG_ON(PageFsCache(page)); goto error; } SetPageUptodate(page); -#ifdef AFS_CACHING_SUPPORT - if (cachefs_write_page(vnode->cache, - page, - afs_file_readpage_write_complete, - NULL, - GFP_KERNEL) != 0 - ) { - cachefs_uncache_page(vnode->cache, page); - unlock_page(page); + /* send the page to the cache */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page) && + fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_uncache_page(vnode->cache, page); + BUG_ON(PageFsCache(page)); } -#else - unlock_page(page); #endif + unlock_page(page); } _leave(" = 0"); @@ -232,34 +216,59 @@ error: } /* - * invalidate part or all of a page + * read a set of pages */ -static void afs_invalidatepage(struct page *page, unsigned long offset) +static int afs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) { - int ret = 1; + struct afs_vnode *vnode; + int ret = 0; - _enter("{%lu},%lu", page->index, offset); + _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); - BUG_ON(!PageLocked(page)); + vnode = AFS_FS_I(mapping->host); + if (vnode->flags & AFS_VNODE_DELETED) { + _leave(" = -ESTALE"); + return -ESTALE; + } - if (PagePrivate(page)) { - /* We release buffers only if the entire page is being - * invalidated. - * The get_block cached value has been unconditionally - * invalidated, so real IO is not possible anymore. - */ - if (offset == 0) { - BUG_ON(!PageLocked(page)); - - ret = 0; - if (!PageWriteback(page)) - ret = page->mapping->a_ops->releasepage(page, - 0); - /* possibly should BUG_ON(!ret); - neilb */ - } + /* attempt to read as many of the pages as possible */ +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_pages(vnode->cache, + mapping, + pages, + &nr_pages, + afs_file_readpage_read_complete, + NULL, + mapping_gfp_mask(mapping)); +#else + ret = -ENOBUFS; +#endif + + switch (ret) { + /* all pages are being read from the cache */ + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(nr_pages != 0); + _leave(" = 0 [reading all]"); + return 0; + + /* there were pages that couldn't be read from the cache */ + case -ENODATA: + case -ENOBUFS: + break; + + /* other error */ + default: + _leave(" = %d", ret); + return ret; } - _leave(" = %d", ret); + /* load the missing pages from the network */ + ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); + + _leave(" = %d [netting]", ret); + return ret; } /* @@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page) } /* - * release a page and cleanup its private data + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_invalidatepage(struct page *page, unsigned long offset) +{ + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + + _enter("{%lu},%lu", page->index, offset); + + BUG_ON(!PageLocked(page)); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0) { +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + fscache_wait_on_page_write(vnode->cache, page); + fscache_uncache_page(vnode->cache, page); + ClearPageFsCache(page); + } +#endif + + if (PagePrivate(page)) { + if (wb && !PageWriteback(page)) { + set_page_private(page, 0); + afs_put_writeback(wb); + } + + if (!page_private(page)) + ClearPagePrivate(page); + } + } + + _leave(""); +} + +/* + * release a page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not */ static int afs_releasepage(struct page *page, gfp_t gfp_flags) { + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - struct afs_writeback *wb; _enter("{{%x:%u}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, gfp_flags); + /* deny if page is being written to the cache and the caller hasn't + * elected to wait */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + if (fscache_check_page_write(vnode->cache, page)) { + if (!(gfp_flags & __GFP_WAIT)) { + _leave(" = F [cache busy]"); + return 0; + } + fscache_wait_on_page_write(vnode->cache, page); + } + + fscache_uncache_page(vnode->cache, page); + ClearPageFsCache(page); + } +#endif + if (PagePrivate(page)) { - wb = (struct afs_writeback *) page_private(page); - ASSERT(wb != NULL); - set_page_private(page, 0); + if (wb) { + set_page_private(page, 0); + afs_put_writeback(wb); + } ClearPagePrivate(page); - afs_put_writeback(wb); } - _leave(" = 0"); - return 0; + /* indicate that the page can be released */ + _leave(" = T"); + return 1; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index bb47217f6a1..c048f065875 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) return -EBADMSG; } +#ifdef CONFIG_AFS_FSCACHE + if (vnode->status.size != inode->i_size) + fscache_attr_changed(vnode->cache); +#endif + inode->i_nlink = vnode->status.nlink; inode->i_uid = vnode->status.owner; inode->i_gid = 0; @@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, return inode; } -#ifdef AFS_CACHING_SUPPORT - /* set up caching before reading the status, as fetch-status reads the - * first page of symlinks to see if they're really mntpts */ - cachefs_acquire_cookie(vnode->volume->cache, - NULL, - vnode, - &vnode->cache); -#endif - if (!status) { /* it's a remotely extant inode */ set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); @@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, } } + /* set up caching before mapping the status, as map-status reads the + * first page of symlinks to see if they're really mountpoints */ + inode->i_size = vnode->status.size; +#ifdef CONFIG_AFS_FSCACHE + vnode->cache = fscache_acquire_cookie(vnode->volume->cache, + &afs_vnode_cache_index_def, + vnode); +#endif + ret = afs_inode_map_status(vnode, key); if (ret < 0) goto bad_inode; @@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* failure */ bad_inode: +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); + vnode->cache = NULL; +#endif iget_failed(inode); _leave(" = %d [bad]", ret); return ERR_PTR(ret); @@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode) ASSERT(list_empty(&vnode->writebacks)); ASSERT(!vnode->cb_promised); -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(vnode->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); vnode->cache = NULL; #endif diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 67f259d99cd..106be66dafd 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -21,6 +21,7 @@ #include "afs.h" #include "afs_vl.h" +#include "cache.h" #define AFS_CELL_MAX_ADDRS 15 @@ -193,8 +194,8 @@ struct afs_cell { struct key *anonymous_key; /* anonymous user key for this cell */ struct list_head proc_link; /* /proc cell list link */ struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif /* server record management */ @@ -249,8 +250,8 @@ struct afs_vlocation { struct list_head grave; /* link in master graveyard list */ struct list_head update; /* link in master update list */ struct afs_cell *cell; /* cell to which volume belongs */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif struct afs_cache_vlocation vldb; /* volume information DB record */ struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ @@ -302,8 +303,8 @@ struct afs_volume { atomic_t usage; struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ struct afs_vlocation *vlocation; /* volume location */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif afs_volid_t vid; /* volume ID */ afs_voltype_t type; /* type of volume */ @@ -333,8 +334,8 @@ struct afs_vnode { struct afs_server *server; /* server currently supplying this file */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ -#ifdef AFS_CACHING_SUPPORT - struct cachefs_cookie *cache; /* caching cookie */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ #endif struct afs_permits *permits; /* cache of permits so far obtained */ struct mutex permits_lock; /* lock for altering permits list */ @@ -428,6 +429,22 @@ struct afs_uuid { /*****************************************************************************/ /* + * cache.c + */ +#ifdef CONFIG_AFS_FSCACHE +extern struct fscache_netfs afs_cache_netfs; +extern struct fscache_cookie_def afs_cell_cache_index_def; +extern struct fscache_cookie_def afs_vlocation_cache_index_def; +extern struct fscache_cookie_def afs_volume_cache_index_def; +extern struct fscache_cookie_def afs_vnode_cache_index_def; +#else +#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) +#endif + +/* * callback.c */ extern void afs_init_callback_state(struct afs_server *); @@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void); */ extern struct rw_semaphore afs_proc_cells_sem; extern struct list_head afs_proc_cells; -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_cache_cell_index_def; -#endif #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) extern int afs_cell_init(char *); @@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *); * main.c */ extern struct afs_uuid afs_uuid; -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_netfs afs_cache_netfs; -#endif /* * misc.c @@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t); /* * vlclient.c */ -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_vlocation_cache_index_def; -#endif - extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, const char *, struct afs_cache_vlocation *, const struct afs_wait_mode *); @@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void); /* * vnode.c */ -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_vnode_cache_index_def; -#endif - -extern struct afs_timer_ops afs_vnode_cb_timed_out_ops; - static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { return container_of(inode, struct afs_vnode, vfs_inode); @@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); /* * volume.c */ -#ifdef AFS_CACHING_SUPPORT -extern struct cachefs_index_def afs_volume_cache_index_def; -#endif - #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) extern void afs_put_volume(struct afs_volume *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 2d3e5d4fb9f..66d54d348c5 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -1,6 +1,6 @@ /* AFS client file system * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -29,18 +29,6 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -#ifdef AFS_CACHING_SUPPORT -static struct cachefs_netfs_operations afs_cache_ops = { - .get_page_cookie = afs_cache_get_page_cookie, -}; - -struct cachefs_netfs afs_cache_netfs = { - .name = "afs", - .version = 0, - .ops = &afs_cache_ops, -}; -#endif - struct afs_uuid afs_uuid; /* @@ -104,10 +92,9 @@ static int __init afs_init(void) if (ret < 0) return ret; -#ifdef AFS_CACHING_SUPPORT +#ifdef CONFIG_AFS_FSCACHE /* we want to be able to cache */ - ret = cachefs_register_netfs(&afs_cache_netfs, - &afs_cache_cell_index_def); + ret = fscache_register_netfs(&afs_cache_netfs); if (ret < 0) goto error_cache; #endif @@ -142,8 +129,8 @@ error_fs: error_open_socket: error_vl_update_init: error_cell_init: -#ifdef AFS_CACHING_SUPPORT - cachefs_unregister_netfs(&afs_cache_netfs); +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif afs_callback_update_kill(); @@ -175,8 +162,8 @@ static void __exit afs_exit(void) afs_vlocation_purge(); flush_scheduled_work(); afs_cell_purge(); -#ifdef AFS_CACHING_SUPPORT - cachefs_unregister_netfs(&afs_cache_netfs); +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); #endif afs_proc_cleanup(); rcu_barrier(); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 78db4953a80..2b9e2d03a39 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (PageError(page)) goto error; - buf = kmap(page); + buf = kmap_atomic(page, KM_USER0); memcpy(devname, buf, size); - kunmap(page); + kunmap_atomic(buf, KM_USER0); page_cache_release(page); page = NULL; diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 849fc3160cb..ec2a7431e45 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl, vl->vldb = *vldb; -#ifdef AFS_CACHING_SUPPORT - /* update volume entry in local cache */ - cachefs_update_cookie(vl->cache); +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); #endif } @@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, memset(&vldb, 0, sizeof(vldb)); /* see if we have an in-cache copy (will set vl->valid if there is) */ -#ifdef AFS_CACHING_SUPPORT - cachefs_acquire_cookie(cell->cache, - &afs_volume_cache_index_def, - vlocation, - &vl->cache); +#ifdef CONFIG_AFS_FSCACHE + vl->cache = fscache_acquire_cookie(vl->cell->cache, + &afs_vlocation_cache_index_def, vl); #endif if (vl->valid) { @@ -420,6 +417,11 @@ fill_in_record: spin_unlock(&vl->lock); wake_up(&vl->waitq); + /* update volume entry in local cache */ +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); +#endif + /* schedule for regular updates */ afs_vlocation_queue_for_updates(vl); goto success; @@ -465,7 +467,7 @@ found_in_memory: spin_unlock(&vl->lock); success: - _leave(" = %p",vl); + _leave(" = %p", vl); return vl; error_abandon: @@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl) { _enter("%p", vl); -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(vl->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vl->cache, 0); #endif - afs_put_cell(vl->cell); kfree(vl); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 8bab0e3437f..a353e69e239 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) } /* attach the cache and volume location */ -#ifdef AFS_CACHING_SUPPORT - cachefs_acquire_cookie(vlocation->cache, - &afs_vnode_cache_index_def, - volume, - &volume->cache); +#ifdef CONFIG_AFS_FSCACHE + volume->cache = fscache_acquire_cookie(vlocation->cache, + &afs_volume_cache_index_def, + volume); #endif - afs_get_vlocation(vlocation); volume->vlocation = vlocation; @@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume) up_write(&vlocation->cell->vl_sem); /* finish cleaning up the volume */ -#ifdef AFS_CACHING_SUPPORT - cachefs_relinquish_cookie(volume->cache, 0); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(volume->cache, 0); #endif afs_put_vlocation(vlocation); diff --git a/fs/afs/write.c b/fs/afs/write.c index 3fb36d43362..c2e7a7ff008 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync) _leave(" = %d", ret); return ret; } + +/* + * notification that a previously read-only page is about to become writable + * - if it returns an error, the caller will deliver a bus error signal + */ +int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); + + _enter("{{%x:%u}},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); + + /* wait for the page to be written to the cache before we allow it to + * be modified */ +#ifdef CONFIG_AFS_FSCACHE + fscache_wait_on_page_write(vnode->cache, page); +#endif + + _leave(" = 0"); + return 0; +} diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a76803108d0..b7ff33c6310 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); int autofs4_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); struct dentry *autofs4_expire_direct(struct super_block *sb, diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 025e105bffe..9e5ae8a4f5c 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - struct dentry *dentry; struct vfsmount *mnt; - int err = -EAGAIN; int how; how = param->expire.how; mnt = fp->f_path.mnt; - if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); - else - dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); - - if (dentry) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); - - /* - * This is synchronous because it makes the daemon a - * little easier - */ - err = autofs4_wait(sbi, dentry, NFY_EXPIRE); - - spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_MOUNTPOINT) { - ino->flags &= ~AUTOFS_INF_MOUNTPOINT; - sbi->sb->s_root->d_mounted++; - } - ino->flags &= ~AUTOFS_INF_EXPIRING; - complete_all(&ino->expire_complete); - spin_unlock(&sbi->fs_lock); - dput(dentry); - } - - return err; + return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index e3bd50776f9..75f7ddacf7d 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb, return ret; } -/* Call repeatedly until it returns -EAGAIN, meaning there's nothing - more to be done */ -int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int __user *arg) +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) { struct dentry *dentry; int ret = -EAGAIN; - int do_now = 0; - - if (arg && get_user(do_now, arg)) - return -EFAULT; if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); + dentry = autofs4_expire_direct(sb, mnt, sbi, when); else - dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); + dentry = autofs4_expire_indirect(sb, mnt, sbi, when); if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, return ret; } +/* Call repeatedly until it returns -EAGAIN, meaning there's nothing + more to be done */ +int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int __user *arg) +{ + int do_now = 0; + + if (arg && get_user(do_now, arg)) + return -EFAULT; + + return autofs4_do_expire_multi(sb, mnt, sbi, do_now); +} + diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 74b1469a950..e383bf0334f 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); - expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name); - if (expiring) { - /* - * If we are racing with expire the request might not - * be quite complete but the directory has been removed - * so it must have been successful, so just wait for it. - */ - ino = autofs4_dentry_ino(expiring); - autofs4_expire_wait(expiring); - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->expiring)) - list_del_init(&ino->expiring); - spin_unlock(&sbi->lookup_lock); - dput(expiring); - } - unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); if (unhashed) dentry = unhashed; @@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s } if (!oz_mode) { + mutex_unlock(&dir->i_mutex); + expiring = autofs4_lookup_expiring(sbi, + dentry->d_parent, + &dentry->d_name); + if (expiring) { + /* + * If we are racing with expire the request might not + * be quite complete but the directory has been removed + * so it must have been successful, so just wait for it. + */ + ino = autofs4_dentry_ino(expiring); + autofs4_expire_wait(expiring); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + dput(expiring); + } + spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - if (dentry->d_op && dentry->d_op->d_revalidate) { - mutex_unlock(&dir->i_mutex); + if (dentry->d_op && dentry->d_op->d_revalidate) (dentry->d_op->d_revalidate)(dentry, nd); - mutex_lock(&dir->i_mutex); - } + mutex_lock(&dir->i_mutex); } /* diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index d06cb023ad0..76afd0d6b86 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -900,6 +900,7 @@ static int befs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); befs_debug(sb, "---> befs_statfs()"); @@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0; /* UNKNOWN */ buf->f_ffree = 0; /* UNKNOWN */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BEFS_NAME_LEN; befs_debug(sb, "<--- befs_statfs()"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 33b7235f853..40381df3486 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -12,8 +12,6 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> -#include <linux/stat.h> -#include <linux/time.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> @@ -21,20 +19,15 @@ #include <linux/binfmts.h> #include <linux/string.h> #include <linux/file.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> #include <linux/slab.h> -#include <linux/shm.h> #include <linux/personality.h> #include <linux/elfcore.h> #include <linux/init.h> #include <linux/highuid.h> -#include <linux/smp.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/security.h> -#include <linux/syscalls.h> #include <linux/random.h> #include <linux/elf.h> #include <linux/utsname.h> @@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata; unsigned long elf_bss, elf_brk; - int elf_exec_fileno; int retval, i; unsigned int size; unsigned long elf_entry; @@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_ph; } - retval = get_unused_fd(); - if (retval < 0) - goto out_free_ph; - get_file(bprm->file); - fd_install(elf_exec_fileno = retval, bprm->file); - elf_ppnt = elf_phdata; elf_bss = 0; elf_brk = 0; @@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) retval = -ENOEXEC; if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) - goto out_free_file; + goto out_free_ph; retval = -ENOMEM; elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); if (!elf_interpreter) - goto out_free_file; + goto out_free_ph; retval = kernel_read(bprm->file, elf_ppnt->p_offset, elf_interpreter, @@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) kfree(elf_phdata); - sys_close(elf_exec_fileno); - set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES @@ -1028,8 +1012,6 @@ out_free_dentry: fput(interpreter); out_free_interp: kfree(elf_interpreter); -out_free_file: - sys_close(elf_exec_fileno); out_free_ph: kfree(elf_phdata); goto out; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index f3e72c5c19f..70cfc4b84ae 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( params->elfhdr_addr = seg->addr; /* clear any space allocated but not loaded */ - if (phdr->p_filesz < phdr->p_memsz) - clear_user((void *) (seg->addr + phdr->p_filesz), - phdr->p_memsz - phdr->p_filesz); + if (phdr->p_filesz < phdr->p_memsz) { + ret = clear_user((void *) (seg->addr + phdr->p_filesz), + phdr->p_memsz - phdr->p_filesz); + if (ret) + return ret; + } if (mm) { if (phdr->p_flags & PF_X) { @@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, delta_vaddr; - int loop, dvset; + int loop, dvset, ret; load_addr = params->load_addr; delta_vaddr = 0; @@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); - clear_user((void __user *) maddr, disp); + ret = clear_user((void __user *) maddr, disp); + if (ret) + return ret; maddr += disp; } @@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - clear_user((void __user *) maddr + phdr->p_filesz, - excess1); + ret = clear_user((void __user *) maddr + phdr->p_filesz, + excess1); + if (ret) + return ret; } #else if (excess > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess); - clear_user((void *) maddr + phdr->p_filesz, excess); + ret = clear_user((void *) maddr + phdr->p_filesz, excess); + if (ret) + return ret; } #endif diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 08644a61616..eff74b9c9e7 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -188,7 +188,6 @@ out: static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) { - int som_exec_fileno; int retval; unsigned int size; unsigned long som_entry; @@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free; } - retval = get_unused_fd(); - if (retval < 0) - goto out_free; - get_file(bprm->file); - fd_install(som_exec_fileno = retval, bprm->file); - /* Flush all traces of the currently running executable */ retval = flush_old_exec(bprm); if (retval) diff --git a/fs/block_dev.c b/fs/block_dev.c index 8c3c6899ccf..f45dbc18dd1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev) } return sync_blockdev(bdev); } +EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev -- lock a filesystem and force it into a consistent state diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d2cf5a54a4b..9adf5e4f7e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o + compression.o delayed-ref.o else # Normal Makefile diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1d53b62dbba..7fdd184a528 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (IS_POSIXACL(dir) && acl) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 72677ce2b74..b30986f00b9 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,12 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -86,12 +92,6 @@ struct btrfs_inode { */ u64 logged_trans; - /* - * trans that last made a change that should be fully fsync'd. This - * gets reset to zero each time the inode is logged - */ - u64 log_dirty_trans; - /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file */ @@ -121,6 +121,25 @@ struct btrfs_inode { /* the start of block group preferred for allocations. */ u64 block_group; + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 37f31b5529a..dbb72412463 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. - * - * prealloc_dest -- if you have already reserved a destination for the cow, - * this uses that block instead of allocating a new one. - * btrfs_alloc_reserved_extent is used to finish the allocation. */ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - u64 prealloc_dest) + u64 search_start, u64 empty_size) { u64 parent_start; struct extent_buffer *cow; @@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); - if (prealloc_dest) { - struct btrfs_key ins; - - ins.objectid = prealloc_dest; - ins.offset = buf->len; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ret = btrfs_alloc_reserved_extent(trans, root, parent_start, - root->root_key.objectid, - trans->transid, level, &ins); - BUG_ON(ret); - cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len, level); - } else { - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, - root->root_key.objectid, - trans->transid, level, - search_start, empty_size); - } + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, root->root_key.objectid, + trans->transid, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest) + struct extent_buffer **cow_ret) { u64 search_start; int ret; @@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; - WARN_ON(prealloc_dest); return 0; } @@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, - prealloc_dest); + parent_slot, cow_ret, search_start, 0); return ret; } @@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize), 0); + (end_slot - i) * blocksize)); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); BUG_ON(ret); spin_lock(&root->node_lock); @@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, child->start, + child->len, mid->start, child->start, root->root_key.objectid, trans->transid, level - 1); @@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; + if (trans->transaction->delayed_refs.flushing && + btrfs_header_nritems(mid) > 2) + return 0; + if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left, 0); + parent, pslot - 1, &left); if (wret) { ret = wret; goto enospc; @@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right, 0); + parent, pslot + 1, &right); if (wret) { ret = wret; goto enospc; @@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left, 0); + pslot - 1, &left); if (ret) wret = 1; else { @@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right, 0); + &right); if (ret) wret = 1; else { @@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root u8 lowest_level = 0; u64 blocknr; u64 gen; - struct btrfs_key prealloc_block; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (ins_len < 0) lowest_unlock = 2; - prealloc_block.objectid = 0; - again: if (p->skip_locking) b = btrfs_root_node(root); @@ -1529,44 +1508,11 @@ again: !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { goto cow_done; } - - /* ok, we have to cow, is our old prealloc the right - * size? - */ - if (prealloc_block.objectid && - prealloc_block.offset != b->len) { - btrfs_release_path(root, p); - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - prealloc_block.objectid = 0; - goto again; - } - - /* - * for higher level blocks, try not to allocate blocks - * with the block and the parent locks held. - */ - if (level > 0 && !prealloc_block.objectid) { - u32 size = b->len; - u64 hint = b->start; - - btrfs_release_path(root, p); - ret = btrfs_reserve_extent(trans, root, - size, size, 0, - hint, (u64)-1, - &prealloc_block, 0); - BUG_ON(ret); - goto again; - } - btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], - &b, prealloc_block.objectid); - prealloc_block.objectid = 0; + p->slots[level + 1], &b); if (wret) { free_extent_buffer(b); ret = wret; @@ -1742,12 +1688,8 @@ done: * we don't really know what they plan on doing with the path * from here on, so for now just mark it as blocking */ - btrfs_set_path_blocking(p); - if (prealloc_block.objectid) { - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - } + if (!p->leave_spinning) + btrfs_set_path_blocking(p); return ret; } @@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, int ret; eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); BUG_ON(ret); btrfs_set_lock_blocking(eb); @@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, } ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb, 0); + &eb); BUG_ON(ret); if (root->root_key.objectid == @@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->start, c->start, + lower->len, lower->start, c->start, root->root_key.objectid, trans->transid, level - 1); BUG_ON(ret); @@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else { + } else if (!trans->transaction->delayed_refs.flushing) { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } -/* - * push some data in the path leaf to the right, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * returns 1 if the push failed because the other node didn't have enough - * room, 0 if everything worked out and < 0 if there were major errors. - */ -static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems) { struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *right; - struct extent_buffer *upper; + struct extent_buffer *upper = path->nodes[1]; struct btrfs_disk_key disk_key; int slot; u32 i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; - u32 left_nritems; u32 nr; u32 right_nritems; u32 data_end; u32 this_item_size; int ret; - slot = path->slots[1]; - if (!path->nodes[1]) - return 1; - - upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(upper) - 1) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - right = read_node_slot(root, upper, slot + 1); - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right, 0); - if (ret) - goto out_unlock; - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) - goto out_unlock; - if (empty) nr = 0; else @@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (path->slots[0] >= left_nritems) push_space += data_size; + slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { item = btrfs_item_nr(left, i); @@ -2528,24 +2432,82 @@ out_unlock: } /* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + return __push_leaf_right(trans, root, path, data_size, empty, + right, free_space, left_nritems); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise */ -static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, int right_nritems) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; - struct extent_buffer *left; int slot; int i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; u32 old_left_nritems; - u32 right_nritems; u32 nr; int ret = 0; int wret; @@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root u32 old_left_item_size; slot = path->slots[1]; - if (slot == 0) - return 1; - if (!path->nodes[1]) - return 1; - - right_nritems = btrfs_header_nritems(right); - if (right_nritems == 0) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - left = read_node_slot(root, path->nodes[1], slot - 1); - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left, 0); - if (ret) { - /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; - goto out; - } - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } if (empty) nr = right_nritems; @@ -2755,6 +2682,154 @@ out: } /* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + return __push_leaf_left(trans, root, path, data_size, + empty, left, free_space, right_nritems); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) +{ + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + struct btrfs_disk_key disk_key; + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + ret = btrfs_update_ref(trans, root, l, right, 0, nritems); + BUG_ON(ret); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + return ret; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; - int data_copy_size; - int rt_data_off; - int i; int ret = 0; int wret; int double_split; int num_doubles = 0; - struct btrfs_disk_key disk_key; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && + !trans->transaction->delayed_refs.flushing) { wret = push_leaf_right(trans, root, path, data_size, 0); if (wret < 0) return wret; @@ -2830,11 +2902,14 @@ again: write_extent_buffer(right, root->fs_info->chunk_tree_uuid, (unsigned long)btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); + if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (slot >= nritems) { + struct btrfs_disk_key disk_key; + btrfs_cpu_key_to_disk(&disk_key, ins_key); btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, @@ -2862,6 +2937,8 @@ again: if (leaf_space_used(l, 0, mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (!extend && data_size && slot == 0) { + struct btrfs_disk_key disk_key; + btrfs_cpu_key_to_disk(&disk_key, ins_key); btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, @@ -2894,76 +2971,16 @@ again: } } } - nritems = nritems - mid; - btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); - - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); - - copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - - data_copy_size, btrfs_leaf_data(l) + - leaf_data_end(root, l), data_copy_size); - - rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end_nr(l, mid); - - for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); - u32 ioff; - - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); - } - - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - - btrfs_set_header_nritems(l, mid); - ret = 0; - btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); - BUG_ON(path->slots[0] != slot); - ret = btrfs_update_ref(trans, root, l, right, 0, nritems); + ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); BUG_ON(ret); - if (mid <= slot) { - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] -= mid; - path->slots[1] += 1; - } else { - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - - BUG_ON(path->slots[0] < 0); - if (double_split) { BUG_ON(num_doubles != 0); num_doubles++; goto again; } + return ret; } @@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, return -EAGAIN; } + btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &orig_key, path, sizeof(struct btrfs_item), 1); path->keep_locks = 0; BUG_ON(ret); + btrfs_unlock_up_safe(path, 1); + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: /* * make sure any changes to the path from split_leaf leave it * in a blocking state */ btrfs_set_path_blocking(path); - leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); - -split: item = btrfs_item_nr(leaf, path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); - buf = kmalloc(item_size, GFP_NOFS); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); @@ -3445,39 +3463,27 @@ out: } /* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. + * this is a helper for btrfs_insert_empty_items, the main goal here is + * to save stack depth by doing the bulk of the work in a function + * that doesn't call btrfs_search_slot */ -int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) +static noinline_for_stack int +setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { - struct extent_buffer *leaf; struct btrfs_item *item; - int ret = 0; - int slot; - int slot_orig; int i; u32 nritems; - u32 total_size = 0; - u32 total_data = 0; unsigned int data_end; struct btrfs_disk_key disk_key; + int ret; + struct extent_buffer *leaf; + int slot; - for (i = 0; i < nr; i++) - total_data += data_size[i]; - - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - slot_orig = path->slots[0]; leaf = path->nodes[0]; + slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } - slot = path->slots[0]; - BUG_ON(slot < 0); - if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, data_end -= data_size[i]; btrfs_set_item_size(leaf, item, data_size[i]); } + btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); ret = 0; if (slot == 0) { + struct btrfs_disk_key disk_key; btrfs_cpu_key_to_disk(&disk_key, cpu_key); ret = fixup_low_keys(trans, root, path, &disk_key, 1); } + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { btrfs_print_leaf(root, leaf); BUG(); } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + int ret = 0; + int slot; + int i; + u32 total_size = 0; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + BUG_ON(slot < 0); + + ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + total_data, total_size, nr); + out: - btrfs_unlock_up_safe(path, 1); return ret; } @@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && + !trans->transaction->delayed_refs.flushing) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; extent_buffer_get(leaf); + btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d..9417713542a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -45,6 +45,13 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 +/* + * files bigger than this get some pre-flushing when they are added + * to the ordered operations list. That way we limit the total + * work done by the commit + */ +#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) + /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -401,15 +408,16 @@ struct btrfs_path { int locks[BTRFS_MAX_LEVEL]; int reada; /* keep some upper locks as we walk down */ - int keep_locks; - int skip_locking; int lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - int search_for_split; + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int leave_spinning:1; }; /* @@ -688,15 +696,18 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; struct extent_io_tree pinned_extents; - struct extent_io_tree pending_del; - struct extent_io_tree extent_ins; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; u64 generation; u64 last_trans_committed; - u64 last_trans_new_blockgroup; + + /* + * this is updated to the current trans every time a full commit + * is required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; u64 open_ioctl_trans; unsigned long mount_opt; u64 max_extent; @@ -717,12 +728,21 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex extent_ins_mutex; struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -737,10 +757,29 @@ struct btrfs_fs_info { * ordered extents */ spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ struct list_head ordered_extents; + + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ struct list_head delalloc_inodes; /* + * special rename and truncate targets that must be on disk before + * we're allowed to commit. This is basically the ext3 style + * data=ordered list. + */ + struct list_head ordered_operations; + + /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for @@ -781,6 +820,11 @@ struct btrfs_fs_info { atomic_t throttle_gen; u64 total_pinned; + + /* protected by the delalloc lock, used to keep from writing + * metadata until there is a nice batch + */ + u64 dirty_metadata_bytes; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs); int btrfs_update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 bytenr); -int btrfs_extent_post_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, @@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 ref_generation, u64 owner_objectid); int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 root_objectid, u64 ref_generation, u64 owner_objectid); @@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest); + struct extent_buffer **cow_ret); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2060,7 +2101,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c new file mode 100644 index 00000000000..cbf7dc8ae3e --- /dev/null +++ b/fs/btrfs/delayed-ref.c @@ -0,0 +1,669 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/sort.h> +#include <linux/ftrace.h> +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" + +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + * + * Right now this code is only used for reference counted trees, but + * the long term goal is to get rid of the similar code for delayed + * extent tree modifications. + */ + +/* + * entries in the rb tree are ordered by the byte number of the extent + * and by the byte number of the parent block. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 parent) +{ + if (bytenr < ref->bytenr) + return -1; + if (bytenr > ref->bytenr) + return 1; + if (parent < ref->parent) + return -1; + if (parent > ref->parent) + return 1; + return 0; +} + +/* + * insert a new ref into the rbtree. This returns any existing refs + * for the same (bytenr,parent) tuple, or NULL if the new node was properly + * inserted. + */ +static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + u64 bytenr, u64 parent, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + rb_node); + + cmp = comp_entry(entry, bytenr, parent); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else + return entry; + } + + entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * find an entry based on (bytenr,parent). This returns the delayed + * ref if it was able to find one, or NULL if nothing was in that spot + */ +static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, + u64 bytenr, u64 parent, + struct btrfs_delayed_ref_node **last) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + if (last) + *last = entry; + + cmp = comp_entry(entry, bytenr, parent); + if (cmp < 0) + n = n->rb_left; + else if (cmp > 0) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 start) +{ + int count = 0; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + + delayed_refs = &trans->transaction->delayed_refs; + if (start == 0) { + node = rb_first(&delayed_refs->root); + } else { + ref = NULL; + tree_search(&delayed_refs->root, start, (u64)-1, &ref); + if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } + node = &ref->rb_node; + } else + node = rb_first(&delayed_refs->root); + } +again: + while (node && count < 32) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + if (list_empty(&head->cluster)) { + list_add_tail(&head->cluster, cluster); + delayed_refs->run_delayed_start = + head->node.bytenr; + count++; + + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + } else if (count) { + /* the goal of the clustering is to find extents + * that are likely to end up in the same extent + * leaf on disk. So, we don't want them spread + * all over the tree. Stop now if we've hit + * a head that was already in use + */ + break; + } + } + node = rb_next(node); + } + if (count) { + return 0; + } else if (start) { + /* + * we've gone to the end of the rbtree without finding any + * clusters. start from the beginning and try again + */ + start = 0; + node = rb_first(&delayed_refs->root); + goto again; + } + return 1; +} + +/* + * This checks to see if there are any delayed refs in the + * btree for a given bytenr. It returns one if it finds any + * and zero otherwise. + * + * If it only finds a head node, it returns 0. + * + * The idea is to use this when deciding if you can safely delete an + * extent from the extent allocation tree. There may be a pending + * ref in the rbtree that adds or removes references, so as long as this + * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent + * allocation tree. + */ +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *prev_node; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + if (ref) { + prev_node = rb_prev(&ref->rb_node); + if (!prev_node) + goto out; + ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr == bytenr) + ret = 1; + } +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +/* + * helper function to lookup reference count + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. This way you + * can check to see what the reference count would be if all of the + * delayed refs are processed. + */ +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 num_refs; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + delayed_refs = &trans->transaction->delayed_refs; +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret == 0) { + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + } else { + num_refs = 0; + ret = 0; + } + + spin_lock(&delayed_refs->lock); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + if (ref) { + head = btrfs_delayed_node_to_head(ref); + if (mutex_trylock(&head->mutex)) { + num_refs += ref->ref_mod; + mutex_unlock(&head->mutex); + *refs = num_refs; + goto out; + } + + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; + } else { + *refs = num_refs; + } +out: + spin_unlock(&delayed_refs->lock); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update an extent delayed ref in the + * rbtree. existing and update must both have the same + * bytenr and parent + * + * This may free existing if the update cancels out whatever + * operation it was doing. + */ +static noinline void +update_existing_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref *existing_ref; + struct btrfs_delayed_ref *ref; + + existing_ref = btrfs_delayed_node_to_ref(existing); + ref = btrfs_delayed_node_to_ref(update); + + if (ref->pin) + existing_ref->pin = 1; + + if (ref->action != existing_ref->action) { + /* + * this is effectively undoing either an add or a + * drop. We decrement the ref_mod, and if it goes + * down to zero we just delete the entry without + * every changing the extent allocation tree. + */ + existing->ref_mod--; + if (existing->ref_mod == 0) { + rb_erase(&existing->rb_node, + &delayed_refs->root); + existing->in_tree = 0; + btrfs_put_delayed_ref(existing); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; + } + } else { + if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { + /* if we're adding refs, make sure all the + * details match up. The extent could + * have been totally freed and reallocated + * by a different owner before the delayed + * ref entries were removed. + */ + existing_ref->owner_objectid = ref->owner_objectid; + existing_ref->generation = ref->generation; + existing_ref->root = ref->root; + existing->num_bytes = update->num_bytes; + } + /* + * the action on the existing ref matches + * the action on the ref we're trying to add. + * Bump the ref_mod by one so the backref that + * is eventually added/removed has the correct + * reference count + */ + existing->ref_mod += update->ref_mod; + } +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void +update_existing_head_ref(struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref_head *existing_ref; + struct btrfs_delayed_ref_head *ref; + + existing_ref = btrfs_delayed_node_to_head(existing); + ref = btrfs_delayed_node_to_head(update); + + if (ref->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing_ref->must_insert_reserved = ref->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + /* + * update the reference mod on the head to reflect this new operation + */ + existing->ref_mod += update->ref_mod; +} + +/* + * helper function to actually insert a delayed ref into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count in the head node and properly dealing + * with updating existing nodes as new modifications are queued. + */ +static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref *full_ref; + struct btrfs_delayed_ref_head *head_ref = NULL; + struct btrfs_delayed_ref_root *delayed_refs; + int count_mod = 1; + int must_insert_reserved = 0; + + /* + * the head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + if (parent == (u64)-1) { + if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + else if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + } + + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update + * the reserved accounting when the extent is finally added, or + * if a later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record + * that accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not required. + */ + if (action == BTRFS_ADD_DELAYED_EXTENT) { + must_insert_reserved = 1; + action = BTRFS_ADD_DELAYED_REF; + } else { + must_insert_reserved = 0; + } + + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->parent = parent; + ref->ref_mod = count_mod; + ref->in_tree = 1; + ref->num_bytes = num_bytes; + + if (btrfs_delayed_ref_is_head(ref)) { + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); + } else { + full_ref = btrfs_delayed_node_to_ref(ref); + full_ref->root = ref_root; + full_ref->generation = ref_generation; + full_ref->owner_objectid = owner_objectid; + full_ref->pin = pin; + full_ref->action = action; + } + + existing = tree_insert(&delayed_refs->root, bytenr, + parent, &ref->rb_node); + + if (existing) { + if (btrfs_delayed_ref_is_head(ref)) + update_existing_head_ref(existing, ref); + else + update_existing_ref(trans, delayed_refs, existing, ref); + + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + if (btrfs_delayed_ref_is_head(ref)) { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + } + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * add a delayed ref to the tree. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin) +{ + struct btrfs_delayed_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + /* + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. + */ + if (parent == 0) + parent = bytenr; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, action, pin); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, action, pin); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * this does a simple search for the head node for a given extent. + * It must be called with the delayed ref spinlock held, and it returns + * the head node if any where found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + if (ref) + return btrfs_delayed_node_to_head(ref); + return NULL; +} + +/* + * add a delayed ref to the tree. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + * + * The main point of this call is to add and remove a backreference in a single + * shot, taking the lock only once, and only searching for the head node once. + * + * It is the same as doing a ref add and delete in two separate calls. + */ +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_delayed_ref *ref; + struct btrfs_delayed_ref *old_ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); + if (!old_ref) { + kfree(ref); + return -ENOMEM; + } + + /* + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. + */ + if (parent == 0) + parent = bytenr; + if (orig_parent == 0) + orig_parent = bytenr; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + kfree(old_ref); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, + BTRFS_UPDATE_DELAYED_HEAD, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, + orig_parent, orig_ref_root, + orig_ref_generation, owner_objectid, + BTRFS_DROP_DELAYED_REF, pin); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h new file mode 100644 index 00000000000..3bec2ff0b15 --- /dev/null +++ b/fs/btrfs/delayed-ref.h @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __DELAYED_REF__ +#define __DELAYED_REF__ + +/* these are the possible values of struct btrfs_delayed_ref->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ + +struct btrfs_delayed_ref_node { + struct rb_node rb_node; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the parent our backref will point to */ + u64 parent; + + /* the size of the extent */ + u64 num_bytes; + + /* ref count on this data structure */ + atomic_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + /* is this node still in the rbtree? */ + unsigned int in_tree:1; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + struct btrfs_delayed_ref_node node; + + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + struct list_head cluster; + + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + unsigned int must_insert_reserved:1; +}; + +struct btrfs_delayed_ref { + struct btrfs_delayed_ref_node node; + + /* the root objectid our ref will point to */ + u64 root; + + /* the generation for the backref */ + u64 generation; + + /* owner_objectid of the backref */ + u64 owner_objectid; + + /* operation done by this entry in the rbtree */ + u8 action; + + /* if pin == 1, when the extent is freed it will be pinned until + * transaction commit + */ + unsigned int pin:1; +}; + +struct btrfs_delayed_ref_root { + struct rb_root root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + unsigned long num_entries; + + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + + /* + * set when the tree is flushing before a transaction commit, + * used by the throttling code to decide if new updates need + * to be run right away + */ + int flushing; + + u64 run_delayed_start; +}; + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(atomic_read(&ref->refs) == 0); + if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(ref->in_tree); + kfree(ref); + } +} + +int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin); + +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 search_start); +/* + * a node might live in a head or a regular ref, this lets you + * test for the proper type to use. + */ +static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) +{ + return node->parent == (u64)-1; +} + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_ref * +btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref, node); + +} + +static inline struct btrfs_delayed_ref_head * +btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(!btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref_head, node); + +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 926a0b287a7..1d70236ba00 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + path->leave_spinning = 1; + data_size = sizeof(*dir_item) + name_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6ec80c0fc86..92d73929d38 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct extent_buffer *eb; + int was_dirty; + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { + return extent_write_full_page(tree, page, + btree_get_extent, wbc); + } - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; + redirty_page_for_writepage(wbc, page); + eb = btrfs_find_tree_block(root, page_offset(page), + PAGE_CACHE_SIZE); + WARN_ON(!eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; + spin_unlock(&root->fs_info->delalloc_lock); } - return extent_write_full_page(tree, page, btree_get_extent, wbc); + free_extent_buffer(eb); + + unlock_page(page); + return 0; } static int btree_writepages(struct address_space *mapping, @@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_root *root = BTRFS_I(mapping->host)->root; u64 num_dirty; - u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; - num_dirty = count_range_bits(tree, &start, (u64)-1, - thresh, EXTENT_DIRTY); + /* this is a bit racy, but that's ok */ + num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty < thresh) return 0; } @@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); - /* ugh, clear_extent_buffer_dirty can be expensive */ - btrfs_set_lock_blocking(buf); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= buf->len) + root->fs_info->dirty_metadata_bytes -= buf->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { - printk(KERN_INFO "btrfs: total reference cache " - "size %llu\n", - root->fs_info->total_ref_cache_size); - } - mutex_lock(&root->fs_info->trans_mutex); cur = root->fs_info->running_transaction; if (!cur) { @@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg) mutex_unlock(&root->fs_info->trans_mutex); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); + sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->pending_del, - fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->extent_ins, - fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; INIT_LIST_HEAD(&fs_info->dead_reloc_roots); @@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->extent_ins_mutex); mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; - - btrfs_set_lock_blocking(buf); + int was_dirty; btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { @@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); + was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += buf->len; + spin_unlock(&root->fs_info->delalloc_lock); + } } void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) @@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int btree_lock_page_hook(struct page *page) { struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; unsigned long len; @@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_lock(eb); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= eb->len) + root->fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + btrfs_tree_unlock(eb); free_extent_buffer(eb); out: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 95029db227b..c958ecbc191 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root, void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int wait_on_tree_block_writeback(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fefe83ad205..f5e7cae63d8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -49,17 +49,23 @@ struct pending_extent_op { int del; }; -static int finish_current_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all); -static int del_pending_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int is_data); +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins, + int ref_mod); +static int update_reserved_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, + int ref_to_drop); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -554,262 +560,13 @@ out: return ret; } -/* - * updates all the backrefs that are pending on update_list for the - * extent_root - */ -static noinline int update_backrefs(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct list_head *update_list) -{ - struct btrfs_key key; - struct btrfs_extent_ref *ref; - struct btrfs_fs_info *info = extent_root->fs_info; - struct pending_extent_op *op; - struct extent_buffer *leaf; - int ret = 0; - struct list_head *cur = update_list->next; - u64 ref_objectid; - u64 ref_root = extent_root->root_key.objectid; - - op = list_entry(cur, struct pending_extent_op, list); - -search: - key.objectid = op->bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = op->orig_parent; - - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); - BUG_ON(ret); - - leaf = path->nodes[0]; - -loop: - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - - ref_objectid = btrfs_ref_objectid(leaf, ref); - - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != op->orig_generation || - (ref_objectid != op->level && - ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { - printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " - "root %llu, owner %u\n", - (unsigned long long)op->bytenr, - (unsigned long long)op->orig_parent, - (unsigned long long)ref_root, op->level); - btrfs_print_leaf(extent_root, leaf); - BUG(); - } - - key.objectid = op->bytenr; - key.offset = op->parent; - key.type = BTRFS_EXTENT_REF_KEY; - ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); - BUG_ON(ret); - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - btrfs_set_ref_generation(leaf, ref, op->generation); - - cur = cur->next; - - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - - if (cur == update_list) { - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(extent_root, path); - goto out; - } - - op = list_entry(cur, struct pending_extent_op, list); - - path->slots[0]++; - while (path->slots[0] < btrfs_header_nritems(leaf)) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid == op->bytenr && - key.type == BTRFS_EXTENT_REF_KEY) - goto loop; - path->slots[0]++; - } - - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(extent_root, path); - goto search; - -out: - return 0; -} - -static noinline int insert_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct list_head *insert_list, int nr) -{ - struct btrfs_key *keys; - u32 *data_size; - struct pending_extent_op *op; - struct extent_buffer *leaf; - struct list_head *cur = insert_list->next; - struct btrfs_fs_info *info = extent_root->fs_info; - u64 ref_root = extent_root->root_key.objectid; - int i = 0, last = 0, ret; - int total = nr * 2; - - if (!nr) - return 0; - - keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); - if (!keys) - return -ENOMEM; - - data_size = kzalloc(total * sizeof(u32), GFP_NOFS); - if (!data_size) { - kfree(keys); - return -ENOMEM; - } - - list_for_each_entry(op, insert_list, list) { - keys[i].objectid = op->bytenr; - keys[i].offset = op->num_bytes; - keys[i].type = BTRFS_EXTENT_ITEM_KEY; - data_size[i] = sizeof(struct btrfs_extent_item); - i++; - - keys[i].objectid = op->bytenr; - keys[i].offset = op->parent; - keys[i].type = BTRFS_EXTENT_REF_KEY; - data_size[i] = sizeof(struct btrfs_extent_ref); - i++; - } - - op = list_entry(cur, struct pending_extent_op, list); - i = 0; - while (i < total) { - int c; - ret = btrfs_insert_some_items(trans, extent_root, path, - keys+i, data_size+i, total-i); - BUG_ON(ret < 0); - - if (last && ret > 1) - BUG(); - - leaf = path->nodes[0]; - for (c = 0; c < ret; c++) { - int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; - - /* - * if the first item we inserted was a backref, then - * the EXTENT_ITEM will be the odd c's, else it will - * be the even c's - */ - if ((ref_first && (c % 2)) || - (!ref_first && !(c % 2))) { - struct btrfs_extent_item *itm; - - itm = btrfs_item_ptr(leaf, path->slots[0] + c, - struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], itm, 1); - op->del++; - } else { - struct btrfs_extent_ref *ref; - - ref = btrfs_item_ptr(leaf, path->slots[0] + c, - struct btrfs_extent_ref); - btrfs_set_ref_root(leaf, ref, ref_root); - btrfs_set_ref_generation(leaf, ref, - op->generation); - btrfs_set_ref_objectid(leaf, ref, op->level); - btrfs_set_ref_num_refs(leaf, ref, 1); - op->del++; - } - - /* - * using del to see when its ok to free up the - * pending_extent_op. In the case where we insert the - * last item on the list in order to help do batching - * we need to not free the extent op until we actually - * insert the extent_item - */ - if (op->del == 2) { - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, - GFP_NOFS); - cur = cur->next; - list_del_init(&op->list); - kfree(op); - if (cur != insert_list) - op = list_entry(cur, - struct pending_extent_op, - list); - } - } - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(extent_root, path); - - /* - * Ok backref's and items usually go right next to eachother, - * but if we could only insert 1 item that means that we - * inserted on the end of a leaf, and we have no idea what may - * be on the next leaf so we just play it safe. In order to - * try and help this case we insert the last thing on our - * insert list so hopefully it will end up being the last - * thing on the leaf and everything else will be before it, - * which will let us insert a whole bunch of items at the same - * time. - */ - if (ret == 1 && !last && (i + ret < total)) { - /* - * last: where we will pick up the next time around - * i: our current key to insert, will be total - 1 - * cur: the current op we are screwing with - * op: duh - */ - last = i + ret; - i = total - 1; - cur = insert_list->prev; - op = list_entry(cur, struct pending_extent_op, list); - } else if (last) { - /* - * ok we successfully inserted the last item on the - * list, lets reset everything - * - * i: our current key to insert, so where we left off - * last time - * last: done with this - * cur: the op we are messing with - * op: duh - * total: since we inserted the last key, we need to - * decrement total so we dont overflow - */ - i = last; - last = 0; - total--; - if (i < total) { - cur = insert_list->next; - op = list_entry(cur, struct pending_extent_op, - list); - } - } else { - i += ret; - } - - cond_resched(); - } - ret = 0; - kfree(keys); - kfree(data_size); - return ret; -} - static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 bytenr, u64 parent, u64 ref_root, u64 ref_generation, - u64 owner_objectid) + u64 owner_objectid, + int refs_to_add) { struct btrfs_key key; struct extent_buffer *leaf; @@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_ref_root(leaf, ref, ref_root); btrfs_set_ref_generation(leaf, ref, ref_generation); btrfs_set_ref_objectid(leaf, ref, owner_objectid); - btrfs_set_ref_num_refs(leaf, ref, 1); + btrfs_set_ref_num_refs(leaf, ref, refs_to_add); } else if (ret == -EEXIST) { u64 existing_owner; + BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, num_refs = btrfs_ref_num_refs(leaf, ref); BUG_ON(num_refs == 0); - btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); + btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); existing_owner = btrfs_ref_objectid(leaf, ref); if (existing_owner != owner_objectid && @@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, } else { goto out; } + btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); out: btrfs_release_path(root, path); @@ -865,7 +624,8 @@ out: static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path) + struct btrfs_path *path, + int refs_to_drop) { struct extent_buffer *leaf; struct btrfs_extent_ref *ref; @@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs == 0); - num_refs -= 1; + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { @@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, #endif } -static noinline int free_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct list_head *del_list) -{ - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_path *path; - struct btrfs_key key, found_key; - struct extent_buffer *leaf; - struct list_head *cur; - struct pending_extent_op *op; - struct btrfs_extent_item *ei; - int ret, num_to_del, extent_slot = 0, found_extent = 0; - u32 refs; - u64 bytes_freed = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 1; - -search: - /* search for the backref for the current ref we want to delete */ - cur = del_list->next; - op = list_entry(cur, struct pending_extent_op, list); - ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, - op->orig_parent, - extent_root->root_key.objectid, - op->orig_generation, op->level, 1); - if (ret) { - printk(KERN_ERR "btrfs unable to find backref byte nr %llu " - "root %llu gen %llu owner %u\n", - (unsigned long long)op->bytenr, - (unsigned long long)extent_root->root_key.objectid, - (unsigned long long)op->orig_generation, op->level); - btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); - goto out; - } - - extent_slot = path->slots[0]; - num_to_del = 1; - found_extent = 0; - - /* - * if we aren't the first item on the leaf we can move back one and see - * if our ref is right next to our extent item - */ - if (likely(extent_slot)) { - extent_slot--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - extent_slot); - if (found_key.objectid == op->bytenr && - found_key.type == BTRFS_EXTENT_ITEM_KEY && - found_key.offset == op->num_bytes) { - num_to_del++; - found_extent = 1; - } - } - - /* - * if we didn't find the extent we need to delete the backref and then - * search for the extent item key so we can update its ref count - */ - if (!found_extent) { - key.objectid = op->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = op->num_bytes; - - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); - btrfs_release_path(extent_root, path); - ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); - BUG_ON(ret); - extent_slot = path->slots[0]; - } - - /* this is where we update the ref count for the extent */ - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs == 0); - refs--; - btrfs_set_extent_refs(leaf, ei, refs); - - btrfs_mark_buffer_dirty(leaf); - - /* - * This extent needs deleting. The reason cur_slot is extent_slot + - * num_to_del is because extent_slot points to the slot where the extent - * is, and if the backref was not right next to the extent we will be - * deleting at least 1 item, and will want to start searching at the - * slot directly next to extent_slot. However if we did find the - * backref next to the extent item them we will be deleting at least 2 - * items and will want to start searching directly after the ref slot - */ - if (!refs) { - struct list_head *pos, *n, *end; - int cur_slot = extent_slot+num_to_del; - u64 super_used; - u64 root_used; - - path->slots[0] = extent_slot; - bytes_freed = op->num_bytes; - - mutex_lock(&info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, op->bytenr, - op->num_bytes, op->level >= - BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&info->pinned_mutex); - BUG_ON(ret < 0); - op->del = ret; - - /* - * we need to see if we can delete multiple things at once, so - * start looping through the list of extents we are wanting to - * delete and see if their extent/backref's are right next to - * eachother and the extents only have 1 ref - */ - for (pos = cur->next; pos != del_list; pos = pos->next) { - struct pending_extent_op *tmp; - - tmp = list_entry(pos, struct pending_extent_op, list); - - /* we only want to delete extent+ref at this stage */ - if (cur_slot >= btrfs_header_nritems(leaf) - 1) - break; - - btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); - if (found_key.objectid != tmp->bytenr || - found_key.type != BTRFS_EXTENT_ITEM_KEY || - found_key.offset != tmp->num_bytes) - break; - - /* check to make sure this extent only has one ref */ - ei = btrfs_item_ptr(leaf, cur_slot, - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, ei) != 1) - break; - - btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); - if (found_key.objectid != tmp->bytenr || - found_key.type != BTRFS_EXTENT_REF_KEY || - found_key.offset != tmp->orig_parent) - break; - - /* - * the ref is right next to the extent, we can set the - * ref count to 0 since we will delete them both now - */ - btrfs_set_extent_refs(leaf, ei, 0); - - /* pin down the bytes for this extent */ - mutex_lock(&info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, tmp->bytenr, - tmp->num_bytes, tmp->level >= - BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&info->pinned_mutex); - BUG_ON(ret < 0); - - /* - * use the del field to tell if we need to go ahead and - * free up the extent when we delete the item or not. - */ - tmp->del = ret; - bytes_freed += tmp->num_bytes; - - num_to_del += 2; - cur_slot += 2; - } - end = pos; - - /* update the free space counters */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - super_used - bytes_freed); - - root_used = btrfs_root_used(&extent_root->root_item); - btrfs_set_root_used(&extent_root->root_item, - root_used - bytes_freed); - spin_unlock(&info->delalloc_lock); - - /* delete the items */ - ret = btrfs_del_items(trans, extent_root, path, - path->slots[0], num_to_del); - BUG_ON(ret); - - /* - * loop through the extents we deleted and do the cleanup work - * on them - */ - for (pos = cur, n = pos->next; pos != end; - pos = n, n = pos->next) { - struct pending_extent_op *tmp; - tmp = list_entry(pos, struct pending_extent_op, list); - - /* - * remember tmp->del tells us wether or not we pinned - * down the extent - */ - ret = update_block_group(trans, extent_root, - tmp->bytenr, tmp->num_bytes, 0, - tmp->del); - BUG_ON(ret); - - list_del_init(&tmp->list); - unlock_extent(&info->extent_ins, tmp->bytenr, - tmp->bytenr + tmp->num_bytes - 1, - GFP_NOFS); - kfree(tmp); - } - } else if (refs && found_extent) { - /* - * the ref and extent were right next to eachother, but the - * extent still has a ref, so just free the backref and keep - * going - */ - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); - - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - } else { - /* - * the extent has multiple refs and the backref we were looking - * for was not right next to it, so just unlock and go next, - * we're good to go - */ - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - } - - btrfs_release_path(extent_root, path); - if (!list_empty(del_list)) - goto search; - -out: - btrfs_free_path(path); - return ret; -} - static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_root, u64 ref_root, u64 orig_generation, u64 ref_generation, u64 owner_objectid) { int ret; - struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_path *path; - - if (root == root->fs_info->extent_root) { - struct pending_extent_op *extent_op; - u64 num_bytes; - - BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); - num_bytes = btrfs_level_size(root, (int)owner_objectid); - mutex_lock(&root->fs_info->extent_ins_mutex); - if (test_range_bit(&root->fs_info->extent_ins, bytenr, - bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { - u64 priv; - ret = get_state_private(&root->fs_info->extent_ins, - bytenr, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - BUG_ON(extent_op->parent != orig_parent); - BUG_ON(extent_op->generation != orig_generation); + int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; - extent_op->parent = parent; - extent_op->generation = ref_generation; - } else { - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_BACKREF_UPDATE; - extent_op->bytenr = bytenr; - extent_op->num_bytes = num_bytes; - extent_op->parent = parent; - extent_op->orig_parent = orig_parent; - extent_op->generation = ref_generation; - extent_op->orig_generation = orig_generation; - extent_op->level = (int)owner_objectid; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - set_extent_bits(&root->fs_info->extent_ins, - bytenr, bytenr + num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->extent_ins, - bytenr, (unsigned long)extent_op); - } - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = lookup_extent_backref(trans, extent_root, path, - bytenr, orig_parent, orig_root, - orig_generation, owner_objectid, 1); - if (ret) - goto out; - ret = remove_extent_backref(trans, extent_root, path); - if (ret) - goto out; - ret = insert_extent_backref(trans, extent_root, path, bytenr, - parent, ref_root, ref_generation, - owner_objectid); + ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, + orig_parent, parent, orig_root, + ref_root, orig_generation, + ref_generation, owner_objectid, pin); BUG_ON(ret); - finish_current_insert(trans, extent_root, 0); - del_pending_extents(trans, extent_root, 0); -out: - btrfs_free_path(path); return ret; } int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 orig_parent, u64 parent, + u64 num_bytes, u64 orig_parent, u64 parent, u64 ref_root, u64 ref_generation, u64 owner_objectid) { @@ -1260,20 +716,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, if (ref_root == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) return 0; - ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, - parent, ref_root, ref_root, - ref_generation, ref_generation, - owner_objectid); + + ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, parent, ref_root, + ref_root, ref_generation, + ref_generation, owner_objectid); return ret; } - static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_root, u64 ref_root, u64 orig_generation, u64 ref_generation, u64 owner_objectid) { + int ret; + + ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, + ref_generation, owner_objectid, + BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + return ret; +} + +static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, + int refs_to_add) +{ struct btrfs_path *path; int ret; struct btrfs_key key; @@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = 1; + path->leave_spinning = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; + key.offset = num_bytes; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, - 0, 1); - if (ret < 0) + /* first find the extent item and update its reference count */ + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + btrfs_set_path_blocking(path); return ret; - BUG_ON(ret == 0 || path->slots[0] == 0); + } - path->slots[0]--; + if (ret > 0) { + WARN_ON(1); + btrfs_free_path(path); + return -EIO; + } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); @@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(l, item); - btrfs_set_extent_refs(l, item, refs + 1); + btrfs_set_extent_refs(l, item, refs + refs_to_add); + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; + path->leave_spinning = 1; + + /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, ref_root, ref_generation, - owner_objectid); + owner_objectid, refs_to_add); BUG_ON(ret); - finish_current_insert(trans, root->fs_info->extent_root, 0); - del_pending_extents(trans, root->fs_info->extent_root, 0); - btrfs_free_path(path); return 0; } @@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ref_root == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) return 0; - ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, + + ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, 0, ref_root, 0, ref_generation, owner_objectid); return ret; } -int btrfs_extent_post_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node) +{ + int ret = 0; + struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); + + BUG_ON(node->ref_mod == 0); + ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, + node->parent, ref->root, ref->generation, + ref->owner_objectid, ref->pin, node->ref_mod); + + return ret; +} + +/* helper function to actually process a single delayed ref entry */ +static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + int insert_reserved) { - u64 start; - u64 end; int ret; + struct btrfs_delayed_ref *ref; + + if (node->parent == (u64)-1) { + struct btrfs_delayed_ref_head *head; + /* + * we've hit the end of the chain and we were supposed + * to insert this extent into the tree. But, it got + * deleted before we ever needed to insert it, so all + * we have to do is clean up the accounting + */ + if (insert_reserved) { + update_reserved_extents(root, node->bytenr, + node->num_bytes, 0); + } + head = btrfs_delayed_node_to_head(node); + mutex_unlock(&head->mutex); + return 0; + } - while(1) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + ref = btrfs_delayed_node_to_ref(node); + if (ref->action == BTRFS_ADD_DELAYED_REF) { + if (insert_reserved) { + struct btrfs_key ins; - /* is there more work to do? */ - ret = find_first_extent_bit(&root->fs_info->pending_del, - 0, &start, &end, EXTENT_WRITEBACK); - if (!ret) - continue; - ret = find_first_extent_bit(&root->fs_info->extent_ins, - 0, &start, &end, EXTENT_WRITEBACK); - if (!ret) - continue; - break; + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + /* record the full extent allocation */ + ret = __btrfs_alloc_reserved_extent(trans, root, + node->parent, ref->root, + ref->generation, ref->owner_objectid, + &ins, node->ref_mod); + update_reserved_extents(root, node->bytenr, + node->num_bytes, 0); + } else { + /* just add one backref */ + ret = add_extent_ref(trans, root, node->bytenr, + node->num_bytes, + node->parent, ref->root, ref->generation, + ref->owner_objectid, node->ref_mod); + } + BUG_ON(ret); + } else if (ref->action == BTRFS_DROP_DELAYED_REF) { + WARN_ON(insert_reserved); + ret = drop_delayed_ref(trans, root, node); } return 0; } -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs) +static noinline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct btrfs_path *path; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + int action = BTRFS_ADD_DELAYED_REF; +again: + /* + * select delayed ref of type BTRFS_ADD_DELAYED_REF first. + * this prevents ref count from going down to zero when + * there still are pending delayed ref. + */ + node = rb_prev(&head->node.rb_node); + while (1) { + if (!node) + break; + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + if (btrfs_delayed_node_to_ref(ref)->action == action) + return ref; + node = rb_prev(node); + } + if (action == BTRFS_ADD_DELAYED_REF) { + action = BTRFS_DROP_DELAYED_REF; + goto again; + } + return NULL; +} + +static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct list_head *cluster) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; - struct btrfs_key key; - struct extent_buffer *l; - struct btrfs_extent_item *item; + int count = 0; + int must_insert_reserved = 0; - WARN_ON(num_bytes < root->sectorsize); - path = btrfs_alloc_path(); - path->reada = 1; - key.objectid = bytenr; - key.offset = num_bytes; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, - 0, 0); - if (ret < 0) - goto out; - if (ret != 0) { - btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_INFO "btrfs failed to find block number %llu\n", - (unsigned long long)bytenr); - BUG(); + delayed_refs = &trans->transaction->delayed_refs; + while (1) { + if (!locked_ref) { + /* pick a new head ref from the cluster list */ + if (list_empty(cluster)) + break; + + locked_ref = list_entry(cluster->next, + struct btrfs_delayed_ref_head, cluster); + + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); + + /* + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. + */ + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; + } + } + + /* + * record the must insert reserved flag before we + * drop the spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + if (!ref) { + /* All delayed refs have been processed, Go ahead + * and send the head node to run_one_delayed_ref, + * so that any accounting fixes can happen + */ + ref = &locked_ref->node; + list_del_init(&locked_ref->cluster); + locked_ref = NULL; + } + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root, ref, + must_insert_reserved); + BUG_ON(ret); + btrfs_put_delayed_ref(ref); + + count++; + cond_resched(); + spin_lock(&delayed_refs->lock); + } + return count; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct list_head cluster; + int ret; + int run_all = count == (unsigned long)-1; + int run_most = 0; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; + INIT_LIST_HEAD(&cluster); +again: + spin_lock(&delayed_refs->lock); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + while (1) { + if (!(run_all || run_most) && + delayed_refs->num_heads_ready < 64) + break; + + /* + * go find something we can process in the rbtree. We start at + * the beginning of the tree, and then build a cluster + * of refs to process starting at the first one we are able to + * lock + */ + ret = btrfs_find_ref_cluster(trans, &cluster, + delayed_refs->run_delayed_start); + if (ret) + break; + + ret = run_clustered_refs(trans, root, &cluster); + BUG_ON(ret < 0); + + count -= min_t(unsigned long, ret, count); + + if (count == 0) + break; + } + + if (run_all) { + node = rb_first(&delayed_refs->root); + if (!node) + goto out; + count = (unsigned long)-1; + + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + atomic_inc(&ref->refs); + + spin_unlock(&delayed_refs->lock); + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref(ref); + cond_resched(); + goto again; + } + node = rb_next(node); + } + spin_unlock(&delayed_refs->lock); + schedule_timeout(1); + goto again; } - l = path->nodes[0]; - item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - *refs = btrfs_extent_refs(l, item); out: - btrfs_free_path(path); + spin_unlock(&delayed_refs->lock); return 0; } @@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, int refi = 0; int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); ref_generation = btrfs_header_generation(buf); @@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, if (level == 0) { btrfs_item_key_to_cpu(buf, &key, slot); + fi = btrfs_item_ptr(buf, slot, + struct btrfs_file_extent_item); + + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; ret = process_func(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); + btrfs_file_extent_disk_num_bytes(buf, fi), + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); if (ret) { faili = slot; @@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, goto fail; } } else { - ret = process_func(trans, root, bytenr, + ret = process_func(trans, root, bytenr, buf->len, orig_buf->start, buf->start, orig_root, ref_root, orig_generation, ref_generation, @@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans, if (bytenr == 0) continue; ret = __btrfs_update_extent_ref(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); + btrfs_file_extent_disk_num_bytes(buf, fi), + orig_buf->start, buf->start, + orig_root, ref_root, orig_generation, + ref_generation, key.objectid); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, slot); ret = __btrfs_update_extent_ref(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, + buf->len, orig_buf->start, + buf->start, orig_root, ref_root, orig_generation, ref_generation, level - 1); if (ret) @@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache) { int ret; - int pending_ret; struct btrfs_root *extent_root = root->fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; @@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(extent_root, path); fail: - finish_current_insert(trans, extent_root, 0); - pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) return ret; - if (pending_ret) - return pending_ret; return 0; } @@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } + mutex_unlock(&root->fs_info->pinned_mutex); + while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); @@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { + mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - if (need_resched()) { - mutex_unlock(&root->fs_info->pinned_mutex); - cond_resched(); - mutex_lock(&root->fs_info->pinned_mutex); - } + cond_resched(); } mutex_unlock(&root->fs_info->pinned_mutex); return ret; } -static int finish_current_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all) -{ - u64 start; - u64 end; - u64 priv; - u64 search = 0; - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_path *path; - struct pending_extent_op *extent_op, *tmp; - struct list_head insert_list, update_list; - int ret; - int num_inserts = 0, max_inserts, restart = 0; - - path = btrfs_alloc_path(); - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - - max_inserts = extent_root->leafsize / - (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + - sizeof(struct btrfs_extent_ref) + - sizeof(struct btrfs_extent_item)); -again: - mutex_lock(&info->extent_ins_mutex); - while (1) { - ret = find_first_extent_bit(&info->extent_ins, search, &start, - &end, EXTENT_WRITEBACK); - if (ret) { - if (restart && !num_inserts && - list_empty(&update_list)) { - restart = 0; - search = 0; - continue; - } - break; - } - - ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); - if (!ret) { - if (all) - restart = 1; - search = end + 1; - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - continue; - } - - ret = get_state_private(&info->extent_ins, start, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *)(unsigned long) priv; - - if (extent_op->type == PENDING_EXTENT_INSERT) { - num_inserts++; - list_add_tail(&extent_op->list, &insert_list); - search = end + 1; - if (num_inserts == max_inserts) { - restart = 1; - break; - } - } else if (extent_op->type == PENDING_BACKREF_UPDATE) { - list_add_tail(&extent_op->list, &update_list); - search = end + 1; - } else { - BUG(); - } - } - - /* - * process the update list, clear the writeback bit for it, and if - * somebody marked this thing for deletion then just unlock it and be - * done, the free_extents will handle it - */ - list_for_each_entry_safe(extent_op, tmp, &update_list, list) { - clear_extent_bits(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - if (extent_op->del) { - list_del_init(&extent_op->list); - unlock_extent(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - - 1, GFP_NOFS); - kfree(extent_op); - } - } - mutex_unlock(&info->extent_ins_mutex); - - /* - * still have things left on the update list, go ahead an update - * everything - */ - if (!list_empty(&update_list)) { - ret = update_backrefs(trans, extent_root, path, &update_list); - BUG_ON(ret); - - /* we may have COW'ed new blocks, so lets start over */ - if (all) - restart = 1; - } - - /* - * if no inserts need to be done, but we skipped some extents and we - * need to make sure everything is cleaned then reset everything and - * go back to the beginning - */ - if (!num_inserts && restart) { - search = 0; - restart = 0; - INIT_LIST_HEAD(&update_list); - INIT_LIST_HEAD(&insert_list); - goto again; - } else if (!num_inserts) { - goto out; - } - - /* - * process the insert extents list. Again if we are deleting this - * extent, then just unlock it, pin down the bytes if need be, and be - * done with it. Saves us from having to actually insert the extent - * into the tree and then subsequently come along and delete it - */ - mutex_lock(&info->extent_ins_mutex); - list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { - clear_extent_bits(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - if (extent_op->del) { - u64 used; - list_del_init(&extent_op->list); - unlock_extent(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - - 1, GFP_NOFS); - - mutex_lock(&extent_root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, - extent_op->bytenr, - extent_op->num_bytes, 0); - mutex_unlock(&extent_root->fs_info->pinned_mutex); - - spin_lock(&info->delalloc_lock); - used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - used - extent_op->num_bytes); - used = btrfs_root_used(&extent_root->root_item); - btrfs_set_root_used(&extent_root->root_item, - used - extent_op->num_bytes); - spin_unlock(&info->delalloc_lock); - - ret = update_block_group(trans, extent_root, - extent_op->bytenr, - extent_op->num_bytes, - 0, ret > 0); - BUG_ON(ret); - kfree(extent_op); - num_inserts--; - } - } - mutex_unlock(&info->extent_ins_mutex); - - ret = insert_extents(trans, extent_root, path, &insert_list, - num_inserts); - BUG_ON(ret); - - /* - * if restart is set for whatever reason we need to go back and start - * searching through the pending list again. - * - * We just inserted some extents, which could have resulted in new - * blocks being allocated, which would result in new blocks needing - * updates, so if all is set we _must_ restart to get the updated - * blocks. - */ - if (restart || all) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - search = 0; - restart = 0; - num_inserts = 0; - goto again; - } -out: - btrfs_free_path(path); - return 0; -} - static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int is_data) + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, int is_data, + struct extent_buffer **must_clean) { int err = 0; struct extent_buffer *buf; @@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && header_owner != BTRFS_TREE_RELOC_OBJECTID && + header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - clean_tree_block(NULL, root, buf); - btrfs_tree_unlock(buf); - free_extent_buffer(buf); + *must_clean = buf; return 1; } btrfs_tree_unlock(buf); } free_extent_buffer(buf); pinit: + btrfs_set_path_blocking(path); + mutex_lock(&root->fs_info->pinned_mutex); + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); BUG_ON(err < 0); @@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, int mark_free) + u64 owner_objectid, int pin, int mark_free, + int refs_to_drop) { struct btrfs_path *path; struct btrfs_key key; @@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = 1; + path->leave_spinning = 1; ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent, root_objectid, ref_generation, owner_objectid, 1); @@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans, break; } if (!found_extent) { - ret = remove_extent_backref(trans, extent_root, path); + ret = remove_extent_backref(trans, extent_root, path, + refs_to_drop); BUG_ON(ret); btrfs_release_path(extent_root, path); + path->leave_spinning = 1; ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "root %llu gen %llu owner %llu\n", + "parent %llu root %llu gen %llu owner %llu\n", (unsigned long long)bytenr, + (unsigned long long)parent, (unsigned long long)root_objectid, (unsigned long long)ref_generation, (unsigned long long)owner_objectid); @@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans, ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs == 0); - refs -= 1; - btrfs_set_extent_refs(leaf, ei, refs); + /* + * we're not allowed to delete the extent item if there + * are other delayed ref updates pending + */ + + BUG_ON(refs < refs_to_drop); + refs -= refs_to_drop; + btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); - if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { + if (refs == 0 && found_extent && + path->slots[0] == extent_slot + 1) { struct btrfs_extent_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); /* if the back ref and the extent are next to each other * they get deleted below in one shot */ @@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } else if (found_extent) { /* otherwise delete the extent back ref */ - ret = remove_extent_backref(trans, extent_root, path); + ret = remove_extent_backref(trans, extent_root, path, + refs_to_drop); BUG_ON(ret); /* if refs are 0, we need to setup the path for deletion */ if (refs == 0) { btrfs_release_path(extent_root, path); + path->leave_spinning = 1; ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); BUG_ON(ret); @@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, if (refs == 0) { u64 super_used; u64 root_used; + struct extent_buffer *must_clean = NULL; if (pin) { - mutex_lock(&root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, root, bytenr, num_bytes, - owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, root, path, + bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, + &must_clean); if (ret > 0) mark_free = 1; BUG_ON(ret < 0); } + /* block accounting for super block */ spin_lock(&info->delalloc_lock); super_used = btrfs_super_bytes_used(&info->super_copy); @@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, root_used - num_bytes); spin_unlock(&info->delalloc_lock); + + /* + * it is going to be very rare for someone to be waiting + * on the block we're freeing. del_items might need to + * schedule, so rather than get fancy, just force it + * to blocking here + */ + if (must_clean) + btrfs_set_lock_blocking(must_clean); + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); + } else { + invalidate_mapping_pages(info->btree_inode->i_mapping, + bytenr >> PAGE_CACHE_SHIFT, + (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0, @@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } btrfs_free_path(path); - finish_current_insert(trans, extent_root, 0); return ret; } /* - * find all the blocks marked as pending in the radix tree and remove - * them from the extent map + * remove an extent from the root, returns 0 on success */ -static int del_pending_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all) +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, + int refs_to_drop) { - int ret; - int err = 0; - u64 start; - u64 end; - u64 priv; - u64 search = 0; - int nr = 0, skipped = 0; - struct extent_io_tree *pending_del; - struct extent_io_tree *extent_ins; - struct pending_extent_op *extent_op; - struct btrfs_fs_info *info = extent_root->fs_info; - struct list_head delete_list; - - INIT_LIST_HEAD(&delete_list); - extent_ins = &extent_root->fs_info->extent_ins; - pending_del = &extent_root->fs_info->pending_del; - -again: - mutex_lock(&info->extent_ins_mutex); - while (1) { - ret = find_first_extent_bit(pending_del, search, &start, &end, - EXTENT_WRITEBACK); - if (ret) { - if (all && skipped && !nr) { - search = 0; - skipped = 0; - continue; - } - mutex_unlock(&info->extent_ins_mutex); - break; - } - - ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); - if (!ret) { - search = end+1; - skipped = 1; - - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - - continue; - } - BUG_ON(ret < 0); - - ret = get_state_private(pending_del, start, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *)(unsigned long)priv; - - clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, - GFP_NOFS); - if (!test_range_bit(extent_ins, start, end, - EXTENT_WRITEBACK, 0)) { - list_add_tail(&extent_op->list, &delete_list); - nr++; - } else { - kfree(extent_op); - - ret = get_state_private(&info->extent_ins, start, - &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - - clear_extent_bits(&info->extent_ins, start, end, - EXTENT_WRITEBACK, GFP_NOFS); - - if (extent_op->type == PENDING_BACKREF_UPDATE) { - list_add_tail(&extent_op->list, &delete_list); - search = end + 1; - nr++; - continue; - } - - mutex_lock(&extent_root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, start, - end + 1 - start, 0); - mutex_unlock(&extent_root->fs_info->pinned_mutex); - - ret = update_block_group(trans, extent_root, start, - end + 1 - start, 0, ret > 0); - - unlock_extent(extent_ins, start, end, GFP_NOFS); - BUG_ON(ret); - kfree(extent_op); - } - if (ret) - err = ret; - - search = end + 1; - - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - } + WARN_ON(num_bytes < root->sectorsize); - if (nr) { - ret = free_extents(trans, extent_root, &delete_list); - BUG_ON(ret); - } + /* + * if metadata always pin + * if data pin when any transaction has committed this + */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || + ref_generation != trans->transid) + pin = 1; - if (all && skipped) { - INIT_LIST_HEAD(&delete_list); - search = 0; - nr = 0; - goto again; - } + if (ref_generation != trans->transid) + pin = 1; - if (!err) - finish_current_insert(trans, extent_root, 0); - return err; + return __free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin, pin == 0, refs_to_drop); } /* - * remove an extent from the root, returns 0 on success + * when we free an extent, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. */ -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin) +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr) { - struct btrfs_root *extent_root = root->fs_info->extent_root; - int pending_ret; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; int ret; - WARN_ON(num_bytes < root->sectorsize); - if (root == extent_root) { - struct pending_extent_op *extent_op = NULL; - - mutex_lock(&root->fs_info->extent_ins_mutex); - if (test_range_bit(&root->fs_info->extent_ins, bytenr, - bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { - u64 priv; - ret = get_state_private(&root->fs_info->extent_ins, - bytenr, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; - extent_op->del = 1; - if (extent_op->type == PENDING_EXTENT_INSERT) { - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - } + node = rb_prev(&head->node.rb_node); + if (!node) + goto out; - if (extent_op) { - ref_generation = extent_op->orig_generation; - parent = extent_op->orig_parent; - } + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_EXTENT_DELETE; - extent_op->bytenr = bytenr; - extent_op->num_bytes = num_bytes; - extent_op->parent = parent; - extent_op->orig_parent = parent; - extent_op->generation = ref_generation; - extent_op->orig_generation = ref_generation; - extent_op->level = (int)owner_objectid; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - set_extent_bits(&root->fs_info->pending_del, - bytenr, bytenr + num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->pending_del, - bytenr, (unsigned long)extent_op); - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - /* if metadata always pin */ - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - mutex_unlock(&root->fs_info->pinned_mutex); - update_reserved_extents(root, bytenr, num_bytes, 0); - return 0; - } - pin = 1; - } + /* there are still entries for this ref, we can't drop it */ + if (ref->bytenr == bytenr) + goto out; - /* if data pin when any transaction has committed this */ - if (ref_generation != trans->transid) - pin = 1; + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; - ret = __free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin, pin == 0); + /* + * at this point we have a head with no other entries. Go + * ahead and process it. + */ + head->node.in_tree = 0; + rb_erase(&head->node.rb_node, &delayed_refs->root); - finish_current_insert(trans, root->fs_info->extent_root, 0); - pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); - return ret ? ret : pending_ret; + delayed_refs->num_entries--; + + /* + * we don't take a ref on the node because we're removing it from the + * tree, so we just steal the ref the tree was holding. + */ + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + + list_del_init(&head->cluster); + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root->fs_info->tree_root, + &head->node, head->must_insert_reserved); + BUG_ON(ret); + btrfs_put_delayed_ref(&head->node); + return 0; +out: + spin_unlock(&delayed_refs->lock); + return 0; } int btrfs_free_extent(struct btrfs_trans_handle *trans, @@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, { int ret; - ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin); + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + * + * data extents referenced by the tree log do need to have + * their reference counts bumped. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + mutex_lock(&root->fs_info->pinned_mutex); + + /* unlocks the pinned mutex */ + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + update_reserved_extents(root, bytenr, num_bytes, 0); + ret = 0; + } else { + ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, + BTRFS_DROP_DELAYED_REF, 1); + BUG_ON(ret); + ret = check_ref_cleanup(trans, root, bytenr); + BUG_ON(ret); + } return ret; } @@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) + u64 owner, struct btrfs_key *ins, + int ref_mod) { int ret; - int pending_ret; u64 super_used; u64 root_used; u64 num_bytes = ins->offset; @@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, root_used + num_bytes); spin_unlock(&info->delalloc_lock); - if (root == extent_root) { - struct pending_extent_op *extent_op; - - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_EXTENT_INSERT; - extent_op->bytenr = ins->objectid; - extent_op->num_bytes = ins->offset; - extent_op->parent = parent; - extent_op->orig_parent = 0; - extent_op->generation = ref_generation; - extent_op->orig_generation = 0; - extent_op->level = (int)owner; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - mutex_lock(&root->fs_info->extent_ins_mutex); - set_extent_bits(&root->fs_info->extent_ins, ins->objectid, - ins->objectid + ins->offset - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->extent_ins, - ins->objectid, (unsigned long)extent_op); - mutex_unlock(&root->fs_info->extent_ins_mutex); - goto update_block; - } - memcpy(&keys[0], ins, sizeof(*ins)); keys[1].objectid = ins->objectid; keys[1].type = BTRFS_EXTENT_REF_KEY; @@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, extent_root, path, keys, sizes, 2); BUG_ON(ret); extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], extent_item, 1); + btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_extent_ref); btrfs_set_ref_root(path->nodes[0], ref, root_objectid); btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); btrfs_set_ref_objectid(path->nodes[0], ref, owner); - btrfs_set_ref_num_refs(path->nodes[0], ref, 1); + btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); btrfs_mark_buffer_dirty(path->nodes[0]); trans->alloc_exclude_start = 0; trans->alloc_exclude_nr = 0; btrfs_free_path(path); - finish_current_insert(trans, extent_root, 0); - pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) goto out; - if (pending_ret) { - ret = pending_ret; - goto out; - } -update_block: ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0); if (ret) { @@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) return 0; - ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins); - update_reserved_extents(root, ins->objectid, ins->offset, 0); + + ret = btrfs_add_delayed_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + ref_generation, owner, + BTRFS_ADD_DELAYED_EXTENT, 0); + BUG_ON(ret); return ret; } @@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins); + ref_generation, owner, ins, 1); return ret; } @@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data) { int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); BUG_ON(ret); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = __btrfs_alloc_reserved_extent(trans, root, parent, - root_objectid, ref_generation, - owner_objectid, ins); + ret = btrfs_add_delayed_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + ref_generation, owner_objectid, + BTRFS_ADD_DELAYED_EXTENT, 0); BUG_ON(ret); - - } else { - update_reserved_extents(root, ins->objectid, ins->offset, 1); } + update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - ret = __btrfs_free_extent(trans, root, disk_bytenr, + ret = btrfs_free_extent(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(leaf, fi), leaf->start, leaf_owner, leaf_generation, key.objectid, 0); @@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, */ for (i = 0; i < ref->nritems; i++) { info = ref->extents + sorted[i].slot; - ret = __btrfs_free_extent(trans, root, info->bytenr, + ret = btrfs_free_extent(trans, root, info->bytenr, info->num_bytes, ref->bytenr, ref->owner, ref->generation, info->objectid, 0); @@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } -static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, +static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, * we just decrement it below and don't update any * of the refs the leaf points to. */ - ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + ret = drop_snap_lookup_refcount(trans, root, bytenr, + blocksize, &refs); BUG_ON(ret); if (refs != 1) continue; @@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, */ for (i = 0; i < refi; i++) { bytenr = sorted[i].bytenr; - ret = __btrfs_free_extent(trans, root, bytenr, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, eb->start, root_owner, root_gen, 0, 1); BUG_ON(ret); @@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, path->nodes[*level]->len, &refs); BUG_ON(ret); if (refs > 1) @@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); - ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + ret = drop_snap_lookup_refcount(trans, root, bytenr, + blocksize, &refs); BUG_ON(ret); /* @@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, root_gen = btrfs_header_generation(parent); path->slots[*level]++; - ret = __btrfs_free_extent(trans, root, bytenr, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level - 1, 1); @@ -4165,7 +3587,7 @@ out: * cleanup and free the reference on the last node * we processed */ - ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level, 1); free_extent_buffer(path->nodes[*level]); @@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_path *path; int i; int orig_level; + int update_count; struct btrfs_root_item *root_item = &root->root_item; WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); @@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root } } while (1) { + unsigned long update; wret = walk_down_tree(trans, root, path, &level); if (wret > 0) break; @@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root break; if (wret < 0) ret = wret; - if (trans->transaction->in_commit) { + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) { ret = -EAGAIN; break; } atomic_inc(&root->fs_info->throttle_gen); wake_up(&root->fs_info->transaction_throttle); + for (update_count = 0; update_count < 16; update_count++) { + update = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (update) + btrfs_run_delayed_refs(trans, root, update); + else + break; + } } for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { @@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, key.objectid); BUG_ON(ret); + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, leaf->start, btrfs_header_owner(leaf), @@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, ref_path, NULL, NULL); BUG_ON(ret); - if (root == root->fs_info->extent_root) - btrfs_extent_post_op(trans, root); - return 0; } @@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) goto out; @@ -6208,6 +5640,9 @@ again: btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); mutex_unlock(&root->fs_info->cleaner_mutex); + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_new_blockgroup = trans->transid; + root->fs_info->last_trans_log_full_commit = trans->transid; cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) @@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, sizeof(cache->item)); BUG_ON(ret); - finish_current_insert(trans, extent_root, 0); - ret = del_pending_extents(trans, extent_root, 0); - BUG_ON(ret); set_avail_alloc_bits(extent_root->fs_info, type); return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ebe6b29e606..08085af089e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb) int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { - int set; unsigned long i; unsigned long num_pages; struct page *page; - u64 start = eb->start; - u64 end = start + eb->len - 1; - - set = clear_extent_dirty(tree, start, end, GFP_NOFS); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!set && !PageDirty(page)) + if (!PageDirty(page)) continue; lock_page(page); @@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, else set_page_private(page, EXTENT_PAGE_PRIVATE); - /* - * if we're on the last page or the first page and the - * block isn't aligned on a page boundary, do extra checks - * to make sure we don't clean page that is partially dirty - */ - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = (u64)page->index << PAGE_CACHE_SHIFT; - end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, - EXTENT_DIRTY, 0)) { - unlock_page(page); - continue; - } - } clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, { unsigned long i; unsigned long num_pages; + int was_dirty = 0; + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = extent_buffer_page(eb, i); - /* writepage may need to do something special for the - * first page, we have to make sure page->private is - * properly set. releasepage may drop page->private - * on us if the page isn't already dirty. - */ - lock_page(page); - if (i == 0) { - set_page_extent_head(page, eb->len); - } else if (PagePrivate(page) && - page->private != EXTENT_PAGE_PRIVATE) { - set_page_extent_mapped(page); - } + for (i = 0; i < num_pages; i++) __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - set_extent_dirty(tree, page_offset(page), - page_offset(page) + PAGE_CACHE_SIZE - 1, - GFP_NOFS); - unlock_page(page); - } - return 0; + return was_dirty; } int clear_extent_buffer_uptodate(struct extent_io_tree *tree, @@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ret = 0; goto out; } + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } /* at this point we can safely release the extent buffer */ num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1f9df88afbf..5bc20abf3f3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -25,6 +25,7 @@ /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 +#define EXTENT_BUFFER_DIRTY 2 /* * page->private values. Every page that is controlled by the extent @@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); +int test_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_io_tree *tree, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 964652435fd..9b99886562d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) @@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) @@ -757,8 +759,10 @@ insert: } else { ins_size = csum_size; } + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); + path->leave_spinning = 0; if (ret < 0) goto fail_unlock; if (ret != 0) { @@ -776,7 +780,6 @@ found: item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); eb_token = NULL; - cond_resched(); next_sector: if (!eb_token || @@ -817,9 +820,9 @@ next_sector: eb_token = NULL; } btrfs_mark_buffer_dirty(path->nodes[0]); - cond_resched(); if (total_bytes < sums->len) { btrfs_release_path(root, path); + cond_resched(); goto again; } out: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dc78954861b..9c9fb46ccd0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -606,6 +606,7 @@ next_slot: btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); btrfs_release_path(root, path); + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*extent)); BUG_ON(ret); @@ -639,17 +640,22 @@ next_slot: ram_bytes); btrfs_set_file_extent_type(leaf, extent, found_type); + btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_set_lock_blocking(path->nodes[0]); if (disk_bytenr != 0) { ret = btrfs_update_extent_ref(trans, root, - disk_bytenr, orig_parent, + disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + orig_parent, leaf->start, root->root_key.objectid, trans->transid, ins.objectid); BUG_ON(ret); } + path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) inode_add_bytes(inode, extent_end - end); @@ -912,7 +918,7 @@ again: btrfs_set_file_extent_other_encoding(leaf, fi, 0); if (orig_parent != leaf->start) { - ret = btrfs_update_extent_ref(trans, root, bytenr, + ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes, orig_parent, leaf->start, root->root_key.objectid, trans->transid, inode->i_ino); @@ -1155,6 +1161,20 @@ out_nolock: page_cache_release(pinned[1]); *ppos = pos; + /* + * we want to make sure fsync finds this change + * but we haven't joined a transaction running right now. + * + * Later on, someone is sure to update the inode and get the + * real transid recorded. + * + * We set last_trans now to the fs_info generation + 1, + * this will either be one more than the running transaction + * or the generation used for the next transaction if there isn't + * one running right now. + */ + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + if (num_written > 0 && will_write) { struct btrfs_trans_handle *trans; @@ -1167,8 +1187,11 @@ out_nolock: ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { - btrfs_sync_log(trans, root); - btrfs_end_transaction(trans, root); + ret = btrfs_sync_log(trans, root); + if (ret == 0) + btrfs_end_transaction(trans, root); + else + btrfs_commit_transaction(trans, root); } else { btrfs_commit_transaction(trans, root); } @@ -1185,6 +1208,18 @@ out_nolock: int btrfs_release_file(struct inode *inode, struct file *filp) { + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (BTRFS_I(inode)->ordered_data_close) { + BTRFS_I(inode)->ordered_data_close = 0; + btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(inode->i_mapping); + } if (filp->private_data) btrfs_ioctl_trans_end(filp); return 0; @@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ret > 0) { ret = btrfs_commit_transaction(trans, root); } else { - btrfs_sync_log(trans, root); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 3d46fa1f29a..6b627c61180 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; @@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22..06d8db5afb0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; @@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap(cpage); + kaddr = kmap_atomic(cpage, KM_USER0); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap(cpage); + kunmap_atomic(kaddr, KM_USER0); i++; ptr += cur_size; @@ -204,7 +205,7 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static int cow_file_range_inline(struct btrfs_trans_handle *trans, +static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, size_t compressed_size, @@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - if (!btrfs_test_opt(root, COMPRESS)) { - return cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); - } - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC, 1, 0, GFP_NOFS); while (start < end) { @@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, u64 start, u64 end, int *page_started, int force, unsigned long *nr_written) { @@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_test_flag(inode, NODATACOW)) ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, else if (btrfs_test_flag(inode, PREALLOC)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); + else if (!btrfs_test_opt(root, COMPRESS)) + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); else ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); - return ret; } @@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, file_pos, &hint); BUG_ON(ret); @@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, fi, compression); btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_unlock_up_safe(path, 1); + btrfs_set_lock_blocking(leaf); + btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); @@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, inode->i_ino, &ins); BUG_ON(ret); - btrfs_free_path(path); + return 0; } +/* + * helper function for btrfs_finish_ordered_io, this + * just reads in some of the csum leaves to prime them into ram + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +static noinline void reada_csum(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_ordered_sum *sum; + u64 bytenr; + + sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, + list); + bytenr = sum->sums[0].bytenr; + + /* + * we don't care about the results, the point of this search is + * just to get the btree leaves into ram + */ + btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered_extent; + struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; int compressed = 0; int ret; @@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; + /* + * before we join the transaction, try to do some of our IO. + * This will limit the amount of IO that we have to do with + * the transaction running. We're unlikely to need to do any + * IO if the file extents are new, the disk_i_size checks + * covers the most common case. + */ + if (start < BTRFS_I(inode)->disk_i_size) { + path = btrfs_alloc_path(); + if (path) { + ret = btrfs_lookup_file_extent(NULL, root, path, + inode->i_ino, + start, 0); + ordered_extent = btrfs_lookup_ordered_extent(inode, + start); + if (!list_empty(&ordered_extent->list)) { + btrfs_release_path(root, path); + reada_csum(root, path, ordered_extent); + } + btrfs_free_path(path); + } + } + trans = btrfs_join_transaction(root, 1); - ordered_extent = btrfs_lookup_ordered_extent(inode, start); + if (!ordered_extent) + ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) goto nocow; @@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 1); if (ret) { @@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } + path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir->i_ino); BUG_ON(ret != 0 && ret != -ENOENT); - if (ret != -ENOENT) - BTRFS_I(dir)->log_dirty_trans = trans->transid; ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); @@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto error; @@ -2644,6 +2702,7 @@ delete: break; } if (found_extent) { + btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, leaf->start, root_owner, @@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + if (attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } else if (inode->i_size > 0 && + attr->ia_size == 0) { + + /* we're truncating a file that used to have good + * data down to zero. Make sure it gets into + * the ordered flush list so that any new writes + * get down to disk quickly. + */ + BTRFS_I(inode)->ordered_data_close = 1; + } } err = inode_setattr(inode, attr); @@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode) bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; - bi->log_dirty_trans = 0; + bi->last_unlink_trans = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); sizes[1] = name_len + sizeof(*ref); + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); if (ret != 0) goto fail; @@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; nr = trans->blocks_used; + + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: if (drop_inode) { @@ -4292,8 +4365,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4306,10 +4380,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) u64 page_end; ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; goto out; + } - ret = -EINVAL; + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); size = i_size_read(inode); @@ -4357,6 +4436,8 @@ again: } ClearPageChecked(page); set_page_dirty(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4382,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); @@ -4458,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); return &ei->vfs_inode; } void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); @@ -4474,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode) BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) posix_acl_release(BTRFS_I(inode)->i_default_acl); - spin_lock(&BTRFS_I(inode)->root->list_lock); + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } + + spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" " list\n", inode->i_ino); dump_stack(); } - spin_unlock(&BTRFS_I(inode)->root->list_lock); + spin_unlock(&root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -4605,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && + new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + trans = btrfs_start_transaction(root, 1); + /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && + old_inode && S_ISREG(old_inode->i_mode)) { + btrfs_add_ordered_operation(trans, root, old_inode); + } + + /* + * this is an ugly little race, but the rename is required to make + * sure that if we crash, the inode is either at the old name + * or the new one. pinning the log transaction lets us make sure + * we don't allow a log commit to come in after we unlink the + * name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + btrfs_set_trans_block_group(trans, new_dir); btrfs_inc_nlink(old_dentry->d_inode); @@ -4614,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, old_dentry->d_name.name, old_dentry->d_name.len); @@ -4645,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); out_fail: + + /* this btrfs_end_log_trans just allows the current + * log-sub transaction to complete + */ + btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); out_unlock: return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bca729fc80c..7594bec1be1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, goto out_dput; if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(parent->mnt); if (error) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 47b0a88c12a..a5310c0f41e 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) static int btrfs_spin_on_block(struct extent_buffer *eb) { int i; + for (i = 0; i < 512; i++) { - cpu_relax(); if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) return 1; if (need_resched()) break; + cpu_relax(); } return 0; } @@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) { int i; - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - + if (btrfs_spin_on_block(eb)) { + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } /* spin for a bit on the BLOCKING flag */ for (i = 0; i < 2; i++) { + cpu_relax(); if (!btrfs_spin_on_block(eb)) break; @@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb) DEFINE_WAIT(wait); wait.func = btrfs_wake_function; + if (!btrfs_spin_on_block(eb)) + goto sleep; + while(1) { spin_nested(eb); @@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb) * spin for a bit, and if the blocking flag goes away, * loop around */ + cpu_relax(); if (btrfs_spin_on_block(eb)) continue; - +sleep: prepare_to_wait_exclusive(&eb->lock_wq, &wait, TASK_UNINTERRUPTIBLE); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 77c2411a5f0..53c87b197d7 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + + /* + * we have no more ordered extents for this inode and + * no dirty pages. We can safely remove it from the + * list of ordered extents + */ + if (RB_EMPTY_ROOT(&tree->tree) && + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + list_del_init(&BTRFS_I(inode)->ordered_operations); + } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); mutex_unlock(&tree->mutex); @@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) } /* + * this is used during transaction commit to write all the inodes + * added to the ordered operation list. These files must be fully on + * disk before the transaction commits. + * + * we have two modes here, one is to just start the IO via filemap_flush + * and the other is to wait for all the io. When we wait, we have an + * extra check to make sure the ordered operation list really is empty + * before we return + */ +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +{ + struct btrfs_inode *btrfs_inode; + struct inode *inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); +again: + list_splice_init(&root->fs_info->ordered_operations, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + inode = &btrfs_inode->vfs_inode; + + list_del_init(&btrfs_inode->ordered_operations); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(inode); + + if (!wait && inode) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + if (wait) + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); + iput(inode); + } + + cond_resched(); + spin_lock(&root->fs_info->ordered_extent_lock); + } + if (wait && !list_empty(&root->fs_info->ordered_operations)) + goto again; + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +/* * Used to start IO or wait for a given ordered extent to finish. * * If wait is one, this effectively waits on page writeback for all the pages @@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, return ret; } + +/* + * add a given inode to the list of inodes that must be fully on + * disk before a transaction commit finishes. + * + * This basically gives us the ext3 style data=ordered mode, and it is mostly + * used to make sure renamed files are fully on disk. + * + * It is a noop if the inode is already fully on disk. + * + * If trans is not null, we'll do a friendly check for a transaction that + * is already flushing things and force the IO down ourselves. + */ +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + u64 last_mod; + + last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); + + /* + * if this file hasn't been changed since the last transaction + * commit, we can safely return without doing anything + */ + if (last_mod < root->fs_info->last_trans_committed) + return 0; + + /* + * the transaction is already committing. Just start the IO and + * don't bother with all of this list nonsense + */ + if (trans && root->fs_info->running_transaction->blocked) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + return 0; + } + + spin_lock(&root->fs_info->ordered_extent_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ab66d5e8d6d..3d31c8827b0 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4112d53d4f4..664782c6a2d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->use_count = 1; cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root.rb_node = NULL; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->delayed_refs.lock); + INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_group = 0; h->alloc_exclude_nr = 0; h->alloc_exclude_start = 0; + h->delayed_ref_updates = 0; + root->fs_info->running_transaction->use_count++; mutex_unlock(&root->fs_info->trans_mutex); return h; @@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); - throttle_on_drops(root); } @@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *info = root->fs_info; + int count = 0; + + while (count < 4) { + unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (cur && + trans->transaction->delayed_refs.num_heads_ready > 64) { + trans->delayed_ref_updates = 0; + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing) + cur = 0; + btrfs_run_delayed_refs(trans, root, cur); + } else { + break; + } + count++; + } mutex_lock(&info->trans_mutex); cur_trans = info->running_transaction; @@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; struct btrfs_root *tree_root = root->fs_info->tree_root; - btrfs_extent_post_op(trans, root); btrfs_write_dirty_block_groups(trans, root); - btrfs_extent_post_op(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); @@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_header_level(root->node)); btrfs_set_root_generation(&root->root_item, trans->transid); - btrfs_extent_post_op(trans, root); - ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); BUG_ON(ret); btrfs_write_dirty_block_groups(trans, root); - btrfs_extent_post_op(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); } return 0; } @@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; struct extent_buffer *eb; + int ret; - btrfs_extent_post_op(trans, fs_info->tree_root); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); - btrfs_extent_post_op(trans, fs_info->tree_root); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; @@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); } return 0; } @@ -635,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) } /* + * when dropping snapshots, we generate a ton of delayed refs, and it makes + * sense not to join the transaction while it is trying to flush the current + * queue of delayed refs out. + * + * This is used by the drop snapshot code only + */ +static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) +{ + DEFINE_WAIT(wait); + + mutex_lock(&info->trans_mutex); + while (info->running_transaction && + info->running_transaction->delayed_refs.flushing) { + prepare_to_wait(&info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&info->trans_mutex); + schedule(); + mutex_lock(&info->trans_mutex); + finish_wait(&info->transaction_wait, &wait); + } + mutex_unlock(&info->trans_mutex); + return 0; +} + +/* * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on * all of them */ @@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, atomic_inc(&root->fs_info->throttles); while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); trans = btrfs_start_transaction(tree_root, 1); + + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; + } + mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, dirty->root); if (ret != -EAGAIN) @@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); + btrfs_cow_block(trans, root, old, NULL, 0, &old); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); @@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; + int should_grow = 0; + unsigned long now = get_seconds(); + + btrfs_run_ordered_operations(root, 0); + + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + cur_trans = trans->transaction; + /* + * set the flushing flag so procs in this transaction have to + * start sending their work down. + */ + cur_trans->delayed_refs.flushing = 1; + + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); - INIT_LIST_HEAD(&dirty_fs_roots); mutex_lock(&root->fs_info->trans_mutex); - if (trans->transaction->in_commit) { - cur_trans = trans->transaction; - trans->transaction->use_count++; + INIT_LIST_HEAD(&dirty_fs_roots); + if (cur_trans->in_commit) { + cur_trans->use_count++; mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->in_commit = 1; trans->transaction->blocked = 1; - cur_trans = trans->transaction; if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); @@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } } + if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + should_grow = 1; + do { int snap_pending = 0; joined = cur_trans->num_joined; @@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; - else + else if (should_grow) timeout = 1; mutex_unlock(&root->fs_info->trans_mutex); @@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); } - schedule_timeout(timeout); + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + smp_mb(); + if (cur_trans->num_writers > 1 || should_grow) + schedule_timeout(timeout); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); } while (cur_trans->num_writers > 1 || - (cur_trans->num_joined != joined)); + (should_grow && cur_trans->num_joined != joined)); ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + WARN_ON(cur_trans != trans->transaction); /* btrfs_commit_tree_roots is responsible for getting the @@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_copy_pinned(root, pinned_copy); trans->transaction->blocked = 0; + wake_up(&root->fs_info->transaction_throttle); wake_up(&root->fs_info->transaction_wait); @@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->trans_mutex); cur_trans->commit_done = 1; + root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ea292117f88..94f5bde2b58 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -19,10 +19,16 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" +#include "delayed-ref.h" struct btrfs_transaction { u64 transid; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ unsigned long num_writers; + unsigned long num_joined; int in_commit; int use_count; @@ -34,6 +40,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct btrfs_delayed_ref_root delayed_refs; }; struct btrfs_trans_handle { @@ -44,6 +51,7 @@ struct btrfs_trans_handle { u64 block_group; u64 alloc_exclude_start; u64 alloc_exclude_nr; + unsigned long delayed_ref_updates; }; struct btrfs_pending_snapshot { diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 98d25fa4570..b10eacdb162 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } btrfs_release_path(root, path); - if (is_extent) - btrfs_extent_post_op(trans, root); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9c462fbd60f..fc9b87a7975 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -35,6 +35,49 @@ #define LOG_INODE_EXISTS 1 /* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find * the second stage (1) is to make sure that all the inodes @@ -47,12 +90,17 @@ #define LOG_WALK_REPLAY_INODES 1 #define LOG_WALK_REPLAY_ALL 2 -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); /* * tree logging is a special write ahead log used to make sure that @@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root) } /* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return ret; +} + +/* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -static int end_log_trans(struct btrfs_root *root) +int btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); @@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log, mutex_lock(&log->fs_info->pinned_mutex); btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - mutex_unlock(&log->fs_info->pinned_mutex); } if (btrfs_buffer_uptodate(eb, gen)) { @@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = link_to_fixup_dir(trans, root, path, location.objectid); BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); BUG_ON(ret); kfree(name); @@ -804,6 +867,7 @@ conflict_again: victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); @@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset--; btrfs_release_path(root, path); } - btrfs_free_path(path); + btrfs_release_path(root, path); if (nlink != inode->i_nlink) { inode->i_nlink = nlink; btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; + if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + btrfs_free_path(path); + return 0; } @@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, iput(inode); - if (key.offset == 0) - break; - key.offset--; + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; } btrfs_release_path(root, path); return 0; @@ -1313,11 +1387,11 @@ again: read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); log_di = NULL; - if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { log_di = btrfs_lookup_dir_item(trans, log, log_path, dir_key->objectid, name, name_len, 0); - } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, @@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid) + u64 dirid, int del_all) { u64 range_start; u64 range_end; @@ -1408,10 +1482,14 @@ again: range_start = 0; range_end = 0; while (1) { - ret = find_dir_range(log, path, dirid, key_type, - &range_start, &range_end); - if (ret != 0) - break; + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } dir_key.offset = range_start; while (1) { @@ -1437,7 +1515,8 @@ again: break; ret = check_item_in_log(trans, root, log, path, - log_path, dir, &found_key); + log_path, dir, + &found_key); BUG_ON(ret); if (found_key.offset == (u64)-1) break; @@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid); + root, log, path, key.objectid, 0); BUG_ON(ret); } ret = overwrite_item(wc->trans, root, path, @@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); BUG_ON(ret); + + /* if the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done + */ + if (inode->i_nlink == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(wc->trans, + root, inode); + } iput(inode); } ret = link_to_fixup_dir(wc->trans, root, @@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_root *root, unsigned long transid) +static int wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid) prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->log_transid < transid + 2 && + + if (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); + finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); } while (root->log_transid < transid + 2 && @@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid) return 0; } -static int wait_for_writer(struct btrfs_root *root) +static int wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { DEFINE_WAIT(wait); while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (atomic_read(&root->log_writers)) + if (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); @@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root) /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(root, root->log_transid); + wait_log_commit(trans, root, root->log_transid); mutex_unlock(&root->log_mutex); return 0; } @@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(root, root->log_transid - 1); + wait_log_commit(trans, root, root->log_transid - 1); while (1) { unsigned long batch = root->log_batch; mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); - wait_for_writer(root); + + wait_for_writer(trans, root); if (batch == root->log_batch) break; } + /* bail out if we need to do a full commit */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + ret = -EAGAIN; + mutex_unlock(&root->log_mutex); + goto out; + } + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); @@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - wait_log_commit(log_root_tree, log_root_tree->log_transid); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); - if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) - wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid - 1); + } + + wait_for_writer(trans, log_root_tree); - wait_for_writer(log_root_tree); + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); @@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ write_ctree_super(trans, root->fs_info->tree_root, 2); + ret = 0; +out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) @@ -1998,7 +2124,8 @@ out: return 0; } -/* * free all the extents used by the tree log. This should be called +/* + * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return 0; } @@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return ret; } @@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * * This handles both files and directories. */ -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only) { @@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans, min_key.offset = 0; max_key.objectid = inode->i_ino; + + /* today the code can only do partial logging of directories */ + if (!S_ISDIR(inode->i_mode)) + inode_only = LOG_INODE_ALL; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* - * if this inode has already been logged and we're in inode_only - * mode, we don't want to delete the things that have already - * been written to the log. - * - * But, if the inode has been through an inode_only log, - * the logged_trans field is not set. This allows us to catch - * any new names for this inode in the backrefs by logging it - * again - */ - if (inode_only == LOG_INODE_EXISTS && - BTRFS_I(inode)->logged_trans == trans->transid) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - goto out; - } mutex_lock(&BTRFS_I(inode)->log_mutex); /* @@ -2693,7 +2809,6 @@ next_slot: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - BTRFS_I(inode)->log_dirty_trans = 0; ret = log_directory_changes(trans, root, inode, path, dst_path); BUG_ON(ret); } @@ -2702,19 +2817,69 @@ next_slot: btrfs_free_path(path); btrfs_free_path(dst_path); -out: return 0; } -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) { - int ret; + int ret = 0; + struct btrfs_root *root; - start_log_trans(trans, root); - ret = __btrfs_log_inode(trans, root, inode, inode_only); - end_log_trans(root); + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + + if (!S_ISDIR(inode->i_mode)) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + goto out; + inode = parent->d_inode; + } + + while (1) { + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + root->fs_info->last_trans_log_full_commit = + trans->transid; + ret = 1; + break; + } + + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + if (parent == sb->s_root) + break; + + parent = parent->d_parent; + inode = parent->d_inode; + + } +out: return ret; } @@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans, * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) { - int inode_only = LOG_INODE_ALL; + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; - int ret; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + + sb = inode->i_sb; + + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; start_log_trans(trans, root); - sb = dentry->d_inode->i_sb; - while (1) { - ret = __btrfs_log_inode(trans, root, dentry->d_inode, - inode_only); - BUG_ON(ret); - inode_only = LOG_INODE_EXISTS; - dentry = dentry->d_parent; - if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto no_parent; + + inode_only = LOG_INODE_EXISTS; + while (1) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; - if (BTRFS_I(dentry->d_inode)->generation <= - root->fs_info->last_trans_committed) + inode = parent->d_inode; + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } + if (parent == sb->s_root) break; + + parent = parent->d_parent; } - end_log_trans(root); - return 0; +no_parent: + ret = 0; + btrfs_end_log_trans(root); +end_no_trans: + return ret; } /* @@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans, int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - u64 gen; - gen = root->fs_info->last_trans_new_blockgroup; - if (gen > root->fs_info->last_trans_committed) - return 1; - else - return btrfs_log_dentry(trans, root, dentry); + return btrfs_log_inode_parent(trans, root, dentry->d_inode, + dentry->d_parent, 0); } /* @@ -2884,3 +3079,94 @@ again: kfree(log_root_tree); return 0; } + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 1); +} + diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index b9409b32ed0..d09c7609e16 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -22,14 +22,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); +int btrfs_join_running_log_trans(struct btrfs_root *root); +int btrfs_end_log_trans(struct btrfs_root *root); +int btrfs_pin_log_trans(struct btrfs_root *root); +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename); +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent); #endif diff --git a/fs/buffer.c b/fs/buffer.c index a2fd743d97c..c2fa1be4923 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) head = page_buffers(page); bh = head; do { - if (bh->b_blocknr == block) { + if (!buffer_mapped(bh)) + all_mapped = 0; + else if (bh->b_blocknr == block) { ret = bh; get_bh(bh); goto out_unlock; } - if (!buffer_mapped(bh)) - all_mapped = 0; bh = bh->b_this_page; } while (bh != head); @@ -290,7 +290,7 @@ static void free_more_memory(void) &zone); if (zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS); + GFP_NOFS, NULL); } } @@ -547,6 +547,39 @@ repeat: return err; } +void do_thaw_all(unsigned long unused) +{ + struct super_block *sb; + char b[BDEVNAME_SIZE]; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + pdflush_operation(do_thaw_all, 0); +} + /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written @@ -621,14 +654,7 @@ static void __set_page_dirty(struct page *page, spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -2320,13 +2346,14 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) * unlock the page. */ int -block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ lock_page(page); size = i_size_read(inode); @@ -2346,6 +2373,13 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page, if (!ret) ret = block_commit_write(page, 0, end); + if (unlikely(ret)) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + } + out_unlock: unlock_page(page); return ret; @@ -3281,7 +3315,6 @@ EXPORT_SYMBOL(cont_write_begin); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); -EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig new file mode 100644 index 00000000000..80e9c6167f0 --- /dev/null +++ b/fs/cachefiles/Kconfig @@ -0,0 +1,39 @@ + +config CACHEFILES + tristate "Filesystem caching on files" + depends on FSCACHE && BLOCK + help + This permits use of a mounted filesystem as a cache for other + filesystems - primarily networking filesystems - thus allowing fast + local disk to enhance the speed of slower devices. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. + +config CACHEFILES_DEBUG + bool "Debug CacheFiles" + depends on CACHEFILES + help + This permits debugging to be dynamically enabled in the filesystem + caching on files module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/cachefiles/parameter/debug or + by including a debugging specifier in /etc/cachefilesd.conf. + +config CACHEFILES_HISTOGRAM + bool "Gather latency information on CacheFiles" + depends on CACHEFILES && PROC_FS + help + + This option causes latency information to be gathered on CacheFiles + operation and exported through file: + + /proc/fs/cachefiles/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile new file mode 100644 index 00000000000..32cbab0ffce --- /dev/null +++ b/fs/cachefiles/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for caching in a mounted filesystem +# + +cachefiles-y := \ + bind.o \ + daemon.o \ + interface.o \ + key.o \ + main.o \ + namei.o \ + rdwr.o \ + security.o \ + xattr.o + +cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o + +obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c new file mode 100644 index 00000000000..3797e0077b3 --- /dev/null +++ b/fs/cachefiles/bind.c @@ -0,0 +1,286 @@ +/* Bind and unbind a cache from the filesystem backing it + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/statfs.h> +#include <linux/ctype.h> +#include "internal.h" + +static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); + +/* + * bind a directory as a cache + */ +int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) +{ + _enter("{%u,%u,%u,%u,%u,%u},%s", + cache->frun_percent, + cache->fcull_percent, + cache->fstop_percent, + cache->brun_percent, + cache->bcull_percent, + cache->bstop_percent, + args); + + /* start by checking things over */ + ASSERT(cache->fstop_percent >= 0 && + cache->fstop_percent < cache->fcull_percent && + cache->fcull_percent < cache->frun_percent && + cache->frun_percent < 100); + + ASSERT(cache->bstop_percent >= 0 && + cache->bstop_percent < cache->bcull_percent && + cache->bcull_percent < cache->brun_percent && + cache->brun_percent < 100); + + if (*args) { + kerror("'bind' command doesn't take an argument"); + return -EINVAL; + } + + if (!cache->rootdirname) { + kerror("No cache directory specified"); + return -EINVAL; + } + + /* don't permit already bound caches to be re-bound */ + if (test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("Cache already bound"); + return -EBUSY; + } + + /* make sure we have copies of the tag and dirname strings */ + if (!cache->tag) { + /* the tag string is released by the fops->release() + * function, so we don't release it on error here */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; + } + + /* add the cache */ + return cachefiles_daemon_add_cache(cache); +} + +/* + * add a cache + */ +static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) +{ + struct cachefiles_object *fsdef; + struct nameidata nd; + struct kstatfs stats; + struct dentry *graveyard, *cachedir, *root; + const struct cred *saved_cred; + int ret; + + _enter(""); + + /* we want to work under the module's security ID */ + ret = cachefiles_get_security_ID(cache); + if (ret < 0) + return ret; + + cachefiles_begin_secure(cache, &saved_cred); + + /* allocate the root index object */ + ret = -ENOMEM; + + fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!fsdef) + goto error_root_object; + + ASSERTCMP(fsdef->backer, ==, NULL); + + atomic_set(&fsdef->usage, 1); + fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; + + _debug("- fsdef %p", fsdef); + + /* look up the directory at the root of the cache */ + memset(&nd, 0, sizeof(nd)); + + ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd); + if (ret < 0) + goto error_open_root; + + cache->mnt = mntget(nd.path.mnt); + root = dget(nd.path.dentry); + path_put(&nd.path); + + /* check parameters */ + ret = -EOPNOTSUPP; + if (!root->d_inode || + !root->d_inode->i_op || + !root->d_inode->i_op->lookup || + !root->d_inode->i_op->mkdir || + !root->d_inode->i_op->setxattr || + !root->d_inode->i_op->getxattr || + !root->d_sb || + !root->d_sb->s_op || + !root->d_sb->s_op->statfs || + !root->d_sb->s_op->sync_fs) + goto error_unsupported; + + ret = -EROFS; + if (root->d_sb->s_flags & MS_RDONLY) + goto error_unsupported; + + /* determine the security of the on-disk cache as this governs + * security ID of files we create */ + ret = cachefiles_determine_cache_security(cache, root, &saved_cred); + if (ret < 0) + goto error_unsupported; + + /* get the cache size and blocksize */ + ret = vfs_statfs(root, &stats); + if (ret < 0) + goto error_unsupported; + + ret = -ERANGE; + if (stats.f_bsize <= 0) + goto error_unsupported; + + ret = -EOPNOTSUPP; + if (stats.f_bsize > PAGE_SIZE) + goto error_unsupported; + + cache->bsize = stats.f_bsize; + cache->bshift = 0; + if (stats.f_bsize < PAGE_SIZE) + cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + + _debug("blksize %u (shift %u)", + cache->bsize, cache->bshift); + + _debug("size %llu, avail %llu", + (unsigned long long) stats.f_blocks, + (unsigned long long) stats.f_bavail); + + /* set up caching limits */ + do_div(stats.f_files, 100); + cache->fstop = stats.f_files * cache->fstop_percent; + cache->fcull = stats.f_files * cache->fcull_percent; + cache->frun = stats.f_files * cache->frun_percent; + + _debug("limits {%llu,%llu,%llu} files", + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop); + + stats.f_blocks >>= cache->bshift; + do_div(stats.f_blocks, 100); + cache->bstop = stats.f_blocks * cache->bstop_percent; + cache->bcull = stats.f_blocks * cache->bcull_percent; + cache->brun = stats.f_blocks * cache->brun_percent; + + _debug("limits {%llu,%llu,%llu} blocks", + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop); + + /* get the cache directory and check its type */ + cachedir = cachefiles_get_directory(cache, root, "cache"); + if (IS_ERR(cachedir)) { + ret = PTR_ERR(cachedir); + goto error_unsupported; + } + + fsdef->dentry = cachedir; + fsdef->fscache.cookie = NULL; + + ret = cachefiles_check_object_type(fsdef); + if (ret < 0) + goto error_unsupported; + + /* get the graveyard directory */ + graveyard = cachefiles_get_directory(cache, root, "graveyard"); + if (IS_ERR(graveyard)) { + ret = PTR_ERR(graveyard); + goto error_unsupported; + } + + cache->graveyard = graveyard; + + /* publish the cache */ + fscache_init_cache(&cache->cache, + &cachefiles_cache_ops, + "%s", + fsdef->dentry->d_sb->s_id); + + fscache_object_init(&fsdef->fscache, NULL, &cache->cache); + + ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); + if (ret < 0) + goto error_add_cache; + + /* done */ + set_bit(CACHEFILES_READY, &cache->flags); + dput(root); + + printk(KERN_INFO "CacheFiles:" + " File cache on %s registered\n", + cache->cache.identifier); + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + cachefiles_end_secure(cache, saved_cred); + return 0; + +error_add_cache: + dput(cache->graveyard); + cache->graveyard = NULL; +error_unsupported: + mntput(cache->mnt); + cache->mnt = NULL; + dput(fsdef->dentry); + fsdef->dentry = NULL; + dput(root); +error_open_root: + kmem_cache_free(cachefiles_object_jar, fsdef); +error_root_object: + cachefiles_end_secure(cache, saved_cred); + kerror("Failed to register: %d", ret); + return ret; +} + +/* + * unbind a cache on fd release + */ +void cachefiles_daemon_unbind(struct cachefiles_cache *cache) +{ + _enter(""); + + if (test_bit(CACHEFILES_READY, &cache->flags)) { + printk(KERN_INFO "CacheFiles:" + " File cache on %s unregistering\n", + cache->cache.identifier); + + fscache_withdraw_cache(&cache->cache); + } + + dput(cache->graveyard); + mntput(cache->mnt); + + kfree(cache->rootdirname); + kfree(cache->secctx); + kfree(cache->tag); + + _leave(""); +} diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c new file mode 100644 index 00000000000..4618516dd99 --- /dev/null +++ b/fs/cachefiles/daemon.c @@ -0,0 +1,755 @@ +/* Daemon interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/poll.h> +#include <linux/mount.h> +#include <linux/statfs.h> +#include <linux/ctype.h> +#include <linux/fs_struct.h> +#include "internal.h" + +static int cachefiles_daemon_open(struct inode *, struct file *); +static int cachefiles_daemon_release(struct inode *, struct file *); +static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, + loff_t *); +static ssize_t cachefiles_daemon_write(struct file *, const char __user *, + size_t, loff_t *); +static unsigned int cachefiles_daemon_poll(struct file *, + struct poll_table_struct *); +static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); +static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); +static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); +static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); +static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); + +static unsigned long cachefiles_open; + +const struct file_operations cachefiles_daemon_fops = { + .owner = THIS_MODULE, + .open = cachefiles_daemon_open, + .release = cachefiles_daemon_release, + .read = cachefiles_daemon_read, + .write = cachefiles_daemon_write, + .poll = cachefiles_daemon_poll, +}; + +struct cachefiles_daemon_cmd { + char name[8]; + int (*handler)(struct cachefiles_cache *cache, char *args); +}; + +static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { + { "bind", cachefiles_daemon_bind }, + { "brun", cachefiles_daemon_brun }, + { "bcull", cachefiles_daemon_bcull }, + { "bstop", cachefiles_daemon_bstop }, + { "cull", cachefiles_daemon_cull }, + { "debug", cachefiles_daemon_debug }, + { "dir", cachefiles_daemon_dir }, + { "frun", cachefiles_daemon_frun }, + { "fcull", cachefiles_daemon_fcull }, + { "fstop", cachefiles_daemon_fstop }, + { "inuse", cachefiles_daemon_inuse }, + { "secctx", cachefiles_daemon_secctx }, + { "tag", cachefiles_daemon_tag }, + { "", NULL } +}; + + +/* + * do various checks + */ +static int cachefiles_daemon_open(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache; + + _enter(""); + + /* only the superuser may do this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* the cachefiles device may only be open once at a time */ + if (xchg(&cachefiles_open, 1) == 1) + return -EBUSY; + + /* allocate a cache record */ + cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); + if (!cache) { + cachefiles_open = 0; + return -ENOMEM; + } + + mutex_init(&cache->daemon_mutex); + cache->active_nodes = RB_ROOT; + rwlock_init(&cache->active_lock); + init_waitqueue_head(&cache->daemon_pollwq); + + /* set default caching limits + * - limit at 1% free space and/or free files + * - cull below 5% free space and/or free files + * - cease culling above 7% free space and/or free files + */ + cache->frun_percent = 7; + cache->fcull_percent = 5; + cache->fstop_percent = 1; + cache->brun_percent = 7; + cache->bcull_percent = 5; + cache->bstop_percent = 1; + + file->private_data = cache; + cache->cachefilesd = file; + return 0; +} + +/* + * release a cache + */ +static int cachefiles_daemon_release(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache = file->private_data; + + _enter(""); + + ASSERT(cache); + + set_bit(CACHEFILES_DEAD, &cache->flags); + + cachefiles_daemon_unbind(cache); + + ASSERT(!cache->active_nodes.rb_node); + + /* clean up the control file interface */ + cache->cachefilesd = NULL; + file->private_data = NULL; + cachefiles_open = 0; + + kfree(cache); + + _leave(""); + return 0; +} + +/* + * read the cache state + */ +static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, + size_t buflen, loff_t *pos) +{ + struct cachefiles_cache *cache = file->private_data; + char buffer[256]; + int n; + + //_enter(",,%zu,", buflen); + + if (!test_bit(CACHEFILES_READY, &cache->flags)) + return 0; + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + + /* summarise */ + clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + + n = snprintf(buffer, sizeof(buffer), + "cull=%c" + " frun=%llx" + " fcull=%llx" + " fstop=%llx" + " brun=%llx" + " bcull=%llx" + " bstop=%llx", + test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop, + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop + ); + + if (n > buflen) + return -EMSGSIZE; + + if (copy_to_user(_buffer, buffer, n) != 0) + return -EFAULT; + + return n; +} + +/* + * command the cache + */ +static ssize_t cachefiles_daemon_write(struct file *file, + const char __user *_data, + size_t datalen, + loff_t *pos) +{ + const struct cachefiles_daemon_cmd *cmd; + struct cachefiles_cache *cache = file->private_data; + ssize_t ret; + char *data, *args, *cp; + + //_enter(",,%zu,", datalen); + + ASSERT(cache); + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) + return -EIO; + + if (datalen < 0 || datalen > PAGE_SIZE - 1) + return -EOPNOTSUPP; + + /* drag the command string into the kernel so we can parse it */ + data = kmalloc(datalen + 1, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, _data, datalen) != 0) + goto error; + + data[datalen] = '\0'; + + ret = -EINVAL; + if (memchr(data, '\0', datalen)) + goto error; + + /* strip any newline */ + cp = memchr(data, '\n', datalen); + if (cp) { + if (cp == data) + goto error; + + *cp = '\0'; + } + + /* parse the command */ + ret = -EOPNOTSUPP; + + for (args = data; *args; args++) + if (isspace(*args)) + break; + if (*args) { + if (args == data) + goto error; + *args = '\0'; + for (args++; isspace(*args); args++) + continue; + } + + /* run the appropriate command handler */ + for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) + if (strcmp(cmd->name, data) == 0) + goto found_command; + +error: + kfree(data); + //_leave(" = %zd", ret); + return ret; + +found_command: + mutex_lock(&cache->daemon_mutex); + + ret = -EIO; + if (!test_bit(CACHEFILES_DEAD, &cache->flags)) + ret = cmd->handler(cache, args); + + mutex_unlock(&cache->daemon_mutex); + + if (ret == 0) + ret = datalen; + goto error; +} + +/* + * poll for culling state + * - use POLLOUT to indicate culling state + */ +static unsigned int cachefiles_daemon_poll(struct file *file, + struct poll_table_struct *poll) +{ + struct cachefiles_cache *cache = file->private_data; + unsigned int mask; + + poll_wait(file, &cache->daemon_pollwq, poll); + mask = 0; + + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) + mask |= POLLIN; + + if (test_bit(CACHEFILES_CULLING, &cache->flags)) + mask |= POLLOUT; + + return mask; +} + +/* + * give a range error for cache space constraints + * - can be tail-called + */ +static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, + char *args) +{ + kerror("Free space limits must be in range" + " 0%%<=stop<cull<run<100%%"); + + return -EINVAL; +} + +/* + * set the percentage of files at which to stop culling + * - command: "frun <N>%" + */ +static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) +{ + unsigned long frun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + frun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (frun <= cache->fcull_percent || frun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->frun_percent = frun; + return 0; +} + +/* + * set the percentage of files at which to start culling + * - command: "fcull <N>%" + */ +static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long fcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fcull_percent = fcull; + return 0; +} + +/* + * set the percentage of files at which to stop allocating + * - command: "fstop <N>%" + */ +static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long fstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fstop < 0 || fstop >= cache->fcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fstop_percent = fstop; + return 0; +} + +/* + * set the percentage of blocks at which to stop culling + * - command: "brun <N>%" + */ +static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) +{ + unsigned long brun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + brun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (brun <= cache->bcull_percent || brun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->brun_percent = brun; + return 0; +} + +/* + * set the percentage of blocks at which to start culling + * - command: "bcull <N>%" + */ +static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long bcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bcull_percent = bcull; + return 0; +} + +/* + * set the percentage of blocks at which to stop allocating + * - command: "bstop <N>%" + */ +static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long bstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bstop < 0 || bstop >= cache->bcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bstop_percent = bstop; + return 0; +} + +/* + * set the cache directory + * - command: "dir <name>" + */ +static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) +{ + char *dir; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty directory specified"); + return -EINVAL; + } + + if (cache->rootdirname) { + kerror("Second cache directory specified"); + return -EEXIST; + } + + dir = kstrdup(args, GFP_KERNEL); + if (!dir) + return -ENOMEM; + + cache->rootdirname = dir; + return 0; +} + +/* + * set the cache security context + * - command: "secctx <ctx>" + */ +static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) +{ + char *secctx; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty security context specified"); + return -EINVAL; + } + + if (cache->secctx) { + kerror("Second security context specified"); + return -EINVAL; + } + + secctx = kstrdup(args, GFP_KERNEL); + if (!secctx) + return -ENOMEM; + + cache->secctx = secctx; + return 0; +} + +/* + * set the cache tag + * - command: "tag <name>" + */ +static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) +{ + char *tag; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty tag specified"); + return -EINVAL; + } + + if (cache->tag) + return -EEXIST; + + tag = kstrdup(args, GFP_KERNEL); + if (!tag) + return -ENOMEM; + + cache->tag = tag; + return 0; +} + +/* + * request a node in the cache be culled from the current working directory + * - command: "cull <name>" + */ +static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) +{ + struct fs_struct *fs; + struct dentry *dir; + const struct cred *saved_cred; + int ret; + + _enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("cull applied to unready cache"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + kerror("cull applied to dead cache"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + fs = current->fs; + read_lock(&fs->lock); + dir = dget(fs->pwd.dentry); + read_unlock(&fs->lock); + + if (!S_ISDIR(dir->d_inode->i_mode)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_cull(cache, dir, args); + cachefiles_end_secure(cache, saved_cred); + + dput(dir); + _leave(" = %d", ret); + return ret; + +notdir: + dput(dir); + kerror("cull command requires dirfd to be a directory"); + return -ENOTDIR; + +inval: + kerror("cull command requires dirfd and filename"); + return -EINVAL; +} + +/* + * set debugging mode + * - command: "debug <mask>" + */ +static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) +{ + unsigned long mask; + + _enter(",%s", args); + + mask = simple_strtoul(args, &args, 0); + if (args[0] != '\0') + goto inval; + + cachefiles_debug = mask; + _leave(" = 0"); + return 0; + +inval: + kerror("debug command requires mask"); + return -EINVAL; +} + +/* + * find out whether an object in the current working directory is in use or not + * - command: "inuse <name>" + */ +static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) +{ + struct fs_struct *fs; + struct dentry *dir; + const struct cred *saved_cred; + int ret; + + //_enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("inuse applied to unready cache"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + kerror("inuse applied to dead cache"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + fs = current->fs; + read_lock(&fs->lock); + dir = dget(fs->pwd.dentry); + read_unlock(&fs->lock); + + if (!S_ISDIR(dir->d_inode->i_mode)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_in_use(cache, dir, args); + cachefiles_end_secure(cache, saved_cred); + + dput(dir); + //_leave(" = %d", ret); + return ret; + +notdir: + dput(dir); + kerror("inuse command requires dirfd to be a directory"); + return -ENOTDIR; + +inval: + kerror("inuse command requires dirfd and filename"); + return -EINVAL; +} + +/* + * see if we have space for a number of pages and/or a number of files in the + * cache + */ +int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr) +{ + struct kstatfs stats; + int ret; + + //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", + // (unsigned long long) cache->frun, + // (unsigned long long) cache->fcull, + // (unsigned long long) cache->fstop, + // (unsigned long long) cache->brun, + // (unsigned long long) cache->bcull, + // (unsigned long long) cache->bstop, + // fnr, bnr); + + /* find out how many pages of blockdev are available */ + memset(&stats, 0, sizeof(stats)); + + ret = vfs_statfs(cache->mnt->mnt_root, &stats); + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error(cache, "statfs failed"); + _leave(" = %d", ret); + return ret; + } + + stats.f_bavail >>= cache->bshift; + + //_debug("avail %llu,%llu", + // (unsigned long long) stats.f_ffree, + // (unsigned long long) stats.f_bavail); + + /* see if there is sufficient space */ + if (stats.f_ffree > fnr) + stats.f_ffree -= fnr; + else + stats.f_ffree = 0; + + if (stats.f_bavail > bnr) + stats.f_bavail -= bnr; + else + stats.f_bavail = 0; + + ret = -ENOBUFS; + if (stats.f_ffree < cache->fstop || + stats.f_bavail < cache->bstop) + goto begin_cull; + + ret = 0; + if (stats.f_ffree < cache->fcull || + stats.f_bavail < cache->bcull) + goto begin_cull; + + if (test_bit(CACHEFILES_CULLING, &cache->flags) && + stats.f_ffree >= cache->frun && + stats.f_bavail >= cache->brun && + test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) + ) { + _debug("cease culling"); + cachefiles_state_changed(cache); + } + + //_leave(" = 0"); + return 0; + +begin_cull: + if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { + _debug("### CULL CACHE ###"); + cachefiles_state_changed(cache); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c new file mode 100644 index 00000000000..1e962348d11 --- /dev/null +++ b/fs/cachefiles/interface.c @@ -0,0 +1,449 @@ +/* FS-Cache interface to CacheFiles + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/mount.h> +#include <linux/buffer_head.h> +#include "internal.h" + +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) + +struct cachefiles_lookup_data { + struct cachefiles_xattr *auxdata; /* auxiliary data */ + char *key; /* key path */ +}; + +static int cachefiles_attr_changed(struct fscache_object *_object); + +/* + * allocate an object record for a cookie lookup and prepare the lookup data + */ +static struct fscache_object *cachefiles_alloc_object( + struct fscache_cache *_cache, + struct fscache_cookie *cookie) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct cachefiles_xattr *auxdata; + unsigned keylen, auxlen; + void *buffer; + char *key; + + cache = container_of(_cache, struct cachefiles_cache, cache); + + _enter("{%s},%p,", cache->cache.identifier, cookie); + + lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); + if (!lookup_data) + goto nomem_lookup_data; + + /* create a new object record and a temporary leaf image */ + object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!object) + goto nomem_object; + + ASSERTCMP(object->backer, ==, NULL); + + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + atomic_set(&object->usage, 1); + + fscache_object_init(&object->fscache, cookie, &cache->cache); + + object->type = cookie->def->type; + + /* get hold of the raw key + * - stick the length on the front and leave space on the back for the + * encoder + */ + buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); + if (!buffer) + goto nomem_buffer; + + keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512); + ASSERTCMP(keylen, <, 512); + + *(uint16_t *)buffer = keylen; + ((char *)buffer)[keylen + 2] = 0; + ((char *)buffer)[keylen + 3] = 0; + ((char *)buffer)[keylen + 4] = 0; + + /* turn the raw key into something that can work with as a filename */ + key = cachefiles_cook_key(buffer, keylen + 2, object->type); + if (!key) + goto nomem_key; + + /* get hold of the auxiliary data and prepend the object type */ + auxdata = buffer; + auxlen = 0; + if (cookie->def->get_aux) { + auxlen = cookie->def->get_aux(cookie->netfs_data, + auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + } + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + lookup_data->auxdata = auxdata; + lookup_data->key = key; + object->lookup_data = lookup_data; + + _leave(" = %p [%p]", &object->fscache, lookup_data); + return &object->fscache; + +nomem_key: + kfree(buffer); +nomem_buffer: + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(&cache->cache); +nomem_object: + kfree(lookup_data); +nomem_lookup_data: + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * attempt to look up the nominated node in this cache + */ +static void cachefiles_lookup_object(struct fscache_object *_object) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *parent, *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", _object->debug_id); + + cache = container_of(_object->cache, struct cachefiles_cache, cache); + parent = container_of(_object->parent, + struct cachefiles_object, fscache); + object = container_of(_object, struct cachefiles_object, fscache); + lookup_data = object->lookup_data; + + ASSERTCMP(lookup_data, !=, NULL); + + /* look up the key, creating any missing bits */ + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_walk_to_object(parent, object, + lookup_data->key, + lookup_data->auxdata); + cachefiles_end_secure(cache, saved_cred); + + /* polish off by setting the attributes of non-index files */ + if (ret == 0 && + object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + cachefiles_attr_changed(&object->fscache); + + if (ret < 0) { + printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", + ret); + fscache_object_lookup_error(&object->fscache); + } + + _leave(" [%d]", ret); +} + +/* + * indication of lookup completion + */ +static void cachefiles_lookup_complete(struct fscache_object *_object) +{ + struct cachefiles_object *object; + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } +} + +/* + * increment the usage count on an inode object (may fail if unmounting) + */ +static +struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) +{ + struct cachefiles_object *object = + container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + atomic_inc(&object->usage); + return &object->fscache; +} + +/* + * update the auxilliary data for an object object on disk + */ +static void cachefiles_update_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_xattr *auxdata; + struct cachefiles_cache *cache; + struct fscache_cookie *cookie; + const struct cred *saved_cred; + unsigned auxlen; + + _enter("{OBJ%x}", _object->debug_id); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, struct cachefiles_cache, + cache); + cookie = object->fscache.cookie; + + if (!cookie->def->get_aux) { + _leave(" [no aux]"); + return; + } + + auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); + if (!auxdata) { + _leave(" [nomem]"); + return; + } + + auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_update_object_xattr(object, auxdata); + cachefiles_end_secure(cache, saved_cred); + kfree(auxdata); + _leave(""); +} + +/* + * discard the resources pinned by an object and effect retirement if + * requested + */ +static void cachefiles_drop_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + /* delete retired objects */ + if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && + _object != cache->cache.fsdef + ) { + _debug("- retire object OBJ%x", object->fscache.debug_id); + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_delete_object(cache, object); + cachefiles_end_secure(cache, saved_cred); + } + + /* close the filesystem stuff attached to the object */ + if (object->backer != object->dentry) + dput(object->backer); + object->backer = NULL; + + /* note that the object is now inactive */ + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + write_lock(&cache->active_lock); + if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE, + &object->flags)) + BUG(); + rb_erase(&object->active_node, &cache->active_nodes); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); + } + + dput(object->dentry); + object->dentry = NULL; + + _leave(""); +} + +/* + * dispose of a reference to an object + */ +static void cachefiles_put_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct fscache_cache *cache; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + ASSERTIFCMP(object->fscache.parent, + object->fscache.parent->n_children, >, 0); + + if (atomic_dec_and_test(&object->usage)) { + _debug("- kill object OBJ%x", object->fscache.debug_id); + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + ASSERTCMP(object->fscache.parent, ==, NULL); + ASSERTCMP(object->backer, ==, NULL); + ASSERTCMP(object->dentry, ==, NULL); + ASSERTCMP(object->fscache.n_ops, ==, 0); + ASSERTCMP(object->fscache.n_children, ==, 0); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } + + cache = object->fscache.cache; + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(cache); + } + + _leave(""); +} + +/* + * sync a cache + */ +static void cachefiles_sync_cache(struct fscache_cache *_cache) +{ + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("%p", _cache); + + cache = container_of(_cache, struct cachefiles_cache, cache); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disc */ + cachefiles_begin_secure(cache, &saved_cred); + ret = fsync_super(cache->mnt->mnt_sb); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) + cachefiles_io_error(cache, + "Attempt to sync backing fs superblock" + " returned error %d", + ret); +} + +/* + * notification the attributes on an object have changed + * - called with reads/writes excluded by FS-Cache + */ +static int cachefiles_attr_changed(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct iattr newattrs; + uint64_t ni_size; + loff_t oi_size; + int ret; + + _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size); + + _enter("{OBJ%x},[%llu]", + _object->debug_id, (unsigned long long) ni_size); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + if (ni_size == object->i_size) + return 0; + + if (!object->backer) + return -ENOBUFS; + + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + fscache_set_store_limit(&object->fscache, ni_size); + + oi_size = i_size_read(object->backer->d_inode); + if (oi_size == ni_size) + return 0; + + newattrs.ia_size = ni_size; + newattrs.ia_valid = ATTR_SIZE; + + cachefiles_begin_secure(cache, &saved_cred); + mutex_lock(&object->backer->d_inode->i_mutex); + ret = notify_change(object->backer, &newattrs); + mutex_unlock(&object->backer->d_inode->i_mutex); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) { + fscache_set_store_limit(&object->fscache, 0); + cachefiles_io_error_obj(object, "Size set failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * dissociate a cache from all the pages it was backing + */ +static void cachefiles_dissociate_pages(struct fscache_cache *cache) +{ + _enter(""); +} + +const struct fscache_cache_ops cachefiles_cache_ops = { + .name = "cachefiles", + .alloc_object = cachefiles_alloc_object, + .lookup_object = cachefiles_lookup_object, + .lookup_complete = cachefiles_lookup_complete, + .grab_object = cachefiles_grab_object, + .update_object = cachefiles_update_object, + .drop_object = cachefiles_drop_object, + .put_object = cachefiles_put_object, + .sync_cache = cachefiles_sync_cache, + .attr_changed = cachefiles_attr_changed, + .read_or_alloc_page = cachefiles_read_or_alloc_page, + .read_or_alloc_pages = cachefiles_read_or_alloc_pages, + .allocate_page = cachefiles_allocate_page, + .allocate_pages = cachefiles_allocate_pages, + .write_page = cachefiles_write_page, + .uncache_page = cachefiles_uncache_page, + .dissociate_pages = cachefiles_dissociate_pages, +}; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h new file mode 100644 index 00000000000..19218e1463d --- /dev/null +++ b/fs/cachefiles/internal.h @@ -0,0 +1,360 @@ +/* General netfs cache on cache files internal defs + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fscache-cache.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/security.h> + +struct cachefiles_cache; +struct cachefiles_object; + +extern unsigned cachefiles_debug; +#define CACHEFILES_DEBUG_KENTER 1 +#define CACHEFILES_DEBUG_KLEAVE 2 +#define CACHEFILES_DEBUG_KDEBUG 4 + +/* + * node records + */ +struct cachefiles_object { + struct fscache_object fscache; /* fscache handle */ + struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ + struct dentry *dentry; /* the file/dir representing this object */ + struct dentry *backer; /* backing file */ + loff_t i_size; /* object size */ + unsigned long flags; +#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ + atomic_t usage; /* object usage count */ + uint8_t type; /* object type */ + uint8_t new; /* T if object new */ + spinlock_t work_lock; + struct rb_node active_node; /* link in active tree (dentry is key) */ +}; + +extern struct kmem_cache *cachefiles_object_jar; + +/* + * Cache files cache definition + */ +struct cachefiles_cache { + struct fscache_cache cache; /* FS-Cache record */ + struct vfsmount *mnt; /* mountpoint holding the cache */ + struct dentry *graveyard; /* directory into which dead objects go */ + struct file *cachefilesd; /* manager daemon handle */ + const struct cred *cache_cred; /* security override for accessing cache */ + struct mutex daemon_mutex; /* command serialisation mutex */ + wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ + struct rb_root active_nodes; /* active nodes (can't be culled) */ + rwlock_t active_lock; /* lock for active_nodes */ + atomic_t gravecounter; /* graveyard uniquifier */ + unsigned frun_percent; /* when to stop culling (% files) */ + unsigned fcull_percent; /* when to start culling (% files) */ + unsigned fstop_percent; /* when to stop allocating (% files) */ + unsigned brun_percent; /* when to stop culling (% blocks) */ + unsigned bcull_percent; /* when to start culling (% blocks) */ + unsigned bstop_percent; /* when to stop allocating (% blocks) */ + unsigned bsize; /* cache's block size */ + unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + uint64_t frun; /* when to stop culling */ + uint64_t fcull; /* when to start culling */ + uint64_t fstop; /* when to stop allocating */ + sector_t brun; /* when to stop culling */ + sector_t bcull; /* when to start culling */ + sector_t bstop; /* when to stop allocating */ + unsigned long flags; +#define CACHEFILES_READY 0 /* T if cache prepared */ +#define CACHEFILES_DEAD 1 /* T if cache dead */ +#define CACHEFILES_CULLING 2 /* T if cull engaged */ +#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ + char *rootdirname; /* name of cache root directory */ + char *secctx; /* LSM security context */ + char *tag; /* cache binding tag */ +}; + +/* + * backing file read tracking + */ +struct cachefiles_one_read { + wait_queue_t monitor; /* link into monitored waitqueue */ + struct page *back_page; /* backing file page we're waiting for */ + struct page *netfs_page; /* netfs page we're going to fill */ + struct fscache_retrieval *op; /* retrieval op covering this */ + struct list_head op_link; /* link in op's todo list */ +}; + +/* + * backing file write tracking + */ +struct cachefiles_one_write { + struct page *netfs_page; /* netfs page to copy */ + struct cachefiles_object *object; + struct list_head obj_link; /* link in object's lists */ + fscache_rw_complete_t end_io_func; + void *context; +}; + +/* + * auxiliary data xattr buffer + */ +struct cachefiles_xattr { + uint16_t len; + uint8_t type; + uint8_t data[]; +}; + +/* + * note change of state for daemon + */ +static inline void cachefiles_state_changed(struct cachefiles_cache *cache) +{ + set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + wake_up_all(&cache->daemon_pollwq); +} + +/* + * cf-bind.c + */ +extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); +extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); + +/* + * cf-daemon.c + */ +extern const struct file_operations cachefiles_daemon_fops; + +extern int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr); + +/* + * cf-interface.c + */ +extern const struct fscache_cache_ops cachefiles_cache_ops; + +/* + * cf-key.c + */ +extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); + +/* + * cf-namei.c + */ +extern int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object); +extern int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata); +extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *name); + +extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename); + +extern int cachefiles_check_in_use(struct cachefiles_cache *cache, + struct dentry *dir, char *filename); + +/* + * cf-proc.c + */ +#ifdef CONFIG_CACHEFILES_HISTOGRAM +extern atomic_t cachefiles_lookup_histogram[HZ]; +extern atomic_t cachefiles_mkdir_histogram[HZ]; +extern atomic_t cachefiles_create_histogram[HZ]; + +extern int __init cachefiles_proc_init(void); +extern void cachefiles_proc_cleanup(void); +static inline +void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +#else +#define cachefiles_proc_init() (0) +#define cachefiles_proc_cleanup() do {} while (0) +#define cachefiles_hist(hist, start_jif) do {} while (0) +#endif + +/* + * cf-rdwr.c + */ +extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, + struct page *, gfp_t); +extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, + gfp_t); +extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, + gfp_t); +extern int cachefiles_allocate_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, gfp_t); +extern int cachefiles_write_page(struct fscache_storage *, struct page *); +extern void cachefiles_uncache_page(struct fscache_object *, struct page *); + +/* + * cf-security.c + */ +extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); +extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred); + +static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, + const struct cred **_saved_cred) +{ + *_saved_cred = override_creds(cache->cache_cred); +} + +static inline void cachefiles_end_secure(struct cachefiles_cache *cache, + const struct cred *saved_cred) +{ + revert_creds(saved_cred); +} + +/* + * cf-xattr.c + */ +extern int cachefiles_check_object_type(struct cachefiles_object *object); +extern int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry); + + +/* + * error handling + */ +#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__) + +#define cachefiles_io_error(___cache, FMT, ...) \ +do { \ + kerror("I/O Error: " FMT, ##__VA_ARGS__); \ + fscache_io_error(&(___cache)->cache); \ + set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ +} while (0) + +#define cachefiles_io_error_obj(object, FMT, ...) \ +do { \ + struct cachefiles_cache *___cache; \ + \ + ___cache = container_of((object)->fscache.cache, \ + struct cachefiles_cache, cache); \ + cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ +} while (0) + + +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +/* make sure we maintain the format strings, even when debugging is disabled */ +static inline void _dbprintk(const char *fmt, ...) + __attribute__((format(printf, 1, 2))); +static inline void _dbprintk(const char *fmt, ...) +{ +} + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_CACHEFILES_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) +#endif + +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c new file mode 100644 index 00000000000..81b8b2b3a67 --- /dev/null +++ b/fs/cachefiles/key.c @@ -0,0 +1,159 @@ +/* Key to pathname encoder + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include "internal.h" + +static const char cachefiles_charmap[64] = + "0123456789" /* 0 - 9 */ + "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ + "_-" /* 62 - 63 */ + ; + +static const char cachefiles_filecharmap[256] = { + /* we skip space and tab and control chars */ + [33 ... 46] = 1, /* '!' -> '.' */ + /* we skip '/' as it's significant to pathwalk */ + [48 ... 127] = 1, /* '0' -> '~' */ +}; + +/* + * turn the raw key into something cooked + * - the raw key should include the length in the two bytes at the front + * - the key may be up to 514 bytes in length (including the length word) + * - "base64" encode the strange keys, mapping 3 bytes of raw to four of + * cooked + * - need to cut the cooked key into 252 char lengths (189 raw bytes) + */ +char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) +{ + unsigned char csum, ch; + unsigned int acc; + char *key; + int loop, len, max, seg, mark, print; + + _enter(",%d", keylen); + + BUG_ON(keylen < 2 || keylen > 514); + + csum = raw[0] + raw[1]; + print = 1; + for (loop = 2; loop < keylen; loop++) { + ch = raw[loop]; + csum += ch; + print &= cachefiles_filecharmap[ch]; + } + + if (print) { + /* if the path is usable ASCII, then we render it directly */ + max = keylen - 2; + max += 2; /* two base64'd length chars on the front */ + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 251) / 252) = 3 + */ + max += 1; /* NUL on end */ + } else { + /* calculate the maximum length of the cooked key */ + keylen = (keylen + 2) / 3; + + max = keylen * 4; + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 188) / 189) = 3 + */ + max += 1; /* NUL on end */ + } + + max += 1; /* 2nd NUL on end */ + + _debug("max: %d", max); + + key = kmalloc(max, GFP_KERNEL); + if (!key) + return NULL; + + len = 0; + + /* build the cooked key */ + sprintf(key, "@%02x%c+", (unsigned) csum, 0); + len = 5; + mark = len - 1; + + if (print) { + acc = *(uint16_t *) raw; + raw += 2; + + key[len + 1] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len] = cachefiles_charmap[acc & 63]; + len += 2; + + seg = 250; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + key[len++] = *raw++; + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break; + default: type = 'S'; break; + } + } else { + seg = 252; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + acc = *raw++; + acc |= *raw++ << 8; + acc |= *raw++ << 16; + + _debug("acc: %06x", acc); + + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break; + default: type = 'T'; break; + } + } + + key[mark] = type; + key[len++] = 0; + key[len] = 0; + + _leave(" = %p %d", key, len); + return key; +} diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c new file mode 100644 index 00000000000..4bfa8cf43bf --- /dev/null +++ b/fs/cachefiles/main.c @@ -0,0 +1,106 @@ +/* Network filesystem caching backend to use cache files on a premounted + * filesystem + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/statfs.h> +#include <linux/sysctl.h> +#include <linux/miscdevice.h> +#include "internal.h" + +unsigned cachefiles_debug; +module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); + +MODULE_DESCRIPTION("Mounted-filesystem based cache"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +struct kmem_cache *cachefiles_object_jar; + +static struct miscdevice cachefiles_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cachefiles", + .fops = &cachefiles_daemon_fops, +}; + +static void cachefiles_object_init_once(void *_object) +{ + struct cachefiles_object *object = _object; + + memset(object, 0, sizeof(*object)); + spin_lock_init(&object->work_lock); +} + +/* + * initialise the fs caching module + */ +static int __init cachefiles_init(void) +{ + int ret; + + ret = misc_register(&cachefiles_dev); + if (ret < 0) + goto error_dev; + + /* create an object jar */ + ret = -ENOMEM; + cachefiles_object_jar = + kmem_cache_create("cachefiles_object_jar", + sizeof(struct cachefiles_object), + 0, + SLAB_HWCACHE_ALIGN, + cachefiles_object_init_once); + if (!cachefiles_object_jar) { + printk(KERN_NOTICE + "CacheFiles: Failed to allocate an object jar\n"); + goto error_object_jar; + } + + ret = cachefiles_proc_init(); + if (ret < 0) + goto error_proc; + + printk(KERN_INFO "CacheFiles: Loaded\n"); + return 0; + +error_proc: + kmem_cache_destroy(cachefiles_object_jar); +error_object_jar: + misc_deregister(&cachefiles_dev); +error_dev: + kerror("failed to register: %d", ret); + return ret; +} + +fs_initcall(cachefiles_init); + +/* + * clean up on module removal + */ +static void __exit cachefiles_exit(void) +{ + printk(KERN_INFO "CacheFiles: Unloading\n"); + + cachefiles_proc_cleanup(); + kmem_cache_destroy(cachefiles_object_jar); + misc_deregister(&cachefiles_dev); +} + +module_exit(cachefiles_exit); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c new file mode 100644 index 00000000000..4ce818ae39e --- /dev/null +++ b/fs/cachefiles/namei.c @@ -0,0 +1,771 @@ +/* CacheFiles path walking and related routines + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/xattr.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/security.h> +#include "internal.h" + +static int cachefiles_wait_bit(void *flags) +{ + schedule(); + return 0; +} + +/* + * record the fact that an object is now active + */ +static void cachefiles_mark_object_active(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct cachefiles_object *xobject; + struct rb_node **_p, *_parent = NULL; + struct dentry *dentry; + + _enter(",%p", object); + +try_again: + write_lock(&cache->active_lock); + + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) + BUG(); + + dentry = object->dentry; + _p = &cache->active_nodes.rb_node; + while (*_p) { + _parent = *_p; + xobject = rb_entry(_parent, + struct cachefiles_object, active_node); + + ASSERT(xobject != object); + + if (xobject->dentry > dentry) + _p = &(*_p)->rb_left; + else if (xobject->dentry < dentry) + _p = &(*_p)->rb_right; + else + goto wait_for_old_object; + } + + rb_link_node(&object->active_node, _parent, _p); + rb_insert_color(&object->active_node, &cache->active_nodes); + + write_unlock(&cache->active_lock); + _leave(""); + return; + + /* an old object from a previous incarnation is hogging the slot - we + * need to wait for it to be destroyed */ +wait_for_old_object: + if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error:" + " Unexpected object collision\n"); + printk(KERN_ERR "xobject: OBJ%x\n", + xobject->fscache.debug_id); + printk(KERN_ERR "xobjstate=%s\n", + fscache_object_states[xobject->fscache.state]); + printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags); + printk(KERN_ERR "xobjevent=%lx [%lx]\n", + xobject->fscache.events, xobject->fscache.event_mask); + printk(KERN_ERR "xops=%u inp=%u exc=%u\n", + xobject->fscache.n_ops, xobject->fscache.n_in_progress, + xobject->fscache.n_exclusive); + printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n", + xobject->fscache.cookie, + xobject->fscache.cookie->parent, + xobject->fscache.cookie->netfs_data, + xobject->fscache.cookie->flags); + printk(KERN_ERR "xparent=%p\n", + xobject->fscache.parent); + printk(KERN_ERR "object: OBJ%x\n", + object->fscache.debug_id); + printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n", + object->fscache.cookie, + object->fscache.cookie->parent, + object->fscache.cookie->netfs_data, + object->fscache.cookie->flags); + printk(KERN_ERR "parent=%p\n", + object->fscache.parent); + BUG(); + } + atomic_inc(&xobject->usage); + write_unlock(&cache->active_lock); + + _debug(">>> wait"); + wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE, + cachefiles_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< waited"); + + cache->cache.ops->put_object(&xobject->fscache); + goto try_again; +} + +/* + * delete an object representation from the cache + * - file backed objects are unlinked + * - directory backed objects are stuffed into the graveyard for userspace to + * delete + * - unlocks the directory mutex + */ +static int cachefiles_bury_object(struct cachefiles_cache *cache, + struct dentry *dir, + struct dentry *rep) +{ + struct dentry *grave, *trap; + char nbuffer[8 + 8 + 1]; + int ret; + + _enter(",'%*.*s','%*.*s'", + dir->d_name.len, dir->d_name.len, dir->d_name.name, + rep->d_name.len, rep->d_name.len, rep->d_name.name); + + /* non-directories can just be unlinked */ + if (!S_ISDIR(rep->d_inode->i_mode)) { + _debug("unlink stale object"); + ret = vfs_unlink(dir->d_inode, rep); + + mutex_unlock(&dir->d_inode->i_mutex); + + if (ret == -EIO) + cachefiles_io_error(cache, "Unlink failed"); + + _leave(" = %d", ret); + return ret; + } + + /* directories have to be moved to the graveyard */ + _debug("move stale object to graveyard"); + mutex_unlock(&dir->d_inode->i_mutex); + +try_again: + /* first step is to make up a grave dentry in the graveyard */ + sprintf(nbuffer, "%08x%08x", + (uint32_t) get_seconds(), + (uint32_t) atomic_inc_return(&cache->gravecounter)); + + /* do the multiway lock magic */ + trap = lock_rename(cache->graveyard, dir); + + /* do some checks before getting the grave dentry */ + if (rep->d_parent != dir) { + /* the entry was probably culled when we dropped the parent dir + * lock */ + unlock_rename(cache->graveyard, dir); + _leave(" = 0 [culled?]"); + return 0; + } + + if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Graveyard no longer a directory"); + return -EIO; + } + + if (trap == rep) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + if (d_mountpoint(rep)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Mountpoint in cache"); + return -EIO; + } + + grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); + if (IS_ERR(grave)) { + unlock_rename(cache->graveyard, dir); + + if (PTR_ERR(grave) == -ENOMEM) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + cachefiles_io_error(cache, "Lookup error %ld", + PTR_ERR(grave)); + return -EIO; + } + + if (grave->d_inode) { + unlock_rename(cache->graveyard, dir); + dput(grave); + grave = NULL; + cond_resched(); + goto try_again; + } + + if (d_mountpoint(grave)) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "Mountpoint in graveyard"); + return -EIO; + } + + /* target should not be an ancestor of source */ + if (trap == grave) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + /* attempt the rename */ + ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave); + if (ret != 0 && ret != -ENOMEM) + cachefiles_io_error(cache, "Rename failed with error %d", ret); + + unlock_rename(cache->graveyard, dir); + dput(grave); + _leave(" = 0"); + return 0; +} + +/* + * delete an object representation from the cache + */ +int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct dentry *dir; + int ret; + + _enter(",{%p}", object->dentry); + + ASSERT(object->dentry); + ASSERT(object->dentry->d_inode); + ASSERT(object->dentry->d_parent); + + dir = dget_parent(object->dentry); + + mutex_lock(&dir->d_inode->i_mutex); + ret = cachefiles_bury_object(cache, dir, object->dentry); + + dput(dir); + _leave(" = %d", ret); + return ret; +} + +/* + * walk from the parent object to the child object through the backing + * filesystem, creating directories as we go + */ +int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_cache *cache; + struct dentry *dir, *next = NULL; + unsigned long start; + const char *name; + int ret, nlen; + + _enter("{%p},,%s,", parent->dentry, key); + + cache = container_of(parent->fscache.cache, + struct cachefiles_cache, cache); + + ASSERT(parent->dentry); + ASSERT(parent->dentry->d_inode); + + if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { + // TODO: convert file to dir + _leave("looking up in none directory"); + return -ENOBUFS; + } + + dir = dget(parent->dentry); + +advance: + /* attempt to transit the first directory component */ + name = key; + nlen = strlen(key); + + /* key ends in a double NUL */ + key = key + nlen + 1; + if (!*key) + key = NULL; + +lookup_again: + /* search the current directory for the element name */ + _debug("lookup '%s'", name); + + mutex_lock(&dir->d_inode->i_mutex); + + start = jiffies; + next = lookup_one_len(name, dir, nlen); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(next)) + goto lookup_error; + + _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); + + if (!key) + object->new = !next->d_inode; + + /* if this element of the path doesn't exist, then the lookup phase + * failed, and we can release any readers in the certain knowledge that + * there's nothing for them to actually read */ + if (!next->d_inode) + fscache_object_lookup_negative(&object->fscache); + + /* we need to create the object if it's negative */ + if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { + /* index objects and intervening tree levels must be subdirs */ + if (!next->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + start = jiffies; + ret = vfs_mkdir(dir->d_inode, next, 0); + cachefiles_hist(cachefiles_mkdir_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(next->d_inode); + + _debug("mkdir -> %p{%p{ino=%lu}}", + next, next->d_inode, next->d_inode->i_ino); + + } else if (!S_ISDIR(next->d_inode->i_mode)) { + kerror("inode %lu is not a directory", + next->d_inode->i_ino); + ret = -ENOBUFS; + goto error; + } + + } else { + /* non-index objects start out life as files */ + if (!next->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + start = jiffies; + ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); + cachefiles_hist(cachefiles_create_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(next->d_inode); + + _debug("create -> %p{%p{ino=%lu}}", + next, next->d_inode, next->d_inode->i_ino); + + } else if (!S_ISDIR(next->d_inode->i_mode) && + !S_ISREG(next->d_inode->i_mode) + ) { + kerror("inode %lu is not a file or directory", + next->d_inode->i_ino); + ret = -ENOBUFS; + goto error; + } + } + + /* process the next component */ + if (key) { + _debug("advance"); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + dir = next; + next = NULL; + goto advance; + } + + /* we've found the object we were looking for */ + object->dentry = next; + + /* if we've found that the terminal object exists, then we need to + * check its attributes and delete it if it's out of date */ + if (!object->new) { + _debug("validate '%*.*s'", + next->d_name.len, next->d_name.len, next->d_name.name); + + ret = cachefiles_check_object_xattr(object, auxdata); + if (ret == -ESTALE) { + /* delete the object (the deleter drops the directory + * mutex) */ + object->dentry = NULL; + + ret = cachefiles_bury_object(cache, dir, next); + dput(next); + next = NULL; + + if (ret < 0) + goto delete_error; + + _debug("redo lookup"); + goto lookup_again; + } + } + + /* note that we're now using this object */ + cachefiles_mark_object_active(cache, object); + + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + dir = NULL; + + _debug("=== OBTAINED_OBJECT ==="); + + if (object->new) { + /* attach data to a newly constructed terminal object */ + ret = cachefiles_set_object_xattr(object, auxdata); + if (ret < 0) + goto check_error; + } else { + /* always update the atime on an object we've just looked up + * (this is used to keep track of culling, and atimes are only + * updated by read, write and readdir but not lookup or + * open) */ + touch_atime(cache->mnt, next); + } + + /* open a file interface onto a data file */ + if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (S_ISREG(object->dentry->d_inode->i_mode)) { + const struct address_space_operations *aops; + + ret = -EPERM; + aops = object->dentry->d_inode->i_mapping->a_ops; + if (!aops->bmap) + goto check_error; + + object->backer = object->dentry; + } else { + BUG(); // TODO: open file in data-class subdir + } + } + + object->new = 0; + fscache_obtained_object(&object->fscache); + + _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); + return 0; + +create_error: + _debug("create error %d", ret); + if (ret == -EIO) + cachefiles_io_error(cache, "Create/mkdir failed"); + goto error; + +check_error: + _debug("check error %d", ret); + write_lock(&cache->active_lock); + rb_erase(&object->active_node, &cache->active_nodes); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); + + dput(object->dentry); + object->dentry = NULL; + goto error_out; + +delete_error: + _debug("delete error %d", ret); + goto error_out2; + +lookup_error: + _debug("lookup error %ld", PTR_ERR(next)); + ret = PTR_ERR(next); + if (ret == -EIO) + cachefiles_io_error(cache, "Lookup failed"); + next = NULL; +error: + mutex_unlock(&dir->d_inode->i_mutex); + dput(next); +error_out2: + dput(dir); +error_out: + if (ret == -ENOSPC) + ret = -ENOBUFS; + + _leave(" = error %d", -ret); + return ret; +} + +/* + * get a subdirectory + */ +struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *dirname) +{ + struct dentry *subdir; + unsigned long start; + int ret; + + _enter(",,%s", dirname); + + /* search the current directory for the element name */ + mutex_lock(&dir->d_inode->i_mutex); + + start = jiffies; + subdir = lookup_one_len(dirname, dir, strlen(dirname)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(subdir)) { + if (PTR_ERR(subdir) == -ENOMEM) + goto nomem_d_alloc; + goto lookup_error; + } + + _debug("subdir -> %p %s", + subdir, subdir->d_inode ? "positive" : "negative"); + + /* we need to create the subdir if it doesn't exist yet */ + if (!subdir->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto mkdir_error; + + _debug("attempt mkdir"); + + ret = vfs_mkdir(dir->d_inode, subdir, 0700); + if (ret < 0) + goto mkdir_error; + + ASSERT(subdir->d_inode); + + _debug("mkdir -> %p{%p{ino=%lu}}", + subdir, + subdir->d_inode, + subdir->d_inode->i_ino); + } + + mutex_unlock(&dir->d_inode->i_mutex); + + /* we need to make sure the subdir is a directory */ + ASSERT(subdir->d_inode); + + if (!S_ISDIR(subdir->d_inode->i_mode)) { + kerror("%s is not a directory", dirname); + ret = -EIO; + goto check_error; + } + + ret = -EPERM; + if (!subdir->d_inode->i_op || + !subdir->d_inode->i_op->setxattr || + !subdir->d_inode->i_op->getxattr || + !subdir->d_inode->i_op->lookup || + !subdir->d_inode->i_op->mkdir || + !subdir->d_inode->i_op->create || + !subdir->d_inode->i_op->rename || + !subdir->d_inode->i_op->rmdir || + !subdir->d_inode->i_op->unlink) + goto check_error; + + _leave(" = [%lu]", subdir->d_inode->i_ino); + return subdir; + +check_error: + dput(subdir); + _leave(" = %d [check]", ret); + return ERR_PTR(ret); + +mkdir_error: + mutex_unlock(&dir->d_inode->i_mutex); + dput(subdir); + kerror("mkdir %s failed with error %d", dirname, ret); + return ERR_PTR(ret); + +lookup_error: + mutex_unlock(&dir->d_inode->i_mutex); + ret = PTR_ERR(subdir); + kerror("Lookup %s failed with error %d", dirname, ret); + return ERR_PTR(ret); + +nomem_d_alloc: + mutex_unlock(&dir->d_inode->i_mutex); + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * find out if an object is in use or not + * - if finds object and it's not in use: + * - returns a pointer to the object and a reference on it + * - returns with the directory locked + */ +static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, + struct dentry *dir, + char *filename) +{ + struct cachefiles_object *object; + struct rb_node *_n; + struct dentry *victim; + unsigned long start; + int ret; + + //_enter(",%*.*s/,%s", + // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + /* look up the victim */ + mutex_lock_nested(&dir->d_inode->i_mutex, 1); + + start = jiffies; + victim = lookup_one_len(filename, dir, strlen(filename)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(victim)) + goto lookup_error; + + //_debug("victim -> %p %s", + // victim, victim->d_inode ? "positive" : "negative"); + + /* if the object is no longer there then we probably retired the object + * at the netfs's request whilst the cull was in progress + */ + if (!victim->d_inode) { + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + _leave(" = -ENOENT [absent]"); + return ERR_PTR(-ENOENT); + } + + /* check to see if we're using this object */ + read_lock(&cache->active_lock); + + _n = cache->active_nodes.rb_node; + + while (_n) { + object = rb_entry(_n, struct cachefiles_object, active_node); + + if (object->dentry > victim) + _n = _n->rb_left; + else if (object->dentry < victim) + _n = _n->rb_right; + else + goto object_in_use; + } + + read_unlock(&cache->active_lock); + + //_leave(" = %p", victim); + return victim; + +object_in_use: + read_unlock(&cache->active_lock); + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + //_leave(" = -EBUSY [in use]"); + return ERR_PTR(-EBUSY); + +lookup_error: + mutex_unlock(&dir->d_inode->i_mutex); + ret = PTR_ERR(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return ERR_PTR(-ESTALE); + } + + if (ret == -EIO) { + cachefiles_io_error(cache, "Lookup failed"); + } else if (ret != -ENOMEM) { + kerror("Internal error: %d", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * cull an object if it's not in use + * - called only by cache manager daemon + */ +int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + int ret; + + _enter(",%*.*s/,%s", + dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + _debug("victim -> %p %s", + victim, victim->d_inode ? "positive" : "negative"); + + /* okay... the victim is not being used so we can cull it + * - start by marking it as stale + */ + _debug("victim is cullable"); + + ret = cachefiles_remove_object_xattr(cache, victim); + if (ret < 0) + goto error_unlock; + + /* actually remove the victim (drops the dir mutex) */ + _debug("bury"); + + ret = cachefiles_bury_object(cache, dir, victim); + if (ret < 0) + goto error; + + dput(victim); + _leave(" = 0"); + return 0; + +error_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +error: + dput(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return -ESTALE; + } + + if (ret != -ENOMEM) { + kerror("Internal error: %d", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * find out if an object is in use or not + * - called only by cache manager daemon + * - returns -EBUSY or 0 to indicate whether an object is in use or not + */ +int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + + //_enter(",%*.*s/,%s", + // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + //_leave(" = 0"); + return 0; +} diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c new file mode 100644 index 00000000000..eccd3394119 --- /dev/null +++ b/fs/cachefiles/proc.c @@ -0,0 +1,134 @@ +/* CacheFiles statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +atomic_t cachefiles_lookup_histogram[HZ]; +atomic_t cachefiles_mkdir_histogram[HZ]; +atomic_t cachefiles_create_histogram[HZ]; + +/* + * display the latency histogram + */ +static int cachefiles_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned x, y, z, t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + x = atomic_read(&cachefiles_lookup_histogram[index]); + y = atomic_read(&cachefiles_mkdir_histogram[index]); + z = atomic_read(&cachefiles_create_histogram[index]); + if (x == 0 && y == 0 && z == 0) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void cachefiles_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations cachefiles_histogram_ops = { + .start = cachefiles_histogram_start, + .stop = cachefiles_histogram_stop, + .next = cachefiles_histogram_next, + .show = cachefiles_histogram_show, +}; + +/* + * open "/proc/fs/cachefiles/XXX" which provide statistics summaries + */ +static int cachefiles_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cachefiles_histogram_ops); +} + +static const struct file_operations cachefiles_histogram_fops = { + .owner = THIS_MODULE, + .open = cachefiles_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * initialise the /proc/fs/cachefiles/ directory + */ +int __init cachefiles_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/cachefiles", NULL)) + goto error_dir; + + if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_fops)) + goto error_histogram; + + _leave(" = 0"); + return 0; + +error_histogram: + remove_proc_entry("fs/cachefiles", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/cachefiles/ directory + */ +void cachefiles_proc_cleanup(void) +{ + remove_proc_entry("fs/cachefiles/histogram", NULL); + remove_proc_entry("fs/cachefiles", NULL); +} diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c new file mode 100644 index 00000000000..a69787e7dd9 --- /dev/null +++ b/fs/cachefiles/rdwr.c @@ -0,0 +1,879 @@ +/* Storage object read/write + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/mount.h> +#include <linux/file.h> +#include "internal.h" + +/* + * detect wake up events generated by the unlocking of pages in which we're + * interested + * - we use this to detect read completion of backing pages + * - the caller holds the waitqueue lock + */ +static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, + int sync, void *_key) +{ + struct cachefiles_one_read *monitor = + container_of(wait, struct cachefiles_one_read, monitor); + struct cachefiles_object *object; + struct wait_bit_key *key = _key; + struct page *page = wait->private; + + ASSERT(key); + + _enter("{%lu},%u,%d,{%p,%u}", + monitor->netfs_page->index, mode, sync, + key->flags, key->bit_nr); + + if (key->flags != &page->flags || + key->bit_nr != PG_locked) + return 0; + + _debug("--- monitor %p %lx ---", page, page->flags); + + if (!PageUptodate(page) && !PageError(page)) + dump_stack(); + + /* remove from the waitqueue */ + list_del(&wait->task_list); + + /* move onto the action list and queue for FS-Cache thread pool */ + ASSERT(monitor->op); + + object = container_of(monitor->op->op.object, + struct cachefiles_object, fscache); + + spin_lock(&object->work_lock); + list_add_tail(&monitor->op_link, &monitor->op->to_do); + spin_unlock(&object->work_lock); + + fscache_enqueue_retrieval(monitor->op); + return 0; +} + +/* + * copy data from backing pages to netfs pages to complete a read operation + * - driven by FS-Cache's thread pool + */ +static void cachefiles_read_copier(struct fscache_operation *_op) +{ + struct cachefiles_one_read *monitor; + struct cachefiles_object *object; + struct fscache_retrieval *op; + struct pagevec pagevec; + int error, max; + + op = container_of(_op, struct fscache_retrieval, op); + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("{ino=%lu}", object->backer->d_inode->i_ino); + + pagevec_init(&pagevec, 0); + + max = 8; + spin_lock_irq(&object->work_lock); + + while (!list_empty(&op->to_do)) { + monitor = list_entry(op->to_do.next, + struct cachefiles_one_read, op_link); + list_del(&monitor->op_link); + + spin_unlock_irq(&object->work_lock); + + _debug("- copy {%lu}", monitor->back_page->index); + + error = -EIO; + if (PageUptodate(monitor->back_page)) { + copy_highpage(monitor->netfs_page, monitor->back_page); + + pagevec_add(&pagevec, monitor->netfs_page); + fscache_mark_pages_cached(monitor->op, &pagevec); + error = 0; + } + + if (error) + cachefiles_io_error_obj( + object, + "Readpage failed on backing file %lx", + (unsigned long) monitor->back_page->flags); + + page_cache_release(monitor->back_page); + + fscache_end_io(op, monitor->netfs_page, error); + page_cache_release(monitor->netfs_page); + fscache_put_retrieval(op); + kfree(monitor); + + /* let the thread pool have some air occasionally */ + max--; + if (max < 0 || need_resched()) { + if (!list_empty(&op->to_do)) + fscache_enqueue_retrieval(op); + _leave(" [maxed out]"); + return; + } + + spin_lock_irq(&object->work_lock); + } + + spin_unlock_irq(&object->work_lock); + _leave(""); +} + +/* + * read the corresponding page to the given set from the backing file + * - an uncertain page is simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file_one(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct page *netpage, + struct pagevec *pagevec) +{ + struct cachefiles_one_read *monitor; + struct address_space *bmapping; + struct page *newpage, *backpage; + int ret; + + _enter(""); + + pagevec_reinit(pagevec); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + if (!monitor) + goto nomem; + + monitor->netfs_page = netpage; + monitor->op = fscache_get_retrieval(op); + + init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); + + /* attempt to get hold of the backing page */ + bmapping = object->backer->d_inode->i_mapping; + newpage = NULL; + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = page_cache_alloc_cold(bmapping); + if (!newpage) + goto nomem_monitor; + } + + ret = add_to_page_cache(newpage, bmapping, + netpage->index, GFP_KERNEL); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem_page; + } + + /* we've installed a new backing page, so now we need to add it + * to the LRU list and start it reading */ +installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + page_cache_get(backpage); + pagevec_add(pagevec, backpage); + __pagevec_lru_add_file(pagevec); + +read_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* set the monitor to transfer the data across */ +monitor_backing_page: + _debug("- monitor add"); + + /* install the monitor */ + page_cache_get(monitor->netfs_page); + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + goto success; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ +backing_page_already_present: + _debug("- present"); + + if (newpage) { + page_cache_release(newpage); + newpage = NULL; + } + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + if (!trylock_page(backpage)) + goto monitor_backing_page; + _debug("read %p {%lx}", backpage, backpage->flags); + goto read_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ +backing_page_already_uptodate: + _debug("- uptodate"); + + pagevec_add(pagevec, netpage); + fscache_mark_pages_cached(op, pagevec); + + copy_highpage(netpage, backpage); + fscache_end_io(op, netpage, 0); + +success: + _debug("success"); + ret = 0; + +out: + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(monitor->op); + kfree(monitor); + } + _leave(" = %d", ret); + return ret; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto out; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; + goto out; + +nomem_page: + page_cache_release(newpage); +nomem_monitor: + fscache_put_retrieval(monitor->op); + kfree(monitor); +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - if the page is backed by a block in the cache: + * - a read will be started which will call the callback on completion + * - 0 will be returned + * - else if the page is unbacked: + * - the metadata will be retained + * - -ENODATA will be returned + */ +int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct inode *inode; + sector_t block0, block; + unsigned shift; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{%p},{%lx},,,", object, page->index); + + if (!object->backer) + return -ENOBUFS; + + inode = object->backer->d_inode; + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + return -ENOBUFS; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + op->op.flags = FSCACHE_OP_FAST; + op->op.processor = cachefiles_read_copier; + + pagevec_init(&pagevec, 0); + + /* we assume the absence or presence of the first block is a good + * enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually good + * enough for this as it doesn't indicate errors, but it's all we've + * got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* submit the apparently valid page to the backing fs to be + * read from disk */ + ret = cachefiles_read_backing_file_one(object, op, page, + &pagevec); + } else if (cachefiles_has_space(cache, 0, 1) == 0) { + /* there's space in the cache we can use */ + pagevec_add(&pagevec, page); + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * read the corresponding pages to the given set from the backing file + * - any uncertain pages are simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct list_head *list, + struct pagevec *mark_pvec) +{ + struct cachefiles_one_read *monitor = NULL; + struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct pagevec lru_pvec; + struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; + int ret = 0; + + _enter(""); + + pagevec_init(&lru_pvec, 0); + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + if (!monitor) { + monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + if (!monitor) + goto nomem; + + monitor->op = fscache_get_retrieval(op); + init_waitqueue_func_entry(&monitor->monitor, + cachefiles_read_waiter); + } + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = page_cache_alloc_cold(bmapping); + if (!newpage) + goto nomem; + } + + ret = add_to_page_cache(newpage, bmapping, + netpage->index, GFP_KERNEL); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem; + } + + /* we've installed a new backing page, so now we need to add it + * to the LRU list and start it reading */ + installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + page_cache_get(backpage); + if (!pagevec_add(&lru_pvec, backpage)) + __pagevec_lru_add_file(&lru_pvec); + + reread_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* add the netfs page to the pagecache and LRU, and set the + * monitor to transfer the data across */ + monitor_backing_page: + _debug("- monitor add"); + + ret = add_to_page_cache(netpage, op->mapping, netpage->index, + GFP_KERNEL); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + continue; + } + goto nomem; + } + + page_cache_get(netpage); + if (!pagevec_add(&lru_pvec, netpage)) + __pagevec_lru_add_file(&lru_pvec); + + /* install a monitor */ + page_cache_get(netpage); + monitor->netfs_page = netpage; + + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was + * installed, so the monitor may miss the event - so we have to + * ensure that we do get one in such a case */ + if (trylock_page(backpage)) { + _debug("2unlock %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + page_cache_release(backpage); + backpage = NULL; + + page_cache_release(netpage); + netpage = NULL; + continue; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ + backing_page_already_present: + _debug("- present %p", backpage); + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + _debug("- not ready %p{%lx}", backpage, backpage->flags); + + if (!trylock_page(backpage)) + goto monitor_backing_page; + + if (PageError(backpage)) { + _debug("error %lx", backpage->flags); + unlock_page(backpage); + goto io_error; + } + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate_unlock; + + /* we've locked a page that's neither up to date nor erroneous, + * so we need to attempt to read it again */ + goto reread_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ + backing_page_already_uptodate_unlock: + _debug("uptodate %lx", backpage->flags); + unlock_page(backpage); + backing_page_already_uptodate: + _debug("- uptodate"); + + ret = add_to_page_cache(netpage, op->mapping, netpage->index, + GFP_KERNEL); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + continue; + } + goto nomem; + } + + copy_highpage(netpage, backpage); + + page_cache_release(backpage); + backpage = NULL; + + if (!pagevec_add(mark_pvec, netpage)) + fscache_mark_pages_cached(op, mark_pvec); + + page_cache_get(netpage); + if (!pagevec_add(&lru_pvec, netpage)) + __pagevec_lru_add_file(&lru_pvec); + + fscache_end_io(op, netpage, 0); + page_cache_release(netpage); + netpage = NULL; + continue; + } + + netpage = NULL; + + _debug("out"); + +out: + /* tidy up */ + pagevec_lru_add_file(&lru_pvec); + + if (newpage) + page_cache_release(newpage); + if (netpage) + page_cache_release(netpage); + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(op); + kfree(monitor); + } + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + page_cache_release(netpage); + } + + _leave(" = %d", ret); + return ret; + +nomem: + _debug("nomem"); + ret = -ENOMEM; + goto out; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto out; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; + goto out; +} + +/* + * read a list of pages from the cache or allocate blocks in which to store + * them + */ +int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct list_head backpages; + struct pagevec pagevec; + struct inode *inode; + struct page *page, *_n; + unsigned shift, nrbackpages; + int ret, ret2, space; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{OBJ%x,%d},,%d,,", + object->fscache.debug_id, atomic_read(&op->op.usage), + *nr_pages); + + if (!object->backer) + return -ENOBUFS; + + space = 1; + if (cachefiles_has_space(cache, 0, *nr_pages) < 0) + space = 0; + + inode = object->backer->d_inode; + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + return -ENOBUFS; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + pagevec_init(&pagevec, 0); + + op->op.flags = FSCACHE_OP_FAST; + op->op.processor = cachefiles_read_copier; + + INIT_LIST_HEAD(&backpages); + nrbackpages = 0; + + ret = space ? -ENODATA : -ENOBUFS; + list_for_each_entry_safe(page, _n, pages, lru) { + sector_t block0, block; + + /* we assume the absence or presence of the first block is a + * good enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually + * good enough for this as it doesn't indicate errors, but + * it's all we've got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, + block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* we have data - add it to the list to give to the + * backing fs */ + list_move(&page->lru, &backpages); + (*nr_pages)--; + nrbackpages++; + } else if (space && pagevec_add(&pagevec, page) == 0) { + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + if (list_empty(pages)) + ret = 0; + + /* submit the apparently valid pages to the backing fs to be read from + * disk */ + if (nrbackpages > 0) { + ret2 = cachefiles_read_backing_file(object, op, &backpages, + &pagevec); + if (ret2 == -ENOMEM || ret2 == -EINTR) + ret = ret2; + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + _leave(" = %d [nr=%u%s]", + ret, *nr_pages, list_empty(pages) ? " empty" : ""); + return ret; +} + +/* + * allocate a block in the cache in which to store a page + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - otherwise: + * - the metadata will be retained + * - 0 will be returned + */ +int cachefiles_allocate_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lx},", object, page->index); + + ret = cachefiles_has_space(cache, 0, 1); + if (ret == 0) { + pagevec_init(&pagevec, 0); + pagevec_add(&pagevec, page); + fscache_mark_pages_cached(op, &pagevec); + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * allocate blocks in the cache in which to store a set of pages + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if some buffers couldn't be made available + * - returns -ENOBUFS if some pages are beyond EOF + * - otherwise: + * - -ENODATA will be returned + * - metadata will be retained for any page marked + */ +int cachefiles_allocate_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct page *page; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,,,%d,", object, *nr_pages); + + ret = cachefiles_has_space(cache, 0, *nr_pages); + if (ret == 0) { + pagevec_init(&pagevec, 0); + + list_for_each_entry(page, pages, lru) { + if (pagevec_add(&pagevec, page) == 0) + fscache_mark_pages_cached(op, &pagevec); + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * request a page be stored in the cache + * - cache withdrawal is prevented by the caller + * - this request may be ignored if there's no cache block available, in which + * case -ENOBUFS will be returned + * - if the op is in progress, 0 will be returned + */ +int cachefiles_write_page(struct fscache_storage *op, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + mm_segment_t old_fs; + struct file *file; + loff_t pos; + void *data; + int ret; + + ASSERT(op != NULL); + ASSERT(page != NULL); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("%p,%p{%lx},,,", object, page, page->index); + + if (!object->backer) { + _leave(" = -ENOBUFS"); + return -ENOBUFS; + } + + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + /* write the page to the backing filesystem and let it store it in its + * own time */ + dget(object->backer); + mntget(cache->mnt); + file = dentry_open(object->backer, cache->mnt, O_RDWR, + cache->cache_cred); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + } else { + ret = -EIO; + if (file->f_op->write) { + pos = (loff_t) page->index << PAGE_SHIFT; + data = kmap(page); + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = file->f_op->write( + file, (const void __user *) data, PAGE_SIZE, + &pos); + set_fs(old_fs); + kunmap(page); + if (ret != PAGE_SIZE) + ret = -EIO; + } + fput(file); + } + + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error_obj( + object, "Write page to backing file failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * detach a backing block from a page + * - cache withdrawal is prevented by the caller + */ +void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lu}", object, page->index); + + spin_unlock(&object->fscache.cookie->lock); +} diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c new file mode 100644 index 00000000000..b5808cdb223 --- /dev/null +++ b/fs/cachefiles/security.c @@ -0,0 +1,116 @@ +/* CacheFiles security management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include "internal.h" + +/* + * determine the security context within which we access the cache from within + * the kernel + */ +int cachefiles_get_security_ID(struct cachefiles_cache *cache) +{ + struct cred *new; + int ret; + + _enter("{%s}", cache->secctx); + + new = prepare_kernel_cred(current); + if (!new) { + ret = -ENOMEM; + goto error; + } + + if (cache->secctx) { + ret = set_security_override_from_ctx(new, cache->secctx); + if (ret < 0) { + put_cred(new); + printk(KERN_ERR "CacheFiles:" + " Security denies permission to nominate" + " security context: error %d\n", + ret); + goto error; + } + } + + cache->cache_cred = new; + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * see if mkdir and create can be performed in the root directory + */ +static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, + struct dentry *root) +{ + int ret; + + ret = security_inode_mkdir(root->d_inode, root, 0); + if (ret < 0) { + printk(KERN_ERR "CacheFiles:" + " Security denies permission to make dirs: error %d", + ret); + return ret; + } + + ret = security_inode_create(root->d_inode, root, 0); + if (ret < 0) + printk(KERN_ERR "CacheFiles:" + " Security denies permission to create files: error %d", + ret); + + return ret; +} + +/* + * check the security details of the on-disk cache + * - must be called with security override in force + */ +int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred) +{ + struct cred *new; + int ret; + + _enter(""); + + /* duplicate the cache creds for COW (the override is currently in + * force, so we can use prepare_creds() to do this) */ + new = prepare_creds(); + if (!new) + return -ENOMEM; + + cachefiles_end_secure(cache, *_saved_cred); + + /* use the cache root dir's security context as the basis with + * which create files */ + ret = set_create_files_as(new, root->d_inode); + if (ret < 0) { + _leave(" = %d [cfa]", ret); + return ret; + } + + put_cred(cache->cache_cred); + cache->cache_cred = new; + + cachefiles_begin_secure(cache, _saved_cred); + ret = cachefiles_check_cache_dir(cache, root); + + if (ret == -EOPNOTSUPP) + ret = 0; + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c new file mode 100644 index 00000000000..f3e7a0bf068 --- /dev/null +++ b/fs/cachefiles/xattr.c @@ -0,0 +1,291 @@ +/* CacheFiles extended attribute management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/xattr.h> +#include "internal.h" + +static const char cachefiles_xattr_cache[] = + XATTR_USER_PREFIX "CacheFiles.cache"; + +/* + * check the type label on an object + * - done using xattrs + */ +int cachefiles_check_object_type(struct cachefiles_object *object) +{ + struct dentry *dentry = object->dentry; + char type[3], xtype[3]; + int ret; + + ASSERT(dentry); + ASSERT(dentry->d_inode); + + if (!object->fscache.cookie) + strcpy(type, "C3"); + else + snprintf(type, 3, "%02x", object->fscache.cookie->def->type); + + _enter("%p{%s}", object, type); + + /* attempt to install a type label directly */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, + XATTR_CREATE); + if (ret == 0) { + _debug("SET"); /* we succeeded */ + goto error; + } + + if (ret != -EEXIST) { + kerror("Can't set xattr on %*.*s [%lu] (err %d)", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + -ret); + goto error; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); + if (ret < 0) { + if (ret == -ERANGE) + goto bad_type_length; + + kerror("Can't read xattr on %*.*s [%lu] (err %d)", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + -ret); + goto error; + } + + /* check the type is what we're expecting */ + if (ret != 2) + goto bad_type_length; + + if (xtype[0] != type[0] || xtype[1] != type[1]) + goto bad_type; + + ret = 0; + +error: + _leave(" = %d", ret); + return ret; + +bad_type_length: + kerror("Cache object %lu type xattr length incorrect", + dentry->d_inode->i_ino); + ret = -EIO; + goto error; + +bad_type: + xtype[2] = 0; + kerror("Cache object %*.*s [%lu] type %s not %s", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + xtype, type); + ret = -EIO; + goto error; +} + +/* + * set the state xattr on a cache file + */ +int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(object->fscache.cookie); + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_CREATE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to set xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * update the state xattr on a cache file + */ +int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(object->fscache.cookie); + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to update xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * check the state xattr on a cache file + * - return -ESTALE if the object should be deleted + */ +int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_xattr *auxbuf; + struct dentry *dentry = object->dentry; + int ret; + + _enter("%p,#%d", object, auxdata->len); + + ASSERT(dentry); + ASSERT(dentry->d_inode); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + if (ret < 0) { + if (ret == -ENODATA) + goto stale; /* no attribute - power went off + * mid-cull? */ + + if (ret == -ERANGE) + goto bad_type_length; + + cachefiles_io_error_obj(object, + "Can't read xattr on %lu (err %d)", + dentry->d_inode->i_ino, -ret); + goto error; + } + + /* check the on-disk object */ + if (ret < 1) + goto bad_type_length; + + if (auxbuf->type != auxdata->type) + goto stale; + + auxbuf->len = ret; + + /* consult the netfs */ + if (object->fscache.cookie->def->check_aux) { + enum fscache_checkaux result; + unsigned int dlen; + + dlen = auxbuf->len - 1; + + _debug("checkaux %s #%u", + object->fscache.cookie->def->name, dlen); + + result = fscache_check_aux(&object->fscache, + &auxbuf->data, dlen); + + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + goto okay; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + goto stale; + + default: + BUG(); + } + + /* update the current label */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0) { + cachefiles_io_error_obj(object, + "Can't update xattr on %lu" + " (error %d)", + dentry->d_inode->i_ino, -ret); + goto error; + } + } + +okay: + ret = 0; + +error: + kfree(auxbuf); + _leave(" = %d", ret); + return ret; + +bad_type_length: + kerror("Cache object %lu xattr length incorrect", + dentry->d_inode->i_ino); + ret = -EIO; + goto error; + +stale: + ret = -ESTALE; + goto error; +} + +/* + * remove the object's xattr to mark it stale + */ +int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + int ret; + + ret = vfs_removexattr(dentry, cachefiles_xattr_cache); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) + ret = 0; + else if (ret != -ENOMEM) + cachefiles_io_error(cache, + "Can't remove xattr from %lu" + " (error %d)", + dentry->d_inode->i_ino, -ret); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 2f35cccfcd8..54dce78fbb7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (oplockEnabled) oplock = REQ_OPLOCK; @@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, rc = -ENOMEM; else if (pTcon->unix_ext) { struct cifs_unix_set_info_args args = { - .mode = mode & ~current->fs->umask, + .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a8797cc6080..f121a80fdd6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) goto mkdir_out; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, NULL /* netfid */, pInfo, &oplock, full_path, cifs_sb->local_nls, @@ -1204,7 +1204,7 @@ mkdir_get_info: if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) direntry->d_inode->i_nlink = 2; - mode &= ~current->fs->umask; + mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ if (inode->i_mode & S_ISGID) mode |= S_ISGID; diff --git a/fs/compat.c b/fs/compat.c index 55efdfebdf5..1c859dae758 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -51,6 +51,7 @@ #include <linux/poll.h> #include <linux/mm.h> #include <linux/eventpoll.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1195,16 +1196,12 @@ out: return ret; } -asmlinkage ssize_t -compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) { - struct file *file; ssize_t ret = -EBADF; - file = fget(fd); - if (!file) - return -EBADF; - if (!(file->f_mode & FMODE_READ)) goto out; @@ -1212,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) goto out; - ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); out: if (ret > 0) add_rchar(current, ret); inc_syscr(current); - fput(file); return ret; } asmlinkage ssize_t -compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) +compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen) { struct file *file; - ssize_t ret = -EBADF; + int fput_needed; + ssize_t ret; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) return -EBADF; + ret = compat_readv(file, vec, vlen, &file->f_pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage ssize_t +compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_high, u32 pos_low) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + int fput_needed; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_readv(file, vec, vlen, &pos); + fput_light(file, fput_needed); + return ret; +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + if (!(file->f_mode & FMODE_WRITE)) goto out; @@ -1238,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) goto out; - ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); out: if (ret > 0) add_wchar(current, ret); inc_syscw(current); - fput(file); + return ret; +} + +asmlinkage ssize_t +compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen) +{ + struct file *file; + int fput_needed; + ssize_t ret; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_writev(file, vec, vlen, &file->f_pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage ssize_t +compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_high, u32 pos_low) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + int fput_needed; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_writev(file, vec, vlen, &pos); + fput_light(file, fput_needed); return ret; } @@ -1441,12 +1503,15 @@ int compat_do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + + retval = check_unsafe_exec(bprm); + if (retval) + goto out_unlock; file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_unlock; + goto out_unmark; sched_exec(); @@ -1488,6 +1553,9 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1506,6 +1574,11 @@ out_file: fput(bprm->file); } +out_unmark: + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); + out_unlock: current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ff786687e93..3e87ce443ea 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -23,7 +23,7 @@ #include <linux/if.h> #include <linux/if_bridge.h> #include <linux/slab.h> -#include <linux/raid/md.h> +#include <linux/raid/md_u.h> #include <linux/kd.h> #include <linux/route.h> #include <linux/in6.h> diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a07338d2d14..dd3634e4c96 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -318,6 +318,7 @@ out: static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; @@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = CRAMFS_SB(sb)->files; buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = CRAMFS_MAXPATHLEN; return 0; } @@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s static int cramfs_readpage(struct file *file, struct page * page) { struct inode *inode = page->mapping->host; - u32 maxblock, bytes_filled; + u32 maxblock; + int bytes_filled; void *pgdata; maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; bytes_filled = 0; + pgdata = kmap(page); + if (page->index < maxblock) { struct super_block *sb = inode->i_sb; u32 blkptr_offset = OFFSET(inode) + page->index*4; @@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page) start_offset = OFFSET(inode) + maxblock*4; mutex_lock(&read_mutex); if (page->index) - start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4); - compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset); + start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, + 4); + compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - + start_offset); mutex_unlock(&read_mutex); - pgdata = kmap(page); + if (compr_len == 0) ; /* hole */ - else if (compr_len > (PAGE_CACHE_SIZE << 1)) - printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len); - else { + else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { + pr_err("cramfs: bad compressed blocksize %u\n", + compr_len); + goto err; + } else { mutex_lock(&read_mutex); bytes_filled = cramfs_uncompress_block(pgdata, PAGE_CACHE_SIZE, cramfs_read(sb, start_offset, compr_len), compr_len); mutex_unlock(&read_mutex); + if (unlikely(bytes_filled < 0)) + goto err; } - } else - pgdata = kmap(page); + } + memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); - kunmap(page); flush_dcache_page(page); + kunmap(page); SetPageUptodate(page); unlock_page(page); return 0; + +err: + kunmap(page); + ClearPageUptodate(page); + SetPageError(page); + unlock_page(page); + return 0; } static const struct address_space_operations cramfs_aops = { diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index fc3ccb74626..023329800d2 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) err: printk("Error %d while decompressing!\n", err); printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); - return 0; + return -EIO; } int cramfs_uncompress_init(void) diff --git a/fs/dcache.c b/fs/dcache.c index 90bbd7e1b11..761d30be268 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -17,7 +17,6 @@ #include <linux/syscalls.h> #include <linux/string.h> #include <linux/mm.h> -#include <linux/fdtable.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/slab.h> @@ -32,6 +31,7 @@ #include <linux/seqlock.h> #include <linux/swap.h> #include <linux/bootmem.h> +#include <linux/fs_struct.h> #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 44d725f612c..b6a719a909f 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; if (inode->i_mapping->nrpages == 0) continue; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index e4a6223c314..af737bb56cb 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, out_release_free_unlock: crypto_free_hash(s->hash_desc.tfm); out_free_unlock: - memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); - kfree(s->block_aligned_filename); + kzfree(s->block_aligned_filename); out_unlock: mutex_unlock(s->tfm_mutex); out: diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 96ef51489e0..295e7fa5675 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) if (daemon->user_ns) put_user_ns(daemon->user_ns); mutex_unlock(&daemon->mux); - memset(daemon, 0, sizeof(*daemon)); - kfree(daemon); + kzfree(daemon); out: return rc; } diff --git a/fs/efs/super.c b/fs/efs/super.c index 73b19cfc91f..f0494281081 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -329,18 +329,22 @@ out_no_fs: } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); + struct super_block *sb = dentry->d_sb; + struct efs_sb_info *sbi = SUPER_INFO(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ - buf->f_blocks = sb->total_groups * /* total data blocks */ - (sb->group_size - sb->inode_blocks); - buf->f_bfree = sb->data_free; /* free data blocks */ - buf->f_bavail = sb->data_free; /* free blocks for non-root */ - buf->f_files = sb->total_groups * /* total inodes */ - sb->inode_blocks * + buf->f_blocks = sbi->total_groups * /* total data blocks */ + (sbi->group_size - sbi->inode_blocks); + buf->f_bfree = sbi->data_free; /* free data blocks */ + buf->f_bavail = sbi->data_free; /* free blocks for non-root */ + buf->f_files = sbi->total_groups * /* total inodes */ + sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); - buf->f_ffree = sb->inode_free; /* free inodes */ + buf->f_ffree = sbi->inode_free; /* free inodes */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/eventfd.c b/fs/eventfd.c index 5de2c2db3aa..2a701d593d3 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -28,6 +28,7 @@ struct eventfd_ctx { * issue a wakeup. */ __u64 count; + unsigned int flags; }; /* @@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n) n = (int) (ULLONG_MAX - ctx->count); ctx->count += n; if (waitqueue_active(&ctx->wqh)) - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, POLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; @@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, { struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt; + __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; - ucnt = ctx->count; - if (ucnt > 0) + if (ctx->count > 0) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - ucnt = ctx->count; res = sizeof(ucnt); break; } @@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (res > 0) { - ctx->count = 0; + if (likely(res > 0)) { + ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= ucnt; if (waitqueue_active(&ctx->wqh)) - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) @@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (res > 0) { + if (likely(res > 0)) { ctx->count += ucnt; if (waitqueue_active(&ctx->wqh)) - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, POLLIN); } spin_unlock_irq(&ctx->wqh.lock); @@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); - if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) + if (flags & ~EFD_FLAGS_SET) return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); @@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; + ctx->flags = flags; /* * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + flags & EFD_SHARED_FCNTL_FLAGS); if (fd < 0) kfree(ctx); return fd; @@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count) { return sys_eventfd2(count, 0); } + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c5c424f23fd..a89f370fadb 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1,6 +1,6 @@ /* - * fs/eventpoll.c (Efficent event polling implementation) - * Copyright (C) 2001,...,2007 Davide Libenzi + * fs/eventpoll.c (Efficient event retrieval implementation) + * Copyright (C) 2001,...,2009 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -71,29 +71,11 @@ * a better scalability. */ -#define DEBUG_EPOLL 0 - -#if DEBUG_EPOLL > 0 -#define DPRINTK(x) printk x -#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0) -#else /* #if DEBUG_EPOLL > 0 */ -#define DPRINTK(x) (void) 0 -#define DNPRINTK(n, x) (void) 0 -#endif /* #if DEBUG_EPOLL > 0 */ - -#define DEBUG_EPI 0 - -#if DEBUG_EPI != 0 -#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) -#else /* #if DEBUG_EPI != 0 */ -#define EPI_SLAB_DEBUG 0 -#endif /* #if DEBUG_EPI != 0 */ - /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) -/* Maximum number of poll wake up nests we are allowing */ -#define EP_MAX_POLLWAKE_NESTS 4 +/* Maximum number of nesting allowed inside epoll sets */ +#define EP_MAX_NESTS 4 /* Maximum msec timeout value storeable in a long int */ #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) @@ -110,24 +92,21 @@ struct epoll_filefd { }; /* - * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". - * It is used to keep track on all tasks that are currently inside the wake_up() code - * to 1) short-circuit the one coming from the same task and same wait queue head - * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting - * 3) let go the ones coming from other tasks. + * Structure used to track possible nested calls, for too deep recursions + * and loop cycles. */ -struct wake_task_node { +struct nested_call_node { struct list_head llink; - struct task_struct *task; - wait_queue_head_t *wq; + void *cookie; + int cpu; }; /* - * This is used to implement the safe poll wake up avoiding to reenter - * the poll callback from inside wake_up(). + * This structure is used as collector for nested calls, to check for + * maximum recursion dept and loop cycles. */ -struct poll_safewake { - struct list_head wake_task_list; +struct nested_calls { + struct list_head tasks_call_list; spinlock_t lock; }; @@ -213,7 +192,7 @@ struct eppoll_entry { struct list_head llink; /* The "base" pointer is set to the container "struct epitem" */ - void *base; + struct epitem *base; /* * Wait queue item that will be linked to the target file wait @@ -231,6 +210,12 @@ struct ep_pqueue { struct epitem *epi; }; +/* Used by the ep_send_events() function as callback private data */ +struct ep_send_events_data { + int maxevents; + struct epoll_event __user *events; +}; + /* * Configuration options available inside /proc/sys/fs/epoll/ */ @@ -242,8 +227,11 @@ static int max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); -/* Safe wake up implementation */ -static struct poll_safewake psw; +/* Used for safe wake up implementation */ +static struct nested_calls poll_safewake_ncalls; + +/* Used to call file's f_op->poll() under the nested calls boundaries */ +static struct nested_calls poll_readywalk_ncalls; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op) } /* Initialize the poll safe wake up structure */ -static void ep_poll_safewake_init(struct poll_safewake *psw) +static void ep_nested_calls_init(struct nested_calls *ncalls) { - - INIT_LIST_HEAD(&psw->wake_task_list); - spin_lock_init(&psw->lock); + INIT_LIST_HEAD(&ncalls->tasks_call_list); + spin_lock_init(&ncalls->lock); } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock - * because this one gets called by the poll callback, that in turn is called - * from inside a wake_up(), that might be called from irq context. +/** + * ep_call_nested - Perform a bound (possibly) nested call, by checking + * that the recursion limit is not exceeded, and that + * the same nested call (by the meaning of same cookie) is + * no re-entered. + * + * @ncalls: Pointer to the nested_calls structure to be used for this call. + * @max_nests: Maximum number of allowed nesting calls. + * @nproc: Nested call core function pointer. + * @priv: Opaque data to be passed to the @nproc callback. + * @cookie: Cookie to be used to identify this nested call. + * + * Returns: Returns the code returned by the @nproc callback, or -1 if + * the maximum recursion limit has been exceeded. */ -static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) +static int ep_call_nested(struct nested_calls *ncalls, int max_nests, + int (*nproc)(void *, void *, int), void *priv, + void *cookie) { - int wake_nests = 0; + int error, call_nests = 0; unsigned long flags; - struct task_struct *this_task = current; - struct list_head *lsthead = &psw->wake_task_list; - struct wake_task_node *tncur; - struct wake_task_node tnode; + int this_cpu = get_cpu(); + struct list_head *lsthead = &ncalls->tasks_call_list; + struct nested_call_node *tncur; + struct nested_call_node tnode; - spin_lock_irqsave(&psw->lock, flags); + spin_lock_irqsave(&ncalls->lock, flags); - /* Try to see if the current task is already inside this wakeup call */ + /* + * Try to see if the current task is already inside this wakeup call. + * We use a list here, since the population inside this set is always + * very much limited. + */ list_for_each_entry(tncur, lsthead, llink) { - - if (tncur->wq == wq || - (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) { + if (tncur->cpu == this_cpu && + (tncur->cookie == cookie || ++call_nests > max_nests)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. */ - spin_unlock_irqrestore(&psw->lock, flags); - return; + error = -1; + goto out_unlock; } } - /* Add the current task to the list */ - tnode.task = this_task; - tnode.wq = wq; + /* Add the current task and cookie to the list */ + tnode.cpu = this_cpu; + tnode.cookie = cookie; list_add(&tnode.llink, lsthead); - spin_unlock_irqrestore(&psw->lock, flags); + spin_unlock_irqrestore(&ncalls->lock, flags); - /* Do really wake up now */ - wake_up_nested(wq, 1 + wake_nests); + /* Call the nested function */ + error = (*nproc)(priv, cookie, call_nests); /* Remove the current task from the list */ - spin_lock_irqsave(&psw->lock, flags); + spin_lock_irqsave(&ncalls->lock, flags); list_del(&tnode.llink); - spin_unlock_irqrestore(&psw->lock, flags); + out_unlock: + spin_unlock_irqrestore(&ncalls->lock, flags); + + put_cpu(); + return error; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + unsigned long flags; + + spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); + wake_up_locked_poll(wqueue, events); + spin_unlock_irqrestore(&wqueue->lock, flags); +} +#else +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + wake_up_poll(wqueue, events); +} +#endif + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) +{ + ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, + 1 + call_nests); + return 0; +} + +/* + * Perform a safe wake up of the poll wait list. The problem is that + * with the new callback'd wake up system, it is possible that the + * poll callback is reentered from inside the call to wake_up() done + * on the poll wait queue head. The rule is that we cannot reenter the + * wake up code from the same task more than EP_MAX_NESTS times, + * and we cannot reenter the same wait queue head at all. This will + * enable to have a hierarchy of epoll file descriptor of no more than + * EP_MAX_NESTS deep. + */ +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, + ep_poll_wakeup_proc, NULL, wq); } /* - * This function unregister poll callbacks from the associated file descriptor. - * Since this must be called without holding "ep->lock" the atomic exchange trick - * will protect us from multiple unregister. + * This function unregisters poll callbacks from the associated file + * descriptor. Must be called with "mtx" held (or "epmutex" if called from + * ep_free). */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { - int nwait; struct list_head *lsthead = &epi->pwqlist; struct eppoll_entry *pwq; - /* This is called without locks, so we need the atomic exchange */ - nwait = xchg(&epi->nwait, 0); + while (!list_empty(lsthead)) { + pwq = list_first_entry(lsthead, struct eppoll_entry, llink); - if (nwait) { - while (!list_empty(lsthead)) { - pwq = list_first_entry(lsthead, struct eppoll_entry, llink); + list_del(&pwq->llink); + remove_wait_queue(pwq->whead, &pwq->wait); + kmem_cache_free(pwq_cache, pwq); + } +} - list_del_init(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); - kmem_cache_free(pwq_cache, pwq); - } +/** + * ep_scan_ready_list - Scans the ready list in a way that makes possible for + * the scan code, to call f_op->poll(). Also allows for + * O(NumReady) performance. + * + * @ep: Pointer to the epoll private data structure. + * @sproc: Pointer to the scan callback. + * @priv: Private opaque data passed to the @sproc callback. + * + * Returns: The same integer error code returned by the @sproc callback. + */ +static int ep_scan_ready_list(struct eventpoll *ep, + int (*sproc)(struct eventpoll *, + struct list_head *, void *), + void *priv) +{ + int error, pwake = 0; + unsigned long flags; + struct epitem *epi, *nepi; + LIST_HEAD(txlist); + + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() and epoll_ctl(). + */ + mutex_lock(&ep->mtx); + + /* + * Steal the ready list, and re-init the original one to the + * empty list. Also, set ep->ovflist to NULL so that events + * happening while looping w/out locks, are not lost. We cannot + * have the poll callback to queue directly on ep->rdllist, + * because we want the "sproc" callback to be able to do it + * in a lockless way. + */ + spin_lock_irqsave(&ep->lock, flags); + list_splice_init(&ep->rdllist, &txlist); + ep->ovflist = NULL; + spin_unlock_irqrestore(&ep->lock, flags); + + /* + * Now call the callback function. + */ + error = (*sproc)(ep, &txlist, priv); + + spin_lock_irqsave(&ep->lock, flags); + /* + * During the time we spent inside the "sproc" callback, some + * other events might have been queued by the poll callback. + * We re-insert them inside the main ready-list here. + */ + for (nepi = ep->ovflist; (epi = nepi) != NULL; + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { + /* + * We need to check if the item is already in the list. + * During the "sproc" callback execution time, items are + * queued into ->ovflist but the "txlist" might already + * contain them, and the list_splice() below takes care of them. + */ + if (!ep_is_linked(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + } + /* + * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after + * releasing the lock, events will be queued in the normal way inside + * ep->rdllist. + */ + ep->ovflist = EP_UNACTIVE_PTR; + + /* + * Quickly re-inject items left on "txlist". + */ + list_splice(&txlist, &ep->rdllist); + + if (!list_empty(&ep->rdllist)) { + /* + * Wake up (if active) both the eventpoll wait list and + * the ->poll() wait list (delayed after we release the lock). + */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; } + spin_unlock_irqrestore(&ep->lock, flags); + + mutex_unlock(&ep->mtx); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return error; } /* @@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) atomic_dec(&ep->user->epoll_watches); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", - current, ep, file)); - return 0; } @@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); /* * We need to lock this because we could be hit by @@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) if (ep) ep_free(ep); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); return 0; } +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) +{ + struct epitem *epi, *tmp; + + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events) + return POLLIN | POLLRDNORM; + else { + /* + * Item has been dropped into the ready list by the poll + * callback, but it's not actually ready, as far as + * caller requested events goes. We can remove it here. + */ + list_del_init(&epi->rdllink); + } + } + + return 0; +} + +static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) +{ + return ep_scan_ready_list(priv, ep_read_events_proc, NULL); +} + static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - unsigned int pollflags = 0; - unsigned long flags; + int pollflags; struct eventpoll *ep = file->private_data; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); - /* Check our condition */ - spin_lock_irqsave(&ep->lock, flags); - if (!list_empty(&ep->rdllist)) - pollflags = POLLIN | POLLRDNORM; - spin_unlock_irqrestore(&ep->lock, flags); + /* + * Proceed to find out if wanted events are really available inside + * the ready list. This need to be done under ep_call_nested() + * supervision, since the call to f_op->poll() done on listed files + * could re-enter here. + */ + pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, + ep_poll_readyevents_proc, ep, ep); - return pollflags; + return pollflags != -1 ? pollflags : 0; } /* File callbacks that implement the eventpoll file behaviour */ @@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file) * We don't want to get "file->f_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that noone is using this file anymore. - * So, for example, epoll_ctl() cannot hit here sicne if we reach this + * So, for example, epoll_ctl() cannot hit here since if we reach this * point, the file counter already went to zero and fget() would fail. * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire @@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep) *pep = ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", - current, ep)); return 0; free_uid: @@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) } } - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", - current, file, epir)); - return epir; } @@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->ffd.file, epi, ep)); - spin_lock_irqsave(&ep->lock, flags); /* @@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k goto out_unlock; /* + * Check the events coming with the callback. At this stage, not + * every device reports the events in the "key" parameter of the + * callback. We need to be able to handle both cases here, hence the + * test for "key" != NULL before the event match test. + */ + if (key && !((unsigned long) key & epi->event.events)) + goto out_unlock; + + /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are @@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k } /* If this file is already in the ready list we exit soon */ - if (ep_is_linked(&epi->rdllink)) - goto is_linked; - - list_add_tail(&epi->rdllink, &ep->rdllist); + if (!ep_is_linked(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); -is_linked: /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. @@ -690,7 +842,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); return 1; } @@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", - current, ep, tfile, fd)); + ep_poll_safewake(&ep->poll_wait); return 0; @@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even { int pwake = 0; unsigned int revents; - unsigned long flags; /* - * Set the new event interest mask before calling f_op->poll(), otherwise - * a potential race might occur. In fact if we do this operation inside - * the lock, an event might happen between the f_op->poll() call and the - * new event set registering. + * Set the new event interest mask before calling f_op->poll(); + * otherwise we might miss an event that happens between the + * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; + epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because @@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); - spin_lock_irqsave(&ep->lock, flags); - - /* Copy the data member from inside the lock */ - epi->event.data = event->data; - /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (revents & event->events) { + spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even if (waitqueue_active(&ep->poll_wait)) pwake++; } + spin_unlock_irq(&ep->lock); } - spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); return 0; } -static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, - int maxevents) +static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) { - int eventcnt, error = -EFAULT, pwake = 0; + struct ep_send_events_data *esed = priv; + int eventcnt; unsigned int revents; - unsigned long flags; - struct epitem *epi, *nepi; - struct list_head txlist; - - INIT_LIST_HEAD(&txlist); - - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). - */ - mutex_lock(&ep->mtx); - - /* - * Steal the ready list, and re-init the original one to the - * empty list. Also, set ep->ovflist to NULL so that events - * happening while looping w/out locks, are not lost. We cannot - * have the poll callback to queue directly on ep->rdllist, - * because we are doing it in the loop below, in a lockless way. - */ - spin_lock_irqsave(&ep->lock, flags); - list_splice(&ep->rdllist, &txlist); - INIT_LIST_HEAD(&ep->rdllist); - ep->ovflist = NULL; - spin_unlock_irqrestore(&ep->lock, flags); + struct epitem *epi; + struct epoll_event __user *uevent; /* - * We can loop without lock because this is a task private list. - * We just splice'd out the ep->rdllist in ep_collect_ready_items(). - * Items cannot vanish during the loop because we are holding "mtx". + * We can loop without lock because we are passed a task private list. + * Items cannot vanish during the loop because ep_scan_ready_list() is + * holding "mtx" during this call. */ - for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { - epi = list_first_entry(&txlist, struct epitem, rdllink); + for (eventcnt = 0, uevent = esed->events; + !list_empty(head) && eventcnt < esed->maxevents;) { + epi = list_first_entry(head, struct epitem, rdllink); list_del_init(&epi->rdllink); - /* - * Get the ready file event set. We can safely use the file - * because we are holding the "mtx" and this will guarantee - * that both the file and the item will not vanish. - */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); - revents &= epi->event.events; + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events; /* - * Is the event mask intersect the caller-requested one, - * deliver the event to userspace. Again, we are holding - * "mtx", so no operations coming from userspace can change - * the item. + * If the event mask intersect the caller-requested one, + * deliver the event to userspace. Again, ep_scan_ready_list() + * is holding "mtx", so no operations coming from userspace + * can change the item. */ if (revents) { - if (__put_user(revents, - &events[eventcnt].events) || - __put_user(epi->event.data, - &events[eventcnt].data)) - goto errxit; + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + return eventcnt ? eventcnt : -EFAULT; + } + eventcnt++; + uevent++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; - eventcnt++; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, noone can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + } } - /* - * At this point, noone can insert into ep->rdllist besides - * us. The epoll_ctl() callers are locked out by us holding - * "mtx" and the poll callback will queue them in ep->ovflist. - */ - if (!(epi->event.events & EPOLLET) && - (revents & epi->event.events)) - list_add_tail(&epi->rdllink, &ep->rdllist); - } - error = 0; - -errxit: - - spin_lock_irqsave(&ep->lock, flags); - /* - * During the time we spent in the loop above, some other events - * might have been queued by the poll callback. We re-insert them - * inside the main ready-list here. - */ - for (nepi = ep->ovflist; (epi = nepi) != NULL; - nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { - /* - * If the above loop quit with errors, the epoll item might still - * be linked to "txlist", and the list_splice() done below will - * take care of those cases. - */ - if (!ep_is_linked(&epi->rdllink)) - list_add_tail(&epi->rdllink, &ep->rdllist); } - /* - * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after - * releasing the lock, events will be queued in the normal way inside - * ep->rdllist. - */ - ep->ovflist = EP_UNACTIVE_PTR; - /* - * In case of error in the event-send loop, or in case the number of - * ready events exceeds the userspace limit, we need to splice the - * "txlist" back inside ep->rdllist. - */ - list_splice(&txlist, &ep->rdllist); - - if (!list_empty(&ep->rdllist)) { - /* - * Wake up (if active) both the eventpoll wait list and the ->poll() - * wait list (delayed after we release the lock). - */ - if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); - if (waitqueue_active(&ep->poll_wait)) - pwake++; - } - spin_unlock_irqrestore(&ep->lock, flags); + return eventcnt; +} - mutex_unlock(&ep->mtx); +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + struct ep_send_events_data esed; - /* We have to call this outside the lock */ - if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); + esed.maxevents = maxevents; + esed.events = events; - return eventcnt == 0 ? error: eventcnt; + return ep_scan_ready_list(ep, ep_send_events_proc, &esed); } static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, @@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, wait_queue_t wait; /* - * Calculate the timeout by checking for the "infinite" value ( -1 ) + * Calculate the timeout by checking for the "infinite" value (-1) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ @@ -1076,9 +1162,8 @@ retry: set_current_state(TASK_RUNNING); } - /* Is it worth to try to dig for events ? */ - eavail = !list_empty(&ep->rdllist); + eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; spin_unlock_irqrestore(&ep->lock, flags); @@ -1099,41 +1184,30 @@ retry: */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error, fd = -1; - struct eventpoll *ep; + int error; + struct eventpoll *ep = NULL; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", - current, flags)); - /* - * Create the internal data structure ( "struct eventpoll" ). + * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); - if (error < 0) { - fd = error; - goto error_return; - } - + if (error < 0) + return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); - if (fd < 0) + error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + flags & O_CLOEXEC); + if (error < 0) ep_free(ep); -error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, flags, fd)); - - return fd; + return error; } SYSCALL_DEFINE1(epoll_create, int, size) @@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epitem *epi; struct epoll_event epds; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", - current, epfd, op, fd, event)); - error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) @@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; @@ -1237,8 +1307,6 @@ error_tgt_fput: error_fput: fput(file); error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", - current, epfd, op, fd, event, error)); return error; } @@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, struct file *file; struct eventpoll *ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", - current, epfd, events, maxevents, timeout)); - /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; @@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, error_fput: fput(file); error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", - current, epfd, events, maxevents, timeout, error)); return error; } @@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void) EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ - ep_poll_safewake_init(&psw); + ep_nested_calls_init(&poll_safewake_ncalls); + + /* Initialize the structure used to perform file's f_op->poll() calls */ + ep_nested_calls_init(&poll_readywalk_ncalls); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, - NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, - EPI_SLAB_DEBUG|SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); return 0; } diff --git a/fs/exec.c b/fs/exec.c index c5128fbc916..052a961e41a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -53,6 +53,7 @@ #include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1056,28 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold current->cred_exec_mutex to protect against * PTRACE_ATTACH */ -void check_unsafe_exec(struct linux_binprm *bprm) +int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned long flags; - unsigned n_fs, n_sighand; + unsigned n_fs; + int res = 0; bprm->unsafe = tracehook_unsafe_exec(p); n_fs = 1; - n_sighand = 1; + write_lock(&p->fs->lock); lock_task_sighand(p, &flags); for (t = next_thread(p); t != p; t = next_thread(t)) { if (t->fs == p->fs) n_fs++; - n_sighand++; } - if (atomic_read(&p->fs->count) > n_fs || - atomic_read(&p->sighand->count) > n_sighand) + if (p->fs->users > n_fs) { bprm->unsafe |= LSM_UNSAFE_SHARE; + } else { + if (p->fs->in_exec) + res = -EAGAIN; + p->fs->in_exec = 1; + } unlock_task_sighand(p, &flags); + write_unlock(&p->fs->lock); + + return res; } /* @@ -1296,12 +1304,15 @@ int do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + + retval = check_unsafe_exec(bprm); + if (retval) + goto out_unlock; file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_unlock; + goto out_unmark; sched_exec(); @@ -1344,6 +1355,9 @@ int do_execve(char * filename, goto out; /* execve succeeded */ + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1362,6 +1376,11 @@ out_file: fput(bprm->file); } +out_unmark: + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); + out_unlock: current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS new file mode 100644 index 00000000000..1b2d4c63a57 --- /dev/null +++ b/fs/exofs/BUGS @@ -0,0 +1,3 @@ +- Out-of-space may cause a severe problem if the object (and directory entry) + were written, but the inode attributes failed. Then if the filesystem was + unmounted and mounted the kernel can get into an endless loop doing a readdir. diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild new file mode 100644 index 00000000000..cc2d22db119 --- /dev/null +++ b/fs/exofs/Kbuild @@ -0,0 +1,16 @@ +# +# Kbuild for the EXOFS module +# +# Copyright (C) 2008 Panasas Inc. All rights reserved. +# +# Authors: +# Boaz Harrosh <bharrosh@panasas.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 +# +# Kbuild - Gets included from the Kernels Makefile and build system +# + +exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o +obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig new file mode 100644 index 00000000000..86194b2f799 --- /dev/null +++ b/fs/exofs/Kconfig @@ -0,0 +1,13 @@ +config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. + +# Debugging-related stuff +config EXOFS_DEBUG + bool "Enable debugging" + depends on EXOFS_FS + help + This option enables EXOFS debug prints. diff --git a/fs/exofs/common.h b/fs/exofs/common.h new file mode 100644 index 00000000000..b1512c4bb8c --- /dev/null +++ b/fs/exofs/common.h @@ -0,0 +1,184 @@ +/* + * common.h - Common definitions for both Kernel and user-mode utilities + * + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __EXOFS_COM_H__ +#define __EXOFS_COM_H__ + +#include <linux/types.h> + +#include <scsi/osd_attributes.h> +#include <scsi/osd_initiator.h> +#include <scsi/osd_sec.h> + +/**************************************************************************** + * Object ID related defines + * NOTE: inode# = object ID - EXOFS_OBJ_OFF + ****************************************************************************/ +#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ +#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ +#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ +#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ + +/* exofs Application specific page/attribute */ +# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) +# define EXOFS_ATTR_INODE_DATA 1 + +/* + * The maximum number of files we can have is limited by the size of the + * inode number. This is the largest object ID that the file system supports. + * Object IDs 0, 1, and 2 are always in use (see above defines). + */ +enum { + EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX : + (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)), + EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF), +}; + +/**************************************************************************** + * Misc. + ****************************************************************************/ +#define EXOFS_BLKSHIFT 12 +#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT) + +/**************************************************************************** + * superblock-related things + ****************************************************************************/ +#define EXOFS_SUPER_MAGIC 0x5DF5 + +/* + * The file system control block - stored in an object's data (mainly, the one + * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored + * on disk. Right now it just has a magic value, which is basically a sanity + * check on our ability to communicate with the object store. + */ +struct exofs_fscb { + __le64 s_nextid; /* Highest object ID used */ + __le32 s_numfiles; /* Number of files on fs */ + __le16 s_magic; /* Magic signature */ + __le16 s_newfs; /* Non-zero if this is a new fs */ +}; + +/**************************************************************************** + * inode-related things + ****************************************************************************/ +#define EXOFS_IDATA 5 + +/* + * The file control block - stored in an object's attributes. This is where + * the in-memory inode is stored on disk. + */ +struct exofs_fcb { + __le64 i_size; /* Size of the file */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_uid; /* Owner Uid */ + __le32 i_gid; /* Group Id */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_flags; /* File flags (unused for now)*/ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */ +}; + +#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb) + +/* This is the Attribute the fcb is stored in */ +static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DATA, + EXOFS_INO_ATTR_SIZE); + +/**************************************************************************** + * dentry-related things + ****************************************************************************/ +#define EXOFS_NAME_LEN 255 + +/* + * The on-disk directory entry + */ +struct exofs_dir_entry { + __le64 inode_no; /* inode number */ + __le16 rec_len; /* directory entry length */ + u8 name_len; /* name length */ + u8 file_type; /* umm...file type */ + char name[EXOFS_NAME_LEN]; /* file name */ +}; + +enum { + EXOFS_FT_UNKNOWN, + EXOFS_FT_REG_FILE, + EXOFS_FT_DIR, + EXOFS_FT_CHRDEV, + EXOFS_FT_BLKDEV, + EXOFS_FT_FIFO, + EXOFS_FT_SOCK, + EXOFS_FT_SYMLINK, + EXOFS_FT_MAX +}; + +#define EXOFS_DIR_PAD 4 +#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1) +#define EXOFS_DIR_REC_LEN(name_len) \ + (((name_len) + offsetof(struct exofs_dir_entry, name) + \ + EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) + +/************************* + * function declarations * + *************************/ +/* osd.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); + +int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid); +static inline int exofs_check_ok(struct osd_request *or) +{ + return exofs_check_ok_resid(or, NULL, NULL); +} +int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred); +int exofs_async_op(struct osd_request *or, + osd_req_done_fn *async_done, void *caller_context, u8 *cred); + +int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); + +int osd_req_read_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void *buff, u64 len); + +int osd_req_write_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void *buff, u64 len); + +#endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c new file mode 100644 index 00000000000..65b0c8c776a --- /dev/null +++ b/fs/exofs/dir.c @@ -0,0 +1,672 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline unsigned exofs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void exofs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* Accesses dir's inode->i_size must be called under inode lock */ +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) +{ + loff_t last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + + if (!PageUptodate(page)) + SetPageUptodate(page); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + set_page_dirty(page); + + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + + return err; +} + +static void exofs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + unsigned chunk_size = exofs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct exofs_dir_entry *p; + char *error; + + /* if the page is the last one in the directory */ + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct exofs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < EXOFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < EXOFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + +Ebadsize: + EXOFS_ERR("ERROR [exofs_check_page]: " + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +bad_entry: + EXOFS_ERR( + "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - " + "offset=%lu, inode=%llu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + _LLU(le64_to_cpu(p->inode_no)), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct exofs_dir_entry *)(kaddr + offs); + EXOFS_ERR("ERROR [exofs_check_page]: " + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%llu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + _LLU(le64_to_cpu(p->inode_no))); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *exofs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + exofs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + exofs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline int exofs_match(int len, const unsigned char *name, + struct exofs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode_no) + return 0; + return !memcmp(name, de->name, len); +} + +static inline +struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p) +{ + return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static inline unsigned +exofs_validate_entry(char *base, unsigned offset, unsigned mask) +{ + struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset); + struct exofs_dir_entry *p = + (struct exofs_dir_entry *)(base + (offset&mask)); + while ((char *)p < (char *)de) { + if (p->rec_len == 0) + break; + p = exofs_next_entry(p); + } + return (char *)p - base; +} + +static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = { + [EXOFS_FT_UNKNOWN] = DT_UNKNOWN, + [EXOFS_FT_REG_FILE] = DT_REG, + [EXOFS_FT_DIR] = DT_DIR, + [EXOFS_FT_CHRDEV] = DT_CHR, + [EXOFS_FT_BLKDEV] = DT_BLK, + [EXOFS_FT_FIFO] = DT_FIFO, + [EXOFS_FT_SOCK] = DT_SOCK, + [EXOFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK, +}; + +static inline +void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static int +exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); + unsigned char *types = NULL; + int need_revalidate = (filp->f_version != inode->i_version); + + if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) + return 0; + + types = exofs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct exofs_dir_entry *de; + struct page *page = exofs_get_page(inode, n); + + if (IS_ERR(page)) { + EXOFS_ERR("ERROR: " + "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + return PTR_ERR(page); + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = exofs_validate_entry(kaddr, offset, + chunk_mask); + filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + } + filp->f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct exofs_dir_entry *)(kaddr + offset); + limit = kaddr + exofs_last_byte(inode, n) - + EXOFS_DIR_REC_LEN(1); + for (; (char *)de <= limit; de = exofs_next_entry(de)) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: " + "zero-length directory entry"); + exofs_put_page(page); + return -EIO; + } + if (de->inode_no) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < EXOFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<<PAGE_CACHE_SHIFT) | offset, + le64_to_cpu(de->inode_no), + d_type); + if (over) { + exofs_put_page(page); + return 0; + } + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + exofs_put_page(page); + } + + return 0; +} + +struct exofs_dir_entry *exofs_find_entry(struct inode *dir, + struct dentry *dentry, struct page **res_page) +{ + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct exofs_i_info *oi = exofs_i(dir); + struct exofs_dir_entry *de; + + if (npages == 0) + goto out; + + *res_page = NULL; + + start = oi->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = exofs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct exofs_dir_entry *) kaddr; + kaddr += exofs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR( + "ERROR: exofs_find_entry: " + "zero-length directory entry"); + exofs_put_page(page); + goto out; + } + if (exofs_match(namelen, name, de)) + goto found; + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + oi->i_dir_start_lookup = n; + return de; +} + +struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = exofs_get_page(dir, 0); + struct exofs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = exofs_next_entry( + (struct exofs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t exofs_parent_ino(struct dentry *child) +{ + struct page *page; + struct exofs_dir_entry *de; + ino_t ino; + + de = exofs_dotdot(child->d_inode, &page); + if (!de) + return 0; + + ino = le64_to_cpu(de->inode_no); + exofs_put_page(page); + return ino; +} + +ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct exofs_dir_entry *de; + struct page *page; + + de = exofs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode_no); + exofs_put_page(page); + } + return res; +} + +int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, + struct page *page, struct inode *inode) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = le16_to_cpu(de->rec_len); + int err; + + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, len, + AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + if (err) + EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", + err); + + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + if (likely(!err)) + err = exofs_commit_chunk(page, pos, len); + exofs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + return err; +} + +int exofs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = exofs_chunk_size(dir); + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = exofs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + exofs_last_byte(dir, n); + de = (struct exofs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode_no = 0; + goto got_it; + } + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_add_link: " + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (exofs_match(namelen, name, de)) + goto out_unlock; + name_len = EXOFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode_no && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct exofs_dir_entry *) ((char *) de + rec_len); + } + unlock_page(page); + exofs_put_page(page); + } + + EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char *)de - (char *)page_address(page); + err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0, + &page, NULL); + if (err) + goto out_unlock; + if (de->inode_no) { + struct exofs_dir_entry *de1 = + (struct exofs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + err = exofs_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + sbi->s_numfiles++; + +out_put: + exofs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + loff_t pos; + struct exofs_dir_entry *pde = NULL; + struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from); + int err; + + while (de < dir) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_delete_entry:" + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = exofs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + pos = page_offset(page) + from; + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, + &page, NULL); + if (err) + EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", + err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode_no = 0; + if (likely(!err)) + err = exofs_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty(inode); + sbi->s_numfiles--; +out: + exofs_put_page(page); + return err; +} + +/* kept aligned on 4 bytes */ +#define THIS_DIR ".\0\0" +#define PARENT_DIR "..\0" + +int exofs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = exofs_chunk_size(inode); + struct exofs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0, + &page, NULL); + if (err) { + unlock_page(page); + goto fail; + } + + kaddr = kmap_atomic(page, KM_USER0); + de = (struct exofs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); + memcpy(de->name, THIS_DIR, sizeof(THIS_DIR)); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + + de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1)); + de->inode_no = cpu_to_le64(parent->i_ino); + memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); + exofs_set_de_type(de, inode); + kunmap_atomic(page, KM_USER0); + err = exofs_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +int exofs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct exofs_dir_entry *de; + page = exofs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct exofs_dir_entry *)kaddr; + kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_empty_dir: " + "zero-length directory entry" + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->inode_no != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (le64_to_cpu(de->inode_no) != + inode->i_ino) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + return 1; + +not_empty: + exofs_put_page(page); + return 0; +} + +const struct file_operations exofs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = exofs_readdir, +}; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h new file mode 100644 index 00000000000..0fd4c785967 --- /dev/null +++ b/fs/exofs/exofs.h @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/fs.h> +#include <linux/time.h> +#include "common.h" + +#ifndef __EXOFS_H__ +#define __EXOFS_H__ + +#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define EXOFS_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define EXOFS_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +/* + * our extension to the in-memory superblock + */ +struct exofs_sb_info { + struct osd_dev *s_dev; /* returned by get_osd_dev */ + osd_id s_pid; /* partition ID of file system*/ + int s_timeout; /* timeout for OSD operations */ + uint64_t s_nextid; /* highest object ID used */ + uint32_t s_numfiles; /* number of files on fs */ + spinlock_t s_next_gen_lock; /* spinlock for gen # update */ + u32 s_next_generation; /* next gen # to use */ + atomic_t s_curr_pending; /* number of pending commands */ + uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ +}; + +/* + * our extension to the in-memory inode + */ +struct exofs_i_info { + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ + wait_queue_head_t i_wq; /* wait queue for inode */ + uint64_t i_commit_size; /* the object's written length */ + uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ + struct inode vfs_inode; /* normal in-memory inode */ +}; + +/* + * our inode flags + */ +#define OBJ_2BCREATED 0 /* object will be created soon*/ +#define OBJ_CREATED 1 /* object has been created on the osd*/ + +static inline int obj_2bcreated(struct exofs_i_info *oi) +{ + return test_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline void set_obj_2bcreated(struct exofs_i_info *oi) +{ + set_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline int obj_created(struct exofs_i_info *oi) +{ + return test_bit(OBJ_CREATED, &oi->i_flags); +} + +static inline void set_obj_created(struct exofs_i_info *oi) +{ + set_bit(OBJ_CREATED, &oi->i_flags); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi); +static inline int wait_obj_created(struct exofs_i_info *oi) +{ + if (likely(obj_created(oi))) + return 0; + + return __exofs_wait_obj_created(oi); +} + +/* + * get to our inode from the vfs inode + */ +static inline struct exofs_i_info *exofs_i(struct inode *inode) +{ + return container_of(inode, struct exofs_i_info, vfs_inode); +} + +/* + * Maximum count of links to a file + */ +#define EXOFS_LINK_MAX 32000 + +/************************* + * function declarations * + *************************/ +/* inode.c */ +void exofs_truncate(struct inode *inode); +int exofs_setattr(struct dentry *, struct iattr *); +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern struct inode *exofs_iget(struct super_block *, unsigned long); +struct inode *exofs_new_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, int); +extern void exofs_delete_inode(struct inode *); + +/* dir.c: */ +int exofs_add_link(struct dentry *, struct inode *); +ino_t exofs_inode_by_name(struct inode *, struct dentry *); +int exofs_delete_entry(struct exofs_dir_entry *, struct page *); +int exofs_make_empty(struct inode *, struct inode *); +struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *, + struct page **); +int exofs_empty_dir(struct inode *); +struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **); +ino_t exofs_parent_ino(struct dentry *child); +int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, + struct inode *); + +/********************* + * operation vectors * + *********************/ +/* dir.c: */ +extern const struct file_operations exofs_dir_operations; + +/* file.c */ +extern const struct inode_operations exofs_file_inode_operations; +extern const struct file_operations exofs_file_operations; + +/* inode.c */ +extern const struct address_space_operations exofs_aops; + +/* namei.c */ +extern const struct inode_operations exofs_dir_inode_operations; +extern const struct inode_operations exofs_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations exofs_symlink_inode_operations; +extern const struct inode_operations exofs_fast_symlink_inode_operations; + +#endif diff --git a/fs/exofs/file.c b/fs/exofs/file.c new file mode 100644 index 00000000000..6ed7fe48475 --- /dev/null +++ b/fs/exofs/file.c @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/buffer_head.h> + +#include "exofs.h" + +static int exofs_release_file(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int exofs_file_fsync(struct file *filp, struct dentry *dentry, + int datasync) +{ + int ret; + struct address_space *mapping = filp->f_mapping; + + ret = filemap_write_and_wait(mapping); + if (ret) + return ret; + + /*Note: file_fsync below also calles sync_blockdev, which is a no-op + * for exofs, but other then that it does sync_inode and + * sync_superblock which is what we need here. + */ + return file_fsync(filp, dentry, datasync); +} + +static int exofs_flush(struct file *file, fl_owner_t id) +{ + exofs_file_fsync(file, file->f_path.dentry, 1); + /* TODO: Flush the OSD target */ + return 0; +} + +const struct file_operations exofs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .open = generic_file_open, + .release = exofs_release_file, + .fsync = exofs_file_fsync, + .flush = exofs_flush, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct inode_operations exofs_file_inode_operations = { + .truncate = exofs_truncate, + .setattr = exofs_setattr, +}; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c new file mode 100644 index 00000000000..ba8d9fab469 --- /dev/null +++ b/fs/exofs/inode.c @@ -0,0 +1,1303 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <scsi/scsi_device.h> + +#include "exofs.h" + +#ifdef CONFIG_EXOFS_DEBUG +# define EXOFS_DEBUG_OBJ_ISIZE 1 +#endif + +struct page_collect { + struct exofs_sb_info *sbi; + struct request_queue *req_q; + struct inode *inode; + unsigned expected_pages; + + struct bio *bio; + unsigned nr_pages; + unsigned long length; + loff_t pg_first; /* keep 64bit also in 32-arches */ +}; + +static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, + struct inode *inode) +{ + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue; + + pcol->sbi = sbi; + pcol->req_q = req_q; + pcol->inode = inode; + pcol->expected_pages = expected_pages; + + pcol->bio = NULL; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + + EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino, + expected_pages); +} + +static void _pcol_reset(struct page_collect *pcol) +{ + pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); + + pcol->bio = NULL; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", + pcol->inode->i_ino, pcol->expected_pages); + + /* this is probably the end of the loop but in writes + * it might not end here. don't be left with nothing + */ + if (!pcol->expected_pages) + pcol->expected_pages = 128; +} + +static int pcol_try_alloc(struct page_collect *pcol) +{ + int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); + + for (; pages; pages >>= 1) { + pcol->bio = bio_alloc(GFP_KERNEL, pages); + if (likely(pcol->bio)) + return 0; + } + + EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", + pcol->expected_pages); + return -ENOMEM; +} + +static void pcol_free(struct page_collect *pcol) +{ + bio_put(pcol->bio); + pcol->bio = NULL; +} + +static int pcol_add_page(struct page_collect *pcol, struct page *page, + unsigned len) +{ + int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); + if (unlikely(len != added_len)) + return -ENOMEM; + + ++pcol->nr_pages; + pcol->length += len; + return 0; +} + +static int update_read_page(struct page *page, int ret) +{ + if (ret == 0) { + /* Everything is OK */ + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } else if (ret == -EFAULT) { + /* In this case we were trying to read something that wasn't on + * disk yet - return a page full of zeroes. This should be OK, + * because the object should be empty (if there was a write + * before this read, the read would be waiting with the page + * locked */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + ret = 0; /* recovered error */ + EXOFS_DBGMSG("recovered read error\n"); + } else /* Error */ + SetPageError(page); + + return ret; +} + +static void update_write_page(struct page *page, int ret) +{ + if (ret) { + mapping_set_error(page->mapping, ret); + SetPageError(page); + } + end_page_writeback(page); +} + +/* Called at the end of reads, to optionally unlock pages and update their + * status. + */ +static int __readpages_done(struct osd_request *or, struct page_collect *pcol, + bool do_unlock) +{ + struct bio_vec *bvec; + int i; + u64 resid; + u64 good_bytes; + u64 length = 0; + int ret = exofs_check_ok_resid(or, &resid, NULL); + + osd_end_request(or); + + if (likely(!ret)) + good_bytes = pcol->length; + else if (!resid) + good_bytes = 0; + else + good_bytes = pcol->length - resid; + + EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + __bio_for_each_segment(bvec, pcol->bio, i, 0) { + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages at end */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", + inode->i_ino, page->index, + page_stat ? "bad_bytes" : "good_bytes"); + + ret = update_read_page(page, page_stat); + if (do_unlock) + unlock_page(page); + length += bvec->bv_len; + } + + pcol_free(pcol); + EXOFS_DBGMSG("readpages_done END\n"); + return ret; +} + +/* callback of async reads */ +static void readpages_done(struct osd_request *or, void *p) +{ + struct page_collect *pcol = p; + + __readpages_done(or, pcol, true); + atomic_dec(&pcol->sbi->s_curr_pending); + kfree(p); +} + +static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, pcol->bio, i, 0) { + struct page *page = bvec->bv_page; + + if (rw == READ) + update_read_page(page, ret); + else + update_write_page(page, ret); + + unlock_page(page); + } + pcol_free(pcol); +} + +static int read_exec(struct page_collect *pcol, bool is_sync) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct osd_obj_id obj = {pcol->sbi->s_pid, + pcol->inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or = NULL; + struct page_collect *pcol_copy = NULL; + loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; + int ret; + + if (!pcol->bio) + return 0; + + /* see comment in _readpage() about sync reads */ + WARN_ON(is_sync && (pcol->nr_pages != 1)); + + or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + + osd_req_read(or, &obj, pcol->bio, i_start); + + if (is_sync) { + exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); + return __readpages_done(or, pcol, false); + } + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); + if (unlikely(ret)) + goto err; + + atomic_inc(&pcol->sbi->s_curr_pending); + + EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", + obj.id, _LLU(i_start), pcol->length); + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + return 0; + +err: + if (!is_sync) + _unlock_pcol_pages(pcol, ret, READ); + kfree(pcol_copy); + if (or) + osd_end_request(or); + return ret; +} + +/* readpage_strip is called either directly from readpage() or by the VFS from + * within read_cache_pages(), to add one more page to be read. It will try to + * collect as many contiguous pages as posible. If a discontinuity is + * encountered, or it runs out of resources, it will submit the previous segment + * and will start a new collection. Eventually caller must submit the last + * segment if present. + */ +static int readpage_strip(void *data, struct page *page) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + /* FIXME: Just for debugging, will be removed */ + if (PageUptodate(page)) + EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, + page->index); + + if (page->index < end_index) + len = PAGE_CACHE_SIZE; + else if (page->index == end_index) + len = i_size & ~PAGE_CACHE_MASK; + else + len = 0; + + if (!len || !obj_created(oi)) { + /* this will be out of bounds, or doesn't exist yet. + * Current page is cleared and the request is split + */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + + unlock_page(page); + EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," + " splitting\n", inode->i_ino, page->index); + + return read_exec(pcol, false); + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = read_exec(pcol, false); + if (unlikely(ret)) + goto fail; + goto try_again; + } + + if (!pcol->bio) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + if (len != PAGE_CACHE_SIZE) + zero_user(page, len, PAGE_CACHE_SIZE - len); + + EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (ret) { + EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " + "this_len=0x%zx nr_pages=%u length=0x%lx\n", + page, len, pcol->nr_pages, pcol->length); + + /* split the request, and start again with current page */ + ret = read_exec(pcol, false); + if (unlikely(ret)) + goto fail; + + goto try_again; + } + + return 0; + +fail: + /* SetPageError(page); ??? */ + unlock_page(page); + return ret; +} + +static int exofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, nr_pages, mapping->host); + + ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); + if (ret) { + EXOFS_ERR("read_cache_pages => %d\n", ret); + return ret; + } + + return read_exec(&pcol, false); +} + +static int _readpage(struct page *page, bool is_sync) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + /* readpage_strip might call read_exec(,async) inside at several places + * but this is safe for is_async=0 since read_exec will not do anything + * when we have a single page. + */ + ret = readpage_strip(&pcol, page); + if (ret) { + EXOFS_ERR("_readpage => %d\n", ret); + return ret; + } + + return read_exec(&pcol, is_sync); +} + +/* + * We don't need the file + */ +static int exofs_readpage(struct file *file, struct page *page) +{ + return _readpage(page, false); +} + +/* Callback for osd_write. All writes are asynchronouse */ +static void writepages_done(struct osd_request *or, void *p) +{ + struct page_collect *pcol = p; + struct bio_vec *bvec; + int i; + u64 resid; + u64 good_bytes; + u64 length = 0; + + int ret = exofs_check_ok_resid(or, NULL, &resid); + + osd_end_request(or); + atomic_dec(&pcol->sbi->s_curr_pending); + + if (likely(!ret)) + good_bytes = pcol->length; + else if (!resid) + good_bytes = 0; + else + good_bytes = pcol->length - resid; + + EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + __bio_for_each_segment(bvec, pcol->bio, i, 0) { + struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages to a bio */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + update_write_page(page, page_stat); + unlock_page(page); + EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", + inode->i_ino, page->index, page_stat); + + length += bvec->bv_len; + } + + pcol_free(pcol); + kfree(pcol); + EXOFS_DBGMSG("writepages_done END\n"); +} + +static int write_exec(struct page_collect *pcol) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct osd_obj_id obj = {pcol->sbi->s_pid, + pcol->inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or = NULL; + struct page_collect *pcol_copy = NULL; + loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; + int ret; + + if (!pcol->bio) + return 0; + + or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("write_exec: Faild to osd_start_request()\n"); + ret = -ENOMEM; + goto err; + } + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + + osd_req_write(or, &obj, pcol_copy->bio, i_start); + ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); + if (unlikely(ret)) { + EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); + goto err; + } + + atomic_inc(&pcol->sbi->s_curr_pending); + EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", + pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), + pcol->length); + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + return 0; + +err: + _unlock_pcol_pages(pcol, ret, WRITE); + kfree(pcol_copy); + if (or) + osd_end_request(or); + return ret; +} + +/* writepage_strip is called either directly from writepage() or by the VFS from + * within write_cache_pages(), to add one more page to be written to storage. + * It will try to collect as many contiguous pages as possible. If a + * discontinuity is encountered or it runs out of resources it will submit the + * previous segment and will start a new collection. + * Eventually caller must submit the last segment if present. + */ +static int writepage_strip(struct page *page, + struct writeback_control *wbc_unused, void *data) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + BUG_ON(!PageLocked(page)); + + ret = wait_obj_created(oi); + if (unlikely(ret)) + goto fail; + + if (page->index < end_index) + /* in this case, the page is within the limits of the file */ + len = PAGE_CACHE_SIZE; + else { + len = i_size & ~PAGE_CACHE_MASK; + + if (page->index > end_index || !len) { + /* in this case, the page is outside the limits + * (truncate in progress) + */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + if (PageError(page)) + ClearPageError(page); + unlock_page(page); + return 0; + } + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + goto try_again; + } + + if (!pcol->bio) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed pcol_add_page " + "nr_pages=%u total_length=0x%lx\n", + pcol->nr_pages, pcol->length); + + /* split the request, next loop will start again */ + ret = write_exec(pcol); + if (unlikely(ret)) { + EXOFS_DBGMSG("write_exec faild => %d", ret); + goto fail; + } + + goto try_again; + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + return 0; + +fail: + set_bit(AS_EIO, &page->mapping->flags); + unlock_page(page); + return ret; +} + +static int exofs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct page_collect pcol; + long start, end, expected_pages; + int ret; + + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = (wbc->range_end == LLONG_MAX) ? + start + mapping->nrpages : + wbc->range_end >> PAGE_CACHE_SHIFT; + + if (start || end) + expected_pages = min(end - start + 1, 32L); + else + expected_pages = mapping->nrpages; + + EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" + " m->nrpages=%lu start=0x%lx end=0x%lx\n", + mapping->host->i_ino, wbc->range_start, wbc->range_end, + mapping->nrpages, start, end); + + _pcol_init(&pcol, expected_pages, mapping->host); + + ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); + if (ret) { + EXOFS_ERR("write_cache_pages => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} + +static int exofs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + ret = writepage_strip(page, NULL, &pcol); + if (ret) { + EXOFS_ERR("exofs_writepage => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} + +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret = 0; + struct page *page; + + page = *pagep; + if (page == NULL) { + ret = simple_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); + if (ret) { + EXOFS_DBGMSG("simple_write_begin faild\n"); + return ret; + } + + page = *pagep; + } + + /* read modify write */ + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + ret = _readpage(page, true); + if (ret) { + /*SetPageError was done by _readpage. Is it ok?*/ + unlock_page(page); + EXOFS_DBGMSG("__readpage_filler faild\n"); + } + } + + return ret; +} + +static int exofs_write_begin_export(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + + return exofs_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); +} + +const struct address_space_operations exofs_aops = { + .readpage = exofs_readpage, + .readpages = exofs_readpages, + .writepage = exofs_writepage, + .writepages = exofs_writepages, + .write_begin = exofs_write_begin_export, + .write_end = simple_write_end, +}; + +/****************************************************************************** + * INODE OPERATIONS + *****************************************************************************/ + +/* + * Test whether an inode is a fast symlink. + */ +static inline int exofs_inode_is_fast_symlink(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + + return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); +} + +/* + * get_block_t - Fill in a buffer_head + * An OSD takes care of block allocation so we just fake an allocation by + * putting in the inode's sector_t in the buffer_head. + * TODO: What about the case of create==0 and @iblock does not exist in the + * object? + */ +static int exofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + map_bh(bh_result, inode->i_sb, iblock); + return 0; +} + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); + +/* + * Truncate a file to the specified size - all we have to do is set the size + * attribute. We make sure the object exists first. + */ +void exofs_truncate(struct inode *inode) +{ + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_i_info *oi = exofs_i(inode); + struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or; + struct osd_attr attr; + loff_t isize = i_size_read(inode); + __be64 newsize; + int ret; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + return; + if (exofs_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n"); + goto fail; + } + + osd_req_set_attributes(or, &obj); + + newsize = cpu_to_be64((u64)isize); + attr = g_attr_logical_length; + attr.val_ptr = &newsize; + osd_req_add_set_attr_list(or, &attr, 1); + + /* if we are about to truncate an object, and it hasn't been + * created yet, wait + */ + if (unlikely(wait_obj_created(oi))) + goto fail; + + ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + osd_end_request(or); + if (ret) + goto fail; + +out: + mark_inode_dirty(inode); + return; +fail: + make_bad_inode(inode); + goto out; +} + +/* + * Set inode attributes - just call generic functions. + */ +int exofs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + return error; +} + +/* + * Read an inode from the OSD, and return it as is. We also return the size + * attribute in the 'sanity' argument if we got compiled with debugging turned + * on. + */ +static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, + struct exofs_fcb *inode, uint64_t *sanity) +{ + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_request *or; + struct osd_attr attr; + struct osd_obj_id obj = {sbi->s_pid, + oi->vfs_inode.i_ino + EXOFS_OBJ_OFF}; + int ret; + + exofs_make_credential(oi->i_cred, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n"); + return -ENOMEM; + } + osd_req_get_attributes(or, &obj); + + /* we need the inode attribute */ + osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); + +#ifdef EXOFS_DEBUG_OBJ_ISIZE + /* we get the size attributes to do a sanity check */ + osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); +#endif + + ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + if (ret) + goto out; + + attr = g_attr_inode_data; + ret = extract_attr_from_req(or, &attr); + if (ret) { + EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); + goto out; + } + + WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); + memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE); + +#ifdef EXOFS_DEBUG_OBJ_ISIZE + attr = g_attr_logical_length; + ret = extract_attr_from_req(or, &attr); + if (ret) { + EXOFS_ERR("ERROR: extract attr from or failed\n"); + goto out; + } + *sanity = get_unaligned_be64(attr.val_ptr); +#endif + +out: + osd_end_request(or); + return ret; +} + +/* + * Fill in an inode read from the OSD and set it up for use + */ +struct inode *exofs_iget(struct super_block *sb, unsigned long ino) +{ + struct exofs_i_info *oi; + struct exofs_fcb fcb; + struct inode *inode; + uint64_t uninitialized_var(sanity); + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + oi = exofs_i(inode); + + /* read the inode from the osd */ + ret = exofs_get_inode(sb, oi, &fcb, &sanity); + if (ret) + goto bad_inode; + + init_waitqueue_head(&oi->i_wq); + set_obj_created(oi); + + /* copy stuff from on-disk struct to in-memory struct */ + inode->i_mode = le16_to_cpu(fcb.i_mode); + inode->i_uid = le32_to_cpu(fcb.i_uid); + inode->i_gid = le32_to_cpu(fcb.i_gid); + inode->i_nlink = le16_to_cpu(fcb.i_links_count); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); + inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); + inode->i_ctime.tv_nsec = + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; + oi->i_commit_size = le64_to_cpu(fcb.i_size); + i_size_write(inode, oi->i_commit_size); + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_generation = le32_to_cpu(fcb.i_generation); + +#ifdef EXOFS_DEBUG_OBJ_ISIZE + if ((inode->i_size != sanity) && + (!exofs_inode_is_fast_symlink(inode))) { + EXOFS_ERR("WARNING: Size of object from inode and " + "attributes differ (%lld != %llu)\n", + inode->i_size, _LLU(sanity)); + } +#endif + + oi->i_dir_start_lookup = 0; + + if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (fcb.i_data[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(fcb.i_data[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(fcb.i_data[1])); + } else { + memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (exofs_inode_is_fast_symlink(inode)) + inode->i_op = &exofs_fast_symlink_inode_operations; + else { + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + } + } else { + inode->i_op = &exofs_special_inode_operations; + if (fcb.i_data[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(fcb.i_data[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(fcb.i_data[1]))); + } + + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi) +{ + if (!obj_created(oi)) { + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + } + return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; +} +/* + * Callback function from exofs_new_inode(). The important thing is that we + * set the obj_created flag so that other methods know that the object exists on + * the OSD. + */ +static void create_done(struct osd_request *or, void *p) +{ + struct inode *inode = p; + struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + int ret; + + ret = exofs_check_ok(or); + osd_end_request(or); + atomic_dec(&sbi->s_curr_pending); + + if (unlikely(ret)) { + EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", + _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); + make_bad_inode(inode); + } else + set_obj_created(oi); + + atomic_dec(&inode->i_count); + wake_up(&oi->i_wq); +} + +/* + * Set up a new inode and create an object for it on the OSD + */ +struct inode *exofs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb; + struct inode *inode; + struct exofs_i_info *oi; + struct exofs_sb_info *sbi; + struct osd_request *or; + struct osd_obj_id obj; + int ret; + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + oi = exofs_i(inode); + + init_waitqueue_head(&oi->i_wq); + set_obj_2bcreated(oi); + + sbi = sb->s_fs_info; + + sb->s_dirt = 1; + inode->i_uid = current->cred->fsuid; + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + inode->i_gid = current->cred->fsgid; + } + inode->i_mode = mode; + + inode->i_ino = sbi->s_nextid++; + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + oi->i_commit_size = inode->i_size = 0; + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + mark_inode_dirty(inode); + + obj.partition = sbi->s_pid; + obj.id = inode->i_ino + EXOFS_OBJ_OFF; + exofs_make_credential(oi->i_cred, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_new_inode: osd_start_request failed\n"); + return ERR_PTR(-ENOMEM); + } + + osd_req_create_object(or, &obj); + + /* increment the refcount so that the inode will still be around when we + * reach the callback + */ + atomic_inc(&inode->i_count); + + ret = exofs_async_op(or, create_done, inode, oi->i_cred); + if (ret) { + atomic_dec(&inode->i_count); + osd_end_request(or); + return ERR_PTR(-EIO); + } + atomic_inc(&sbi->s_curr_pending); + + return inode; +} + +/* + * struct to pass two arguments to update_inode's callback + */ +struct updatei_args { + struct exofs_sb_info *sbi; + struct exofs_fcb fcb; +}; + +/* + * Callback function from exofs_update_inode(). + */ +static void updatei_done(struct osd_request *or, void *p) +{ + struct updatei_args *args = p; + + osd_end_request(or); + + atomic_dec(&args->sbi->s_curr_pending); + + kfree(args); +} + +/* + * Write the inode to the OSD. Just fill up the struct, and set the attribute + * synchronously or asynchronously depending on the do_sync flag. + */ +static int exofs_update_inode(struct inode *inode, int do_sync) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or; + struct osd_attr attr; + struct exofs_fcb *fcb; + struct updatei_args *args; + int ret; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + fcb = &args->fcb; + + fcb->i_mode = cpu_to_le16(inode->i_mode); + fcb->i_uid = cpu_to_le32(inode->i_uid); + fcb->i_gid = cpu_to_le32(inode->i_gid); + fcb->i_links_count = cpu_to_le16(inode->i_nlink); + fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + oi->i_commit_size = i_size_read(inode); + fcb->i_size = cpu_to_le64(oi->i_commit_size); + fcb->i_generation = cpu_to_le32(inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + fcb->i_data[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + fcb->i_data[1] = 0; + } else { + fcb->i_data[0] = 0; + fcb->i_data[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + fcb->i_data[2] = 0; + } + } else + memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); + ret = -ENOMEM; + goto free_args; + } + + osd_req_set_attributes(or, &obj); + + attr = g_attr_inode_data; + attr.val_ptr = fcb; + osd_req_add_set_attr_list(or, &attr, 1); + + if (!obj_created(oi)) { + EXOFS_DBGMSG("!obj_created\n"); + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + EXOFS_DBGMSG("wait_event done\n"); + } + + if (do_sync) { + ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + osd_end_request(or); + goto free_args; + } else { + args->sbi = sbi; + + ret = exofs_async_op(or, updatei_done, args, oi->i_cred); + if (ret) { + osd_end_request(or); + goto free_args; + } + atomic_inc(&sbi->s_curr_pending); + goto out; /* deallocation in updatei_done */ + } + +free_args: + kfree(args); +out: + EXOFS_DBGMSG("ret=>%d\n", ret); + return ret; +} + +int exofs_write_inode(struct inode *inode, int wait) +{ + return exofs_update_inode(inode, wait); +} + +/* + * Callback function from exofs_delete_inode() - don't have much cleaning up to + * do. + */ +static void delete_done(struct osd_request *or, void *p) +{ + struct exofs_sb_info *sbi; + osd_end_request(or); + sbi = p; + atomic_dec(&sbi->s_curr_pending); +} + +/* + * Called when the refcount of an inode reaches zero. We remove the object + * from the OSD here. We make sure the object was created before we try and + * delete it. + */ +void exofs_delete_inode(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; + struct osd_request *or; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + mark_inode_dirty(inode); + exofs_update_inode(inode, inode_needs_sync(inode)); + + inode->i_size = 0; + if (inode->i_blocks) + exofs_truncate(inode); + + clear_inode(inode); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); + return; + } + + osd_req_remove_object(or, &obj); + + /* if we are deleting an obj that hasn't been created yet, wait */ + if (!obj_created(oi)) { + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + } + + ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); + if (ret) { + EXOFS_ERR( + "ERROR: @exofs_delete_inode exofs_async_op failed\n"); + osd_end_request(or); + return; + } + atomic_inc(&sbi->s_curr_pending); + + return; + +no_delete: + clear_inode(inode); +} diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c new file mode 100644 index 00000000000..77fdd765e76 --- /dev/null +++ b/fs/exofs/namei.c @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = exofs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > EXOFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = exofs_inode_by_name(dir, dentry); + inode = NULL; + if (ino) { + inode = exofs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode = exofs_new_inode(dir, mode); + int err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + inode = exofs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct super_block *sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode *inode; + struct exofs_i_info *oi; + + if (l > sb->s_blocksize) + goto out; + + inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + oi = exofs_i(inode); + if (l > sizeof(oi->i_data)) { + /* slow symlink */ + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + memset(oi->i_data, 0, sizeof(oi->i_data)); + + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &exofs_fast_symlink_inode_operations; + memcpy(oi->i_data, symname, l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = exofs_add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int exofs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= EXOFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + atomic_inc(&inode->i_count); + + return exofs_add_nondir(dentry, inode); +} + +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + int err = -EMLINK; + + if (dir->i_nlink >= EXOFS_LINK_MAX) + goto out; + + inode_inc_link_count(dir); + + inode = exofs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + + inode_inc_link_count(inode); + + err = exofs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = exofs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int exofs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct exofs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = exofs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + err = exofs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int exofs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = -ENOTEMPTY; + + if (exofs_empty_dir(inode)) { + err = exofs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct exofs_dir_entry *dir_de = NULL; + struct page *old_page; + struct exofs_dir_entry *old_de; + int err = -ENOENT; + + old_de = exofs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = exofs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct exofs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !exofs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = exofs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + inode_inc_link_count(old_inode); + err = exofs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + if (err) + goto out_dir; + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= EXOFS_LINK_MAX) + goto out_dir; + } + inode_inc_link_count(old_inode); + err = exofs_add_link(new_dentry, old_inode); + if (err) { + inode_dec_link_count(old_inode); + goto out_dir; + } + if (dir_de) + inode_inc_link_count(new_dir); + } + + old_inode->i_ctime = CURRENT_TIME; + + exofs_delete_entry(old_de, old_page); + inode_dec_link_count(old_inode); + + if (dir_de) { + err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + if (err) + goto out_dir; + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations exofs_dir_inode_operations = { + .create = exofs_create, + .lookup = exofs_lookup, + .link = exofs_link, + .unlink = exofs_unlink, + .symlink = exofs_symlink, + .mkdir = exofs_mkdir, + .rmdir = exofs_rmdir, + .mknod = exofs_mknod, + .rename = exofs_rename, + .setattr = exofs_setattr, +}; + +const struct inode_operations exofs_special_inode_operations = { + .setattr = exofs_setattr, +}; diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c new file mode 100644 index 00000000000..b249ae97fb1 --- /dev/null +++ b/fs/exofs/osd.c @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <scsi/scsi_device.h> +#include <scsi/osd_sense.h> + +#include "exofs.h" + +int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) +{ + struct osd_sense_info osi; + int ret = osd_req_decode_sense(or, &osi); + + if (ret) { /* translate to Linux codes */ + if (osi.additional_code == scsi_invalid_field_in_cdb) { + if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE) + ret = -EFAULT; + if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID) + ret = -ENOENT; + else + ret = -EINVAL; + } else if (osi.additional_code == osd_quota_error) + ret = -ENOSPC; + else + ret = -EIO; + } + + /* FIXME: should be include in osd_sense_info */ + if (in_resid) + *in_resid = or->in.req ? or->in.req->data_len : 0; + + if (out_resid) + *out_resid = or->out.req ? or->out.req->data_len : 0; + + return ret; +} + +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +/* + * Perform a synchronous OSD operation. + */ +int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential) +{ + int ret; + + or->timeout = timeout; + ret = osd_finalize_request(or, 0, credential, NULL); + if (ret) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); + return ret; + } + + ret = osd_execute_request(or); + + if (ret) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + return ret; +} + +/* + * Perform an asynchronous OSD operation. + */ +int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done, + void *caller_context, u8 *cred) +{ + int ret; + + ret = osd_finalize_request(or, 0, cred, NULL); + if (ret) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); + return ret; + } + + ret = osd_execute_request_async(or, async_done, caller_context); + + if (ret) + EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret); + return ret; +} + +int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} + +int osd_req_read_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void* buff, u64 len) +{ + struct request_queue *req_q = or->osd_dev->scsi_device->request_queue; + struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL); + + if (!bio) + return -ENOMEM; + + osd_req_read(or, obj, bio, offset); + return 0; +} + +int osd_req_write_kern(struct osd_request *or, + const struct osd_obj_id *obj, u64 offset, void* buff, u64 len) +{ + struct request_queue *req_q = or->osd_dev->scsi_device->request_queue; + struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL); + + if (!bio) + return -ENOMEM; + + osd_req_write(or, obj, bio, offset); + return 0; +} diff --git a/fs/exofs/super.c b/fs/exofs/super.c new file mode 100644 index 00000000000..9f1985e857e --- /dev/null +++ b/fs/exofs/super.c @@ -0,0 +1,584 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/string.h> +#include <linux/parser.h> +#include <linux/vfs.h> +#include <linux/random.h> +#include <linux/exportfs.h> + +#include "exofs.h" + +/****************************************************************************** + * MOUNT OPTIONS + *****************************************************************************/ + +/* + * struct to hold what we get from mount options + */ +struct exofs_mountopt { + const char *dev_name; + uint64_t pid; + int timeout; +}; + +/* + * exofs-specific mount-time options. + */ +enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err }; + +/* + * Our mount-time options. These should ideally be 64-bit unsigned, but the + * kernel's parsing functions do not currently support that. 32-bit should be + * sufficient for most applications now. + */ +static match_table_t tokens = { + {Opt_pid, "pid=%u"}, + {Opt_to, "to=%u"}, + {Opt_err, NULL} +}; + +/* + * The main option parsing method. Also makes sure that all of the mandatory + * mount options were set. + */ +static int parse_options(char *options, struct exofs_mountopt *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + bool s_pid = false; + + EXOFS_DBGMSG("parse_options %s\n", options); + /* defaults */ + memset(opts, 0, sizeof(*opts)); + opts->timeout = BLK_DEFAULT_SG_TIMEOUT; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + char str[32]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_pid: + if (0 == match_strlcpy(str, &args[0], sizeof(str))) + return -EINVAL; + opts->pid = simple_strtoull(str, NULL, 0); + if (opts->pid < EXOFS_MIN_PID) { + EXOFS_ERR("Partition ID must be >= %u", + EXOFS_MIN_PID); + return -EINVAL; + } + s_pid = 1; + break; + case Opt_to: + if (match_int(&args[0], &option)) + return -EINVAL; + if (option <= 0) { + EXOFS_ERR("Timout must be > 0"); + return -EINVAL; + } + opts->timeout = option * HZ; + break; + } + } + + if (!s_pid) { + EXOFS_ERR("Need to specify the following options:\n"); + EXOFS_ERR(" -o pid=pid_no_to_use\n"); + return -EINVAL; + } + + return 0; +} + +/****************************************************************************** + * INODE CACHE + *****************************************************************************/ + +/* + * Our inode cache. Isn't it pretty? + */ +static struct kmem_cache *exofs_inode_cachep; + +/* + * Allocate an inode in the cache + */ +static struct inode *exofs_alloc_inode(struct super_block *sb) +{ + struct exofs_i_info *oi; + + oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + + oi->vfs_inode.i_version = 1; + return &oi->vfs_inode; +} + +/* + * Remove an inode from the cache + */ +static void exofs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); +} + +/* + * Initialize the inode + */ +static void exofs_init_once(void *foo) +{ + struct exofs_i_info *oi = foo; + + inode_init_once(&oi->vfs_inode); +} + +/* + * Create and initialize the inode cache + */ +static int init_inodecache(void) +{ + exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", + sizeof(struct exofs_i_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + exofs_init_once); + if (exofs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +/* + * Destroy the inode cache + */ +static void destroy_inodecache(void) +{ + kmem_cache_destroy(exofs_inode_cachep); +} + +/****************************************************************************** + * SUPERBLOCK FUNCTIONS + *****************************************************************************/ +static const struct super_operations exofs_sops; +static const struct export_operations exofs_export_ops; + +/* + * Write the superblock to the OSD + */ +static void exofs_write_super(struct super_block *sb) +{ + struct exofs_sb_info *sbi; + struct exofs_fscb *fscb; + struct osd_request *or; + struct osd_obj_id obj; + int ret; + + fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); + if (!fscb) { + EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); + return; + } + + lock_kernel(); + sbi = sb->s_fs_info; + fscb->s_nextid = cpu_to_le64(sbi->s_nextid); + fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_magic = cpu_to_le16(sb->s_magic); + fscb->s_newfs = 0; + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); + goto out; + } + + obj.partition = sbi->s_pid; + obj.id = EXOFS_SUPER_ID; + ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb)); + if (unlikely(ret)) { + EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n"); + goto out; + } + + ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); + if (unlikely(ret)) { + EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n"); + goto out; + } + sb->s_dirt = 0; + +out: + if (or) + osd_end_request(or); + unlock_kernel(); + kfree(fscb); +} + +/* + * This function is called when the vfs is freeing the superblock. We just + * need to free our own part. + */ +static void exofs_put_super(struct super_block *sb) +{ + int num_pend; + struct exofs_sb_info *sbi = sb->s_fs_info; + + /* make sure there are no pending commands */ + for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; + num_pend = atomic_read(&sbi->s_curr_pending)) { + wait_queue_head_t wq; + init_waitqueue_head(&wq); + wait_event_timeout(wq, + (atomic_read(&sbi->s_curr_pending) == 0), + msecs_to_jiffies(100)); + } + + osduld_put_device(sbi->s_dev); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +/* + * Read the superblock from the OSD and fill in the fields + */ +static int exofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root; + struct exofs_mountopt *opts = data; + struct exofs_sb_info *sbi; /*extended info */ + struct exofs_fscb fscb; /*on-disk superblock info */ + struct osd_request *or = NULL; + struct osd_obj_id obj; + int ret; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* use mount options to fill superblock */ + sbi->s_dev = osduld_path_lookup(opts->dev_name); + if (IS_ERR(sbi->s_dev)) { + ret = PTR_ERR(sbi->s_dev); + sbi->s_dev = NULL; + goto free_sbi; + } + + sbi->s_pid = opts->pid; + sbi->s_timeout = opts->timeout; + + /* fill in some other data by hand */ + memset(sb->s_id, 0, sizeof(sb->s_id)); + strcpy(sb->s_id, "exofs"); + sb->s_blocksize = EXOFS_BLKSIZE; + sb->s_blocksize_bits = EXOFS_BLKSHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + atomic_set(&sbi->s_curr_pending, 0); + sb->s_bdev = NULL; + sb->s_dev = 0; + + /* read data from on-disk superblock object */ + obj.partition = sbi->s_pid; + obj.id = EXOFS_SUPER_ID; + exofs_make_credential(sbi->s_cred, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + if (!silent) + EXOFS_ERR( + "exofs_fill_super: osd_start_request failed.\n"); + ret = -ENOMEM; + goto free_sbi; + } + ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb)); + if (unlikely(ret)) { + if (!silent) + EXOFS_ERR( + "exofs_fill_super: osd_req_read_kern failed.\n"); + ret = -ENOMEM; + goto free_sbi; + } + + ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); + if (unlikely(ret)) { + if (!silent) + EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n"); + ret = -EIO; + goto free_sbi; + } + + sb->s_magic = le16_to_cpu(fscb.s_magic); + sbi->s_nextid = le64_to_cpu(fscb.s_nextid); + sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); + + /* make sure what we read from the object store is correct */ + if (sb->s_magic != EXOFS_SUPER_MAGIC) { + if (!silent) + EXOFS_ERR("ERROR: Bad magic value\n"); + ret = -EINVAL; + goto free_sbi; + } + + /* start generation numbers from a random point */ + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + /* set up operation vectors */ + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); + ret = PTR_ERR(root); + goto free_sbi; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + EXOFS_ERR("ERROR: get root inode failed\n"); + ret = -ENOMEM; + goto free_sbi; + } + + if (!S_ISDIR(root->i_mode)) { + dput(sb->s_root); + sb->s_root = NULL; + EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n", + root->i_mode); + ret = -EINVAL; + goto free_sbi; + } + + ret = 0; +out: + if (or) + osd_end_request(or); + return ret; + +free_sbi: + osduld_put_device(sbi->s_dev); /* NULL safe */ + kfree(sbi); + goto out; +} + +/* + * Set up the superblock (calls exofs_fill_super eventually) + */ +static int exofs_get_sb(struct file_system_type *type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + struct exofs_mountopt opts; + int ret; + + ret = parse_options(data, &opts); + if (ret) + return ret; + + opts.dev_name = dev_name; + return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); +} + +/* + * Return information about the file system state in the buffer. This is used + * by the 'df' command, for example. + */ +static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_obj_id obj = {sbi->s_pid, 0}; + struct osd_attr attrs[] = { + ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, + OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), + ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION, + OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)), + }; + uint64_t capacity = ULLONG_MAX; + uint64_t used = ULLONG_MAX; + struct osd_request *or; + uint8_t cred_a[OSD_CAP_LEN]; + int ret; + + /* get used/capacity attributes */ + exofs_make_credential(cred_a, &obj); + + or = osd_start_request(sbi->s_dev, GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n"); + return -ENOMEM; + } + + osd_req_get_attributes(or, &obj); + osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); + ret = exofs_sync_op(or, sbi->s_timeout, cred_a); + if (unlikely(ret)) + goto out; + + ret = extract_attr_from_req(or, &attrs[0]); + if (likely(!ret)) + capacity = get_unaligned_be64(attrs[0].val_ptr); + else + EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); + + ret = extract_attr_from_req(or, &attrs[1]); + if (likely(!ret)) + used = get_unaligned_be64(attrs[1].val_ptr); + else + EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n"); + + /* fill in the stats buffer */ + buf->f_type = EXOFS_SUPER_MAGIC; + buf->f_bsize = EXOFS_BLKSIZE; + buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); + buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); + buf->f_bavail = buf->f_bfree; + buf->f_files = sbi->s_numfiles; + buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; + buf->f_namelen = EXOFS_NAME_LEN; + +out: + osd_end_request(or); + return ret; +} + +static const struct super_operations exofs_sops = { + .alloc_inode = exofs_alloc_inode, + .destroy_inode = exofs_destroy_inode, + .write_inode = exofs_write_inode, + .delete_inode = exofs_delete_inode, + .put_super = exofs_put_super, + .write_super = exofs_write_super, + .statfs = exofs_statfs, +}; + +/****************************************************************************** + * EXPORT OPERATIONS + *****************************************************************************/ + +struct dentry *exofs_get_parent(struct dentry *child) +{ + unsigned long ino = exofs_parent_ino(child); + + if (!ino) + return NULL; + + return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); +} + +static struct inode *exofs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = exofs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *exofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static struct dentry *exofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static const struct export_operations exofs_export_ops = { + .fh_to_dentry = exofs_fh_to_dentry, + .fh_to_parent = exofs_fh_to_parent, + .get_parent = exofs_get_parent, +}; + +/****************************************************************************** + * INSMOD/RMMOD + *****************************************************************************/ + +/* + * struct that describes this file system + */ +static struct file_system_type exofs_type = { + .owner = THIS_MODULE, + .name = "exofs", + .get_sb = exofs_get_sb, + .kill_sb = generic_shutdown_super, +}; + +static int __init init_exofs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto out; + + err = register_filesystem(&exofs_type); + if (err) + goto out_d; + + return 0; +out_d: + destroy_inodecache(); +out: + return err; +} + +static void __exit exit_exofs(void) +{ + unregister_filesystem(&exofs_type); + destroy_inodecache(); +} + +MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>"); +MODULE_DESCRIPTION("exofs"); +MODULE_LICENSE("GPL"); + +module_init(init_exofs) +module_exit(exit_exofs) diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c new file mode 100644 index 00000000000..36e2d7bc7f7 --- /dev/null +++ b/fs/exofs/symlink.c @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) + * Copyright (C) 2005, 2006 + * International Business Machines + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/namei.h> + +#include "exofs.h" + +static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct exofs_i_info *oi = exofs_i(dentry->d_inode); + + nd_set_link(nd, (char *)oi->i_data); + return NULL; +} + +const struct inode_operations exofs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; + +const struct inode_operations exofs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = exofs_follow_link, +}; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ae8c4f850b2..d46e38cb85c 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index b60bb241880..d81ef2fdb08 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 5853f4440af..3d724a95882 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ext3_readdir, /* we take BKL. needed?*/ - .ioctl = ext3_ioctl, /* BKL held */ + .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, #endif diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 3be1e0689c9..521f8238b2f 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -112,7 +112,7 @@ const struct file_operations ext3_file_operations = { .write = do_sync_write, .aio_read = generic_file_aio_read, .aio_write = ext3_file_write, - .ioctl = ext3_ioctl, + .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, #endif diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 4a09ff16987..d3ef6566b01 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + int ret; handle_t *handle; int retries = 0; struct page *page; pgoff_t index; unsigned from, to; + /* Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason */ + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); @@ -1184,15 +1187,20 @@ retry: } write_begin_failed: if (ret) { - ext3_journal_stop(handle); - unlock_page(page); - page_cache_release(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before truncate + * finishes. */ if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) @@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) return err; } +/* For ordered writepage and write_end functions */ +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + /* + * Write could have mapped the buffer but it didn't copy the data in + * yet. So avoid filing such buffer into a transaction. + */ + if (buffer_mapped(bh) && buffer_uptodate(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext3 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run - * after block_write_end. + * This is nasty and subtle: ext3_write_begin() could have allocated blocks + * for the whole page but later we failed to copy the data in. Update inode + * size according to what we managed to copy. The rest is going to be + * truncated in write_end function. */ -static int ext3_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) { - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); + /* What matters to us is i_disksize. We don't write i_size anywhere */ + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + if (pos + copied > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = pos + copied; mark_inode_dirty(inode); } - - return copied; } /* @@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + copied; ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext3_journal_dirty_data); + from, to, NULL, journal_dirty_data_fn); - if (ret == 0) { - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - loff_t new_i_size; - - new_i_size = pos + copied; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; - } + if (ret == 0) + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); return ret ? ret : copied; } @@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = file->f_mapping->host; - int ret = 0, ret2; - loff_t new_i_size; - - new_i_size = pos + copied; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; + int ret; - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); + ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); return ret ? ret : copied; } @@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file, if (copied < len) { if (!PageUptodate(page)) copied = 0; - page_zero_new_buffers(page, from+copied, to); + page_zero_new_buffers(page, from + copied, to); + to = from + copied; } ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size) + ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; @@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); return ret ? ret : copied; } @@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - if (buffer_mapped(bh)) - return ext3_journal_dirty_data(handle, bh); - return 0; -} - static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) { return !buffer_mapped(bh); } + /* * Note that we always start a transaction even if we're not journalling * data. This is to preserve ordering: any hole instantiation within diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 5e86ce9a86e..88974814783 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -15,12 +15,11 @@ #include <linux/mount.h> #include <linux/time.h> #include <linux/compat.h> -#include <linux/smp_lock.h> #include <asm/uaccess.h> -int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, - unsigned long arg) +long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; struct ext3_inode_info *ei = EXT3_I(inode); unsigned int flags; unsigned short rsv_window_size; @@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; + if (!is_owner_or_cap(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); if (err) return err; - if (!is_owner_or_cap(inode)) { - err = -EACCES; - goto flags_out; - } - - if (get_user(flags, (int __user *) arg)) { - err = -EFAULT; - goto flags_out; - } - flags = ext3_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); - err = -EPERM; + err = -EPERM; + if (IS_NOQUOTA(inode)) goto flags_out; - } + oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - err = -EPERM; + if (!capable(CAP_LINUX_IMMUTABLE)) goto flags_out; - } } /* @@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { - mutex_unlock(&inode->i_mutex); - err = -EPERM; + if (!capable(CAP_SYS_RESOURCE)) goto flags_out; - } } - handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { - mutex_unlock(&inode->i_mutex); err = PTR_ERR(handle); goto flags_out; } @@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) { - mutex_unlock(&inode->i_mutex); - return err; - } + if (err) + goto flags_out; if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); - mutex_unlock(&inode->i_mutex); flags_out: + mutex_unlock(&inode->i_mutex); mnt_drop_write(filp->f_path.mnt); return err; } @@ -140,6 +125,7 @@ flags_out: if (!is_owner_or_cap(inode)) return -EPERM; + err = mnt_want_write(filp->f_path.mnt); if (err) return err; @@ -147,6 +133,7 @@ flags_out: err = -EFAULT; goto setversion_out; } + handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -299,9 +286,6 @@ group_add_out: #ifdef CONFIG_COMPAT long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; - /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT3_IOC32_GETFLAGS: @@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) default: return -ENOIOCTLCMD; } - lock_kernel(); - ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); - return ret; + return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index e2fc63cbba8..6ddaa0a42b2 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry, struct dx_frame *frame, int *err); static void dx_release (struct dx_frame *frames); -static int dx_make_map (struct ext3_dir_entry_2 *de, int size, +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, struct dx_map_entry *offsets, int count); -static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, @@ -708,14 +708,14 @@ errout: * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map (struct ext3_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; - while ((char *) de < base + size) + while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { ext3fs_dirhash(de->name, de->name_len, &h); @@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str return ERR_PTR(-EIO); } inode = ext3_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (unlikely(IS_ERR(inode))) { + if (PTR_ERR(inode) == -ESTALE) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } } return d_splice_alias(inode, dentry); } @@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) { - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; + struct ext3_dir_entry_2 *next, *to, *prev; + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; unsigned rec_len = 0; prev = to = de; - while ((char*)de < base + size) { + while ((char *)de < base + blocksize) { next = ext3_next_entry(de); if (de->inode && de->name_len) { rec_len = EXT3_DIR_REC_LEN(de->name_len); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 694ed6fadcc..647e0d65a28 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38f40d55899..53c72ad8587 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, } static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group) + ext4_group_t block_group, + struct ext4_group_desc *gdp) { ext4_fsblk_t tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, int used_blocks = sbi->s_itb_per_group + 2; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - struct ext4_group_desc *gdp; - struct buffer_head *bh; - - gdp = ext4_get_group_desc(sb, block_group, &bh); if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) used_blocks--; @@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group); + return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); } @@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); } /* * request to reload the buddy with the diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2df2e40b01a..b64789929a6 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, unsigned int offset) { const char *error_msg = NULL; - const int rlen = ext4_rec_len_from_disk(de->rec_len); + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); if (rlen < EXT4_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; @@ -178,10 +179,11 @@ revalidate: * least that it is non-zero. A * failure will be detected in the * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len) - < EXT4_DIR_REC_LEN(1)) + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) break; - i += ext4_rec_len_from_disk(de->rec_len); + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); } offset = i; filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) @@ -203,7 +205,8 @@ revalidate: ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len); + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -225,7 +228,8 @@ revalidate: goto revalidate; stored++; } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len); + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); } offset = 0; brelse(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6083bb38057..d0f15ef56de 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,14 +33,6 @@ #undef EXT4FS_DEBUG /* - * Define EXT4_RESERVATION to reserve data blocks for expanding files - */ -#define EXT4_DEFAULT_RESERVE_BLOCKS 8 -/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ -#define EXT4_MAX_RESERVE_BLOCKS 1027 -#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0 - -/* * Debug code */ #ifdef EXT4FS_DEBUG @@ -54,8 +46,6 @@ #define ext4_debug(f, a...) do {} while (0) #endif -#define EXT4_MULTIBLOCK_ALLOCATOR 1 - /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ @@ -180,8 +170,9 @@ struct ext4_group_desc */ struct flex_groups { - __u32 free_inodes; - __u32 free_blocks; + atomic_t free_inodes; + atomic_t free_blocks; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -249,6 +240,30 @@ struct flex_groups { #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ + EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + /* * Inode dynamic state flags */ @@ -256,6 +271,7 @@ struct flex_groups { #define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ +#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { @@ -303,7 +319,9 @@ struct ext4_new_group_data { #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) #define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) /* * ioctl commands in 32 bit emulation @@ -531,7 +549,7 @@ do { \ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ @@ -666,7 +684,8 @@ struct ext4_super_block { __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_reserved_char_pad2; __le16 s_reserved_pad; - __u32 s_reserved[162]; /* Padding to the end of the block */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __u32 s_reserved[160]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ /* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 @@ -865,24 +890,6 @@ struct ext4_dir_entry_2 { ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) -static inline unsigned ext4_rec_len_from_disk(__le16 dlen) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return 1 << 16; - return len; -} - -static inline __le16 ext4_rec_len_to_disk(unsigned len) -{ - if (len == (1 << 16)) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); - return cpu_to_le16(len); -} - /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 @@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, extern struct proc_dir_entry *ext4_proc_root; -#ifdef CONFIG_PROC_FS -extern const struct file_operations ext4_ui_proc_fops; - -#define EXT4_PROC_HANDLER(name, var) \ -do { \ - proc = proc_create_data(name, mode, sbi->s_proc, \ - &ext4_ui_proc_fops, &sbi->s_##var); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ - goto err_out; \ - } \ -} while (0) -#else -#define EXT4_PROC_HANDLER(name, var) -#endif - /* * Function prototypes */ @@ -1092,13 +1083,14 @@ extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); /* ioctl.c */ @@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); + /* namei.c */ +extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); +extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 18cb67b2cbb..f0c3ec85bd4 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index e69acc16f5c..4ce2187123a 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; -#define rsv_start rsv_window._rsv_start -#define rsv_end rsv_window._rsv_end - /* * storage for cached extent */ @@ -125,6 +122,9 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* ialloc */ + ext4_group_t i_last_alloc_group; + /* allocation reservation info for delalloc */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 039b6ea1a04..57b71fefbcc 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -62,12 +62,10 @@ struct ext4_sb_info { struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock s_blockgroup_lock; + struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; - - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; + struct kobject s_kobj; + struct completion s_kobj_unregister; /* Journaling */ struct inode *s_journal_inode; @@ -146,6 +144,10 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; }; @@ -153,7 +155,7 @@ struct ext4_sb_info { static inline spinlock_t * sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) { - return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); } #endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0aa4fe4f59..ac77d8b8251 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth) return max; } -static int __ext4_ext_check_header(const char *function, struct inode *inode, +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(block < le32_to_cpu(es->s_first_data_block) || + ((block + len) > ext4_blocks_count(es)))) + return 0; + else + return 1; +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = idx_pblock(ext_idx); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(block < le32_to_cpu(es->s_first_data_block) || + (block > ext4_blocks_count(es)))) + return 0; + else + return 1; +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + struct ext4_extent *ext; + struct ext4_extent_idx *ext_idx; + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + ext = EXT_FIRST_EXTENT(eh); + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + ext++; + entries--; + } + } else { + ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, struct inode *inode, struct ext4_extent_header *eh, int depth) { @@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode, error_msg = "invalid eh_entries"; goto corrupted; } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } return 0; corrupted: ext4_error(inode->i_sb, function, - "bad header in inode #%lu: %s - magic %x, " + "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), @@ -342,8 +426,13 @@ corrupted: return -EIO; } -#define ext4_ext_check_header(inode, eh, depth) \ - __ext4_ext_check_header(__func__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth) \ + __ext4_ext_check(__func__, inode, eh, depth) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - if (ext4_ext_check_header(inode, eh, depth)) - return ERR_PTR(-EIO); - /* account possible depth increase */ if (!path) { @@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { + int need_to_validate = 0; + ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_bread(inode->i_sb, path[ppos].p_block); - if (!bh) + bh = sb_getblk(inode->i_sb, path[ppos].p_block); + if (unlikely(!bh)) goto err; - + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto err; + } + /* validate the extent entries */ + need_to_validate = 1; + } eh = ext_block_hdr(bh); ppos++; BUG_ON(ppos > depth); @@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_header(inode, eh, i)) + if (need_to_validate && ext4_ext_check(inode, eh, i)) goto err; } @@ -1181,7 +1276,7 @@ got_index: return -EIO; eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -1194,7 +1289,7 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } @@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) err = -EIO; break; } - if (ext4_ext_check_header(inode, ext_block_hdr(bh), + if (ext4_ext_check(inode, ext_block_hdr(bh), depth - i - 1)) { err = -EIO; break; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f731cb545a0..588af8c7724 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -33,9 +33,14 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { + if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { + ext4_alloc_da_blocks(inode); + EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; + } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fb51b40e3e8..47b84e8df56 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", @@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); + atomic_dec(&sbi->s_flex_groups[f].free_inodes); + } + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); @@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) percpu_counter_dec(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { - flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_inodes++; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); + atomic_inc(&sbi->s_flex_groups[f].free_inodes); } } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); @@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, sbi->s_log_groups_per_flex; find_close_to_parent: - flexbg_free_blocks = flex_group[best_flex].free_blocks; + flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (flex_group[best_flex].free_inodes && + if (atomic_read(&flex_group[best_flex].free_inodes) && flex_freeb_ratio > free_block_ratio) goto found_flexbg; @@ -375,24 +381,24 @@ find_close_to_parent: if (i == parent_fbg_group || i == parent_fbg_group - 1) continue; - flexbg_free_blocks = flex_group[i].free_blocks; + flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; if (flex_freeb_ratio > free_block_ratio && - flex_group[i].free_inodes) { + (atomic_read(&flex_group[i].free_inodes))) { best_flex = i; goto found_flexbg; } - if (flex_group[best_flex].free_inodes == 0 || - (flex_group[i].free_blocks > - flex_group[best_flex].free_blocks && - flex_group[i].free_inodes)) + if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || + ((atomic_read(&flex_group[i].free_blocks) > + atomic_read(&flex_group[best_flex].free_blocks)) && + atomic_read(&flex_group[i].free_inodes))) best_flex = i; } - if (!flex_group[best_flex].free_inodes || - !flex_group[best_flex].free_blocks) + if (!atomic_read(&flex_group[best_flex].free_inodes) || + !atomic_read(&flex_group[best_flex].free_blocks)) return -1; found_flexbg: @@ -410,6 +416,42 @@ out: return 0; } +struct orlov_stats { + __u32 free_inodes; + __u32 free_blocks; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; + + if (flex_size > 1) { + stats->free_inodes = atomic_read(&flex_group[g].free_inodes); + stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->used_dirs = atomic_read(&flex_group[g].used_dirs); + return; + } + + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_blocks = 0; + stats->used_dirs = 0; + } +} + /* * Orlov's allocator for directories. * @@ -425,35 +467,34 @@ out: * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). - * - * Debt is incremented each time we allocate a directory and decremented - * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group) + ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; ext4_group_t ngroups = sbi->s_groups_count; int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; - ext4_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i; + ext4_group_t i, grp, g; struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + + if (flex_size > 1) { + ngroups = (ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; @@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, do_div(avefreeb, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { + if (S_ISDIR(mode) && + ((parent == sb->s_root->d_inode) || + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { int best_ndir = inodes_per_group; - ext4_group_t grp; int ret = -1; get_random_bytes(&grp, sizeof(grp)); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) continue; - if (ext4_used_dirs_count(sb, desc) >= best_ndir) + if (stats.used_dirs >= best_ndir) continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) + if (stats.free_inodes < avefreei) continue; - if (ext4_free_blks_count(sb, desc) < avefreeb) + if (stats.free_blocks < avefreeb) continue; - *group = grp; + grp = g; ret = 0; - best_ndir = ext4_used_dirs_count(sb, desc); + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= sbi->s_groups_count) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } } - if (ret == 0) - return ret; goto fallback; } - blocks_per_dir = ext4_blocks_count(es) - freeb; - do_div(blocks_per_dir, ndirs); - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group / 4; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; - - max_debt = EXT4_BLOCKS_PER_GROUP(sb); - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } for (i = 0; i < ngroups; i++) { - *group = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) - continue; - if (ext4_used_dirs_count(sb, desc) >= max_dirs) + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) continue; - if (ext4_free_inodes_count(sb, desc) < min_inodes) + if (stats.free_inodes < min_inodes) continue; - if (ext4_free_blks_count(sb, desc) < min_blocks) + if (stats.free_blocks < min_blocks) continue; - return 0; + goto found_flex_bg; } fallback: + ngroups = sbi->s_groups_count; + avefreei = freei / ngroups; + parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { - *group = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) + ext4_free_inodes_count(sb, desc) >= avefreei) { + *group = grp; return 0; + } } if (avefreei) { @@ -542,12 +609,51 @@ fallback: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group) + ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *desc; - ext4_group_t i; + ext4_group_t i, last; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode); + } /* * Try to place the inode in its parent directory @@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb, if (S_ISDIR(mode)) { count = ext4_used_dirs_count(sb, gdp) + 1; ext4_used_dirs_set(sb, gdp, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + } } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: @@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; - if (sbi->s_log_groups_per_flex) { + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group); + ret2 = find_group_other(sb, dir, &group, mode); if (ret2 == 0 && once) once = 0; printk(KERN_NOTICE "ext4: find_group_flex " @@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else - ret2 = find_group_orlov(sb, dir, &group); + ret2 = find_group_orlov(sb, dir, &group, mode); } else - ret2 = find_group_other(sb, dir, &group); + ret2 = find_group_other(sb, dir, &group, mode); got_group: + EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; @@ -858,9 +970,7 @@ got: if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_inodes--; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } inode->i_uid = current_fsuid(); @@ -885,19 +995,16 @@ got: ei->i_disksize = 0; /* - * Don't inherit extent flag from directory. We set extent flag on - * newly created directory and file only if -o extent mount option is - * specified + * Don't inherit extent flag from directory, amongst others. We set + * extent flag on newly created directory and file only if -o extent + * mount option is specified */ - ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); - /* dirsync only applies to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT4_DIRSYNC_FL; + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; + ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71d3ecd5db7..a2e7952bc5f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode, return n; } +static int __ext4_check_blockref(const char *function, struct inode *inode, + unsigned int *p, unsigned int max) { + + unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); + unsigned int *bref = p; + while (bref < p+max) { + if (unlikely(*bref >= maxblocks)) { + ext4_error(inode->i_sb, function, + "block reference %u >= max (%u) " + "in inode #%lu, offset=%d", + *bref, maxblocks, + inode->i_ino, (int)(bref-p)); + return -EIO; + } + bref++; + } + return 0; +} + + +#define ext4_check_indirect_blockref(inode, bh) \ + __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_check_inode_blockref(inode) \ + __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question @@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, if (!p->key) goto no_block; while (--depth) { - bh = sb_bread(sb, le32_to_cpu(p->key)); - if (!bh) + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) @@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) /* * free those over-booking quota for metadata blocks */ - if (mdb_free) vfs_dq_release_reservation_block(inode, mdb_free); + + /* + * If we have done all the pending block allocations and if + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ + if (!total && (atomic_read(&inode->i_writecount) == 0)) + ext4_discard_preallocations(inode); } /* @@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page, struct mpage_da_data { struct inode *inode; - struct buffer_head lbh; /* extent of blocks */ + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ unsigned long first_page, next_page; /* extent of pages */ - get_block_t *get_block; struct writeback_control *wbc; int io_done; int pages_written; @@ -1704,7 +1768,6 @@ struct mpage_da_data { * @mpd->inode: inode * @mpd->first_page: first page of the extent * @mpd->next_page: page after the last page of the extent - * @mpd->get_block: the filesystem's block mapper function * * By the time mpage_da_submit_io() is called we expect all blocks * to be allocated. this may be wrong if allocation failed. @@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->lbh.b_blocknr we would only be looking + * If we look at mpd->b_blocknr we would only be looking * at the currently mapped buffer_heads. */ index = mpd->first_page; @@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode) return; } +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + if (ret <= 0) + return ret; + + bh_result->b_size = (ret << inode->i_blkbits); + + if (ext4_should_order_data(inode)) { + int retval; + retval = ext4_jbd2_file_inode(handle, inode); + if (retval) + /* + * Failed to add inode for ordered mode. Don't + * update file size + */ + return retval; + } + + /* + * Update on-disk size along with block allocation we don't + * use 'extend_disksize' as size may change within already + * allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, disksize); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + return 0; +} + /* * mpage_da_map_blocks - go through given space * - * @mpd->lbh - bh describing space - * @mpd->get_block - the filesystem's block mapper function + * @mpd - bh describing space * * The function skips space we know is already mapped to disk blocks. * */ -static int mpage_da_map_blocks(struct mpage_da_data *mpd) +static int mpage_da_map_blocks(struct mpage_da_data *mpd) { int err = 0; struct buffer_head new; - struct buffer_head *lbh = &mpd->lbh; sector_t next; /* * We consider only non-mapped and non-allocated blocks */ - if (buffer_mapped(lbh) && !buffer_delay(lbh)) + if ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay))) return 0; - new.b_state = lbh->b_state; + new.b_state = mpd->b_state; new.b_blocknr = 0; - new.b_size = lbh->b_size; - next = lbh->b_blocknr; + new.b_size = mpd->b_size; + next = mpd->b_blocknr; /* * If we didn't accumulate anything * to write simply return */ if (!new.b_size) return 0; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* If get block returns with error - * we simply return. Later writepage - * will redirty the page and writepages - * will find the dirty page again + err = ext4_da_get_block_write(mpd->inode, next, &new, 1); + if (err) { + /* + * If get block returns with error we simply + * return. Later writepage will redirty the page and + * writepages will find the dirty page again */ if (err == -EAGAIN) return 0; if (err == -ENOSPC && - ext4_count_free_blocks(mpd->inode->i_sb)) { + ext4_count_free_blocks(mpd->inode->i_sb)) { mpd->retval = err; return 0; } /* - * get block failure will cause us - * to loop in writepages. Because - * a_ops->writepage won't be able to - * make progress. The page will be redirtied - * by writepage and writepages will again - * try to write the same. + * get block failure will cause us to loop in + * writepages, because a_ops->writepage won't be able + * to make progress. The page will be redirtied by + * writepage and writepages will again try to write + * the same. */ printk(KERN_EMERG "%s block allocation failed for inode %lu " "at logical offset %llu with max blocks " "%zd with error %d\n", __func__, mpd->inode->i_ino, (unsigned long long)next, - lbh->b_size >> mpd->inode->i_blkbits, err); + mpd->b_size >> mpd->inode->i_blkbits, err); printk(KERN_EMERG "This should not happen.!! " "Data will be lost\n"); if (err == -ENOSPC) { @@ -1983,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) } /* invlaidate all the pages */ ext4_da_block_invalidatepages(mpd, next, - lbh->b_size >> mpd->inode->i_blkbits); + mpd->b_size >> mpd->inode->i_blkbits); return err; } BUG_ON(new.b_size == 0); @@ -1995,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * If blocks are delayed marked, we need to * put actual blocknr and drop delayed bit */ - if (buffer_delay(lbh) || buffer_unwritten(lbh)) + if ((mpd->b_state & (1 << BH_Delay)) || + (mpd->b_state & (1 << BH_Unwritten))) mpage_put_bnr_to_bhs(mpd, next, &new); return 0; @@ -2014,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * the function is used to collect contig. blocks in same state */ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, struct buffer_head *bh) + sector_t logical, size_t b_size, + unsigned long b_state) { sector_t next; - size_t b_size = bh->b_size; - struct buffer_head *lbh = &mpd->lbh; - int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; + int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; /* check if thereserved journal credits might overflow */ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { @@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, /* * First block in the extent */ - if (lbh->b_size == 0) { - lbh->b_blocknr = logical; - lbh->b_size = b_size; - lbh->b_state = bh->b_state & BH_FLAGS; + if (mpd->b_size == 0) { + mpd->b_blocknr = logical; + mpd->b_size = b_size; + mpd->b_state = b_state & BH_FLAGS; return; } - next = lbh->b_blocknr + nrblocks; + next = mpd->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ - if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += b_size; + if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { + mpd->b_size += b_size; return; } @@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page, { struct mpage_da_data *mpd = data; struct inode *inode = mpd->inode; - struct buffer_head *bh, *head, fake; + struct buffer_head *bh, *head; sector_t logical; if (mpd->io_done) { @@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page, /* * ... and blocks */ - mpd->lbh.b_size = 0; - mpd->lbh.b_state = 0; - mpd->lbh.b_blocknr = 0; + mpd->b_size = 0; + mpd->b_state = 0; + mpd->b_blocknr = 0; } mpd->next_page = page->index + 1; @@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page, (PAGE_CACHE_SHIFT - inode->i_blkbits); if (!page_has_buffers(page)) { - /* - * There is no attached buffer heads yet (mmap?) - * we treat the page asfull of dirty blocks - */ - bh = &fake; - bh->b_size = PAGE_CACHE_SIZE; - bh->b_state = 0; - set_buffer_dirty(bh); - set_buffer_uptodate(bh); - mpage_add_bh_to_extent(mpd, logical, bh); + mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; } else { @@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page, * with the page in ext4_da_writepage */ if (buffer_dirty(bh) && - (!buffer_mapped(bh) || buffer_delay(bh))) { - mpage_add_bh_to_extent(mpd, logical, bh); + (!buffer_mapped(bh) || buffer_delay(bh))) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { @@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page, * unmapped buffer_head later we need to * use the b_state flag of that buffer_head. */ - if (mpd->lbh.b_size == 0) - mpd->lbh.b_state = - bh->b_state & BH_FLAGS; + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; } logical++; } while ((bh = bh->b_this_page) != head); @@ -2191,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page, } /* - * mpage_da_writepages - walk the list of dirty pages of the given - * address space, allocates non-allocated blocks, maps newly-allocated - * blocks to existing bhs and issue IO them - * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @get_block: the filesystem's block mapper function. - * - * This is a library function, which implements the writepages() - * address_space_operation. - */ -static int mpage_da_writepages(struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd) -{ - int ret; - - if (!mpd->get_block) - return generic_writepages(mapping, wbc); - - mpd->lbh.b_size = 0; - mpd->lbh.b_state = 0; - mpd->lbh.b_blocknr = 0; - mpd->first_page = 0; - mpd->next_page = 0; - mpd->io_done = 0; - mpd->pages_written = 0; - mpd->retval = 0; - - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* - * Handle last extent of pages - */ - if (!mpd->io_done && mpd->next_page != mpd->first_page) { - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); - - mpd->io_done = 1; - ret = MPAGE_DA_EXTENT_TAIL; - } - wbc->nr_to_write -= mpd->pages_written; - return ret; -} - -/* * this is a special callback for ->write_begin() only * it's intention is to return mapped block or reserve space */ @@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; } -#define EXT4_DELALLOC_RSVED 1 -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); - if (ret > 0) { - - bh_result->b_size = (ret << inode->i_blkbits); - - if (ext4_should_order_data(inode)) { - int retval; - retval = ext4_jbd2_file_inode(handle, inode); - if (retval) - /* - * Failed to add inode for ordered - * mode. Don't update file size - */ - return retval; - } - - /* - * Update on-disk size along with block allocation - * we don't use 'extend_disksize' as size may change - * within already allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, disksize); - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - ret = 0; - } - return ret; -} static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) { @@ -2569,8 +2578,38 @@ retry: dump_stack(); goto out_writepages; } - mpd.get_block = ext4_da_get_block_write; - ret = mpage_da_writepages(mapping, wbc, &mpd); + + /* + * Now call __mpage_da_writepage to find the next + * contiguous region of logical blocks that need + * blocks to be allocated by ext4. We don't actually + * submit the blocks for I/O here, even though + * write_cache_pages thinks it will, and will set the + * pages as clean for write before calling + * __mpage_da_writepage(). + */ + mpd.b_size = 0; + mpd.b_state = 0; + mpd.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.io_done = 0; + mpd.pages_written = 0; + mpd.retval = 0; + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, + &mpd); + /* + * If we have a contigous extent of pages and we + * haven't done the I/O yet, map the blocks and submit + * them for I/O. + */ + if (!mpd.io_done && mpd.next_page != mpd.first_page) { + if (mpage_da_map_blocks(&mpd) == 0) + mpage_da_submit_io(&mpd); + mpd.io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd.pages_written; ext4_journal_stop(handle); @@ -2846,6 +2885,48 @@ out: return; } +/* + * Force all delayed allocation blocks to be allocated for a given inode. + */ +int ext4_alloc_da_blocks(struct inode *inode) +{ + if (!EXT4_I(inode)->i_reserved_data_blocks && + !EXT4_I(inode)->i_reserved_meta_blocks) + return 0; + + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_da_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writeback() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them becuase we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} /* * bmap() is special. It gets used by applications such as lilo and by @@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); return; @@ -4110,12 +4194,7 @@ make_io: unsigned num; table = ext4_inode_table(sb, gdp); - /* Make sure s_inode_readahead_blks is a power of 2 */ - while (EXT4_SB(sb)->s_inode_readahead_blks & - (EXT4_SB(sb)->s_inode_readahead_blks-1)) - EXT4_SB(sb)->s_inode_readahead_blks = - (EXT4_SB(sb)->s_inode_readahead_blks & - (EXT4_SB(sb)->s_inode_readahead_blks-1)); + /* s_inode_readahead_blks is always a power of 2 */ b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); if (table > b) b = table; @@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! @@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } + if (ei->i_flags & EXT4_EXTENTS_FL) { + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_check_inode_blockref(inode); + } + if (ret) { + brelse(bh); + goto bad_inode; + } + if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; @@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } - } else { + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, @@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else { + brelse(bh); + ret = -EIO; + ext4_error(inode->i_sb, __func__, + "bogus i_mode (%o) for inode=%lu", + inode->i_mode, inode->i_ino); + goto bad_inode; } brelse(iloc.bh); ext4_set_inode_flags(inode); @@ -5146,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; loff_t size; unsigned long len; int ret = -EINVAL; @@ -5199,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock; ret = 0; out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 42dc83fb247..91e75f7a9e7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT4_DIRSYNC_FL; + flags = ext4_mask_flags(inode->i_mode, flags); err = -EPERM; mutex_lock(&inode->i_mutex); @@ -263,6 +262,20 @@ setversion_out: return err; } + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write(filp->f_path.mnt); + return err; + } + default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b038188bd03..f871677a798 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -46,22 +46,23 @@ * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * - * During initialization phase of the allocator we decide to use the group - * preallocation or inode preallocation depending on the size file. The - * size of the file could be the resulting file size we would have after - * allocation or the current file size which ever is larger. If the size is - * less that sbi->s_mb_stream_request we select the group - * preallocation. The default value of s_mb_stream_request is 16 - * blocks. This can also be tuned via - * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms - * of number of blocks. + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in + * terms of number of blocks. * * The main motivation for having small file use group preallocation is to - * ensure that we have small file closer in the disk. + * ensure that we have small files closer together on the disk. * - * First stage the allocator looks at the inode prealloc list - * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for - * this particular inode. The inode prealloc space is represented as: + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space @@ -121,29 +122,29 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /proc/fs/ext4/<partition/group_prealloc. The value is represented in + * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * stripe value (sbi->s_stripe) * - * The regular allocator(using the buddy cache) support few tunables. + * The regular allocator(using the buddy cache) supports few tunables. * - * /proc/fs/ext4/<partition>/min_to_scan - * /proc/fs/ext4/<partition>/max_to_scan - * /proc/fs/ext4/<partition>/order2_req + * /sys/fs/ext4/<partition>/mb_min_to_scan + * /sys/fs/ext4/<partition>/mb_max_to_scan + * /sys/fs/ext4/<partition>/mb_order2_req * - * The regular allocator use buddy scan only if the request len is power of + * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via - * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to + * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contigous block in - * stripe size. This should result in better allocation on RAID setup. If - * not we search in the specific group using bitmap for best extents. The - * tunable min_to_scan and max_to_scan controll the behaviour here. + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best - * extent and max_to_scanindicate how long the mballoc __can__ look for a + * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it @@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); @@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, { unsigned free, fragments; unsigned i, bits; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_desc *desc; struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); @@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) return 0; + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return 0; + bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) @@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs - * You can tune it via /proc/fs/ext4/<partition>/order2_req + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req */ if (i >= sbi->s_mb_order2_reqs) { /* @@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - kfree(sbi->s_mb_maxs); + kfree(sbi->s_mb_offsets); return -ENOMEM; } @@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); if (sbi->s_journal) @@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb) free_percpu(sbi->s_locality_groups); ext4_mb_history_release(sb); - ext4_mb_destroy_per_dev_proc(sb); return 0; } @@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug("freed %u blocks in %u structures\n", count, count2); } -#define EXT4_MB_STATS_NAME "stats" -#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" -#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" -#define EXT4_MB_ORDER2_REQ "order2_req" -#define EXT4_MB_STREAM_REQ "stream_req" -#define EXT4_MB_GROUP_PREALLOC "group_prealloc" - -static int ext4_mb_init_per_dev_proc(struct super_block *sb) -{ -#ifdef CONFIG_PROC_FS - mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct proc_dir_entry *proc; - - if (sbi->s_proc == NULL) - return -EINVAL; - - EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); - EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); - EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); - EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); - EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); - EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); - return 0; - -err_out: - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - return -ENOMEM; -#else - return 0; -#endif -} - -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) -{ -#ifdef CONFIG_PROC_FS - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_proc == NULL) - return -EINVAL; - - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); -#endif - return 0; -} - int __init init_ext4_mballoc(void) { ext4_pspace_cachep = @@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_blocks); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3116,7 +3063,7 @@ out_err: * here we normalize request for locality group * Group request are normalized to s_strip size if we set the same via mount * option. If not we set it to s_mb_group_prealloc which can be configured via - * /proc/fs/ext4/<partition>/group_prealloc + * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ @@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; - /* If linear, pa_pstart may be in the next group when pa is used up */ - if (pa->pa_linear) + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) grp_blk--; ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); @@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_linear = 0; + pa->pa_type = MB_INODE_PA; mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); @@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_linear = 1; + pa->pa_type = MB_GROUP_PA; mb_debug("new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); @@ -4021,7 +3971,7 @@ repeat: list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); - if (pa->pa_linear) + if (pa->pa_type == MB_GROUP_PA) ext4_mb_release_group_pa(&e4b, pa, ac); else ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); @@ -4121,7 +4071,7 @@ repeat: spin_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { - BUG_ON(pa->pa_linear != 0); + BUG_ON(pa->pa_type != MB_INODE_PA); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); err = ext4_mb_load_buddy(sb, group, &e4b); @@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) * file is determined by the current size or the resulting size after * allocation which ever is larger * - * One can tune this size via /proc/fs/ext4/<partition>/stream_req + * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { @@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, continue; } /* only lg prealloc space */ - BUG_ON(!pa->pa_linear); + BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ pa->pa_deleted = 1; @@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) pa_inode_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { @@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { - if (pa->pa_linear) { + if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += ac->ac_b_ex.fe_len; @@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * doesn't grow big. We need to release * alloc_semp before calling ext4_mb_add_n_trim() */ - if (pa->pa_linear && likely(pa->pa_free)) { + if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); @@ -4936,9 +4886,7 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += count; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); } ext4_mb_release_desc(&e4b); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 10a2921baf1..dd9e6cd5f6c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -132,12 +132,15 @@ struct ext4_prealloc_space { ext4_lblk_t pa_lstart; /* log. block */ unsigned short pa_len; /* len of preallocated chunk */ unsigned short pa_free; /* how many blocks are free */ - unsigned short pa_linear; /* consumed in one direction - * strictly, for grp prealloc */ + unsigned short pa_type; /* pa type. inode or group */ spinlock_t *pa_obj_lock; struct inode *pa_inode; /* hack, for history only */ }; +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; struct ext4_free_extent { ext4_lblk_t fe_logical; @@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 83410244d3e..22098e1cd08 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name, struct dx_frame *frame, int *err); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, int size, +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); + struct dx_map_entry *offsets, int count, unsigned blocksize); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, @@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); +unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +} + +__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +} + /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * -ext4_next_entry(struct ext4_dir_entry_2 *p) +ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + - ext4_rec_len_from_disk(p->rec_len)); + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* @@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent space += EXT4_DIR_REC_LEN(de->name_len); names++; } - de = ext4_next_entry(de); + de = ext4_next_entry(de, size); } printk("(%i)\n", names); return (struct stats) { names, space, 1 }; @@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de)) { + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { @@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - de = ext4_next_entry(de); + de = ext4_next_entry(de, dir->i_sb->s_blocksize); if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) goto errout; count++; @@ -713,15 +737,15 @@ errout: * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map (struct ext4_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; - while ((char *) de < base + size) - { + while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { ext4fs_dirhash(de->name, de->name_len, &h); map_tail--; @@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size, cond_resched(); } /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext4_next_entry(de); + de = ext4_next_entry(de, blocksize); } return count; } @@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh, return 1; } /* prevent looping on a bad block */ - de_len = ext4_rec_len_from_disk(de->rec_len); + de_len = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); if (de_len <= 0) return -1; offset += de_len; @@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de)) { + for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + ((char *) de - bh->b_data); @@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (unlikely(IS_ERR(inode))) { + if (PTR_ERR(inode) == -ESTALE) { + ext4_error(dir->i_sb, __func__, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } } return d_splice_alias(inode, dentry); } @@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb, * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, + unsigned blocksize) { unsigned rec_len = 0; @@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) rec_len = EXT4_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = - ext4_rec_len_to_disk(rec_len); + ext4_rec_len_to_disk(rec_len, blocksize); de->inode = 0; map++; to += rec_len; @@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; - while ((char*)de < base + size) { - next = ext4_next_entry(de); + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = EXT4_DIR_REC_LEN(de->name_len); if (de > to) memmove(to, de, rec_len); - to->rec_len = ext4_rec_len_to_disk(rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } @@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split); + de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + blocksize); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int offset = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; int nlen, rlen, err; char *top; @@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + dir->i_sb->s_blocksize - reclen; + top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, bh, offset)) { @@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, return -EEXIST; } nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); @@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling */ nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); - de->rec_len = ext4_rec_len_to_disk(nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); + de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); de = de1; } de->file_type = EXT4_FT_UNKNOWN; @@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + - ext4_rec_len_from_disk(fde->rec_len)); + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { ext4_error(dir->i_sb, __func__, "invalid rec_len for '..' in inode %lu", @@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, memcpy (data1, de, len); de = (struct ext4_dir_entry_2 *) data1; top = data1 + len; - while ((char *)(de2 = ext4_next_entry(de)) < top) + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), + blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; @@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; - node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, frame->bh); @@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle, struct buffer_head *bh) { struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; @@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle, ext4_journal_get_write_access(handle, bh); if (pde) pde->rec_len = ext4_rec_len_to_disk( - ext4_rec_len_from_disk(pde->rec_len) + - ext4_rec_len_from_disk(de->rec_len)); + ext4_rec_len_from_disk(pde->rec_len, + blocksize) + + ext4_rec_len_from_disk(de->rec_len, + blocksize), + blocksize); else de->inode = 0; dir->i_version++; @@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle, ext4_handle_dirty_metadata(handle, dir, bh); return 0; } - i += ext4_rec_len_from_disk(de->rec_len); + i += ext4_rec_len_from_disk(de->rec_len, blocksize); pde = de; - de = ext4_next_entry(de); + de = ext4_next_entry(de, blocksize); } return -ENOENT; } @@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct inode *inode; struct buffer_head *dir_block; struct ext4_dir_entry_2 *de; + unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) @@ -1824,13 +1869,14 @@ retry: de = (struct ext4_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); strcpy(de->name, "."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de); + de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - - EXT4_DIR_REC_LEN(1)); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), + blocksize); de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); @@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode) return 1; } de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de); + de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || !le32_to_cpu(de1->inode) || strcmp(".", de->name) || @@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode) brelse(bh); return 1; } - offset = ext4_rec_len_from_disk(de->rec_len) + - ext4_rec_len_from_disk(de1->rec_len); - de = ext4_next_entry(de1); + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); + de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { if (!bh || (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { @@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode) brelse(bh); return 0; } - offset += ext4_rec_len_from_disk(de->rec_len); - de = ext4_next_entry(de); + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); return 1; @@ -2297,8 +2343,8 @@ retry: return err; } -#define PARENT_INO(buffer) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) +#define PARENT_INO(buffer, size) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) /* * Anybody can rename anything with this: the permission checks are left to the @@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; - int retval; + int retval, force_da_alloc = 0; old_bh = new_bh = dir_bh = NULL; @@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + if (le32_to_cpu(PARENT_INO(dir_bh->b_data, + old_dir->i_sb->s_blocksize)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); ext4_journal_get_write_access(handle, dir_bh); - PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = + cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, old_dir, dir_bh); ext4_dec_count(handle, old_dir); @@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); + if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + force_da_alloc = 1; } retval = 0; @@ -2457,6 +2507,8 @@ end_rename: brelse(old_bh); brelse(new_bh); ext4_journal_stop(handle); + if (retval == 0 && force_da_alloc) + ext4_alloc_da_blocks(old_inode); return retval; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c06886abd65..546c7dd869e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); - sbi->s_flex_groups[flex_group].free_blocks += - input->free_blocks_count; - sbi->s_flex_groups[flex_group].free_inodes += - EXT4_INODES_PER_GROUP(sb); + atomic_add(input->free_blocks_count, + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_INODES_PER_GROUP(sb), + &sbi->s_flex_groups[flex_group].free_inodes); } ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f7371a6a923..9987bba99db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -35,6 +35,7 @@ #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/ctype.h> #include <linux/marker.h> #include <linux/log2.h> #include <linux/crc16.h> @@ -48,6 +49,7 @@ #include "group.h" struct proc_dir_entry *ext4_proc_root; +static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb) ext4_commit_super(sb, es, 1); } if (sbi->s_proc) { - remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } + kobject_del(&sbi->s_kobj); for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); @@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb) ext4_blkdev_remove(sbi); } sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + unlock_kernel(); + unlock_super(sb); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + lock_super(sb); + lock_kernel(); + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; } @@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) seq_puts(seq, ",noacl"); #endif - if (!test_opt(sb, RESERVATION)) - seq_puts(seq, ",noreservation"); if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); @@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); + if (test_opt(sb, NO_AUTO_DA_ALLOC)) + seq_puts(seq, ",noauto_da_alloc"); + ext4_show_quota_options(seq, sb); return 0; } @@ -1004,7 +1018,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, @@ -1012,8 +1026,8 @@ enum { Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_i_version, + Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1039,8 +1053,6 @@ static const match_table_t tokens = { {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, - {Opt_reservation, "reservation"}, - {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, @@ -1068,6 +1080,8 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, @@ -1075,6 +1089,9 @@ static const match_table_t tokens = { {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_err, NULL}, }; @@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb, "not supported\n"); break; #endif - case Opt_reservation: - set_opt(sbi->s_mount_opt, RESERVATION); - break; - case Opt_noreservation: - clear_opt(sbi->s_mount_opt, RESERVATION); - break; case Opt_journal_update: /* @@@ FIXME */ /* Eventually we will want to be able to create @@ -1415,9 +1426,14 @@ set_qf_format: case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_barrier: - if (match_int(&args[0], &option)) - return 0; + if (match_int(&args[0], &option)) { + set_opt(sbi->s_mount_opt, BARRIER); + break; + } if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1463,6 +1479,11 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; + if (option & (option - 1)) { + printk(KERN_ERR "EXT4-fs: inode_readahead_blks" + " must be a power of 2\n"); + return 0; + } sbi->s_inode_readahead_blks = option; break; case Opt_journal_ioprio: @@ -1473,6 +1494,19 @@ set_qf_format: *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, option); break; + case Opt_noauto_da_alloc: + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; + case Opt_auto_da_alloc: + if (match_int(&args[0], &option)) { + clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + break; + } + if (option) + clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + else + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, &bh); flex_group = ext4_flex_group(sbi, i); - sbi->s_flex_groups[flex_group].free_inodes += - ext4_free_inodes_count(sb, gdp); - sbi->s_flex_groups[flex_group].free_blocks += - ext4_free_blks_count(sb, gdp); + atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, + ext4_free_inodes_count(sb, gdp)); + atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, + ext4_free_blks_count(sb, gdp)); + atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, + ext4_used_dirs_count(sb, gdp)); } return 1; @@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return 0; } +/* sysfs supprt */ + +struct ext4_attr { + struct attribute attr; + ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + const char *, size_t); + int offset; +}; + +static int parse_strtoul(const char *buf, + unsigned long max, unsigned long *value) +{ + char *endp; + + while (*buf && isspace(*buf)) + buf++; + *value = simple_strtoul(buf, &endp, 0); + while (*endp && isspace(*endp)) + endp++; + if (*endp || *value > max) + return -EINVAL; + + return 0; +} + +static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); +} + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + + if (parse_strtoul(buf, 0x40000000, &t)) + return -EINVAL; + + /* inode_readahead_blks must be a power of 2 */ + if (t & (t-1)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t sbi_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t sbi_ui_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned long t; + + if (parse_strtoul(buf, 0xffffffff, &t)) + return -EINVAL; + *ui = t; + return count; +} + +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct ext4_sb_info, _elname), \ +} +#define EXT4_ATTR(name, mode, show, store) \ +static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) + +#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) +#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) +#define EXT4_RW_ATTR_SBI_UI(name, elname) \ + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_RO_ATTR(delayed_allocation_blocks); +EXT4_RO_ATTR(session_write_kbytes); +EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + NULL, +}; + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + + +static struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; sbi->s_resgid = EXT4_DEF_RESGID; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; + sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, + sectors[1]); unlock_kernel(); @@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) goto cantfind_ext4; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); /* @@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - - if (sbi->s_proc) - proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, - &ext4_ui_proc_fops, - &sbi->s_inode_readahead_blks); #endif - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); @@ -2564,6 +2779,16 @@ no_journal: goto failed_mount4; } + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); + if (err) { + ext4_mb_release(sb); + ext4_ext_release(sb); + goto failed_mount4; + }; + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2618,7 +2843,6 @@ failed_mount2: kfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { - remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb, set_buffer_uptodate(sbh); } es->s_wtime = cpu_to_le32(get_seconds()); + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( @@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); } -#ifdef CONFIG_PROC_FS -static int ext4_ui_proc_show(struct seq_file *m, void *v) -{ - unsigned int *p = m->private; - - seq_printf(m, "%u\n", *p); - return 0; -} - -static int ext4_ui_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, ext4_ui_proc_show, PDE(inode)->data); -} - -static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; - char str[32]; - - if (cnt >= sizeof(str)) - return -EINVAL; - if (copy_from_user(str, buf, cnt)) - return -EFAULT; - - *p = simple_strtoul(str, NULL, 0); - return cnt; -} - -const struct file_operations ext4_ui_proc_fops = { - .owner = THIS_MODULE, - .open = ext4_ui_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = ext4_ui_proc_write, -}; -#endif - static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", @@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void) { int err; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) + return -ENOMEM; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) @@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void) exit_ext4_xattr(); exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0004fe6e0..296785a0dec 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); + struct super_block *sb = dentry->d_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); /* If the count of free cluster is still unknown, counts it here. */ if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { @@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; buf->f_bavail = sbi->free_clusters; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = sbi->options.isvfat ? 260 : 12; return 0; @@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); - opts->fs_fmask = opts->fs_dmask = current->fs->umask; + opts->fs_fmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e3fe9918faa..eed48063990 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode) struct inode *tail_inode; tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (!time_after_eq(inode->dirtied_when, + if (time_before(inode->dirtied_when, tail_inode->dirtied_when)) inode->dirtied_when = jiffies; } @@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode) wake_up_bit(&inode->i_state, __I_SYNC); } +static bool inode_dirtied_after(struct inode *inode, unsigned long t) +{ + bool ret = time_after(inode->dirtied_when, t); +#ifndef CONFIG_64BIT + /* + * For inodes being constantly redirtied, dirtied_when can get stuck. + * It _appears_ to be in the future, but is actually in distant past. + * This test is necessary to prevent such wrapped-around relative times + * from permanently stopping the whole pdflush writeback. + */ + ret = ret && time_before_eq(inode->dirtied_when, jiffies); +#endif + return ret; +} + /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ @@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, struct inode *inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && - time_after(inode->dirtied_when, *older_than_this)) + inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_list, dispatch_queue); } @@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; /* blockdev has wrong queue */ } - /* Was this inode dirtied after sync_sb_inodes was called? */ - if (time_after(inode->dirtied_when, start)) + /* + * Was this inode dirtied after sync_sb_inodes was called? + * This keeps sync from extra jobs and livelock. + */ + if (inode_dirtied_after(inode, start)) break; /* Is another pdflush already flushing this queue? */ @@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb, list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & + (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) diff --git a/fs/fs_struct.c b/fs/fs_struct.c new file mode 100644 index 00000000000..eee059052db --- /dev/null +++ b/fs/fs_struct.c @@ -0,0 +1,177 @@ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/path.h> +#include <linux/slab.h> +#include <linux/fs_struct.h> + +/* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_root(struct fs_struct *fs, struct path *path) +{ + struct path old_root; + + write_lock(&fs->lock); + old_root = fs->root; + fs->root = *path; + path_get(path); + write_unlock(&fs->lock); + if (old_root.dentry) + path_put(&old_root); +} + +/* + * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_pwd(struct fs_struct *fs, struct path *path) +{ + struct path old_pwd; + + write_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + write_unlock(&fs->lock); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +void chroot_fs_refs(struct path *old_root, struct path *new_root) +{ + struct task_struct *g, *p; + struct fs_struct *fs; + int count = 0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { + write_lock(&fs->lock); + if (fs->root.dentry == old_root->dentry + && fs->root.mnt == old_root->mnt) { + path_get(new_root); + fs->root = *new_root; + count++; + } + if (fs->pwd.dentry == old_root->dentry + && fs->pwd.mnt == old_root->mnt) { + path_get(new_root); + fs->pwd = *new_root; + count++; + } + write_unlock(&fs->lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + while (count--) + path_put(old_root); +} + +void free_fs_struct(struct fs_struct *fs) +{ + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct *fs = tsk->fs; + + if (fs) { + int kill; + task_lock(tsk); + write_lock(&fs->lock); + tsk->fs = NULL; + kill = !--fs->users; + write_unlock(&fs->lock); + task_unlock(tsk); + if (kill) + free_fs_struct(fs); + } +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + fs->users = 1; + fs->in_exec = 0; + rwlock_init(&fs->lock); + fs->umask = old->umask; + read_lock(&old->lock); + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); + read_unlock(&old->lock); + } + return fs; +} + +int unshare_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + struct fs_struct *new_fs = copy_fs_struct(fs); + int kill; + + if (!new_fs) + return -ENOMEM; + + task_lock(current); + write_lock(&fs->lock); + kill = !--fs->users; + current->fs = new_fs; + write_unlock(&fs->lock); + task_unlock(current); + + if (kill) + free_fs_struct(fs); + + return 0; +} +EXPORT_SYMBOL_GPL(unshare_fs_struct); + +int current_umask(void) +{ + return current->fs->umask; +} +EXPORT_SYMBOL(current_umask); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .users = 1, + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .umask = 0022, +}; + +void daemonize_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + + if (fs) { + int kill; + + task_lock(current); + + write_lock(&init_fs.lock); + init_fs.users++; + write_unlock(&init_fs.lock); + + write_lock(&fs->lock); + current->fs = &init_fs; + kill = !--fs->users; + write_unlock(&fs->lock); + + task_unlock(current); + if (kill) + free_fs_struct(fs); + } +} diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig new file mode 100644 index 00000000000..9bbb8ce7bea --- /dev/null +++ b/fs/fscache/Kconfig @@ -0,0 +1,56 @@ + +config FSCACHE + tristate "General filesystem local caching manager" + depends on EXPERIMENTAL + select SLOW_WORK + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_HISTOGRAM + bool "Gather latency information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes latency information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.txt for more information. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile new file mode 100644 index 00000000000..91571b95aac --- /dev/null +++ b/fs/fscache/Makefile @@ -0,0 +1,19 @@ +# +# Makefile for general filesystem caching code +# + +fscache-y := \ + cache.o \ + cookie.o \ + fsdef.o \ + main.o \ + netfs.o \ + object.o \ + operation.o \ + page.o + +fscache-$(CONFIG_PROC_FS) += proc.o +fscache-$(CONFIG_FSCACHE_STATS) += stats.o +fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o + +obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c new file mode 100644 index 00000000000..e21985bbb1f --- /dev/null +++ b/fs/fscache/cache.c @@ -0,0 +1,415 @@ +/* FS-Cache cache handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include <linux/module.h> +#include <linux/slab.h> +#include "internal.h" + +LIST_HEAD(fscache_cache_list); +DECLARE_RWSEM(fscache_addremove_sem); +DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq); +EXPORT_SYMBOL(fscache_cache_cleared_wq); + +static LIST_HEAD(fscache_cache_tag_list); + +/* + * look up a cache tag + */ +struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name) +{ + struct fscache_cache_tag *tag, *xtag; + + /* firstly check for the existence of the tag under read lock */ + down_read(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_read(&fscache_addremove_sem); + return tag; + } + } + + up_read(&fscache_addremove_sem); + + /* the tag does not exist - create a candidate */ + xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL); + if (!xtag) + /* return a dummy tag if out of memory */ + return ERR_PTR(-ENOMEM); + + atomic_set(&xtag->usage, 1); + strcpy(xtag->name, name); + + /* write lock, search again and add if still not present */ + down_write(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_write(&fscache_addremove_sem); + kfree(xtag); + return tag; + } + } + + list_add_tail(&xtag->link, &fscache_cache_tag_list); + up_write(&fscache_addremove_sem); + return xtag; +} + +/* + * release a reference to a cache tag + */ +void __fscache_release_cache_tag(struct fscache_cache_tag *tag) +{ + if (tag != ERR_PTR(-ENOMEM)) { + down_write(&fscache_addremove_sem); + + if (atomic_dec_and_test(&tag->usage)) + list_del_init(&tag->link); + else + tag = NULL; + + up_write(&fscache_addremove_sem); + + kfree(tag); + } +} + +/* + * select a cache in which to store an object + * - the cache addremove semaphore must be at least read-locked by the caller + * - the object will never be an index + */ +struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *cookie) +{ + struct fscache_cache_tag *tag; + struct fscache_object *object; + struct fscache_cache *cache; + + _enter(""); + + if (list_empty(&fscache_cache_list)) { + _leave(" = NULL [no cache]"); + return NULL; + } + + /* we check the parent to determine the cache to use */ + spin_lock(&cookie->lock); + + /* the first in the parent's backing list should be the preferred + * cache */ + if (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + cache = object->cache; + if (object->state >= FSCACHE_OBJECT_DYING || + test_bit(FSCACHE_IOERROR, &cache->flags)) + cache = NULL; + + spin_unlock(&cookie->lock); + _leave(" = %p [parent]", cache); + return cache; + } + + /* the parent is unbacked */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* cookie not an index and is unbacked */ + spin_unlock(&cookie->lock); + _leave(" = NULL [cookie ub,ni]"); + return NULL; + } + + spin_unlock(&cookie->lock); + + if (!cookie->def->select_cache) + goto no_preference; + + /* ask the netfs for its preference */ + tag = cookie->def->select_cache(cookie->parent->netfs_data, + cookie->netfs_data); + if (!tag) + goto no_preference; + + if (tag == ERR_PTR(-ENOMEM)) { + _leave(" = NULL [nomem tag]"); + return NULL; + } + + if (!tag->cache) { + _leave(" = NULL [unbacked tag]"); + return NULL; + } + + if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) + return NULL; + + _leave(" = %p [specific]", tag->cache); + return tag->cache; + +no_preference: + /* netfs has no preference - just select first cache */ + cache = list_entry(fscache_cache_list.next, + struct fscache_cache, link); + _leave(" = %p [first]", cache); + return cache; +} + +/** + * fscache_init_cache - Initialise a cache record + * @cache: The cache record to be initialised + * @ops: The cache operations to be installed in that record + * @idfmt: Format string to define identifier + * @...: sprintf-style arguments + * + * Initialise a record of a cache and fill in the name. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_init_cache(struct fscache_cache *cache, + const struct fscache_cache_ops *ops, + const char *idfmt, + ...) +{ + va_list va; + + memset(cache, 0, sizeof(*cache)); + + cache->ops = ops; + + va_start(va, idfmt); + vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va); + va_end(va); + + INIT_WORK(&cache->op_gc, fscache_operation_gc); + INIT_LIST_HEAD(&cache->link); + INIT_LIST_HEAD(&cache->object_list); + INIT_LIST_HEAD(&cache->op_gc_list); + spin_lock_init(&cache->object_list_lock); + spin_lock_init(&cache->op_gc_list_lock); +} +EXPORT_SYMBOL(fscache_init_cache); + +/** + * fscache_add_cache - Declare a cache as being open for business + * @cache: The record describing the cache + * @ifsdef: The record of the cache object describing the top-level index + * @tagname: The tag describing this cache + * + * Add a cache to the system, making it available for netfs's to use. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +int fscache_add_cache(struct fscache_cache *cache, + struct fscache_object *ifsdef, + const char *tagname) +{ + struct fscache_cache_tag *tag; + + BUG_ON(!cache->ops); + BUG_ON(!ifsdef); + + cache->flags = 0; + ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + ifsdef->state = FSCACHE_OBJECT_ACTIVE; + + if (!tagname) + tagname = cache->identifier; + + BUG_ON(!tagname[0]); + + _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname); + + /* we use the cache tag to uniquely identify caches */ + tag = __fscache_lookup_cache_tag(tagname); + if (IS_ERR(tag)) + goto nomem; + + if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags)) + goto tag_in_use; + + cache->kobj = kobject_create_and_add(tagname, fscache_root); + if (!cache->kobj) + goto error; + + ifsdef->cookie = &fscache_fsdef_index; + ifsdef->cache = cache; + cache->fsdef = ifsdef; + + down_write(&fscache_addremove_sem); + + tag->cache = cache; + cache->tag = tag; + + /* add the cache to the list */ + list_add(&cache->link, &fscache_cache_list); + + /* add the cache's netfs definition index object to the cache's + * list */ + spin_lock(&cache->object_list_lock); + list_add_tail(&ifsdef->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + + /* add the cache's netfs definition index object to the top level index + * cookie as a known backing object */ + spin_lock(&fscache_fsdef_index.lock); + + hlist_add_head(&ifsdef->cookie_link, + &fscache_fsdef_index.backing_objects); + + atomic_inc(&fscache_fsdef_index.usage); + + /* done */ + spin_unlock(&fscache_fsdef_index.lock); + up_write(&fscache_addremove_sem); + + printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); + kobject_uevent(cache->kobj, KOBJ_ADD); + + _leave(" = 0 [%s]", cache->identifier); + return 0; + +tag_in_use: + printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); + __fscache_release_cache_tag(tag); + _leave(" = -EXIST"); + return -EEXIST; + +error: + __fscache_release_cache_tag(tag); + _leave(" = -EINVAL"); + return -EINVAL; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(fscache_add_cache); + +/** + * fscache_io_error - Note a cache I/O error + * @cache: The record describing the cache + * + * Note that an I/O error occurred in a cache and that it should no longer be + * used for anything. This also reports the error into the kernel log. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_io_error(struct fscache_cache *cache) +{ + set_bit(FSCACHE_IOERROR, &cache->flags); + + printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", + cache->ops->name); +} +EXPORT_SYMBOL(fscache_io_error); + +/* + * request withdrawal of all the objects in a cache + * - all the objects being withdrawn are moved onto the supplied list + */ +static void fscache_withdraw_all_objects(struct fscache_cache *cache, + struct list_head *dying_objects) +{ + struct fscache_object *object; + + spin_lock(&cache->object_list_lock); + + while (!list_empty(&cache->object_list)) { + object = list_entry(cache->object_list.next, + struct fscache_object, cache_link); + list_move_tail(&object->cache_link, dying_objects); + + _debug("withdraw %p", object->cookie); + + spin_lock(&object->lock); + spin_unlock(&cache->object_list_lock); + fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); + spin_unlock(&object->lock); + + cond_resched(); + spin_lock(&cache->object_list_lock); + } + + spin_unlock(&cache->object_list_lock); +} + +/** + * fscache_withdraw_cache - Withdraw a cache from the active service + * @cache: The record describing the cache + * + * Withdraw a cache from service, unbinding all its cache objects from the + * netfs cookies they're currently representing. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_withdraw_cache(struct fscache_cache *cache) +{ + LIST_HEAD(dying_objects); + + _enter(""); + + printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", + cache->tag->name); + + /* make the cache unavailable for cookie acquisition */ + if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) + BUG(); + + down_write(&fscache_addremove_sem); + list_del_init(&cache->link); + cache->tag->cache = NULL; + up_write(&fscache_addremove_sem); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disk */ + cache->ops->sync_cache(cache); + + /* dissociate all the netfs pages backed by this cache from the block + * mappings in the cache */ + cache->ops->dissociate_pages(cache); + + /* we now have to destroy all the active objects pertaining to this + * cache - which we do by passing them off to thread pool to be + * disposed of */ + _debug("destroy"); + + fscache_withdraw_all_objects(cache, &dying_objects); + + /* wait for all extant objects to finish their outstanding operations + * and go away */ + _debug("wait for finish"); + wait_event(fscache_cache_cleared_wq, + atomic_read(&cache->object_count) == 0); + _debug("wait for clearance"); + wait_event(fscache_cache_cleared_wq, + list_empty(&cache->object_list)); + _debug("cleared"); + ASSERT(list_empty(&dying_objects)); + + kobject_put(cache->kobj); + + clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags); + fscache_release_cache_tag(cache->tag); + cache->tag = NULL; + + _leave(""); +} +EXPORT_SYMBOL(fscache_withdraw_cache); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c new file mode 100644 index 00000000000..72fd18f6c71 --- /dev/null +++ b/fs/fscache/cookie.c @@ -0,0 +1,500 @@ +/* netfs cookie management + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/netfs-api.txt for more information on + * the netfs API. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/module.h> +#include <linux/slab.h> +#include "internal.h" + +struct kmem_cache *fscache_cookie_jar; + +static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); + +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie); +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object); + +/* + * initialise an cookie jar slab element prior to any use + */ +void fscache_cookie_init_once(void *_cookie) +{ + struct fscache_cookie *cookie = _cookie; + + memset(cookie, 0, sizeof(*cookie)); + spin_lock_init(&cookie->lock); + INIT_HLIST_HEAD(&cookie->backing_objects); +} + +/* + * request a cookie to represent an object (index, datafile, xattr, etc) + * - parent specifies the parent object + * - the top level index cookie for each netfs is stored in the fscache_netfs + * struct upon registration + * - def points to the definition + * - the netfs_data will be passed to the functions pointed to in *def + * - all attached caches will be searched to see if they contain this object + * - index objects aren't stored on disk until there's a dependent file that + * needs storing + * - other objects are stored in a selected cache immediately, and all the + * indices forming the path to it are instantiated if necessary + * - we never let on to the netfs about errors + * - we may set a negative cookie pointer, but that's okay + */ +struct fscache_cookie *__fscache_acquire_cookie( + struct fscache_cookie *parent, + const struct fscache_cookie_def *def, + void *netfs_data) +{ + struct fscache_cookie *cookie; + + BUG_ON(!def); + + _enter("{%s},{%s},%p", + parent ? (char *) parent->def->name : "<no-parent>", + def->name, netfs_data); + + fscache_stat(&fscache_n_acquires); + + /* if there's no parent cookie, then we don't create one here either */ + if (!parent) { + fscache_stat(&fscache_n_acquires_null); + _leave(" [no parent]"); + return NULL; + } + + /* validate the definition */ + BUG_ON(!def->get_key); + BUG_ON(!def->name[0]); + + BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && + parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); + + /* allocate and initialise a cookie */ + cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + if (!cookie) { + fscache_stat(&fscache_n_acquires_oom); + _leave(" [ENOMEM]"); + return NULL; + } + + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + + atomic_inc(&parent->usage); + atomic_inc(&parent->n_children); + + cookie->def = def; + cookie->parent = parent; + cookie->netfs_data = netfs_data; + cookie->flags = 0; + + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS); + + switch (cookie->def->type) { + case FSCACHE_COOKIE_TYPE_INDEX: + fscache_stat(&fscache_n_cookie_index); + break; + case FSCACHE_COOKIE_TYPE_DATAFILE: + fscache_stat(&fscache_n_cookie_data); + break; + default: + fscache_stat(&fscache_n_cookie_special); + break; + } + + /* if the object is an index then we need do nothing more here - we + * create indices on disk when we need them as an index may exist in + * multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) < 0) { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } + + fscache_stat(&fscache_n_acquires_ok); + _leave(" = %p", cookie); + return cookie; +} +EXPORT_SYMBOL(__fscache_acquire_cookie); + +/* + * acquire a non-index cookie + * - this must make sure the index chain is instantiated and instantiate the + * object representation too + */ +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct fscache_cache *cache; + uint64_t i_size; + int ret; + + _enter(""); + + cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE; + + /* now we need to see whether the backing objects for this cookie yet + * exist, if not there'll be nothing to search */ + down_read(&fscache_addremove_sem); + + if (list_empty(&fscache_cache_list)) { + up_read(&fscache_addremove_sem); + _leave(" = 0 [no caches]"); + return 0; + } + + /* select a cache in which to store the object */ + cache = fscache_select_cache_for_object(cookie->parent); + if (!cache) { + up_read(&fscache_addremove_sem); + fscache_stat(&fscache_n_acquires_no_cache); + _leave(" = -ENOMEDIUM [no cache]"); + return -ENOMEDIUM; + } + + _debug("cache %s", cache->tag->name); + + cookie->flags = + (1 << FSCACHE_COOKIE_LOOKING_UP) | + (1 << FSCACHE_COOKIE_CREATING) | + (1 << FSCACHE_COOKIE_NO_DATA_YET); + + /* ask the cache to allocate objects for this cookie and its parent + * chain */ + ret = fscache_alloc_object(cache, cookie); + if (ret < 0) { + up_read(&fscache_addremove_sem); + _leave(" = %d", ret); + return ret; + } + + /* pass on how big the object we're caching is supposed to be */ + cookie->def->get_attr(cookie->netfs_data, &i_size); + + spin_lock(&cookie->lock); + if (hlist_empty(&cookie->backing_objects)) { + spin_unlock(&cookie->lock); + goto unavailable; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + fscache_set_store_limit(object, i_size); + + /* initiate the process of looking up all the objects in the chain + * (done by fscache_initialise_object()) */ + fscache_enqueue_object(object); + + spin_unlock(&cookie->lock); + + /* we may be required to wait for lookup to complete at this point */ + if (!fscache_defer_lookup) { + _debug("non-deferred lookup %p", &cookie->flags); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("complete"); + if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) + goto unavailable; + } + + up_read(&fscache_addremove_sem); + _leave(" = 0 [deferred]"); + return 0; + +unavailable: + up_read(&fscache_addremove_sem); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} + +/* + * recursively allocate cache object records for a cookie/cache combination + * - caller must be holding the addremove sem + */ +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct hlist_node *_n; + int ret; + + _enter("%p,%p{%s}", cache, cookie, cookie->def->name); + + spin_lock(&cookie->lock); + hlist_for_each_entry(object, _n, &cookie->backing_objects, + cookie_link) { + if (object->cache == cache) + goto object_already_extant; + } + spin_unlock(&cookie->lock); + + /* ask the cache to allocate an object (we may end up with duplicate + * objects at this stage, but we sort that out later) */ + object = cache->ops->alloc_object(cache, cookie); + if (IS_ERR(object)) { + fscache_stat(&fscache_n_object_no_alloc); + ret = PTR_ERR(object); + goto error; + } + + fscache_stat(&fscache_n_object_alloc); + + object->debug_id = atomic_inc_return(&fscache_object_debug_id); + + _debug("ALLOC OBJ%x: %s {%lx}", + object->debug_id, cookie->def->name, object->events); + + ret = fscache_alloc_object(cache, cookie->parent); + if (ret < 0) + goto error_put; + + /* only attach if we managed to allocate all we needed, otherwise + * discard the object we just allocated and instead use the one + * attached to the cookie */ + if (fscache_attach_object(cookie, object) < 0) + cache->ops->put_object(object); + + _leave(" = 0"); + return 0; + +object_already_extant: + ret = -ENOBUFS; + if (object->state >= FSCACHE_OBJECT_DYING) { + spin_unlock(&cookie->lock); + goto error; + } + spin_unlock(&cookie->lock); + _leave(" = 0 [found]"); + return 0; + +error_put: + cache->ops->put_object(object); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * attach a cache object to a cookie + */ +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object) +{ + struct fscache_object *p; + struct fscache_cache *cache = object->cache; + struct hlist_node *_n; + int ret; + + _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + + spin_lock(&cookie->lock); + + /* there may be multiple initial creations of this object, but we only + * want one */ + ret = -EEXIST; + hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) { + if (p->cache == object->cache) { + if (p->state >= FSCACHE_OBJECT_DYING) + ret = -ENOBUFS; + goto cant_attach_object; + } + } + + /* pin the parent object */ + spin_lock_nested(&cookie->parent->lock, 1); + hlist_for_each_entry(p, _n, &cookie->parent->backing_objects, + cookie_link) { + if (p->cache == object->cache) { + if (p->state >= FSCACHE_OBJECT_DYING) { + ret = -ENOBUFS; + spin_unlock(&cookie->parent->lock); + goto cant_attach_object; + } + object->parent = p; + spin_lock(&p->lock); + p->n_children++; + spin_unlock(&p->lock); + break; + } + } + spin_unlock(&cookie->parent->lock); + + /* attach to the cache's object list */ + if (list_empty(&object->cache_link)) { + spin_lock(&cache->object_list_lock); + list_add(&object->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + } + + /* attach to the cookie */ + object->cookie = cookie; + atomic_inc(&cookie->usage); + hlist_add_head(&object->cookie_link, &cookie->backing_objects); + ret = 0; + +cant_attach_object: + spin_unlock(&cookie->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * update the index entries backing a cookie + */ +void __fscache_update_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct hlist_node *_p; + + fscache_stat(&fscache_n_updates); + + if (!cookie) { + fscache_stat(&fscache_n_updates_null); + _leave(" [no cookie]"); + return; + } + + _enter("{%s}", cookie->def->name); + + BUG_ON(!cookie->def->get_aux); + + spin_lock(&cookie->lock); + + /* update the index entry on disk in each cache backing this cookie */ + hlist_for_each_entry(object, _p, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } + + spin_unlock(&cookie->lock); + _leave(""); +} +EXPORT_SYMBOL(__fscache_update_cookie); + +/* + * release a cookie back to the cache + * - the object will be marked as recyclable on disk if retire is true + * - all dependents of this cookie must have already been unregistered + * (indices/files/pages) + */ +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +{ + struct fscache_cache *cache; + struct fscache_object *object; + unsigned long event; + + fscache_stat(&fscache_n_relinquishes); + + if (!cookie) { + fscache_stat(&fscache_n_relinquishes_null); + _leave(" [no cookie]"); + return; + } + + _enter("%p{%s,%p},%d", + cookie, cookie->def->name, cookie->netfs_data, retire); + + if (atomic_read(&cookie->n_children) != 0) { + printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", + cookie->def->name); + BUG(); + } + + /* wait for the cookie to finish being instantiated (or to fail) */ + if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { + fscache_stat(&fscache_n_relinquishes_waitcrt); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } + + event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; + + /* detach pointers back to the netfs */ + spin_lock(&cookie->lock); + + cookie->netfs_data = NULL; + cookie->def = NULL; + + /* break links with all the active objects */ + while (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, + cookie_link); + + _debug("RELEASE OBJ%x", object->debug_id); + + /* detach each cache object from the object cookie */ + spin_lock(&object->lock); + hlist_del_init(&object->cookie_link); + + cache = object->cache; + object->cookie = NULL; + fscache_raise_event(object, event); + spin_unlock(&object->lock); + + if (atomic_dec_and_test(&cookie->usage)) + /* the cookie refcount shouldn't be reduced to 0 yet */ + BUG(); + } + + spin_unlock(&cookie->lock); + + if (cookie->parent) { + ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); + ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); + atomic_dec(&cookie->parent->n_children); + } + + /* finally dispose of the cookie */ + ASSERTCMP(atomic_read(&cookie->usage), >, 0); + fscache_cookie_put(cookie); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_relinquish_cookie); + +/* + * destroy a cookie + */ +void __fscache_cookie_put(struct fscache_cookie *cookie) +{ + struct fscache_cookie *parent; + + _enter("%p", cookie); + + for (;;) { + _debug("FREE COOKIE %p", cookie); + parent = cookie->parent; + BUG_ON(!hlist_empty(&cookie->backing_objects)); + kmem_cache_free(fscache_cookie_jar, cookie); + + if (!parent) + break; + + cookie = parent; + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (!atomic_dec_and_test(&cookie->usage)) + break; + } + + _leave(""); +} diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c new file mode 100644 index 00000000000..f5b4baee735 --- /dev/null +++ b/fs/fscache/fsdef.c @@ -0,0 +1,144 @@ +/* Filesystem index definition + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include <linux/module.h> +#include "internal.h" + +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static +enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen); + +/* + * The root index is owned by FS-Cache itself. + * + * When a netfs requests caching facilities, FS-Cache will, if one doesn't + * already exist, create an entry in the root index with the key being the name + * of the netfs ("AFS" for example), and the auxiliary data holding the index + * structure version supplied by the netfs: + * + * FSDEF + * | + * +-----------+ + * | | + * NFS AFS + * [v=1] [v=1] + * + * If an entry with the appropriate name does already exist, the version is + * compared. If the version is different, the entire subtree from that entry + * will be discarded and a new entry created. + * + * The new entry will be an index, and a cookie referring to it will be passed + * to the netfs. This is then the root handle by which the netfs accesses the + * cache. It can create whatever objects it likes in that index, including + * further indices. + */ +static struct fscache_cookie_def fscache_fsdef_index_def = { + .name = ".FS-Cache", + .type = FSCACHE_COOKIE_TYPE_INDEX, +}; + +struct fscache_cookie fscache_fsdef_index = { + .usage = ATOMIC_INIT(1), + .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), + .backing_objects = HLIST_HEAD_INIT, + .def = &fscache_fsdef_index_def, +}; +EXPORT_SYMBOL(fscache_fsdef_index); + +/* + * Definition of an entry in the root index. Each entry is an index, keyed to + * a specific netfs and only applicable to a particular version of the index + * structure used by that netfs. + */ +struct fscache_cookie_def fscache_fsdef_netfs_def = { + .name = "FSDEF.netfs", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = fscache_fsdef_netfs_get_key, + .get_aux = fscache_fsdef_netfs_get_aux, + .check_aux = fscache_fsdef_netfs_check_aux, +}; + +/* + * get the key data for an FSDEF index record - this is the name of the netfs + * for which this entry is created + */ +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned klen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + klen = strlen(netfs->name); + if (klen > bufmax) + return 0; + + memcpy(buffer, netfs->name, klen); + return klen; +} + +/* + * get the auxiliary data for an FSDEF index record - this is the index + * structure version number of the netfs for which this version is created + */ +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned dlen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + dlen = sizeof(uint32_t); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &netfs->version, dlen); + return dlen; +} + +/* + * check that the index structure version number stored in the auxiliary data + * matches the one the netfs gave us + */ +static enum fscache_checkaux fscache_fsdef_netfs_check_aux( + void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct fscache_netfs *netfs = cookie_netfs_data; + uint32_t version; + + _enter("{%s},,%hu", netfs->name, datalen); + + if (datalen != sizeof(version)) { + _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version)); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + memcpy(&version, data, sizeof(version)); + if (version != netfs->version) { + _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c new file mode 100644 index 00000000000..bad496748a5 --- /dev/null +++ b/fs/fscache/histogram.c @@ -0,0 +1,109 @@ +/* FS-Cache latency histogram + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +atomic_t fscache_obj_instantiate_histogram[HZ]; +atomic_t fscache_objs_histogram[HZ]; +atomic_t fscache_ops_histogram[HZ]; +atomic_t fscache_retrieval_delay_histogram[HZ]; +atomic_t fscache_retrieval_histogram[HZ]; + +/* + * display the time-taken histogram + */ +static int fscache_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned n[5], t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " + " RETRV DLY RETRIEVLS\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========" + " ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]); + n[1] = atomic_read(&fscache_ops_histogram[index]); + n[2] = atomic_read(&fscache_objs_histogram[index]); + n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]); + n[4] = atomic_read(&fscache_retrieval_histogram[index]); + if (!(n[0] | n[1] | n[2] | n[3] | n[4])) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n", + index, t, n[0], n[1], n[2], n[3], n[4]); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void fscache_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations fscache_histogram_ops = { + .start = fscache_histogram_start, + .stop = fscache_histogram_stop, + .next = fscache_histogram_next, + .show = fscache_histogram_show, +}; + +/* + * open "/proc/fs/fscache/histogram" to provide latency data + */ +static int fscache_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fscache_histogram_ops); +} + +const struct file_operations fscache_histogram_fops = { + .owner = THIS_MODULE, + .open = fscache_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h new file mode 100644 index 00000000000..e0cbd16f6dc --- /dev/null +++ b/fs/fscache/internal.h @@ -0,0 +1,380 @@ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Lock order, in the order in which multiple locks should be obtained: + * - fscache_addremove_sem + * - cookie->lock + * - cookie->parent->lock + * - cache->object_list_lock + * - object->lock + * - object->parent->lock + * - fscache_thread_lock + * + */ + +#include <linux/fscache-cache.h> +#include <linux/sched.h> + +#define FSCACHE_MIN_THREADS 4 +#define FSCACHE_MAX_THREADS 32 + +/* + * fsc-cache.c + */ +extern struct list_head fscache_cache_list; +extern struct rw_semaphore fscache_addremove_sem; + +extern struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *); + +/* + * fsc-cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; + +extern void fscache_cookie_init_once(void *); +extern void __fscache_cookie_put(struct fscache_cookie *); + +/* + * fsc-fsdef.c + */ +extern struct fscache_cookie fscache_fsdef_index; +extern struct fscache_cookie_def fscache_fsdef_netfs_def; + +/* + * fsc-histogram.c + */ +#ifdef CONFIG_FSCACHE_HISTOGRAM +extern atomic_t fscache_obj_instantiate_histogram[HZ]; +extern atomic_t fscache_objs_histogram[HZ]; +extern atomic_t fscache_ops_histogram[HZ]; +extern atomic_t fscache_retrieval_delay_histogram[HZ]; +extern atomic_t fscache_retrieval_histogram[HZ]; + +static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +extern const struct file_operations fscache_histogram_fops; + +#else +#define fscache_hist(hist, start_jif) do {} while (0) +#endif + +/* + * fsc-main.c + */ +extern unsigned fscache_defer_lookup; +extern unsigned fscache_defer_create; +extern unsigned fscache_debug; +extern struct kobject *fscache_root; + +extern int fscache_wait_bit(void *); +extern int fscache_wait_bit_interruptible(void *); + +/* + * fsc-object.c + */ +extern void fscache_withdrawing_object(struct fscache_cache *, + struct fscache_object *); +extern void fscache_enqueue_object(struct fscache_object *); + +/* + * fsc-operation.c + */ +extern int fscache_submit_exclusive_op(struct fscache_object *, + struct fscache_operation *); +extern int fscache_submit_op(struct fscache_object *, + struct fscache_operation *); +extern void fscache_abort_object(struct fscache_object *); +extern void fscache_start_operations(struct fscache_object *); +extern void fscache_operation_gc(struct work_struct *); + +/* + * fsc-proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * fsc-stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; +extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS]; + +extern atomic_t fscache_n_op_pend; +extern atomic_t fscache_n_op_run; +extern atomic_t fscache_n_op_enqueue; +extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_release; +extern atomic_t fscache_n_op_gc; + +extern atomic_t fscache_n_attr_changed; +extern atomic_t fscache_n_attr_changed_ok; +extern atomic_t fscache_n_attr_changed_nobufs; +extern atomic_t fscache_n_attr_changed_nomem; +extern atomic_t fscache_n_attr_changed_calls; + +extern atomic_t fscache_n_allocs; +extern atomic_t fscache_n_allocs_ok; +extern atomic_t fscache_n_allocs_wait; +extern atomic_t fscache_n_allocs_nobufs; +extern atomic_t fscache_n_alloc_ops; +extern atomic_t fscache_n_alloc_op_waits; + +extern atomic_t fscache_n_retrievals; +extern atomic_t fscache_n_retrievals_ok; +extern atomic_t fscache_n_retrievals_wait; +extern atomic_t fscache_n_retrievals_nodata; +extern atomic_t fscache_n_retrievals_nobufs; +extern atomic_t fscache_n_retrievals_intr; +extern atomic_t fscache_n_retrievals_nomem; +extern atomic_t fscache_n_retrieval_ops; +extern atomic_t fscache_n_retrieval_op_waits; + +extern atomic_t fscache_n_stores; +extern atomic_t fscache_n_stores_ok; +extern atomic_t fscache_n_stores_again; +extern atomic_t fscache_n_stores_nobufs; +extern atomic_t fscache_n_stores_oom; +extern atomic_t fscache_n_store_ops; +extern atomic_t fscache_n_store_calls; + +extern atomic_t fscache_n_marks; +extern atomic_t fscache_n_uncaches; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_null; +extern atomic_t fscache_n_acquires_no_cache; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_nobufs; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_updates; +extern atomic_t fscache_n_updates_null; +extern atomic_t fscache_n_updates_run; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_null; +extern atomic_t fscache_n_relinquishes_waitcrt; + +extern atomic_t fscache_n_cookie_index; +extern atomic_t fscache_n_cookie_data; +extern atomic_t fscache_n_cookie_special; + +extern atomic_t fscache_n_object_alloc; +extern atomic_t fscache_n_object_no_alloc; +extern atomic_t fscache_n_object_lookups; +extern atomic_t fscache_n_object_lookups_negative; +extern atomic_t fscache_n_object_lookups_positive; +extern atomic_t fscache_n_object_created; +extern atomic_t fscache_n_object_avail; +extern atomic_t fscache_n_object_dead; + +extern atomic_t fscache_n_checkaux_none; +extern atomic_t fscache_n_checkaux_okay; +extern atomic_t fscache_n_checkaux_update; +extern atomic_t fscache_n_checkaux_obsolete; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +extern const struct file_operations fscache_stats_fops; +#else + +#define fscache_stat(stat) do {} while (0) +#endif + +/* + * raise an event on an object + * - if the event is not masked for that object, then the object is + * queued for attention by the thread pool. + */ +static inline void fscache_raise_event(struct fscache_object *object, + unsigned event) +{ + if (!test_and_set_bit(event, &object->events) && + test_bit(event, &object->event_mask)) + fscache_enqueue_object(object); +} + +/* + * drop a reference to a cookie + */ +static inline void fscache_cookie_put(struct fscache_cookie *cookie) +{ + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (atomic_dec_and_test(&cookie->usage)) + __fscache_cookie_put(cookie); +} + +/* + * get an extra reference to a netfs retrieval context + */ +static inline +void *fscache_get_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->get_context) + cookie->def->get_context(cookie->netfs_data, context); + return context; +} + +/* + * release a reference to a netfs retrieval context + */ +static inline +void fscache_put_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->put_context) + cookie->def->put_context(cookie->netfs_data, context); +} + +/*****************************************************************************/ +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +/* make sure we maintain the format strings, even when debugging is disabled */ +static inline __attribute__((format(printf, 1, 2))) +void _dbprintk(const char *fmt, ...) +{ +} + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + +#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) + +#ifdef __KDEBUG +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_FSCACHE_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (__do_kdebug(ENTER)) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (__do_kdebug(LEAVE)) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (__do_kdebug(DEBUG)) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) +#endif + +/* + * determine whether a particular optional debugging point should be logged + * - we need to go through three steps to persuade cpp to correctly join the + * shorthand in FSCACHE_DEBUG_LEVEL with its prefix + */ +#define ____do_kdebug(LEVEL, POINT) \ + unlikely((fscache_debug & \ + (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) +#define ___do_kdebug(LEVEL, POINT) \ + ____do_kdebug(LEVEL, POINT) +#define __do_kdebug(POINT) \ + ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) + +#define FSCACHE_DEBUG_CACHE 0 +#define FSCACHE_DEBUG_COOKIE 1 +#define FSCACHE_DEBUG_PAGE 2 +#define FSCACHE_DEBUG_OPERATION 3 + +#define FSCACHE_POINT_ENTER 1 +#define FSCACHE_POINT_LEAVE 2 +#define FSCACHE_POINT_DEBUG 4 + +#ifndef FSCACHE_DEBUG_LEVEL +#define FSCACHE_DEBUG_LEVEL CACHE +#endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/fscache/main.c b/fs/fscache/main.c new file mode 100644 index 00000000000..4de41b59749 --- /dev/null +++ b/fs/fscache/main.c @@ -0,0 +1,124 @@ +/* General filesystem local caching manager + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include "internal.h" + +MODULE_DESCRIPTION("FS Cache Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned fscache_defer_lookup = 1; +module_param_named(defer_lookup, fscache_defer_lookup, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_lookup, + "Defer cookie lookup to background thread"); + +unsigned fscache_defer_create = 1; +module_param_named(defer_create, fscache_defer_create, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_create, + "Defer cookie creation to background thread"); + +unsigned fscache_debug; +module_param_named(debug, fscache_debug, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_debug, + "FS-Cache debugging mask"); + +struct kobject *fscache_root; + +/* + * initialise the fs caching module + */ +static int __init fscache_init(void) +{ + int ret; + + ret = slow_work_register_user(); + if (ret < 0) + goto error_slow_work; + + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; + + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", + sizeof(struct fscache_cookie), + 0, + 0, + fscache_cookie_init_once); + if (!fscache_cookie_jar) { + printk(KERN_NOTICE + "FS-Cache: Failed to allocate a cookie jar\n"); + ret = -ENOMEM; + goto error_cookie_jar; + } + + fscache_root = kobject_create_and_add("fscache", kernel_kobj); + if (!fscache_root) + goto error_kobj; + + printk(KERN_NOTICE "FS-Cache: Loaded\n"); + return 0; + +error_kobj: + kmem_cache_destroy(fscache_cookie_jar); +error_cookie_jar: + fscache_proc_cleanup(); +error_proc: + slow_work_unregister_user(); +error_slow_work: + return ret; +} + +fs_initcall(fscache_init); + +/* + * clean up on module removal + */ +static void __exit fscache_exit(void) +{ + _enter(""); + + kobject_put(fscache_root); + kmem_cache_destroy(fscache_cookie_jar); + fscache_proc_cleanup(); + slow_work_unregister_user(); + printk(KERN_NOTICE "FS-Cache: Unloaded\n"); +} + +module_exit(fscache_exit); + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +int fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} +EXPORT_SYMBOL(fscache_wait_bit); + +/* + * wait_on_bit() sleep function for interruptible waiting + */ +int fscache_wait_bit_interruptible(void *flags) +{ + schedule(); + return signal_pending(current); +} +EXPORT_SYMBOL(fscache_wait_bit_interruptible); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c new file mode 100644 index 00000000000..e028b8eb1c4 --- /dev/null +++ b/fs/fscache/netfs.c @@ -0,0 +1,103 @@ +/* FS-Cache netfs (client) registration + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/module.h> +#include <linux/slab.h> +#include "internal.h" + +static LIST_HEAD(fscache_netfs_list); + +/* + * register a network filesystem for caching + */ +int __fscache_register_netfs(struct fscache_netfs *netfs) +{ + struct fscache_netfs *ptr; + int ret; + + _enter("{%s}", netfs->name); + + INIT_LIST_HEAD(&netfs->link); + + /* allocate a cookie for the primary index */ + netfs->primary_index = + kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); + + if (!netfs->primary_index) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* initialise the primary index cookie */ + atomic_set(&netfs->primary_index->usage, 1); + atomic_set(&netfs->primary_index->n_children, 0); + + netfs->primary_index->def = &fscache_fsdef_netfs_def; + netfs->primary_index->parent = &fscache_fsdef_index; + netfs->primary_index->netfs_data = netfs; + + atomic_inc(&netfs->primary_index->parent->usage); + atomic_inc(&netfs->primary_index->parent->n_children); + + spin_lock_init(&netfs->primary_index->lock); + INIT_HLIST_HEAD(&netfs->primary_index->backing_objects); + + /* check the netfs type is not already present */ + down_write(&fscache_addremove_sem); + + ret = -EEXIST; + list_for_each_entry(ptr, &fscache_netfs_list, link) { + if (strcmp(ptr->name, netfs->name) == 0) + goto already_registered; + } + + list_add(&netfs->link, &fscache_netfs_list); + ret = 0; + + printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", + netfs->name); + +already_registered: + up_write(&fscache_addremove_sem); + + if (ret < 0) { + netfs->primary_index->parent = NULL; + __fscache_cookie_put(netfs->primary_index); + netfs->primary_index = NULL; + } + + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(__fscache_register_netfs); + +/* + * unregister a network filesystem from the cache + * - all cookies must have been released first + */ +void __fscache_unregister_netfs(struct fscache_netfs *netfs) +{ + _enter("{%s.%u}", netfs->name, netfs->version); + + down_write(&fscache_addremove_sem); + + list_del(&netfs->link); + fscache_relinquish_cookie(netfs->primary_index, 0); + + up_write(&fscache_addremove_sem); + + printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", + netfs->name); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_unregister_netfs); diff --git a/fs/fscache/object.c b/fs/fscache/object.c new file mode 100644 index 00000000000..392a41b1b79 --- /dev/null +++ b/fs/fscache/object.c @@ -0,0 +1,810 @@ +/* FS-Cache object state machine handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/object.txt for a description of the + * object state machine and the in-kernel representations. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/module.h> +#include "internal.h" + +const char *fscache_object_states[] = { + [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", + [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", + [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", + [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", + [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", + [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", + [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", + [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", + [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", + [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", + [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", + [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", + [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", +}; +EXPORT_SYMBOL(fscache_object_states); + +static void fscache_object_slow_work_put_ref(struct slow_work *); +static int fscache_object_slow_work_get_ref(struct slow_work *); +static void fscache_object_slow_work_execute(struct slow_work *); +static void fscache_initialise_object(struct fscache_object *); +static void fscache_lookup_object(struct fscache_object *); +static void fscache_object_available(struct fscache_object *); +static void fscache_release_object(struct fscache_object *); +static void fscache_withdraw_object(struct fscache_object *); +static void fscache_enqueue_dependents(struct fscache_object *); +static void fscache_dequeue_object(struct fscache_object *); + +const struct slow_work_ops fscache_object_slow_work_ops = { + .get_ref = fscache_object_slow_work_get_ref, + .put_ref = fscache_object_slow_work_put_ref, + .execute = fscache_object_slow_work_execute, +}; +EXPORT_SYMBOL(fscache_object_slow_work_ops); + +/* + * we need to notify the parent when an op completes that we had outstanding + * upon it + */ +static inline void fscache_done_parent_op(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + + _enter("OBJ%x {OBJ%x,%x}", + object->debug_id, parent->debug_id, parent->n_ops); + + spin_lock_nested(&parent->lock, 1); + parent->n_ops--; + parent->n_obj_ops--; + if (parent->n_ops == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); +} + +/* + * process events that have been sent to an object's state machine + * - initiates parent lookup + * - does object lookup + * - does object creation + * - does object recycling and retirement + * - does object withdrawal + */ +static void fscache_object_state_machine(struct fscache_object *object) +{ + enum fscache_object_state new_state; + + ASSERT(object != NULL); + + _enter("{OBJ%x,%s,%lx}", + object->debug_id, fscache_object_states[object->state], + object->events); + + switch (object->state) { + /* wait for the parent object to become ready */ + case FSCACHE_OBJECT_INIT: + object->event_mask = + ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + fscache_initialise_object(object); + goto done; + + /* look up the object metadata on disk */ + case FSCACHE_OBJECT_LOOKING_UP: + fscache_lookup_object(object); + goto lookup_transit; + + /* create the object metadata on disk */ + case FSCACHE_OBJECT_CREATING: + fscache_lookup_object(object); + goto lookup_transit; + + /* handle an object becoming available; start pending + * operations and queue dependent operations for processing */ + case FSCACHE_OBJECT_AVAILABLE: + fscache_object_available(object); + goto active_transit; + + /* normal running state */ + case FSCACHE_OBJECT_ACTIVE: + goto active_transit; + + /* update the object metadata on disk */ + case FSCACHE_OBJECT_UPDATING: + clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); + fscache_stat(&fscache_n_updates_run); + object->cache->ops->update_object(object); + goto active_transit; + + /* handle an object dying during lookup or creation */ + case FSCACHE_OBJECT_LC_DYING: + object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + object->cache->ops->lookup_complete(object); + + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DYING; + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, + FSCACHE_COOKIE_CREATING); + spin_unlock(&object->lock); + + fscache_done_parent_op(object); + + /* wait for completion of all active operations on this object + * and the death of all child objects of this object */ + case FSCACHE_OBJECT_DYING: + dying: + clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events); + spin_lock(&object->lock); + _debug("dying OBJ%x {%d,%d}", + object->debug_id, object->n_ops, object->n_children); + if (object->n_ops == 0 && object->n_children == 0) { + object->event_mask &= + ~(1 << FSCACHE_OBJECT_EV_CLEARED); + object->event_mask |= + (1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR); + } else { + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + object->event_mask |= + 1 << FSCACHE_OBJECT_EV_CLEARED; + } + spin_unlock(&object->lock); + fscache_enqueue_dependents(object); + goto terminal_transit; + + /* handle an abort during initialisation */ + case FSCACHE_OBJECT_ABORT_INIT: + _debug("handle abort init %lx", object->events); + object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + + spin_lock(&object->lock); + fscache_dequeue_object(object); + + object->state = FSCACHE_OBJECT_DYING; + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, + FSCACHE_COOKIE_CREATING); + spin_unlock(&object->lock); + goto dying; + + /* handle the netfs releasing an object and possibly marking it + * obsolete too */ + case FSCACHE_OBJECT_RELEASING: + case FSCACHE_OBJECT_RECYCLING: + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + fscache_release_object(object); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DEAD; + spin_unlock(&object->lock); + fscache_stat(&fscache_n_object_dead); + goto terminal_transit; + + /* handle the parent cache of this object being withdrawn from + * active service */ + case FSCACHE_OBJECT_WITHDRAWING: + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + fscache_withdraw_object(object); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DEAD; + spin_unlock(&object->lock); + fscache_stat(&fscache_n_object_dead); + goto terminal_transit; + + /* complain about the object being woken up once it is + * deceased */ + case FSCACHE_OBJECT_DEAD: + printk(KERN_ERR "FS-Cache:" + " Unexpected event in dead state %lx\n", + object->events & object->event_mask); + BUG(); + + default: + printk(KERN_ERR "FS-Cache: Unknown object state %u\n", + object->state); + BUG(); + } + + /* determine the transition from a lookup state */ +lookup_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + case FSCACHE_OBJECT_EV_RETIRE: + case FSCACHE_OBJECT_EV_RELEASE: + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_LC_DYING; + goto change_state; + case FSCACHE_OBJECT_EV_REQUEUE: + goto done; + case -1: + goto done; /* sleep until event */ + default: + goto unsupported_event; + } + + /* determine the transition from an active state */ +active_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + case FSCACHE_OBJECT_EV_RETIRE: + case FSCACHE_OBJECT_EV_RELEASE: + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_DYING; + goto change_state; + case FSCACHE_OBJECT_EV_UPDATE: + new_state = FSCACHE_OBJECT_UPDATING; + goto change_state; + case -1: + new_state = FSCACHE_OBJECT_ACTIVE; + goto change_state; /* sleep until event */ + default: + goto unsupported_event; + } + + /* determine the transition from a terminal state */ +terminal_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + new_state = FSCACHE_OBJECT_WITHDRAWING; + goto change_state; + case FSCACHE_OBJECT_EV_RETIRE: + new_state = FSCACHE_OBJECT_RECYCLING; + goto change_state; + case FSCACHE_OBJECT_EV_RELEASE: + new_state = FSCACHE_OBJECT_RELEASING; + goto change_state; + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_WITHDRAWING; + goto change_state; + case FSCACHE_OBJECT_EV_CLEARED: + new_state = FSCACHE_OBJECT_DYING; + goto change_state; + case -1: + goto done; /* sleep until event */ + default: + goto unsupported_event; + } + +change_state: + spin_lock(&object->lock); + object->state = new_state; + spin_unlock(&object->lock); + +done: + _leave(" [->%s]", fscache_object_states[object->state]); + return; + +unsupported_event: + printk(KERN_ERR "FS-Cache:" + " Unsupported event %lx [mask %lx] in state %s\n", + object->events, object->event_mask, + fscache_object_states[object->state]); + BUG(); +} + +/* + * execute an object + */ +static void fscache_object_slow_work_execute(struct slow_work *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + unsigned long start; + + _enter("{OBJ%x}", object->debug_id); + + clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + + start = jiffies; + fscache_object_state_machine(object); + fscache_hist(fscache_objs_histogram, start); + if (object->events & object->event_mask) + fscache_enqueue_object(object); +} + +/* + * initialise an object + * - check the specified object's parent to see if we can make use of it + * immediately to do a creation + * - we may need to start the process of creating a parent and we need to wait + * for the parent's lookup and creation to complete if it's not there yet + * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the + * leaf-most cookies of the object and all its children + */ +static void fscache_initialise_object(struct fscache_object *object) +{ + struct fscache_object *parent; + + _enter(""); + ASSERT(object->cookie != NULL); + ASSERT(object->cookie->parent != NULL); + ASSERT(list_empty(&object->work.link)); + + if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_WITHDRAW))) { + _debug("abort init %lx", object->events); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_ABORT_INIT; + spin_unlock(&object->lock); + return; + } + + spin_lock(&object->cookie->lock); + spin_lock_nested(&object->cookie->parent->lock, 1); + + parent = object->parent; + if (!parent) { + _debug("no parent"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + } else { + spin_lock(&object->lock); + spin_lock_nested(&parent->lock, 1); + _debug("parent %s", fscache_object_states[parent->state]); + + if (parent->state >= FSCACHE_OBJECT_DYING) { + _debug("bad parent"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) { + _debug("wait"); + + /* we may get woken up in this state by child objects + * binding on to us, so we need to make sure we don't + * add ourself to the list multiple times */ + if (list_empty(&object->dep_link)) { + object->cache->ops->grab_object(object); + list_add(&object->dep_link, + &parent->dependents); + + /* fscache_acquire_non_index_cookie() uses this + * to wake the chain up */ + if (parent->state == FSCACHE_OBJECT_INIT) + fscache_enqueue_object(parent); + } + } else { + _debug("go"); + parent->n_ops++; + parent->n_obj_ops++; + object->lookup_jif = jiffies; + object->state = FSCACHE_OBJECT_LOOKING_UP; + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } + + spin_unlock(&parent->lock); + spin_unlock(&object->lock); + } + + spin_unlock(&object->cookie->parent->lock); + spin_unlock(&object->cookie->lock); + _leave(""); +} + +/* + * look an object up in the cache from which it was allocated + * - we hold an "access lock" on the parent object, so the parent object cannot + * be withdrawn by either party till we've finished + * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the + * leaf-most cookies of the object and all its children + */ +static void fscache_lookup_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + struct fscache_object *parent; + + _enter(""); + + parent = object->parent; + ASSERT(parent != NULL); + ASSERTCMP(parent->n_ops, >, 0); + ASSERTCMP(parent->n_obj_ops, >, 0); + + /* make sure the parent is still available */ + ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); + + if (parent->state >= FSCACHE_OBJECT_DYING || + test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + _debug("unavailable"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + _leave(""); + return; + } + + _debug("LOOKUP \"%s/%s\" in \"%s\"", + parent->cookie->def->name, cookie->def->name, + object->cache->tag->name); + + fscache_stat(&fscache_n_object_lookups); + object->cache->ops->lookup_object(object); + + if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + + _leave(""); +} + +/** + * fscache_object_lookup_negative - Note negative cookie lookup + * @object: Object pointing to cookie to mark + * + * Note negative lookup, permitting those waiting to read data from an already + * existing backing object to continue as there's no data for them to read. + */ +void fscache_object_lookup_negative(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", + object->debug_id, fscache_object_states[object->state]); + + spin_lock(&object->lock); + if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + fscache_stat(&fscache_n_object_lookups_negative); + + /* transit here to allow write requests to begin stacking up + * and read requests to begin returning ENODATA */ + object->state = FSCACHE_OBJECT_CREATING; + spin_unlock(&object->lock); + + set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + _debug("wake up lookup %p", &cookie->flags); + smp_mb__before_clear_bit(); + clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } else { + ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); + spin_unlock(&object->lock); + } + + _leave(""); +} +EXPORT_SYMBOL(fscache_object_lookup_negative); + +/** + * fscache_obtained_object - Note successful object lookup or creation + * @object: Object pointing to cookie to mark + * + * Note successful lookup and/or creation, permitting those waiting to write + * data to a backing object to continue. + * + * Note that after calling this, an object's cookie may be relinquished by the + * netfs, and so must be accessed with object lock held. + */ +void fscache_obtained_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", + object->debug_id, fscache_object_states[object->state]); + + /* if we were still looking up, then we must have a positive lookup + * result, in which case there may be data available */ + spin_lock(&object->lock); + if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + fscache_stat(&fscache_n_object_lookups_positive); + + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + object->state = FSCACHE_OBJECT_AVAILABLE; + spin_unlock(&object->lock); + + smp_mb__before_clear_bit(); + clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } else { + ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); + fscache_stat(&fscache_n_object_created); + + object->state = FSCACHE_OBJECT_AVAILABLE; + spin_unlock(&object->lock); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + smp_wmb(); + } + + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING); + + _leave(""); +} +EXPORT_SYMBOL(fscache_obtained_object); + +/* + * handle an object that has just become available + */ +static void fscache_object_available(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + spin_lock(&object->lock); + + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); + + fscache_done_parent_op(object); + if (object->n_in_progress == 0) { + if (object->n_ops > 0) { + ASSERTCMP(object->n_ops, >=, object->n_obj_ops); + ASSERTIF(object->n_ops > object->n_obj_ops, + !list_empty(&object->pending_ops)); + fscache_start_operations(object); + } else { + ASSERT(list_empty(&object->pending_ops)); + } + } + spin_unlock(&object->lock); + + object->cache->ops->lookup_complete(object); + fscache_enqueue_dependents(object); + + fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); + fscache_stat(&fscache_n_object_avail); + + _leave(""); +} + +/* + * drop an object's attachments + */ +static void fscache_drop_object(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + struct fscache_cache *cache = object->cache; + + _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + + spin_lock(&cache->object_list_lock); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); + + cache->ops->drop_object(object); + + if (parent) { + _debug("release parent OBJ%x {%d}", + parent->debug_id, parent->n_children); + + spin_lock(&parent->lock); + parent->n_children--; + if (parent->n_children == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); + object->parent = NULL; + } + + /* this just shifts the object release to the slow work processor */ + object->cache->ops->put_object(object); + + _leave(""); +} + +/* + * release or recycle an object that the netfs has discarded + */ +static void fscache_release_object(struct fscache_object *object) +{ + _enter(""); + + fscache_drop_object(object); +} + +/* + * withdraw an object from active service + */ +static void fscache_withdraw_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie; + bool detached; + + _enter(""); + + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* need to get the cookie lock before the object lock, starting + * from the object pointer */ + atomic_inc(&cookie->usage); + spin_unlock(&object->lock); + + detached = false; + spin_lock(&cookie->lock); + spin_lock(&object->lock); + + if (object->cookie == cookie) { + hlist_del_init(&object->cookie_link); + object->cookie = NULL; + detached = true; + } + spin_unlock(&cookie->lock); + fscache_cookie_put(cookie); + if (detached) + fscache_cookie_put(cookie); + } + + spin_unlock(&object->lock); + + fscache_drop_object(object); +} + +/* + * withdraw an object from active service at the behest of the cache + * - need break the links to a cached object cookie + * - called under two situations: + * (1) recycler decides to reclaim an in-use object + * (2) a cache is unmounted + * - have to take care as the cookie can be being relinquished by the netfs + * simultaneously + * - the object is pinned by the caller holding a refcount on it + */ +void fscache_withdrawing_object(struct fscache_cache *cache, + struct fscache_object *object) +{ + bool enqueue = false; + + _enter(",OBJ%x", object->debug_id); + + spin_lock(&object->lock); + if (object->state < FSCACHE_OBJECT_WITHDRAWING) { + object->state = FSCACHE_OBJECT_WITHDRAWING; + enqueue = true; + } + spin_unlock(&object->lock); + + if (enqueue) + fscache_enqueue_object(object); + + _leave(""); +} + +/* + * allow the slow work item processor to get a ref on an object + */ +static int fscache_object_slow_work_get_ref(struct slow_work *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + + return object->cache->ops->grab_object(object) ? 0 : -EAGAIN; +} + +/* + * allow the slow work item processor to discard a ref on a work item + */ +static void fscache_object_slow_work_put_ref(struct slow_work *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + + return object->cache->ops->put_object(object); +} + +/* + * enqueue an object for metadata-type processing + */ +void fscache_enqueue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + slow_work_enqueue(&object->work); +} + +/* + * enqueue the dependents of an object for metadata-type processing + * - the caller must hold the object's lock + * - this may cause an already locked object to wind up being processed again + */ +static void fscache_enqueue_dependents(struct fscache_object *object) +{ + struct fscache_object *dep; + + _enter("{OBJ%x}", object->debug_id); + + if (list_empty(&object->dependents)) + return; + + spin_lock(&object->lock); + + while (!list_empty(&object->dependents)) { + dep = list_entry(object->dependents.next, + struct fscache_object, dep_link); + list_del_init(&dep->dep_link); + + + /* sort onto appropriate lists */ + fscache_enqueue_object(dep); + dep->cache->ops->put_object(dep); + + if (!list_empty(&object->dependents)) + cond_resched_lock(&object->lock); + } + + spin_unlock(&object->lock); +} + +/* + * remove an object from whatever queue it's waiting on + * - the caller must hold object->lock + */ +void fscache_dequeue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + if (!list_empty(&object->dep_link)) { + spin_lock(&object->parent->lock); + list_del_init(&object->dep_link); + spin_unlock(&object->parent->lock); + } + + _leave(""); +} + +/** + * fscache_check_aux - Ask the netfs whether an object on disk is still valid + * @object: The object to ask about + * @data: The auxiliary data for the object + * @datalen: The size of the auxiliary data + * + * This function consults the netfs about the coherency state of an object + */ +enum fscache_checkaux fscache_check_aux(struct fscache_object *object, + const void *data, uint16_t datalen) +{ + enum fscache_checkaux result; + + if (!object->cookie->def->check_aux) { + fscache_stat(&fscache_n_checkaux_none); + return FSCACHE_CHECKAUX_OKAY; + } + + result = object->cookie->def->check_aux(object->cookie->netfs_data, + data, datalen); + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + fscache_stat(&fscache_n_checkaux_okay); + break; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + fscache_stat(&fscache_n_checkaux_update); + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + fscache_stat(&fscache_n_checkaux_obsolete); + break; + + default: + BUG(); + } + + return result; +} +EXPORT_SYMBOL(fscache_check_aux); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c new file mode 100644 index 00000000000..e7f8d53b8b6 --- /dev/null +++ b/fs/fscache/operation.c @@ -0,0 +1,459 @@ +/* FS-Cache worker operation management routines + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/operations.txt + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include <linux/module.h> +#include "internal.h" + +atomic_t fscache_op_debug_id; +EXPORT_SYMBOL(fscache_op_debug_id); + +/** + * fscache_enqueue_operation - Enqueue an operation for processing + * @op: The operation to enqueue + * + * Enqueue an operation for processing by the FS-Cache thread pool. + * + * This will get its own ref on the object. + */ +void fscache_enqueue_operation(struct fscache_operation *op) +{ + _enter("{OBJ%x OP%x,%u}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(op->processor != NULL); + ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); + ASSERTCMP(atomic_read(&op->usage), >, 0); + + if (list_empty(&op->pend_link)) { + switch (op->flags & FSCACHE_OP_TYPE) { + case FSCACHE_OP_FAST: + _debug("queue fast"); + atomic_inc(&op->usage); + if (!schedule_work(&op->fast_work)) + fscache_put_operation(op); + break; + case FSCACHE_OP_SLOW: + _debug("queue slow"); + slow_work_enqueue(&op->slow_work); + break; + case FSCACHE_OP_MYTHREAD: + _debug("queue for caller's attention"); + break; + default: + printk(KERN_ERR "FS-Cache: Unexpected op type %lx", + op->flags); + BUG(); + break; + } + fscache_stat(&fscache_n_op_enqueue); + } +} +EXPORT_SYMBOL(fscache_enqueue_operation); + +/* + * start an op running + */ +static void fscache_run_op(struct fscache_object *object, + struct fscache_operation *op) +{ + object->n_in_progress++; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + if (op->processor) + fscache_enqueue_operation(op); + fscache_stat(&fscache_n_op_run); +} + +/* + * submit an exclusive operation for an object + * - other ops are excluded from running simultaneously with this one + * - this gets any extra refs it needs on an op + */ +int fscache_submit_exclusive_op(struct fscache_object *object, + struct fscache_operation *op) +{ + int ret; + + _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + + ret = -ENOBUFS; + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + + if (object->n_ops > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_in_progress, ==, 0); + fscache_run_op(object, op); + } + + /* need to issue a new write op after this */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + ret = 0; + } else if (object->state == FSCACHE_OBJECT_CREATING) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else { + /* not allowed to submit ops in any other state */ + BUG(); + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + unsigned long ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, + fscache_object_states[object->state]); + kdebug("objstate=%s [%s]", + fscache_object_states[object->state], + fscache_object_states[ostate]); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* + * submit an operation for an object + * - objects may be submitted only in the following states: + * - during object creation (write ops may be submitted) + * - whilst the object is active + * - after an I/O error incurred in one of the two above states (op rejected) + * - this gets any extra refs it needs on an op + */ +int fscache_submit_op(struct fscache_object *object, + struct fscache_operation *op) +{ + unsigned long ostate; + int ret; + + _enter("{OBJ%x OP%x},{%u}", + object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + + ostate = object->state; + smp_rmb(); + + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + + if (object->n_exclusive > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_exclusive, ==, 0); + fscache_run_op(object, op); + } + ret = 0; + } else if (object->state == FSCACHE_OBJECT_CREATING) { + op->object = object; + object->n_ops++; + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + fscache_report_unexpected_submission(object, op, ostate); + ASSERT(!fscache_object_is_active(object)); + ret = -ENOBUFS; + } else { + ret = -ENOBUFS; + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * queue an object for withdrawal on error, aborting all following asynchronous + * operations + */ +void fscache_abort_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); +} + +/* + * jump start the operation processing on an object + * - caller must hold object->lock + */ +void fscache_start_operations(struct fscache_object *object) +{ + struct fscache_operation *op; + bool stop = false; + + while (!list_empty(&object->pending_ops) && !stop) { + op = list_entry(object->pending_ops.next, + struct fscache_operation, pend_link); + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + if (object->n_in_progress > 0) + break; + stop = true; + } + list_del_init(&op->pend_link); + object->n_in_progress++; + + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + if (op->processor) + fscache_enqueue_operation(op); + + /* the pending queue was holding a ref on the object */ + fscache_put_operation(op); + } + + ASSERTCMP(object->n_in_progress, <=, object->n_ops); + + _debug("woke %d ops on OBJ%x", + object->n_in_progress, object->debug_id); +} + +/* + * release an operation + * - queues pending ops if this is the last in-progress op + */ +void fscache_put_operation(struct fscache_operation *op) +{ + struct fscache_object *object; + struct fscache_cache *cache; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + if (!atomic_dec_and_test(&op->usage)) + return; + + _debug("PUT OP"); + if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) + BUG(); + + fscache_stat(&fscache_n_op_release); + + if (op->release) { + op->release(op); + op->release = NULL; + } + + object = op->object; + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + ASSERTCMP(object->n_exclusive, >, 0); + object->n_exclusive--; + } + + ASSERTCMP(object->n_in_progress, >, 0); + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + kfree(op); + _leave(" [done]"); +} +EXPORT_SYMBOL(fscache_put_operation); + +/* + * garbage collect operations that have had their release deferred + */ +void fscache_operation_gc(struct work_struct *work) +{ + struct fscache_operation *op; + struct fscache_object *object; + struct fscache_cache *cache = + container_of(work, struct fscache_cache, op_gc); + int count = 0; + + _enter(""); + + do { + spin_lock(&cache->op_gc_list_lock); + if (list_empty(&cache->op_gc_list)) { + spin_unlock(&cache->op_gc_list_lock); + break; + } + + op = list_entry(cache->op_gc_list.next, + struct fscache_operation, pend_link); + list_del(&op->pend_link); + spin_unlock(&cache->op_gc_list_lock); + + object = op->object; + + _debug("GC DEFERRED REL OBJ%x OP%x", + object->debug_id, op->debug_id); + fscache_stat(&fscache_n_op_gc); + + ASSERTCMP(atomic_read(&op->usage), ==, 0); + + spin_lock(&object->lock); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + ASSERTCMP(object->n_exclusive, >, 0); + object->n_exclusive--; + } + + ASSERTCMP(object->n_in_progress, >, 0); + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + } while (count++ < 20); + + if (!list_empty(&cache->op_gc_list)) + schedule_work(&cache->op_gc); + + _leave(""); +} + +/* + * allow the slow work item processor to get a ref on an operation + */ +static int fscache_op_get_ref(struct slow_work *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + + atomic_inc(&op->usage); + return 0; +} + +/* + * allow the slow work item processor to discard a ref on an operation + */ +static void fscache_op_put_ref(struct slow_work *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + + fscache_put_operation(op); +} + +/* + * execute an operation using the slow thread pool to provide processing context + * - the caller holds a ref to this object, so we don't need to hold one + */ +static void fscache_op_execute(struct slow_work *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + unsigned long start; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(op->processor != NULL); + start = jiffies; + op->processor(op); + fscache_hist(fscache_ops_histogram, start); + + _leave(""); +} + +const struct slow_work_ops fscache_op_slow_work_ops = { + .get_ref = fscache_op_get_ref, + .put_ref = fscache_op_put_ref, + .execute = fscache_op_execute, +}; diff --git a/fs/fscache/page.c b/fs/fscache/page.c new file mode 100644 index 00000000000..2568e0eb644 --- /dev/null +++ b/fs/fscache/page.c @@ -0,0 +1,816 @@ +/* Cache page management and data I/O routines + * + * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL PAGE +#include <linux/module.h> +#include <linux/fscache-cache.h> +#include <linux/buffer_head.h> +#include <linux/pagevec.h> +#include "internal.h" + +/* + * check to see if a page is being written to the cache + */ +bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page) +{ + void *val; + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + rcu_read_unlock(); + + return val != NULL; +} +EXPORT_SYMBOL(__fscache_check_page_write); + +/* + * wait for a page to finish being written to the cache + */ +void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + wait_event(*wq, !__fscache_check_page_write(cookie, page)); +} +EXPORT_SYMBOL(__fscache_wait_on_page_write); + +/* + * note that a page has finished being written to the cache + */ +static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page) +{ + struct page *xpage; + + spin_lock(&cookie->lock); + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->lock); + ASSERT(xpage != NULL); + + wake_up_bit(&cookie->flags, 0); +} + +/* + * actually apply the changed attributes to a cache object + */ +static void fscache_attr_changed_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + + _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); + + fscache_stat(&fscache_n_attr_changed_calls); + + if (fscache_object_is_active(object) && + object->cache->ops->attr_changed(object) < 0) + fscache_abort_object(object); + + _leave(""); +} + +/* + * notification that the attributes on an object have changed + */ +int __fscache_attr_changed(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + + _enter("%p", cookie); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + + fscache_stat(&fscache_n_attr_changed); + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { + fscache_stat(&fscache_n_attr_changed_nomem); + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + fscache_operation_init(op, NULL); + fscache_operation_init_slow(op, fscache_attr_changed_op); + op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_exclusive_op(object, op) < 0) + goto nobufs; + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_attr_changed_ok); + fscache_put_operation(op); + _leave(" = 0"); + return 0; + +nobufs: + spin_unlock(&cookie->lock); + kfree(op); + fscache_stat(&fscache_n_attr_changed_nobufs); + _leave(" = %d", -ENOBUFS); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_attr_changed); + +/* + * handle secondary execution given to a retrieval op on behalf of the + * cache + */ +static void fscache_retrieval_work(struct work_struct *work) +{ + struct fscache_retrieval *op = + container_of(work, struct fscache_retrieval, op.fast_work); + unsigned long start; + + _enter("{OP%x}", op->op.debug_id); + + start = jiffies; + op->op.processor(&op->op); + fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(&op->op); +} + +/* + * release a retrieval op reference + */ +static void fscache_release_retrieval_op(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + _enter("{OP%x}", op->op.debug_id); + + fscache_hist(fscache_retrieval_histogram, op->start_time); + if (op->context) + fscache_put_context(op->op.object->cookie, op->context); + + _leave(""); +} + +/* + * allocate a retrieval op + */ +static struct fscache_retrieval *fscache_alloc_retrieval( + struct address_space *mapping, + fscache_rw_complete_t end_io_func, + void *context) +{ + struct fscache_retrieval *op; + + /* allocate a retrieval operation and attempt to submit it */ + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) { + fscache_stat(&fscache_n_retrievals_nomem); + return NULL; + } + + fscache_operation_init(&op->op, fscache_release_retrieval_op); + op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); + op->mapping = mapping; + op->end_io_func = end_io_func; + op->context = context; + op->start_time = jiffies; + INIT_WORK(&op->op.fast_work, fscache_retrieval_work); + INIT_LIST_HEAD(&op->to_do); + return op; +} + +/* + * wait for a deferred lookup to complete + */ +static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) +{ + unsigned long jif; + + _enter(""); + + if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { + _leave(" = 0 [imm]"); + return 0; + } + + fscache_stat(&fscache_n_retrievals_wait); + + jif = jiffies; + if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) != 0) { + fscache_stat(&fscache_n_retrievals_intr); + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; + } + + ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); + + smp_rmb(); + fscache_hist(fscache_retrieval_delay_histogram, jif); + _leave(" = 0 [dly]"); + return 0; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * -ENODATA - no data available in the backing object for this block + * 0 - dispatched a read - it'll call end_io_func() when finished + */ +int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, + struct page *page, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(page->mapping, end_io_func, context); + if (!op) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { + _debug(">>> WT"); + fscache_stat(&fscache_n_retrieval_op_waits); + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< GO"); + } + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + ret = object->cache->ops->allocate_page(op, page, gfp); + if (ret == 0) + ret = -ENODATA; + } else { + ret = object->cache->ops->read_or_alloc_page(op, page, gfp); + } + + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_page); + +/* + * read a list of page from the cache or allocate a block in which to store + * them + * - we return: + * -ENOMEM - out of memory, some pages may be being read + * -ERESTARTSYS - interrupted, some pages may be being read + * -ENOBUFS - no backing object or space available in which to cache any + * pages not being read + * -ENODATA - no data available in the backing object for some or all of + * the pages + * 0 - dispatched a read on all pages + * + * end_io_func() will be called for each page read from the cache as it is + * finishes being read + * + * any pages for which a read is dispatched will be removed from pages and + * nr_pages + */ +int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + fscache_pages_retrieval_func_t func; + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,,%d,,,", cookie, *nr_pages); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(*nr_pages, >, 0); + ASSERT(!list_empty(pages)); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(mapping, end_io_func, context); + if (!op) + return -ENOMEM; + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { + _debug(">>> WT"); + fscache_stat(&fscache_n_retrieval_op_waits); + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< GO"); + } + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) + func = object->cache->ops->allocate_pages; + else + func = object->cache->ops->read_or_alloc_pages; + ret = func(op, pages, nr_pages, gfp); + + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_pages); + +/* + * allocate a block in the cache on which to store a page + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * 0 - block allocated + */ +int __fscache_alloc_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_allocs); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(page->mapping, NULL, NULL); + if (!op) + return -ENOMEM; + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_alloc_ops); + + if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { + _debug(">>> WT"); + fscache_stat(&fscache_n_alloc_op_waits); + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("<<< GO"); + } + + /* ask the cache to honour the operation */ + ret = object->cache->ops->allocate_page(op, page, gfp); + + if (ret < 0) + fscache_stat(&fscache_n_allocs_nobufs); + else + fscache_stat(&fscache_n_allocs_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_allocs_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_alloc_page); + +/* + * release a write op reference + */ +static void fscache_release_write_op(struct fscache_operation *_op) +{ + _enter("{OP%x}", _op->debug_id); +} + +/* + * perform the background storage of a page into the cache + */ +static void fscache_write_op(struct fscache_operation *_op) +{ + struct fscache_storage *op = + container_of(_op, struct fscache_storage, op); + struct fscache_object *object = op->op.object; + struct fscache_cookie *cookie = object->cookie; + struct page *page; + unsigned n; + void *results[1]; + int ret; + + _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); + + spin_lock(&cookie->lock); + spin_lock(&object->lock); + + if (!fscache_object_is_active(object)) { + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + _leave(""); + return; + } + + fscache_stat(&fscache_n_store_calls); + + /* find a page to store */ + page = NULL; + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, + FSCACHE_COOKIE_PENDING_TAG); + if (n != 1) + goto superseded; + page = results[0]; + _debug("gang %d [%lx]", n, page->index); + if (page->index > op->store_limit) + goto superseded; + + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + + if (page) { + ret = object->cache->ops->write_page(op, page); + fscache_end_page_write(cookie, page); + page_cache_release(page); + if (ret < 0) + fscache_abort_object(object); + else + fscache_enqueue_operation(&op->op); + } + + _leave(""); + return; + +superseded: + /* this writer is going away and there aren't any more things to + * write */ + _debug("cease"); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + _leave(""); +} + +/* + * request a page be stored in the cache + * - returns: + * -ENOMEM - out of memory, nothing done + * -ENOBUFS - no backing object available in which to cache the page + * 0 - dispatched a write - it'll call end_io_func() when finished + * + * if the cookie still has a backing object at this point, that object can be + * in one of a few states with respect to storage processing: + * + * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is + * set) + * + * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred + * fill op) + * + * (b) writes deferred till post-creation (mark page for writing and + * return immediately) + * + * (2) negative lookup, object created, initial fill being made from netfs + * (FSCACHE_COOKIE_INITIAL_FILL is set) + * + * (a) fill point not yet reached this page (mark page for writing and + * return) + * + * (b) fill point passed this page (queue op to store this page) + * + * (3) object extant (queue op to store this page) + * + * any other state is invalid + */ +int __fscache_write_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_storage *op; + struct fscache_object *object; + int ret; + + _enter("%p,%x,", cookie, (u32) page->flags); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERT(PageFsCache(page)); + + fscache_stat(&fscache_n_stores); + + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) + goto nomem; + + fscache_operation_init(&op->op, fscache_release_write_op); + fscache_operation_init_slow(&op->op, fscache_write_op); + op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + + ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + if (ret < 0) + goto nomem_free; + + ret = -ENOBUFS; + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto nobufs; + + /* add the page to the pending-storage radix tree on the backing + * object */ + spin_lock(&object->lock); + + _debug("store limit %llx", (unsigned long long) object->store_limit); + + ret = radix_tree_insert(&cookie->stores, page->index, page); + if (ret < 0) { + if (ret == -EEXIST) + goto already_queued; + _debug("insert failed %d", ret); + goto nobufs_unlock_obj; + } + + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + page_cache_get(page); + + /* we only want one writer at a time, but we do need to queue new + * writers after exclusive ops */ + if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) + goto already_pending; + + spin_unlock(&object->lock); + + op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); + op->store_limit = object->store_limit; + + if (fscache_submit_op(object, &op->op) < 0) + goto submit_failed; + + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + fscache_stat(&fscache_n_store_ops); + fscache_stat(&fscache_n_stores_ok); + + /* the slow work queue now carries its own ref on the object */ + fscache_put_operation(&op->op); + _leave(" = 0"); + return 0; + +already_queued: + fscache_stat(&fscache_n_stores_again); +already_pending: + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_ok); + _leave(" = 0"); + return 0; + +submit_failed: + radix_tree_delete(&cookie->stores, page->index); + page_cache_release(page); + ret = -ENOBUFS; + goto nobufs; + +nobufs_unlock_obj: + spin_unlock(&object->lock); +nobufs: + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; + +nomem_free: + kfree(op); +nomem: + fscache_stat(&fscache_n_stores_oom); + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(__fscache_write_page); + +/* + * remove a page from the cache + */ +void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) +{ + struct fscache_object *object; + + _enter(",%p", page); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + fscache_stat(&fscache_n_uncaches); + + /* cache withdrawal may beat us to it */ + if (!PageFsCache(page)) + goto done; + + /* get the object */ + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) { + ClearPageFsCache(page); + goto done_unlock; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + /* there might now be stuff on disk we could read */ + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + /* only invoke the cache backend if we managed to mark the page + * uncached here; this deals with synchronisation vs withdrawal */ + if (TestClearPageFsCache(page) && + object->cache->ops->uncache_page) { + /* the cache backend releases the cookie lock */ + object->cache->ops->uncache_page(object, page); + goto done; + } + +done_unlock: + spin_unlock(&cookie->lock); +done: + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_page); + +/** + * fscache_mark_pages_cached - Mark pages as being cached + * @op: The retrieval op pages are being marked for + * @pagevec: The pages to be marked + * + * Mark a bunch of netfs pages as being cached. After this is called, + * the netfs must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_pages_cached(struct fscache_retrieval *op, + struct pagevec *pagevec) +{ + struct fscache_cookie *cookie = op->op.object->cookie; + unsigned long loop; + +#ifdef CONFIG_FSCACHE_STATS + atomic_add(pagevec->nr, &fscache_n_marks); +#endif + + for (loop = 0; loop < pagevec->nr; loop++) { + struct page *page = pagevec->pages[loop]; + + _debug("- mark %p{%lx}", page, page->index); + if (TestSetPageFsCache(page)) { + static bool once_only; + if (!once_only) { + once_only = true; + printk(KERN_WARNING "FS-Cache:" + " Cookie type %s marked page %lx" + " multiple times\n", + cookie->def->name, page->index); + } + } + } + + if (cookie->def->mark_pages_cached) + cookie->def->mark_pages_cached(cookie->netfs_data, + op->mapping, pagevec); + pagevec_reinit(pagevec); +} +EXPORT_SYMBOL(fscache_mark_pages_cached); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c new file mode 100644 index 00000000000..beeab44bc31 --- /dev/null +++ b/fs/fscache/proc.c @@ -0,0 +1,68 @@ +/* FS-Cache statistics viewing interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +/* + * initialise the /proc/fs/fscache/ directory + */ +int __init fscache_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/fscache", NULL)) + goto error_dir; + +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, + &fscache_stats_fops)) + goto error_stats; +#endif + +#ifdef CONFIG_FSCACHE_HISTOGRAM + if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_fops)) + goto error_histogram; +#endif + + _leave(" = 0"); + return 0; + +#ifdef CONFIG_FSCACHE_HISTOGRAM +error_histogram: +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +error_stats: +#endif + remove_proc_entry("fs/fscache", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/fscache/ directory + */ +void fscache_proc_cleanup(void) +{ +#ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +#endif + remove_proc_entry("fs/fscache", NULL); +} diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c new file mode 100644 index 00000000000..65deb99e756 --- /dev/null +++ b/fs/fscache/stats.c @@ -0,0 +1,212 @@ +/* FS-Cache statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +/* + * operation counters + */ +atomic_t fscache_n_op_pend; +atomic_t fscache_n_op_run; +atomic_t fscache_n_op_enqueue; +atomic_t fscache_n_op_requeue; +atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_release; +atomic_t fscache_n_op_gc; + +atomic_t fscache_n_attr_changed; +atomic_t fscache_n_attr_changed_ok; +atomic_t fscache_n_attr_changed_nobufs; +atomic_t fscache_n_attr_changed_nomem; +atomic_t fscache_n_attr_changed_calls; + +atomic_t fscache_n_allocs; +atomic_t fscache_n_allocs_ok; +atomic_t fscache_n_allocs_wait; +atomic_t fscache_n_allocs_nobufs; +atomic_t fscache_n_alloc_ops; +atomic_t fscache_n_alloc_op_waits; + +atomic_t fscache_n_retrievals; +atomic_t fscache_n_retrievals_ok; +atomic_t fscache_n_retrievals_wait; +atomic_t fscache_n_retrievals_nodata; +atomic_t fscache_n_retrievals_nobufs; +atomic_t fscache_n_retrievals_intr; +atomic_t fscache_n_retrievals_nomem; +atomic_t fscache_n_retrieval_ops; +atomic_t fscache_n_retrieval_op_waits; + +atomic_t fscache_n_stores; +atomic_t fscache_n_stores_ok; +atomic_t fscache_n_stores_again; +atomic_t fscache_n_stores_nobufs; +atomic_t fscache_n_stores_oom; +atomic_t fscache_n_store_ops; +atomic_t fscache_n_store_calls; + +atomic_t fscache_n_marks; +atomic_t fscache_n_uncaches; + +atomic_t fscache_n_acquires; +atomic_t fscache_n_acquires_null; +atomic_t fscache_n_acquires_no_cache; +atomic_t fscache_n_acquires_ok; +atomic_t fscache_n_acquires_nobufs; +atomic_t fscache_n_acquires_oom; + +atomic_t fscache_n_updates; +atomic_t fscache_n_updates_null; +atomic_t fscache_n_updates_run; + +atomic_t fscache_n_relinquishes; +atomic_t fscache_n_relinquishes_null; +atomic_t fscache_n_relinquishes_waitcrt; + +atomic_t fscache_n_cookie_index; +atomic_t fscache_n_cookie_data; +atomic_t fscache_n_cookie_special; + +atomic_t fscache_n_object_alloc; +atomic_t fscache_n_object_no_alloc; +atomic_t fscache_n_object_lookups; +atomic_t fscache_n_object_lookups_negative; +atomic_t fscache_n_object_lookups_positive; +atomic_t fscache_n_object_created; +atomic_t fscache_n_object_avail; +atomic_t fscache_n_object_dead; + +atomic_t fscache_n_checkaux_none; +atomic_t fscache_n_checkaux_okay; +atomic_t fscache_n_checkaux_update; +atomic_t fscache_n_checkaux_obsolete; + +/* + * display the general statistics + */ +static int fscache_stats_show(struct seq_file *m, void *v) +{ + seq_puts(m, "FS-Cache statistics\n"); + + seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n", + atomic_read(&fscache_n_cookie_index), + atomic_read(&fscache_n_cookie_data), + atomic_read(&fscache_n_cookie_special)); + + seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n", + atomic_read(&fscache_n_object_alloc), + atomic_read(&fscache_n_object_no_alloc), + atomic_read(&fscache_n_object_avail), + atomic_read(&fscache_n_object_dead)); + seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n", + atomic_read(&fscache_n_checkaux_none), + atomic_read(&fscache_n_checkaux_okay), + atomic_read(&fscache_n_checkaux_update), + atomic_read(&fscache_n_checkaux_obsolete)); + + seq_printf(m, "Pages : mrk=%u unc=%u\n", + atomic_read(&fscache_n_marks), + atomic_read(&fscache_n_uncaches)); + + seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u" + " oom=%u\n", + atomic_read(&fscache_n_acquires), + atomic_read(&fscache_n_acquires_null), + atomic_read(&fscache_n_acquires_no_cache), + atomic_read(&fscache_n_acquires_ok), + atomic_read(&fscache_n_acquires_nobufs), + atomic_read(&fscache_n_acquires_oom)); + + seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n", + atomic_read(&fscache_n_object_lookups), + atomic_read(&fscache_n_object_lookups_negative), + atomic_read(&fscache_n_object_lookups_positive), + atomic_read(&fscache_n_object_created)); + + seq_printf(m, "Updates: n=%u nul=%u run=%u\n", + atomic_read(&fscache_n_updates), + atomic_read(&fscache_n_updates_null), + atomic_read(&fscache_n_updates_run)); + + seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n", + atomic_read(&fscache_n_relinquishes), + atomic_read(&fscache_n_relinquishes_null), + atomic_read(&fscache_n_relinquishes_waitcrt)); + + seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", + atomic_read(&fscache_n_attr_changed), + atomic_read(&fscache_n_attr_changed_ok), + atomic_read(&fscache_n_attr_changed_nobufs), + atomic_read(&fscache_n_attr_changed_nomem), + atomic_read(&fscache_n_attr_changed_calls)); + + seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n", + atomic_read(&fscache_n_allocs), + atomic_read(&fscache_n_allocs_ok), + atomic_read(&fscache_n_allocs_wait), + atomic_read(&fscache_n_allocs_nobufs)); + seq_printf(m, "Allocs : ops=%u owt=%u\n", + atomic_read(&fscache_n_alloc_ops), + atomic_read(&fscache_n_alloc_op_waits)); + + seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" + " int=%u oom=%u\n", + atomic_read(&fscache_n_retrievals), + atomic_read(&fscache_n_retrievals_ok), + atomic_read(&fscache_n_retrievals_wait), + atomic_read(&fscache_n_retrievals_nodata), + atomic_read(&fscache_n_retrievals_nobufs), + atomic_read(&fscache_n_retrievals_intr), + atomic_read(&fscache_n_retrievals_nomem)); + seq_printf(m, "Retrvls: ops=%u owt=%u\n", + atomic_read(&fscache_n_retrieval_ops), + atomic_read(&fscache_n_retrieval_op_waits)); + + seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", + atomic_read(&fscache_n_stores), + atomic_read(&fscache_n_stores_ok), + atomic_read(&fscache_n_stores_again), + atomic_read(&fscache_n_stores_nobufs), + atomic_read(&fscache_n_stores_oom)); + seq_printf(m, "Stores : ops=%u run=%u\n", + atomic_read(&fscache_n_store_ops), + atomic_read(&fscache_n_store_calls)); + + seq_printf(m, "Ops : pend=%u run=%u enq=%u\n", + atomic_read(&fscache_n_op_pend), + atomic_read(&fscache_n_op_run), + atomic_read(&fscache_n_op_enqueue)); + seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_deferred_release), + atomic_read(&fscache_n_op_release), + atomic_read(&fscache_n_op_gc)); + return 0; +} + +/* + * open "/proc/fs/fscache/stats" allowing provision of a statistical summary + */ +static int fscache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, fscache_stats_show, NULL); +} + +const struct file_operations fscache_stats_fops = { + .owner = THIS_MODULE, + .open = fscache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 821d10f719b..4e340fedf76 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma) * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ -static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; /* * Don't use page->mapping as it may become NULL from a * concurrent truncate. diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 995d63b2e74..e0b53aa7bbe 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, mode_t mode = inode->i_mode; int error; - inode->i_mode = mode & ~current->fs->umask; + inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) acl = ops->getacl(dir, ACL_TYPE_DEFAULT); if (acl) { diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 43764f4fa76..fa881bdc3d8 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (error) return error; if (!acl) { - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (mode != ip->i_inode.i_mode) error = munge_mode(ip, mode); return error; diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3b9e8de3500..70b9b854894 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -412,6 +413,8 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index c8b5acf4b0b..a36bb749926 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb) static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = HFS_SB(sb)->fs_ablocks; buf->f_ffree = HFS_SB(sb)->free_ablocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFS_NAMELEN; return 0; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index bab7f8d1bdf..3fcbb0e1f6f 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) opts->creator = HFSPLUS_DEF_CR_TYPE; opts->type = HFSPLUS_DEF_CR_TYPE; - opts->umask = current->fs->umask; + opts->umask = current_umask(); opts->uid = current_uid(); opts->gid = current_gid(); opts->part = -1; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index eb74531a0a8..f2a64020f42 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb) static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFSPLUS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFSPLUS_MAX_STRLEN; return 0; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 0d049b8919c..fecf402d7b8 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); lock_kernel(); /*if (sbi->sb_n_free == -1) {*/ @@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = sbi->sb_n_free_dnodes; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = 254; unlock_kernel(); @@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) uid = current_uid(); gid = current_gid(); - umask = current->fs->umask; + umask = current_umask(); lowercase = 0; conv = CONV_BINARY; eas = 2; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index b278f7f5202..a5089a6dd67 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, "errno = %d\n", err); return err; } - count = hppfs_read_file(hppfs->host_fd, buf, count); + err = hppfs_read_file(hppfs->host_fd, buf, count); + if (err < 0) { + printk(KERN_ERR "hppfs_read: read failed: %d\n", err); + return err; + } + count = err; if (count > 0) *ppos += count; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9b800d97a68..23a3c76711e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -943,14 +943,13 @@ static struct vfsmount *hugetlbfs_vfsmount; static int can_do_hugetlb_shm(void) { - return likely(capable(CAP_IPC_LOCK) || - in_group_p(sysctl_hugetlb_shm_group) || - can_do_mlock()); + return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; + int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; @@ -960,11 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm()) - return ERR_PTR(-EPERM); - - if (!user_shm_lock(size, user)) - return ERR_PTR(-ENOMEM); + if (!can_do_hugetlb_shm()) { + if (user_shm_lock(size, user)) { + unlock_shm = 1; + WARN_ONCE(1, + "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + } else + return ERR_PTR(-EPERM); + } root = hugetlbfs_vfsmount->mnt_root; quick_string.name = name; @@ -1004,7 +1006,8 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - user_shm_unlock(size, user); + if (unlock_shm) + user_shm_unlock(size, user); return ERR_PTR(error); } diff --git a/fs/internal.h b/fs/internal.h index 53af885f173..b4dac4fb6b6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct linux_binprm; +struct path; /* * block_dev.c @@ -43,7 +44,7 @@ extern void __init chrdev_init(void); /* * exec.c */ -extern void check_unsafe_exec(struct linux_binprm *); +extern int check_unsafe_exec(struct linux_binprm *); /* * namespace.c @@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern void __init mnt_init(void); + +/* + * fs_struct.c + */ +extern void chroot_fs_refs(struct path *, struct path *); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 13d2eddd069..b4cbe9603c7 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -923,6 +923,7 @@ out_freesbi: static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ISOFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = NAME_MAX; return 0; } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e79c07812af..737f7246a4b 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); @@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev, if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - journal = NULL; - goto out; + goto out_err; } journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev, journal->j_maxlen = len; bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; -out: + return journal; +out_err: + kfree(journal); + return NULL; } /** @@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - return NULL; + goto out_err; } err = journal_bmap(journal, 0, &blocknr); @@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); - kfree(journal); - return NULL; + goto out_err; } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; +out_err: + kfree(journal); + return NULL; } /* diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 62804e57a44..4ea72377c7a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + if (commit_transaction->t_synchronous_commit) + write_op = WRITE_SYNC; stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, @@ -680,7 +683,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE, bh); + submit_bh(write_op, bh); } cond_resched(); stats.u.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 257ff262576..bbe6d592d8b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -55,6 +55,25 @@ * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment noone else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. */ #ifndef __KERNEL__ @@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. - * - * The caller must have the journal locked. */ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { @@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. - * - * Called with the journal lock held. */ - void jbd2_journal_write_revoke_records(journal_t *journal, transaction_t *transaction) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 28ce21d8598..996ffda06bf 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle) } } + if (handle->h_sync) + transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index d98713777a1..77ccf8cb082 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) return PTR_ERR(acl); if (!acl) { - *i_mode &= ~current->fs->umask; + *i_mode &= ~current_umask(); } else { if (S_ISDIR(*i_mode)) jffs2_iset_acl(inode, &f->i_acl_default, acl); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a166c1669e8..06ca1b8d205 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) cleanup: posix_acl_release(acl); } else - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index aedc47a264c..1f3b0fc0d35 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, - struct in6_addr *addr_mapped) -{ - const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; - - switch (sap->sa_family) { - case AF_INET6: - return &((const struct sockaddr_in6 *)sap)->sin6_addr; - case AF_INET: - ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); - return addr_mapped; - } - - return NULL; -} - -/* - * If lockd is using a PF_INET6 listener, all incoming requests appear - * to come from AF_INET6 remotes. The address of AF_INET remotes are - * mapped to AF_INET6 automatically by the network layer. In case the - * user passed an AF_INET server address at mount time, ensure both - * addresses are AF_INET6 before comparing them. - */ -static int nlmclnt_cmp_addr(const struct nlm_host *host, - const struct sockaddr *sap) -{ - const struct in6_addr *addr1; - const struct in6_addr *addr2; - struct in6_addr addr1_mapped; - struct in6_addr addr2_mapped; - - addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); - if (likely(addr1 != NULL)) { - addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); - if (likely(addr2 != NULL)) - return ipv6_addr_equal(addr1, addr2); - } - - return 0; -} -#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ -static int nlmclnt_cmp_addr(const struct nlm_host *host, - const struct sockaddr *sap) -{ - return nlm_cmp_addr(nlm_addr(host), sap); -} -#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ - /* * The server lockd has called us back to tell us the lock was granted */ @@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlmclnt_cmp_addr(block->b_host, addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 5e2c4d5ac82..6d5d4a4169e 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -16,6 +16,8 @@ #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> +#include <asm/unaligned.h> + #define NLMDBG_FACILITY NLMDBG_MONITOR #define NSM_PROGRAM 100024 #define NSM_VERSION 1 @@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm) { u64 *p = (u64 *)&nsm->sm_priv.data; struct timespec ts; + s64 ns; ktime_get_ts(&ts); - *p++ = timespec_to_ns(&ts); - *p = (unsigned long)nsm; + ns = timespec_to_ns(&ts); + put_unaligned(ns, p); + put_unaligned((unsigned long)nsm, p + 1); } static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 64f1c31b585..abf83881f68 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; /* - * If the kernel has IPv6 support available, always listen for - * both AF_INET and AF_INET6 requests. - */ -#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \ - defined(CONFIG_SUNRPC_REGISTER_V4) -static const sa_family_t nlmsvc_family = AF_INET6; -#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ -static const sa_family_t nlmsvc_family = AF_INET; -#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ - -/* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ @@ -204,19 +193,30 @@ lockd(void *vrqstp) return 0; } -static int create_lockd_listener(struct svc_serv *serv, char *name, - unsigned short port) +static int create_lockd_listener(struct svc_serv *serv, const char *name, + const int family, const unsigned short port) { struct svc_xprt *xprt; - xprt = svc_find_xprt(serv, name, 0, 0); + xprt = svc_find_xprt(serv, name, family, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); - + return svc_create_xprt(serv, name, family, port, + SVC_SOCK_DEFAULTS); svc_xprt_put(xprt); return 0; } +static int create_lockd_family(struct svc_serv *serv, const int family) +{ + int err; + + err = create_lockd_listener(serv, "udp", family, nlm_udpport); + if (err < 0) + return err; + + return create_lockd_listener(serv, "tcp", family, nlm_tcpport); +} + /* * Ensure there are active UDP and TCP listeners for lockd. * @@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv) static int warned; int err; - err = create_lockd_listener(serv, "udp", nlm_udpport); + err = create_lockd_family(serv, PF_INET); if (err < 0) goto out_err; - err = create_lockd_listener(serv, "tcp", nlm_tcpport); - if (err < 0) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + err = create_lockd_family(serv, PF_INET6); + if (err < 0 && err != -EAFNOSUPPORT) goto out_err; +#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */ warned = 0; return 0; @@ -274,7 +276,7 @@ int lockd_up(void) "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 618865b3128..daad3c2740d 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -321,15 +321,20 @@ out: static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct minix_sb_info *sbi = minix_sb(dentry->d_sb); - buf->f_type = dentry->d_sb->s_magic; - buf->f_bsize = dentry->d_sb->s_blocksize; + struct super_block *sb = dentry->d_sb; + struct minix_sb_info *sbi = minix_sb(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; buf->f_bfree = minix_count_free_blocks(sbi); buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_ninodes; buf->f_ffree = minix_count_free_inodes(sbi); buf->f_namelen = sbi->s_namelen; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index 16c3ef37eae..680ba60863f 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) bio_put(bio); } -struct bio *mpage_bio_submit(int rw, struct bio *bio) +static struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } -EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage); * just allocate full-size (16-page) BIOs. */ -int __mpage_writepage(struct page *page, struct writeback_control *wbc, +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + +static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; @@ -648,7 +654,6 @@ out: mpd->bio = bio; return ret; } -EXPORT_SYMBOL(__mpage_writepage); /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them diff --git a/fs/namei.c b/fs/namei.c index d040ce11785..b8433ebfae0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) @@ -1578,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path, struct dentry *dir = nd->path.dentry; if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = security_path_mknod(&nd->path, path->dentry, mode, 0); if (error) goto out_unlock; @@ -1989,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, goto out_unlock; } if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = may_mknod(mode); if (error) goto out_dput; @@ -2067,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) goto out_unlock; if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; @@ -2897,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink); EXPORT_SYMBOL(vfs_unlink); EXPORT_SYMBOL(dentry_unhash); EXPORT_SYMBOL(generic_readlink); - -/* to be mentioned only in INIT_TASK */ -struct fs_struct init_fs = { - .count = ATOMIC_INIT(1), - .lock = __RW_LOCK_UNLOCKED(init_fs.lock), - .umask = 0022, -}; diff --git a/fs/namespace.c b/fs/namespace.c index 0a42e0e9602..c6f54e4c429 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include <linux/ramfs.h> #include <linux/log2.h> #include <linux/idr.h> +#include <linux/fs_struct.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include "pnode.h" @@ -2093,66 +2094,6 @@ out1: } /* - * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_root(struct fs_struct *fs, struct path *path) -{ - struct path old_root; - - write_lock(&fs->lock); - old_root = fs->root; - fs->root = *path; - path_get(path); - write_unlock(&fs->lock); - if (old_root.dentry) - path_put(&old_root); -} - -/* - * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) -{ - struct path old_pwd; - - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); - - if (old_pwd.dentry) - path_put(&old_pwd); -} - -static void chroot_fs_refs(struct path *old_root, struct path *new_root) -{ - struct task_struct *g, *p; - struct fs_struct *fs; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - fs = p->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(p); - if (fs->root.dentry == old_root->dentry - && fs->root.mnt == old_root->mnt) - set_fs_root(fs, new_root); - if (fs->pwd.dentry == old_root->dentry - && fs->pwd.mnt == old_root->mnt) - set_fs_pwd(fs, new_root); - put_fs_struct(fs); - } else - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -/* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 36fe20d6eba..e67f3ec0773 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -84,3 +84,11 @@ config ROOT_NFS <file:Documentation/filesystems/nfsroot.txt>. Most people say N here. + +config NFS_FSCACHE + bool "Provide NFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + help + Say Y here if you want NFS data to be cached locally on disc through + the general filesystem cache manager diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index ac6170c594a..845159814de 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 3e634f2a108..a886e692ddd 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; +unsigned short nfs_callback_tcpport6; static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; -/* - * If the kernel has IPv6 support available, always listen for - * both AF_INET and AF_INET6 requests. - */ -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static const sa_family_t nfs_callback_family = AF_INET6; -#else -static const sa_family_t nfs_callback_family = AF_INET; -#endif - static int param_set_port(const char *val, struct kernel_param *kp) { char *endp; @@ -116,19 +107,29 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, - nfs_callback_family, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); ret = -ENOMEM; if (!serv) goto out_err; - ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, - SVC_SOCK_ANONYMOUS); + ret = svc_create_xprt(serv, "tcp", PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; nfs_callback_tcpport = ret; dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport, nfs_callback_family); + nfs_callback_tcpport, PF_INET); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + ret = svc_create_xprt(serv, "tcp", PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport6, PF_INET6); + } else if (ret != -EAFNOSUPPORT) + goto out_err; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); if (IS_ERR(nfs_callback_info.rqst)) { diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index bb25d2135ff..e110e286a26 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -72,5 +72,6 @@ extern void nfs_callback_down(void); extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; +extern unsigned short nfs_callback_tcpport6; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2277421656e..75c9cd2aa11 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -45,6 +45,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ if (!IS_ERR(cred)) clp->cl_machine_cred = cred; + nfs_fscache_get_client_cookie(clp); + return clp; error_3: @@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp) nfs4_shutdown_client(clp); + nfs_fscache_release_client_cookie(clp); + /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); @@ -224,38 +229,6 @@ void nfs_put_client(struct nfs_client *clp) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped) -{ - switch (sa->sa_family) { - default: - return NULL; - case AF_INET6: - return &((const struct sockaddr_in6 *)sa)->sin6_addr; - break; - case AF_INET: - ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr, - addr_mapped); - return addr_mapped; - } -} - -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct in6_addr *addr1; - const struct in6_addr *addr2; - struct in6_addr addr1_mapped; - struct in6_addr addr2_mapped; - - addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped); - if (likely(addr1 != NULL)) { - addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped); - if (likely(addr2 != NULL)) - return ipv6_addr_equal(addr1, addr2); - } - return 0; -} - /* * Test if two ip6 socket addresses refer to the same socket by * comparing relevant fields. The padding bytes specifically, are not @@ -267,38 +240,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET6. */ -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (!ipv6_addr_equal(&saddr1->sin6_addr, - &saddr1->sin6_addr)) - return 0; - if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - saddr1->sin6_scope_id != saddr2->sin6_scope_id) + if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + sin1->sin6_scope_id != sin2->sin6_scope_id) return 0; - return saddr1->sin6_port == saddr2->sin6_port; -} -#else -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, - const struct sockaddr_in *sa2) -{ - return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; -} -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET)) - return 0; - return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, - (const struct sockaddr_in *)sa2); + return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); } - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, - const struct sockaddr * sa2) +#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { return 0; } @@ -311,20 +267,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, * * The caller should ensure both socket addresses are AF_INET. */ +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + return nfs_sockaddr_match_ipaddr6(sa1, sa2) && + (sin1->sin6_port == sin2->sin6_port); +} + static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, const struct sockaddr *sa2) { - const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) + return nfs_sockaddr_match_ipaddr4(sa1, sa2) && + (sin1->sin_port == sin2->sin_port); +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, excluding the port number. + */ +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) return 0; - return saddr1->sin_port == saddr2->sin_port; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_match_ipaddr4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_match_ipaddr6(sa1, sa2); + } + return 0; } /* * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields. + * by comparing (only) relevant fields, including the port number. */ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) @@ -772,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; + server->options = data->options; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1160,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; + server->options = data->options; /* Get a client record */ error = nfs4_set_client(server, @@ -1571,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) /* display header on line 1 */ if (v == &nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID\n"); + seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); return 0; } /* display one transport per line on subsequent lines */ @@ -1585,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - seq_printf(m, "v%u %s %s %-7s %-17s\n", + seq_printf(m, "v%u %s %s %-7s %-17s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, - fsid); + fsid, + nfs_server_fscache_state(server)); return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 78bf72fc1db..370b190a09d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (atomic_read(&new_dentry->d_count) > 1) /* dentry still busy? */ goto out; - } else - nfs_drop_nlink(new_inode); + } go_ahead: /* @@ -1638,10 +1637,8 @@ go_ahead: } nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) { + if (new_inode != NULL) nfs_inode_return_delegation(new_inode); - d_delete(new_dentry); - } error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); @@ -1650,6 +1647,8 @@ out: if (rehash) d_rehash(rehash); if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); d_move(old_dentry, new_dentry); nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 90f292b520d..3523b895eb4 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -35,6 +35,7 @@ #include "delegation.h" #include "internal.h" #include "iostat.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -64,11 +65,7 @@ const struct file_operations nfs_file_operations = { .write = do_sync_write, .aio_read = nfs_file_read, .aio_write = nfs_file_write, -#ifdef CONFIG_MMU .mmap = nfs_file_mmap, -#else - .mmap = generic_file_mmap, -#endif .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, @@ -141,9 +138,6 @@ nfs_file_release(struct inode *inode, struct file *filp) dentry->d_parent->d_name.name, dentry->d_name.name); - /* Ensure that dirty pages are flushed out with the right creds */ - if (filp->f_mode & FMODE_WRITE) - nfs_wb_all(dentry->d_inode); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); } @@ -235,7 +229,6 @@ nfs_file_flush(struct file *file, fl_owner_t id) struct nfs_open_context *ctx = nfs_file_open_context(file); struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; - int status; dprintk("NFS: flush(%s/%s)\n", dentry->d_parent->d_name.name, @@ -245,11 +238,8 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; nfs_inc_stats(inode, NFSIOS_VFSFLUSH); - /* Ensure that data+attribute caches are up to date after close() */ - status = nfs_do_fsync(ctx, inode); - if (!status) - nfs_revalidate_inode(NFS_SERVER(inode), inode); - return status; + /* Flush writes to the server and return any errors */ + return nfs_do_fsync(ctx, inode); } static ssize_t @@ -304,11 +294,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) dprintk("NFS: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); - status = nfs_revalidate_mapping(inode, file->f_mapping); + /* Note: generic_file_mmap() returns ENOSYS on nommu systems + * so we call that before revalidating the mapping + */ + status = generic_file_mmap(file, vma); if (!status) { vma->vm_ops = &nfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; - file_accessed(file); + status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } @@ -354,6 +346,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); + /* + * Prevent starvation issues if someone is doing a consistency + * sync-to-disk + */ + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + return ret; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return copied; } +/* + * Partially or wholly invalidate a page + * - Release the private state associated with a page if undergoing complete + * page invalidation + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + */ static void nfs_invalidate_page(struct page *page, unsigned long offset) { dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); @@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page->mapping->host, page); + + nfs_fscache_invalidate_page(page, page->mapping->host); } +/* + * Attempt to release the private state associated with a page + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + * - Return true (may release page) or false (may not) + */ static int nfs_release_page(struct page *page, gfp_t gfp) { dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* If PagePrivate() is set, then the page is not freeable */ - return 0; + if (PagePrivate(page)) + return 0; + return nfs_fscache_release_page(page, gfp); } +/* + * Attempt to clear the private state associated with a page when an error + * occurs that requires the cached contents of an inode to be written back or + * destroyed + * - Called if either PG_private or fscache is set on the page + * - Caller holds page lock + * - Return 0 if successful, -error otherwise + */ static int nfs_launder_page(struct page *page) { struct inode *inode = page->mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", inode->i_ino, (long long)page_offset(page)); + nfs_fscache_wait_on_page_write(nfsi, page); return nfs_wb_page(inode, page); } @@ -451,8 +479,14 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +/* + * Notification that a PTE pointing to an NFS page is about to be made + * writable, implying that someone is about to modify the page through a + * shared-writable mapping + */ +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; @@ -464,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) filp->f_mapping->host->i_ino, (long long)page_offset(page)); + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + lock_page(page); mapping = page->mapping; if (mapping != dentry->d_inode->i_mapping) @@ -483,6 +520,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = pagelen; out_unlock: unlock_page(page); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c new file mode 100644 index 00000000000..5b1006480bc --- /dev/null +++ b/fs/nfs/fscache-index.c @@ -0,0 +1,337 @@ +/* NFS FS-Cache index structure definition + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> + +#include "internal.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +/* + * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks + * the cookie for the top-level index object for NFS into here. The top-level + * index can than have other cache objects inserted into it. + */ +struct fscache_netfs nfs_fscache_netfs = { + .name = "nfs", + .version = 0, +}; + +/* + * Register NFS for caching + */ +int nfs_fscache_register(void) +{ + return fscache_register_netfs(&nfs_fscache_netfs); +} + +/* + * Unregister NFS for caching + */ +void nfs_fscache_unregister(void) +{ + fscache_unregister_netfs(&nfs_fscache_netfs); +} + +/* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + } addr[0]; +}; + +/* + * Generate a key to describe a server in the main NFS index + * - We return the length of the key, or 0 if we can't generate one + */ +static uint16_t nfs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_client *clp = cookie_netfs_data; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key *key = buffer; + uint16_t len = sizeof(struct nfs_server_key); + + key->nfsversion = clp->rpc_ops->version; + key->family = clp->cl_addr.ss_family; + + memset(key, 0, len); + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key->port = sin->sin_port; + key->addr[0].ipv4_addr = sin->sin_addr; + len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->port = sin6->sin6_port; + key->addr[0].ipv6_addr = sin6->sin6_addr; + len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + len = 0; + break; + } + + return len; +} + +/* + * Define the server object for FS-Cache. This is used to describe a server + * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and + * server address parameters. + */ +const struct fscache_cookie_def nfs_fscache_server_index_def = { + .name = "NFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_server_get_key, +}; + +/* + * Generate a key to describe a superblock key in the main NFS index + */ +static uint16_t nfs_super_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_fscache_key *key; + const struct nfs_server *nfss = cookie_netfs_data; + uint16_t len; + + key = nfss->fscache_key; + len = sizeof(key->key) + key->key.uniq_len; + if (len > bufmax) { + len = 0; + } else { + memcpy(buffer, &key->key, sizeof(key->key)); + memcpy(buffer + sizeof(key->key), + key->key.uniquifier, key->key.uniq_len); + } + + return len; +} + +/* + * Define the superblock object for FS-Cache. This is used to describe a + * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS + * parameters that might cause a separate superblock. + */ +const struct fscache_cookie_def nfs_fscache_super_index_def = { + .name = "NFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_super_get_key, +}; + +/* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + loff_t size; + u64 change_attr; +}; + +/* + * Generate a key to describe an NFS inode in an NFS server's index + */ +static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + uint16_t nsize; + + /* use the inode's NFS filehandle as the key */ + nsize = nfsi->fh.size; + memcpy(buffer, nfsi->fh.data, nsize); + return nsize; +} + +/* + * Get certain file attributes from the netfs data + * - This function can be absent for an index + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + + *size = nfsi->vfs_inode.i_size; +} + +/* + * Get the auxiliary data from netfs data + * - This function can be absent if the index carries no state data + * - Should store the auxiliary data in the buffer + * - Should return the amount of amount stored + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct nfs_fscache_inode_auxdata auxdata; + const struct nfs_inode *nfsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->change_attr; + + if (bufmax > sizeof(auxdata)) + bufmax = sizeof(auxdata); + + memcpy(buffer, &auxdata, bufmax); + return bufmax; +} + +/* + * Consult the netfs about the state of an object + * - This function can be absent if the index carries no state data + * - The netfs data from the cookie being used as the target is + * presented, as is the auxiliary data + */ +static +enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->change_attr; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Indication from FS-Cache that the cookie is no longer cached + * - This function is called when the backing store currently caching a cookie + * is removed + * - The netfs should use this to clean up any markers indicating cached pages + * - This is mandatory for any object that may have data + */ +static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct nfs_inode *nfsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); + + for (;;) { + /* grab a bunch of pages to unmark */ + nr_pages = pagevec_lookup(&pvec, + nfsi->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Get an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + * - The read context is passed back to NFS in the event that a data read on the + * cache fails with EIO - in which case the server must be contacted to + * retrieve the data, which requires the read context for security. + */ +static void nfs_fh_get_context(void *cookie_netfs_data, void *context) +{ + get_nfs_open_context(context); +} + +/* + * Release an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + */ +static void nfs_fh_put_context(void *cookie_netfs_data, void *context) +{ + if (context) + put_nfs_open_context(context); +} + +/* + * Define the inode object for FS-Cache. This is used to describe an inode + * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for + * an inode. + * + * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime + * held in the cache auxiliary data for the data storage object with those in + * the inode struct in memory. + */ +const struct fscache_cookie_def nfs_fscache_inode_object_def = { + .name = "NFS.fh", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = nfs_fscache_inode_get_key, + .get_attr = nfs_fscache_inode_get_attr, + .get_aux = nfs_fscache_inode_get_aux, + .check_aux = nfs_fscache_inode_check_aux, + .now_uncached = nfs_fscache_inode_now_uncached, + .get_context = nfs_fh_get_context, + .put_context = nfs_fh_put_context, +}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c new file mode 100644 index 00000000000..379be678cb7 --- /dev/null +++ b/fs/nfs/fscache.c @@ -0,0 +1,523 @@ +/* NFS filesystem cache interface + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> +#include <linux/seq_file.h> + +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +static struct rb_root nfs_fscache_keys = RB_ROOT; +static DEFINE_SPINLOCK(nfs_fscache_keys_lock); + +/* + * Get the per-client index cookie for an NFS client if the appropriate mount + * flag was set + * - We always try and get an index cookie for the client, but get filehandle + * cookies on a per-superblock basis, depending on the mount flags + */ +void nfs_fscache_get_client_cookie(struct nfs_client *clp) +{ + /* create a cache index for looking up filehandles */ + clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, + &nfs_fscache_server_index_def, + clp); + dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", + clp, clp->fscache); +} + +/* + * Dispose of a per-client cookie + */ +void nfs_fscache_release_client_cookie(struct nfs_client *clp) +{ + dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", + clp, clp->fscache); + + fscache_relinquish_cookie(clp->fscache, 0); + clp->fscache = NULL; +} + +/* + * Get the cache cookie for an NFS superblock. We have to handle + * uniquification here because the cache doesn't do it for us. + */ +void nfs_fscache_get_super_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *data) +{ + struct nfs_fscache_key *key, *xkey; + struct nfs_server *nfss = NFS_SB(sb); + struct rb_node **p, *parent; + const char *uniq = data->fscache_uniq ?: ""; + int diff, ulen; + + ulen = strlen(uniq); + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + if (!key) + return; + + key->nfs_client = nfss->nfs_client; + key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; + key->key.nfs_server.flags = nfss->flags; + key->key.nfs_server.rsize = nfss->rsize; + key->key.nfs_server.wsize = nfss->wsize; + key->key.nfs_server.acregmin = nfss->acregmin; + key->key.nfs_server.acregmax = nfss->acregmax; + key->key.nfs_server.acdirmin = nfss->acdirmin; + key->key.nfs_server.acdirmax = nfss->acdirmax; + key->key.nfs_server.fsid = nfss->fsid; + key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; + + key->key.uniq_len = ulen; + memcpy(key->key.uniquifier, uniq, ulen); + + spin_lock(&nfs_fscache_keys_lock); + p = &nfs_fscache_keys.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xkey = rb_entry(parent, struct nfs_fscache_key, node); + + if (key->nfs_client < xkey->nfs_client) + goto go_left; + if (key->nfs_client > xkey->nfs_client) + goto go_right; + + diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + + if (key->key.uniq_len == 0) + goto non_unique; + diff = memcmp(key->key.uniquifier, + xkey->key.uniquifier, + key->key.uniq_len); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + goto non_unique; + + go_left: + p = &(*p)->rb_left; + continue; + go_right: + p = &(*p)->rb_right; + } + + rb_link_node(&key->node, parent, p); + rb_insert_color(&key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + nfss->fscache_key = key; + + /* create a cache index for looking up filehandles */ + nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, + &nfs_fscache_super_index_def, + nfss); + dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + return; + +non_unique: + spin_unlock(&nfs_fscache_keys_lock); + kfree(key); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + printk(KERN_WARNING "NFS:" + " Cache request denied due to non-unique superblock keys\n"); +} + +/* + * release a per-superblock cookie + */ +void nfs_fscache_release_super_cookie(struct super_block *sb) +{ + struct nfs_server *nfss = NFS_SB(sb); + + dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + + fscache_relinquish_cookie(nfss->fscache, 0); + nfss->fscache = NULL; + + if (nfss->fscache_key) { + spin_lock(&nfs_fscache_keys_lock); + rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + kfree(nfss->fscache_key); + nfss->fscache_key = NULL; + } +} + +/* + * Initialise the per-inode cache cookie pointer for an NFS inode. + */ +void nfs_fscache_init_inode_cookie(struct inode *inode) +{ + NFS_I(inode)->fscache = NULL; + if (S_ISREG(inode->i_mode)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); +} + +/* + * Get the per-inode cache cookie for an NFS inode. + */ +static void nfs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->fscache || !NFS_FSCACHE(inode)) + return; + + if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { + nfsi->fscache = fscache_acquire_cookie( + NFS_SB(sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", + sb, nfsi, nfsi->fscache); + } +} + +/* + * Release a per-inode cookie. + */ +void nfs_fscache_release_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 0); + nfsi->fscache = NULL; +} + +/* + * Retire a per-inode cookie, destroying the data attached to it. + */ +void nfs_fscache_zap_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 1); + nfsi->fscache = NULL; +} + +/* + * Turn off the cache with regard to a per-inode cookie if opened for writing, + * invalidating all the pages in the page cache relating to the associated + * inode to clear the per-page caching. + */ +static void nfs_fscache_disable_inode_cookie(struct inode *inode) +{ + clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + + if (NFS_I(inode)->fscache) { + dfprintk(FSCACHE, + "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); + + /* Need to invalidate any mapped pages that were read in before + * turning off the cache. + */ + if (inode->i_mapping && inode->i_mapping->nrpages) + invalidate_inode_pages2(inode->i_mapping); + + nfs_fscache_zap_inode_cookie(inode); + } +} + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +static int nfs_fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} + +/* + * Lock against someone else trying to also acquire or relinquish a cookie + */ +static inline void nfs_fscache_inode_lock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) + wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, + nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); +} + +/* + * Unlock cookie management lock + */ +static inline void nfs_fscache_inode_unlock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + smp_mb__before_clear_bit(); + clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); +} + +/* + * Decide if we should enable or disable local caching for this inode. + * - For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * - May be invoked multiple times in parallel by parallel nfs_open() functions. + */ +void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if (NFS_FSCACHE(inode)) { + nfs_fscache_inode_lock(inode); + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + nfs_fscache_disable_inode_cookie(inode); + else + nfs_fscache_enable_inode_cookie(inode); + nfs_fscache_inode_unlock(inode); + } +} + +/* + * Replace a per-inode cookie due to revalidation detecting a file having + * changed on the server. + */ +void nfs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *nfss = NFS_SERVER(inode); + struct fscache_cookie *old = nfsi->fscache; + + nfs_fscache_inode_lock(inode); + if (nfsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(nfsi->fscache, 1); + + nfsi->fscache = fscache_acquire_cookie( + nfss->nfs_client->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, + "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", + nfss, nfsi, old, nfsi->fscache); + } + nfs_fscache_inode_unlock(inode); +} + +/* + * Release the caching state associated with a page, if the page isn't busy + * interacting with the cache. + * - Returns true (can release page) or false (page busy). + */ +int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + struct nfs_inode *nfsi = NFS_I(page->mapping->host); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + + if (fscache_check_page_write(cookie, page)) { + if (!(gfp & __GFP_WAIT)) + return 0; + fscache_wait_on_page_write(cookie, page); + } + + if (PageFsCache(page)) { + dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } + + return 1; +} + +/* + * Release the caching state associated with a page if undergoing complete page + * invalidation. + */ +void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + + dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + fscache_wait_on_page_write(cookie, page); + + BUG_ON(!PageLocked(page)); + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); +} + +/* + * Handle completion of a page being read from the cache. + * - Called in process (keventd) context. + */ +static void nfs_readpage_from_fscache_complete(struct page *page, + void *context, + int error) +{ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", + page, context, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) { + SetPageUptodate(page); + unlock_page(page); + } else { + error = nfs_readpage_async(context, page->mapping->host, page); + if (error) + unlock_page(page); + } +} + +/* + * Retrieve a page from fscache + */ +int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, struct page *page) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, inode); + + ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + page, + nfs_readpage_from_fscache_complete, + ctx, + GFP_KERNEL); + + switch (ret) { + case 0: /* read BIO submitted (page in fscache) */ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache: BIO submitted\n"); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1); + return ret; + + case -ENOBUFS: /* inode not in cache */ + case -ENODATA: /* page not in cache */ + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + dfprintk(FSCACHE, + "NFS: readpage_from_fscache %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + } + return ret; +} + +/* + * Retrieve a set of pages from fscache + */ +int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret, npages = *nr_pages; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); + + ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + mapping, pages, nr_pages, + nfs_readpage_from_fscache_complete, + ctx, + mapping_gfp_mask(mapping)); + if (*nr_pages < npages) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, + npages); + if (*nr_pages > 0) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, + *nr_pages); + + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: submitted\n"); + + return ret; + + case -ENOBUFS: /* some pages aren't cached and can't be */ + case -ENODATA: /* some pages aren't cached */ + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: ret %d\n", ret); + } + + return ret; +} + +/* + * Store a newly fetched page in fscache + * - PG_fscache must be set on the page + */ +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, sync); + + ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + dfprintk(FSCACHE, + "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", + page, page->index, page->flags, ret); + + if (ret != 0) { + fscache_uncache_page(NFS_I(inode)->fscache, page); + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } else { + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1); + } +} diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h new file mode 100644 index 00000000000..6e809bb0ff0 --- /dev/null +++ b/fs/nfs/fscache.h @@ -0,0 +1,220 @@ +/* NFS filesystem cache interface definitions + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _NFS_FSCACHE_H +#define _NFS_FSCACHE_H + +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/fscache.h> + +#ifdef CONFIG_NFS_FSCACHE + +/* + * set of NFS FS-Cache objects that form a superblock key + */ +struct nfs_fscache_key { + struct rb_node node; + struct nfs_client *nfs_client; /* the server */ + + /* the elements of the unique key - as used by nfs_compare_super() and + * nfs_compare_mount_options() to distinguish superblocks */ + struct { + struct { + unsigned long s_flags; /* various flags + * (& NFS_MS_MASK) */ + } super; + + struct { + struct nfs_fsid fsid; + int flags; + unsigned int rsize; /* read size */ + unsigned int wsize; /* write size */ + unsigned int acregmin; /* attr cache timeouts */ + unsigned int acregmax; + unsigned int acdirmin; + unsigned int acdirmax; + } nfs_server; + + struct { + rpc_authflavor_t au_flavor; + } rpc_auth; + + /* uniquifier - can be used if nfs_server.flags includes + * NFS_MOUNT_UNSHARED */ + u8 uniq_len; + char uniquifier[0]; + } key; +}; + +/* + * fscache-index.c + */ +extern struct fscache_netfs nfs_fscache_netfs; +extern const struct fscache_cookie_def nfs_fscache_server_index_def; +extern const struct fscache_cookie_def nfs_fscache_super_index_def; +extern const struct fscache_cookie_def nfs_fscache_inode_object_def; + +extern int nfs_fscache_register(void); +extern void nfs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void nfs_fscache_get_client_cookie(struct nfs_client *); +extern void nfs_fscache_release_client_cookie(struct nfs_client *); + +extern void nfs_fscache_get_super_cookie(struct super_block *, + struct nfs_parsed_mount_data *); +extern void nfs_fscache_release_super_cookie(struct super_block *); + +extern void nfs_fscache_init_inode_cookie(struct inode *); +extern void nfs_fscache_release_inode_cookie(struct inode *); +extern void nfs_fscache_zap_inode_cookie(struct inode *); +extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void nfs_fscache_reset_inode_cookie(struct inode *); + +extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); +extern int nfs_fscache_release_page(struct page *, gfp_t); + +extern int __nfs_readpage_from_fscache(struct nfs_open_context *, + struct inode *, struct page *); +extern int __nfs_readpages_from_fscache(struct nfs_open_context *, + struct inode *, struct address_space *, + struct list_head *, unsigned *); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); + +/* + * wait for a page to complete writing to the cache + */ +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) +{ + if (PageFsCache(page)) + fscache_wait_on_page_write(nfsi->fscache, page); +} + +/* + * release the caching state associated with a page if undergoing complete page + * invalidation + */ +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __nfs_fscache_invalidate_page(page, inode); +} + +/* + * Retrieve a page from an inode data storage object. + */ +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpage_from_fscache(ctx, inode, page); + return -ENOBUFS; +} + +/* + * Retrieve a set of pages from an inode data storage object. + */ +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +/* + * Store a page newly fetched from the server in an inode data storage object + * in the cache. + */ +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, + int sync) +{ + if (PageFsCache(page)) + __nfs_readpage_to_fscache(inode, page, sync); +} + +/* + * indicate the client caching state as readable text + */ +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + return "yes"; + return "no "; +} + + +#else /* CONFIG_NFS_FSCACHE */ +static inline int nfs_fscache_register(void) { return 0; } +static inline void nfs_fscache_unregister(void) {} + +static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} +static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} + +static inline void nfs_fscache_get_super_cookie( + struct super_block *sb, + struct nfs_parsed_mount_data *data) +{ +} +static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} + +static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* True: may release page */ +} +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) {} + +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, int sync) {} + +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + return "no "; +} + +#endif /* CONFIG_NFS_FSCACHE */ +#endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b7c9b2df1f2..46177cb8706 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server, return ret; } - if (fattr.type != NFDIR) { + if (!S_ISDIR(fattr.mode)) { printk(KERN_ERR "nfs4_get_root:" " getroot encountered non-directory\n"); return -ENOTDIR; @@ -213,7 +213,7 @@ eat_dot_dir: return ret; } - if (fattr.type != NFDIR) { + if (!S_ISDIR(fattr.mode)) { printk(KERN_ERR "nfs4_get_root:" " lookupfh encountered non-directory\n"); return -ENOTDIR; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0c381686171..64f87194d39 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -46,6 +46,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -66,6 +67,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) } /** + * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +int nfs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + +/** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid * @@ -109,6 +122,7 @@ void nfs_clear_inode(struct inode *inode) BUG_ON(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); + nfs_fscache_release_inode_cookie(inode); } /** @@ -249,13 +263,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR) == 0) + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) goto out_no_inode; - - if (!fattr->nlink) { - printk("NFS: Buggy server - nlink == 0!\n"); + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; - } hash = nfs_fattr_to_ino_t(fattr); @@ -291,7 +302,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) && fattr->size <= NFS_LIMIT_READDIRPLUS) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ - if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if ((fattr->valid & NFS_ATTR_FATTR_FSID) + && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else @@ -304,30 +316,49 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else init_special_inode(inode, inode->i_mode, fattr->rdev); + memset(&inode->i_atime, 0, sizeof(inode->i_atime)); + memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + nfsi->change_attr = 0; + inode->i_size = 0; + inode->i_nlink = 0; + inode->i_uid = -2; + inode->i_gid = -2; + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; - inode->i_atime = fattr->atime; - inode->i_mtime = fattr->mtime; - inode->i_ctime = fattr->ctime; - if (fattr->valid & NFS_ATTR_FATTR_V4) + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = fattr->atime; + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = fattr->mtime; + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = fattr->ctime; + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) nfsi->change_attr = fattr->change_attr; - inode->i_size = nfs_size_to_loff_t(fattr->size); - inode->i_nlink = fattr->nlink; - inode->i_uid = fattr->uid; - inode->i_gid = fattr->gid; - if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_SIZE) + inode->i_size = nfs_size_to_loff_t(fattr->size); + if (fattr->valid & NFS_ATTR_FATTR_NLINK) + inode->i_nlink = fattr->nlink; + if (fattr->valid & NFS_ATTR_FATTR_OWNER) + inode->i_uid = fattr->uid; + if (fattr->valid & NFS_ATTR_FATTR_GROUP) + inode->i_gid = fattr->gid; + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else { - inode->i_blocks = fattr->du.nfs2.blocks; } nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->access_cache = RB_ROOT; + nfs_fscache_init_inode_cookie(inode); + unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); @@ -514,6 +545,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) return err; } +/** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context + * @is_sync: is this a synchronous close + * + * always ensure that the attributes are up to date if we're mounted + * with close-to-open semantics + */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode; + struct nfs_server *server; + + if (!(ctx->mode & FMODE_WRITE)) + return; + if (!is_sync) + return; + inode = ctx->path.dentry->d_inode; + if (!list_empty(&NFS_I(inode)->open_files)) + return; + server = NFS_SERVER(inode); + if (server->flags & NFS_MOUNT_NOCTO) + return; + nfs_revalidate_inode(server, inode); +} + static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) { struct nfs_open_context *ctx; @@ -540,24 +597,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) return ctx; } -static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) +static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode; - - if (ctx == NULL) - return; + struct inode *inode = ctx->path.dentry->d_inode; - inode = ctx->path.dentry->d_inode; if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); - if (ctx->state != NULL) { - if (wait) - nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); - else - nfs4_close_state(&ctx->path, ctx->state, ctx->mode); - } + NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); path_put(&ctx->path); @@ -642,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp) ctx->mode = filp->f_mode; nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); + nfs_fscache_set_inode_cookie(inode, filp); return 0; } @@ -670,9 +719,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; - if (NFS_STALE(inode)) - goto out; - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { @@ -745,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); spin_unlock(&inode->i_lock); nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); + nfs_fscache_reset_inode_cookie(inode); dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); return 0; @@ -815,25 +862,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); - if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && - nfsi->change_attr == fattr->pre_change_attr) { + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) + && (fattr->valid & NFS_ATTR_FATTR_CHANGE) + && nfsi->change_attr == fattr->pre_change_attr) { nfsi->change_attr = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; } /* If we have atomic WCC data, we may update some attributes */ - if ((fattr->valid & NFS_ATTR_WCC) != 0) { - if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) + if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) + && (fattr->valid & NFS_ATTR_FATTR_CTIME) + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + + if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) + && (fattr->valid & NFS_ATTR_FATTR_MTIME) + && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; - } - if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && - nfsi->npages == 0) - i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } + if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) + && (fattr->valid & NFS_ATTR_FATTR_SIZE) + && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) + && nfsi->npages == 0) + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } /** @@ -853,35 +906,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Has the inode gone and changed behind our back? */ - if (nfsi->fileid != fattr->fileid - || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + return -EIO; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - } - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->npages == 0) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } /* Have any file permissions changed? */ - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) - || inode->i_uid != fattr->uid - || inode->i_gid != fattr->gid) + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ - if (inode->i_nlink != fattr->nlink) + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_ATTR; - if (!timespec_equal(&inode->i_atime, &fattr->atime)) + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) @@ -893,11 +950,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { + if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) + return 0; return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return 0; return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); } @@ -975,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); + return status; } @@ -1033,20 +1095,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { - fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME); goto out_noforce; } - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && - (fattr->valid & NFS_ATTR_WCC_V4) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; - fattr->valid |= NFS_ATTR_WCC_V4; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } - if ((fattr->valid & NFS_ATTR_FATTR) != 0 && - (fattr->valid & NFS_ATTR_WCC) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->valid |= NFS_ATTR_FATTR_PRECTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->valid |= NFS_ATTR_FATTR_PREMTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { fattr->pre_size = i_size_read(inode); - fattr->valid |= NFS_ATTR_WCC; + fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); @@ -1078,18 +1151,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) __func__, inode->i_sb->s_id, inode->i_ino, atomic_read(&inode->i_count), fattr->valid); - if (nfsi->fileid != fattr->fileid) + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) goto out_fileid; /* * Make sure the inode's type hasn't changed. */ - if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; server = NFS_SERVER(inode); /* Update the fsid? */ - if (S_ISDIR(inode->i_mode) && + if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) server->fsid = fattr->fsid; @@ -1099,14 +1172,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); /* More cache consistency checks */ - if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (nfsi->change_attr != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + nfsi->change_attr = fattr->change_attr; + } + } + + if (fattr->valid & NFS_ATTR_FATTR_MTIME) { /* NFSv2/v3: Check if the mtime agrees */ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { dprintk("NFS: mtime change on server for file %s/%ld\n", @@ -1114,59 +1200,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } + } + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ - if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - } else if (nfsi->change_attr != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); + /* and probably clear data for a directory too as utimes can cause + * havoc with our cache. + */ + if (S_ISDIR(inode->i_mode)) { + invalid |= NFS_INO_INVALID_DATA; + nfs_force_lookup_revalidate(inode); + } + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + } } /* Check if our cached file size is stale */ - new_isize = nfs_size_to_loff_t(fattr->size); - cur_isize = i_size_read(inode); - if (new_isize != cur_isize) { - /* Do we perhaps have any outstanding writes, or has - * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { - i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if (nfsi->npages == 0 || new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + } + dprintk("NFS: isize change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); } - dprintk("NFS: isize change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); } - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - nfsi->change_attr = fattr->change_attr; - - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || - inode->i_uid != fattr->uid || - inode->i_gid != fattr->gid) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - if (inode->i_nlink != fattr->nlink) - invalid |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_MODE) { + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_mode = fattr->mode; + } + } + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { + if (inode->i_uid != fattr->uid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_uid = fattr->uid; + } + } + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { + if (inode->i_gid != fattr->gid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_gid = fattr->gid; + } + } - inode->i_mode = fattr->mode; - inode->i_nlink = fattr->nlink; - inode->i_uid = fattr->uid; - inode->i_gid = fattr->gid; + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + inode->i_nlink = fattr->nlink; + } + } - if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else { - inode->i_blocks = fattr->du.nfs2.blocks; } + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { @@ -1274,7 +1381,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); - nfsi->ncommit = 0; nfsi->npages = 0; atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); @@ -1337,6 +1443,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_fscache_register(); + if (err < 0) + goto out7; + err = nfsiod_start(); if (err) goto out6; @@ -1389,6 +1499,8 @@ out4: out5: nfsiod_stop(); out6: + nfs_fscache_unregister(); +out7: return err; } @@ -1399,6 +1511,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); + nfs_fscache_unregister(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 340ede8f608..e4d6a8348ad 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -5,6 +5,8 @@ #include <linux/mount.h> #include <linux/security.h> +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) + struct nfs_string; /* Maximum number of readahead requests @@ -37,10 +39,12 @@ struct nfs_parsed_mount_data { int acregmin, acregmax, acdirmin, acdirmax; int namlen; + unsigned int options; unsigned int bsize; unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; char *client_address; + char *fscache_uniq; struct { struct sockaddr_storage address; @@ -152,6 +156,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; #endif +/* proc.c */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + /* dir.c */ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); @@ -165,6 +172,7 @@ extern void nfs_clear_inode(struct inode *); extern void nfs4_clear_inode(struct inode *); #endif void nfs_zap_acl_cache(struct inode *inode); +extern int nfs_wait_bit_killable(void *word); /* super.c */ void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index a3695281003..a2ab2529b5c 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -16,6 +16,9 @@ struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; +#ifdef CONFIG_NFS_FSCACHE + unsigned long long fscache[__NFSIOS_FSCACHEMAX]; +#endif unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; @@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } +#ifdef CONFIG_NFS_FSCACHE +static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, + unsigned long addend) +{ + struct nfs_iostats *iostats; + int cpu; + + cpu = get_cpu(); + iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); + iostats->fscache[stat] += addend; + put_cpu_no_resched(); +} +#endif + static inline struct nfs_iostats *nfs_alloc_iostats(void) { return alloc_percpu(struct nfs_iostats); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 28bab67d151..c862c9340f9 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep) static __be32 * xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) { - u32 rdev; - fattr->type = (enum nfs_ftype) ntohl(*p++); + u32 rdev, type; + type = ntohl(*p++); fattr->mode = ntohl(*p++); fattr->nlink = ntohl(*p++); fattr->uid = ntohl(*p++); @@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time(p, &fattr->atime); p = xdr_decode_time(p, &fattr->mtime); p = xdr_decode_time(p, &fattr->ctime); - fattr->valid |= NFS_ATTR_FATTR; + fattr->valid |= NFS_ATTR_FATTR_V2; fattr->rdev = new_decode_dev(rdev); - if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { - fattr->type = NFFIFO; + if (type == NFCHR && rdev == NFS2_FIFO_DEV) { fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index c55be7a7679..d0cc5ce0edf 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.verifier[1] = current->pid; } - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); for (;;) { status = nfs3_do_create(dir, dentry, data); @@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) dprintk("NFS call mkdir %s\n", dentry->d_name.name); - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); data = nfs3_alloc_createdata(); if (data == NULL) @@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); data = nfs3_alloc_createdata(); if (data == NULL) @@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, + .close_context = nfs_close_context, }; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 6cdeacffde4..e6a1932c711 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -91,19 +91,15 @@ /* * Map file type to S_IFMT bits */ -static struct { - unsigned int mode; - unsigned int nfs2type; -} nfs_type2fmt[] = { - { 0, NFNON }, - { S_IFREG, NFREG }, - { S_IFDIR, NFDIR }, - { S_IFBLK, NFBLK }, - { S_IFCHR, NFCHR }, - { S_IFLNK, NFLNK }, - { S_IFSOCK, NFSOCK }, - { S_IFIFO, NFFIFO }, - { 0, NFBAD } +static const umode_t nfs_type2fmt[] = { + [NF3BAD] = 0, + [NF3REG] = S_IFREG, + [NF3DIR] = S_IFDIR, + [NF3BLK] = S_IFBLK, + [NF3CHR] = S_IFCHR, + [NF3LNK] = S_IFLNK, + [NF3SOCK] = S_IFSOCK, + [NF3FIFO] = S_IFIFO, }; /* @@ -148,13 +144,12 @@ static __be32 * xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) { unsigned int type, major, minor; - int fmode; + umode_t fmode; type = ntohl(*p++); - if (type >= NF3BAD) - type = NF3BAD; - fmode = nfs_type2fmt[type].mode; - fattr->type = nfs_type2fmt[type].nfs2type; + if (type > NF3FIFO) + type = NF3NON; + fmode = nfs_type2fmt[type]; fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; fattr->nlink = ntohl(*p++); fattr->uid = ntohl(*p++); @@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time3(p, &fattr->ctime); /* Update the mode bits */ - fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); + fattr->valid |= NFS_ATTR_FATTR_V3; return p; } @@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_hyper(p, &fattr->pre_size); p = xdr_decode_time3(p, &fattr->pre_mtime); p = xdr_decode_time3(p, &fattr->pre_ctime); - fattr->valid |= NFS_ATTR_WCC; + fattr->valid |= NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME; return p; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8dde84b988d..a4d24268029 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start, KM_USER0); } -static int nfs4_wait_bit_killable(void *word) -{ - if (fatal_signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - static int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; @@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs4_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, TASK_KILLABLE); return res; } @@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) if (calldata->arg.seqid == NULL) goto out_free_calldata; calldata->arg.fmode = 0; - calldata->arg.bitmask = server->attr_bitmask; + calldata->arg.bitmask = server->cache_consistency_bitmask; calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; @@ -1509,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) attr.ia_mode = nd->intent.open.create_mode; attr.ia_valid = ATTR_MODE; if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current->fs->umask; + attr.ia_mode &= ~current_umask(); } else { attr.ia_valid = 0; BUG_ON(nd->intent.open.flags & O_CREAT); @@ -1580,6 +1572,15 @@ out_drop: return 0; } +void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) +{ + if (ctx->state == NULL) + return; + if (is_sync) + nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); + else + nfs4_close_state(&ctx->path, ctx->state, ctx->mode); +} static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); + server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; + server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->acl_bitmask = res.acl_bitmask; } return status; @@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - args->bitmask = server->attr_bitmask; + args->bitmask = server->cache_consistency_bitmask; res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; } @@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pages = &page, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask, }; struct nfs4_readdir_res res; struct rpc_message msg = { @@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->attr_bitmask; + data->args.bitmask = server->cache_consistency_bitmask; data->res.server = server; data->timestamp = jiffies; @@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->attr_bitmask; + data->args.bitmask = server->cache_consistency_bitmask; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } @@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) return len; } +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) +{ + if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + return; + + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page) { @@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, fs_locations->server = server; fs_locations->nlocations = 0; status = rpc_call_sync(server->client, &msg, 0); + nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } @@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .commit_done = nfs4_commit_done, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, + .close_context = nfs4_close_context, }; /* diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2022fe47966..0298e909559 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list); static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) { - int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, - nfs_callback_tcpport, cred); + unsigned short port; + int status; + + port = nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); if (status == 0) status = nfs4_proc_setclientid_confirm(clp, cred); if (status == 0) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d1e4c8f8a0a..1690f0e44b9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int); decode_lookup_maxsz + \ decode_fs_locations_maxsz) -static struct { - unsigned int mode; - unsigned int nfs2type; -} nfs_type2fmt[] = { - { 0, NFNON }, - { S_IFREG, NFREG }, - { S_IFDIR, NFDIR }, - { S_IFBLK, NFBLK }, - { S_IFCHR, NFCHR }, - { S_IFLNK, NFLNK }, - { S_IFSOCK, NFSOCK }, - { S_IFIFO, NFFIFO }, - { 0, NFNON }, - { 0, NFNON }, +static const umode_t nfs_type2fmt[] = { + [NF4BAD] = 0, + [NF4REG] = S_IFREG, + [NF4DIR] = S_IFDIR, + [NF4BLK] = S_IFBLK, + [NF4CHR] = S_IFCHR, + [NF4LNK] = S_IFLNK, + [NF4SOCK] = S_IFSOCK, + [NF4FIFO] = S_IFIFO, + [NF4ATTRDIR] = 0, + [NF4NAMEDATTR] = 0, }; struct compound_hdr { @@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) { __be32 *p; + int ret = 0; *type = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) @@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * return -EIO; } bitmap[0] &= ~FATTR4_WORD0_TYPE; + ret = NFS_ATTR_FATTR_TYPE; } - dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); - return 0; + dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); + return ret; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) { __be32 *p; + int ret = 0; *change = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) @@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t READ_BUF(8); READ64(*change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; + ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); - return 0; + return ret; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) { __be32 *p; + int ret = 0; *size = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) @@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * READ_BUF(8); READ64(*size); bitmap[0] &= ~FATTR4_WORD0_SIZE; + ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); - return 0; + return ret; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) { __be32 *p; + int ret = 0; fsid->major = 0; fsid->minor = 0; @@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs READ64(fsid->major); READ64(fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; + ret = NFS_ATTR_FATTR_FSID; } dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, (unsigned long long)fsid->major, (unsigned long long)fsid->minor); - return 0; + return ret; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; + int ret = 0; *fileid = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) @@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t READ_BUF(8); READ64(*fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; + ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); - return 0; + return ret; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; + int ret = 0; *fileid = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) @@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma READ_BUF(8); READ64(*fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); - return 0; + return ret; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) res->nlocations++; } + if (res->nlocations != 0) + status = NFS_ATTR_FATTR_V4_REFERRAL; out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; @@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return status; } -static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) +static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) { + uint32_t tmp; __be32 *p; + int ret = 0; *mode = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { READ_BUF(4); - READ32(*mode); - *mode &= ~S_IFMT; + READ32(tmp); + *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; + ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); - return 0; + return ret; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) { __be32 *p; + int ret = 0; *nlink = 1; if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) @@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t READ_BUF(4); READ32(*nlink); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; + ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); - return 0; + return ret; } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) { uint32_t len; __be32 *p; + int ret = 0; *uid = -2; if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) @@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf READ32(len); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) + if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) + ret = NFS_ATTR_FATTR_OWNER; + else dprintk("%s: nfs_map_name_to_uid failed!\n", __func__); } else @@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf bitmap[1] &= ~FATTR4_WORD1_OWNER; } dprintk("%s: uid=%d\n", __func__, (int)*uid); - return 0; + return ret; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) { uint32_t len; __be32 *p; + int ret = 0; *gid = -2; if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) @@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf READ32(len); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) + if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) + ret = NFS_ATTR_FATTR_GROUP; + else dprintk("%s: nfs_map_group_to_gid failed!\n", __func__); } else @@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } dprintk("%s: gid=%d\n", __func__, (int)*gid); - return 0; + return ret; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) { uint32_t major = 0, minor = 0; __be32 *p; + int ret = 0; *rdev = MKDEV(0,0); if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) @@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; + ret = NFS_ATTR_FATTR_RDEV; } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); - return 0; + return ret; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) { __be32 *p; + int ret = 0; *used = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) @@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint READ_BUF(8); READ64(*used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; + ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); - return 0; + return ret; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); @@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); @@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); @@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons uint32_t attrlen, bitmap[2] = {0}, type; - int status, fmode = 0; + int status; + umode_t fmode = 0; uint64_t fileid; - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) - goto xdr_error; - if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) goto xdr_error; - fattr->bitmap[0] = bitmap[0]; - fattr->bitmap[1] = bitmap[1]; + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; - if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) goto xdr_error; - if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) + status = decode_attr_type(xdr, bitmap, &type); + if (status < 0) goto xdr_error; - fattr->type = nfs_type2fmt[type].nfs2type; - fmode = nfs_type2fmt[type].mode; + fattr->mode = 0; + if (status != 0) { + fattr->mode |= nfs_type2fmt[type]; + fattr->valid |= status; + } - if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) + status = decode_attr_change(xdr, bitmap, &fattr->change_attr); + if (status < 0) goto xdr_error; - if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) + fattr->valid |= status; + + status = decode_attr_size(xdr, bitmap, &fattr->size); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) + fattr->valid |= status; + + status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) + fattr->valid |= status; + + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, + fattr->valid |= status; + + status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, struct nfs4_fs_locations, - fattr))) != 0) + fattr)); + if (status < 0) goto xdr_error; - if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) + fattr->valid |= status; + + status = decode_attr_mode(xdr, bitmap, &fmode); + if (status < 0) goto xdr_error; - fattr->mode |= fmode; - if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) + if (status != 0) { + fattr->mode |= fmode; + fattr->valid |= status; + } + + status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); + if (status < 0) goto xdr_error; - if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) + fattr->valid |= status; + + status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) + fattr->valid |= status; + + status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) + fattr->valid |= status; + + status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); + if (status < 0) goto xdr_error; - if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) + fattr->valid |= status; + + status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) + fattr->valid |= status; + + status = decode_attr_time_access(xdr, bitmap, &fattr->atime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) + fattr->valid |= status; + + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) + fattr->valid |= status; + + status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) + fattr->valid |= status; + + status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); + if (status < 0) goto xdr_error; - if (fattr->fileid == 0 && fileid != 0) + if (status != 0 && !(fattr->valid & status)) { fattr->fileid = fileid; - if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) - fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; + fattr->valid |= status; + } + + status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; @@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr, res); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); - if (status == NFS4ERR_DELAY) - status = 0; + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7f079209d70..e2975939126 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req) kref_put(&req->wb_kref, nfs_free_request); } -static int nfs_wait_bit_killable(void *word) -{ - int ret = 0; - - if (fatal_signal_pending(current)) - ret = -ERESTARTSYS; - else - schedule(); - return ret; -} - /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 193465210d7..7be72d90d49 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .commit_setup = nfs_proc_commit_setup, .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, + .close_context = nfs_close_context, }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f856004bb7f..4ace3c50a8e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -24,6 +24,7 @@ #include "internal.h" #include "iostat.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) +int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, + struct page *page) { LIST_HEAD(one_request); struct nfs_page *new; @@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, static void nfs_readpage_release(struct nfs_page *req) { + struct inode *d_inode = req->wb_context->path.dentry->d_inode; + + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + unlock_page(req->wb_page); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", @@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page) } else ctx = get_nfs_open_context(nfs_file_open_context(file)); + if (!IS_SYNC(inode)) { + error = nfs_readpage_from_fscache(ctx, inode, page); + if (error == 0) + goto out; + } + error = nfs_readpage_async(ctx, inode, page); +out: put_nfs_open_context(ctx); return error; out_unlock: @@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, return -EBADF; } else desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + + /* attempt to read as many of the pages as possible from the cache + * - this returns -ENOBUFS immediately if the cookie is negative + */ + ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, + pages, &nr_pages); + if (ret == 0) + goto read_complete; /* all pages were read */ + if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else @@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nfs_pageio_complete(&pgio); npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); +read_complete: put_nfs_open_context(desc.ctx); out: return ret; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d6686f4786d..82eaadbff40 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -60,6 +60,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -76,6 +77,7 @@ enum { Opt_rdirplus, Opt_nordirplus, Opt_sharecache, Opt_nosharecache, Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, /* Mount options that take integer arguments */ Opt_port, @@ -93,6 +95,7 @@ enum { Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, Opt_lookupcache, + Opt_fscache_uniq, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nosharecache, "nosharecache" }, { Opt_resvport, "resvport" }, { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_nofscache, "nofsc" }, { Opt_port, "port=%u" }, { Opt_rsize, "rsize=%u" }, @@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (clp->rpc_ops->version == 4) seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); #endif + if (nfss->options & NFS_OPTION_FSCACHE) + seq_printf(m, ",fsc"); } /* @@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) totals.events[i] += stats->events[i]; for (i = 0; i < __NFSIOS_BYTESMAX; i++) totals.bytes[i] += stats->bytes[i]; +#ifdef CONFIG_NFS_FSCACHE + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + totals.fscache[i] += stats->fscache[i]; +#endif preempt_enable(); } @@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); +#ifdef CONFIG_NFS_FSCACHE + if (nfss->options & NFS_OPTION_FSCACHE) { + seq_printf(m, "\n\tfsc:\t"); + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + } +#endif seq_printf(m, "\n"); rpc_print_iostats(m, nfss->client); @@ -1018,6 +1037,7 @@ static int nfs_parse_mount_options(char *raw, case Opt_rdma: mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(p); break; case Opt_acl: mnt->flags &= ~NFS_MOUNT_NOACL; @@ -1043,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw, case Opt_noresvport: mnt->flags |= NFS_MOUNT_NORESVPORT; break; + case Opt_fscache: + mnt->options |= NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_nofscache: + mnt->options &= ~NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_fscache_uniq: + string = match_strdup(args); + if (!string) + goto out_nomem; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = string; + mnt->options |= NFS_OPTION_FSCACHE; + break; /* * options that take numeric values @@ -1205,12 +1243,14 @@ static int nfs_parse_mount_options(char *raw, /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(string); break; default: errors++; dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); } + kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -1218,7 +1258,6 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); - kfree(string); switch (token) { case Opt_xprt_udp: @@ -1868,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb, nfs_initialise_sb(sb); } -#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) - static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; @@ -2034,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs_fill_super(s, data); + nfs_fscache_get_super_cookie(s, data); } mntroot = nfs_get_root(s, mntfh); @@ -2054,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, out: kfree(data->nfs_server.hostname); kfree(data->mount_server.hostname); + kfree(data->fscache_uniq); security_free_mnt_opts(&data->lsm_opts); out_free_fh: kfree(mntfh); @@ -2081,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s) bdi_unregister(&server->backing_dev_info); kill_anon_super(s); + nfs_fscache_release_super_cookie(s); nfs_free_server(server); } @@ -2388,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs4_fill_super(s); + nfs_fscache_get_super_cookie(s, data); } mntroot = nfs4_get_root(s, mntfh); @@ -2409,6 +2450,7 @@ out: kfree(data->client_address); kfree(data->nfs_server.export_path); kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); security_free_mnt_opts(&data->lsm_opts); out_free_fh: kfree(mntfh); @@ -2435,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb) kill_anon_super(sb); nfs4_renewd_prepare_shutdown(server); + nfs_fscache_release_super_cookie(sb); nfs_free_server(server); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9f9845859fc..e560a78995a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; + /* Stop dirtying of new pages while we sync */ + err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (err) + goto out_err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); + + clear_bit_unlock(NFS_INO_FLUSHING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_FLUSHING); + if (err < 0) - return err; - if (pgio.pg_error < 0) - return pgio.pg_error; + goto out_err; + err = pgio.pg_error; + if (err < 0) + goto out_err; return 0; +out_err: + return err; } /* @@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - nfsi->ncommit++; set_bit(PG_CLEAN, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, @@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head) } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static int +nfs_need_commit(struct nfs_inode *nfsi) +{ + return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); +} + /* * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan @@ -538,16 +558,18 @@ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res = 0; - if (nfsi->ncommit != 0) { - res = nfs_scan_list(nfsi, dst, idx_start, npages, - NFS_PAGE_TAG_COMMIT); - nfsi->ncommit -= res; - } - return res; + if (!nfs_need_commit(nfsi)) + return 0; + + return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); } #else +static inline int nfs_need_commit(struct nfs_inode *nfsi) +{ + return 0; +} + static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { return 0; @@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; - if (!NFS_I(inode)->ncommit) + if (!nfs_need_commit(NFS_I(inode))) data->args.stable = NFS_FILE_SYNC; } @@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how) { struct writeback_control wbc = { .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, .for_writepages = 1, }; - int ret; - ret = __nfs_write_mapping(mapping, &wbc, how); - if (ret < 0) - return ret; - wbc.sync_mode = WB_SYNC_ALL; return __nfs_write_mapping(mapping, &wbc, how); } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3d93b2064ce..a4ed8644d69 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) char transport[16]; int port; if (sscanf(buf, "%15s %4d", transport, &port) == 2) { + if (port < 1 || port > 65535) + return -EINVAL; err = nfsd_create_serv(); if (!err) { err = svc_create_xprt(nfsd_serv, - transport, port, + transport, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err == -ENOENT) /* Give a reasonable perror msg for @@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) char transport[16]; int port; if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { - if (port == 0) + if (port < 1 || port > 65535) return -EINVAL; if (nfsd_serv) { xprt = svc_find_xprt(nfsd_serv, transport, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 07e4f5d7baa..7c09852be71 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -229,7 +229,6 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; @@ -244,7 +243,7 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@ -253,7 +252,7 @@ static int nfsd_init_socks(int port) if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@ -404,7 +403,6 @@ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; - struct fs_struct *fsp; int err, preverr = 0; /* Lock module and set up kernel thread */ @@ -413,13 +411,11 @@ nfsd(void *vrqstp) /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ - fsp = copy_fs_struct(current->fs); - if (!fsp) { + if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; } - exit_fs(current); - current->fs = fsp; + current->fs->umask = 0; /* diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 34314b33dbd..5a9e34475e3 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -32,8 +32,8 @@ /** * The little endian Unicode string $I30 as a global constant. */ -ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'), - const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 }; +ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), + cpu_to_le16('3'), cpu_to_le16('0'), 0 }; /** * ntfs_lookup_inode_by_name - find an inode in a directory given its name diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 86bef156cf0..82c5085559c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi) goto em_put_err_out; next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > - const_le32_to_cpu(AT_DATA)) + if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) goto em_put_err_out; if (AT_DATA != al_entry->type) continue; diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 1e383328ece..50931b1ce4b 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -31,19 +31,8 @@ #include "types.h" -/* - * Constant endianness conversion defines. - */ -#define const_le16_to_cpu(x) __constant_le16_to_cpu(x) -#define const_le32_to_cpu(x) __constant_le32_to_cpu(x) -#define const_le64_to_cpu(x) __constant_le64_to_cpu(x) - -#define const_cpu_to_le16(x) __constant_cpu_to_le16(x) -#define const_cpu_to_le32(x) __constant_cpu_to_le32(x) -#define const_cpu_to_le64(x) __constant_cpu_to_le64(x) - /* The NTFS oem_id "NTFS " */ -#define magicNTFS const_cpu_to_le64(0x202020205346544eULL) +#define magicNTFS cpu_to_le64(0x202020205346544eULL) /* * Location of bootsector on partition: @@ -114,25 +103,25 @@ typedef struct { */ enum { /* Found in $MFT/$DATA. */ - magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */ - magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */ - magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ + magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ + magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ + magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ /* Found in $LogFile/$DATA. */ - magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */ - magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */ + magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ + magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ - magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ + magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ /* Found in all ntfs record containing records. */ - magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector + magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector transfer was detected. */ /* * Found in $LogFile/$DATA when a page is full of 0xff bytes and is * thus not initialized. Page must be initialized before using it. */ - magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */ + magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ }; typedef le32 NTFS_RECORD_TYPE; @@ -258,8 +247,8 @@ typedef enum { * information about the mft record in which they are present. */ enum { - MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001), - MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002), + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), } __attribute__ ((__packed__)); typedef le16 MFT_RECORD_FLAGS; @@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS; * Note: The _LE versions will return a CPU endian formatted value! */ #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL -#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU) +#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) typedef u64 MFT_REF; typedef le64 leMFT_REF; @@ -477,25 +466,25 @@ typedef struct { * a revealing choice of symbol I do not know what is... (-; */ enum { - AT_UNUSED = const_cpu_to_le32( 0), - AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10), - AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20), - AT_FILE_NAME = const_cpu_to_le32( 0x30), - AT_OBJECT_ID = const_cpu_to_le32( 0x40), - AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50), - AT_VOLUME_NAME = const_cpu_to_le32( 0x60), - AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70), - AT_DATA = const_cpu_to_le32( 0x80), - AT_INDEX_ROOT = const_cpu_to_le32( 0x90), - AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0), - AT_BITMAP = const_cpu_to_le32( 0xb0), - AT_REPARSE_POINT = const_cpu_to_le32( 0xc0), - AT_EA_INFORMATION = const_cpu_to_le32( 0xd0), - AT_EA = const_cpu_to_le32( 0xe0), - AT_PROPERTY_SET = const_cpu_to_le32( 0xf0), - AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100), - AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000), - AT_END = const_cpu_to_le32(0xffffffff) + AT_UNUSED = cpu_to_le32( 0), + AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), + AT_FILE_NAME = cpu_to_le32( 0x30), + AT_OBJECT_ID = cpu_to_le32( 0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), + AT_VOLUME_NAME = cpu_to_le32( 0x60), + AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), + AT_DATA = cpu_to_le32( 0x80), + AT_INDEX_ROOT = cpu_to_le32( 0x90), + AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), + AT_BITMAP = cpu_to_le32( 0xb0), + AT_REPARSE_POINT = cpu_to_le32( 0xc0), + AT_EA_INFORMATION = cpu_to_le32( 0xd0), + AT_EA = cpu_to_le32( 0xe0), + AT_PROPERTY_SET = cpu_to_le32( 0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_END = cpu_to_le32(0xffffffff) }; typedef le32 ATTR_TYPE; @@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE; * equal then the second le32 values would be compared, etc. */ enum { - COLLATION_BINARY = const_cpu_to_le32(0x00), - COLLATION_FILE_NAME = const_cpu_to_le32(0x01), - COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02), - COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10), - COLLATION_NTOFS_SID = const_cpu_to_le32(0x11), - COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12), - COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13), + COLLATION_BINARY = cpu_to_le32(0x00), + COLLATION_FILE_NAME = cpu_to_le32(0x01), + COLLATION_UNICODE_STRING = cpu_to_le32(0x02), + COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), + COLLATION_NTOFS_SID = cpu_to_le32(0x11), + COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), + COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), }; typedef le32 COLLATION_RULE; @@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE; * NT4. */ enum { - ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be indexed. */ - ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type can be present multiple times in the mft records of an inode. */ - ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value must contain at least one non-zero byte. */ - ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be indexed and the attribute value must be unique for the attribute type in all of the mft records of an inode. */ - ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be named and the name must be unique for the attribute type in all of the mft records of an inode. */ - ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be resident. */ - ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log modifications to this attribute, regardless of whether it is resident or non-resident. Without this, only log @@ -614,12 +603,12 @@ typedef struct { * Attribute flags (16-bit). */ enum { - ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001), - ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method + ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method mask. Also, first illegal value. */ - ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000), - ATTR_IS_SPARSE = const_cpu_to_le16(0x8000), + ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), + ATTR_IS_SPARSE = cpu_to_le16(0x8000), } __attribute__ ((__packed__)); typedef le16 ATTR_FLAGS; @@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC; * flags appear in all of the above. */ enum { - FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001), - FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002), - FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004), - /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */ + FILE_ATTR_READONLY = cpu_to_le32(0x00000001), + FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), + /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ - FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010), + FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is reserved for the DOS SUBDIRECTORY flag. */ - FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020), - FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040), - FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080), + FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), - FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100), - FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200), - FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400), - FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800), + FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), - FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000), - FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000), - FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000), + FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), - FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7), + FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the FILE_ATTR_DEVICE and preserves everything else. This mask is used to obtain all flags that are valid for reading. */ - FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7), + FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask @@ -846,11 +835,11 @@ enum { * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION * attribute of an mft record. */ - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this is a directory or not, i.e. whether it has an index root attribute or not. */ - FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), + FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this file has a view index present (eg. object id index, quota index, one of the security indexes or the encrypting @@ -1446,42 +1435,42 @@ enum { /* Specific rights for files and directories are as follows: */ /* Right to read data from the file. (FILE) */ - FILE_READ_DATA = const_cpu_to_le32(0x00000001), + FILE_READ_DATA = cpu_to_le32(0x00000001), /* Right to list contents of a directory. (DIRECTORY) */ - FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001), + FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), /* Right to write data to the file. (FILE) */ - FILE_WRITE_DATA = const_cpu_to_le32(0x00000002), + FILE_WRITE_DATA = cpu_to_le32(0x00000002), /* Right to create a file in the directory. (DIRECTORY) */ - FILE_ADD_FILE = const_cpu_to_le32(0x00000002), + FILE_ADD_FILE = cpu_to_le32(0x00000002), /* Right to append data to the file. (FILE) */ - FILE_APPEND_DATA = const_cpu_to_le32(0x00000004), + FILE_APPEND_DATA = cpu_to_le32(0x00000004), /* Right to create a subdirectory. (DIRECTORY) */ - FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004), + FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), /* Right to read extended attributes. (FILE/DIRECTORY) */ - FILE_READ_EA = const_cpu_to_le32(0x00000008), + FILE_READ_EA = cpu_to_le32(0x00000008), /* Right to write extended attributes. (FILE/DIRECTORY) */ - FILE_WRITE_EA = const_cpu_to_le32(0x00000010), + FILE_WRITE_EA = cpu_to_le32(0x00000010), /* Right to execute a file. (FILE) */ - FILE_EXECUTE = const_cpu_to_le32(0x00000020), + FILE_EXECUTE = cpu_to_le32(0x00000020), /* Right to traverse the directory. (DIRECTORY) */ - FILE_TRAVERSE = const_cpu_to_le32(0x00000020), + FILE_TRAVERSE = cpu_to_le32(0x00000020), /* * Right to delete a directory and all the files it contains (its * children), even if the files are read-only. (DIRECTORY) */ - FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040), + FILE_DELETE_CHILD = cpu_to_le32(0x00000040), /* Right to read file attributes. (FILE/DIRECTORY) */ - FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080), + FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), /* Right to change file attributes. (FILE/DIRECTORY) */ - FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100), + FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), /* * The standard rights (bits 16 to 23). These are independent of the @@ -1489,27 +1478,27 @@ enum { */ /* Right to delete the object. */ - DELETE = const_cpu_to_le32(0x00010000), + DELETE = cpu_to_le32(0x00010000), /* * Right to read the information in the object's security descriptor, * not including the information in the SACL, i.e. right to read the * security descriptor and owner. */ - READ_CONTROL = const_cpu_to_le32(0x00020000), + READ_CONTROL = cpu_to_le32(0x00020000), /* Right to modify the DACL in the object's security descriptor. */ - WRITE_DAC = const_cpu_to_le32(0x00040000), + WRITE_DAC = cpu_to_le32(0x00040000), /* Right to change the owner in the object's security descriptor. */ - WRITE_OWNER = const_cpu_to_le32(0x00080000), + WRITE_OWNER = cpu_to_le32(0x00080000), /* * Right to use the object for synchronization. Enables a process to * wait until the object is in the signalled state. Some object types * do not support this access right. */ - SYNCHRONIZE = const_cpu_to_le32(0x00100000), + SYNCHRONIZE = cpu_to_le32(0x00100000), /* * The following STANDARD_RIGHTS_* are combinations of the above for @@ -1517,25 +1506,25 @@ enum { */ /* These are currently defined to READ_CONTROL. */ - STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000), - STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000), - STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000), + STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ - STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000), + STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), /* * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and * SYNCHRONIZE access. */ - STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000), + STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), /* * The access system ACL and maximum allowed access types (bits 24 to * 25, bits 26 to 27 are reserved). */ - ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000), - MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000), + ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), + MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), /* * The generic rights (bits 28 to 31). These map onto the standard and @@ -1543,10 +1532,10 @@ enum { */ /* Read, write, and execute access. */ - GENERIC_ALL = const_cpu_to_le32(0x10000000), + GENERIC_ALL = cpu_to_le32(0x10000000), /* Execute access. */ - GENERIC_EXECUTE = const_cpu_to_le32(0x20000000), + GENERIC_EXECUTE = cpu_to_le32(0x20000000), /* * Write access. For files, this maps onto: @@ -1555,7 +1544,7 @@ enum { * For directories, the mapping has the same numerical value. See * above for the descriptions of the rights granted. */ - GENERIC_WRITE = const_cpu_to_le32(0x40000000), + GENERIC_WRITE = cpu_to_le32(0x40000000), /* * Read access. For files, this maps onto: @@ -1564,7 +1553,7 @@ enum { * For directories, the mapping has the same numberical value. See * above for the descriptions of the rights granted. */ - GENERIC_READ = const_cpu_to_le32(0x80000000), + GENERIC_READ = cpu_to_le32(0x80000000), }; typedef le32 ACCESS_MASK; @@ -1604,8 +1593,8 @@ typedef struct { * The object ACE flags (32-bit). */ enum { - ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1), - ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2), + ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), + ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), }; typedef le32 OBJECT_ACE_FLAGS; @@ -1706,23 +1695,23 @@ typedef enum { * expressed as offsets from the beginning of the security descriptor. */ enum { - SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001), - SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002), - SE_DACL_PRESENT = const_cpu_to_le16(0x0004), - SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008), - - SE_SACL_PRESENT = const_cpu_to_le16(0x0010), - SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020), - - SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100), - SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200), - SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400), - SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800), - - SE_DACL_PROTECTED = const_cpu_to_le16(0x1000), - SE_SACL_PROTECTED = const_cpu_to_le16(0x2000), - SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000), - SE_SELF_RELATIVE = const_cpu_to_le16(0x8000) + SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), + SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), + SE_DACL_PRESENT = cpu_to_le16(0x0004), + SE_DACL_DEFAULTED = cpu_to_le16(0x0008), + + SE_SACL_PRESENT = cpu_to_le16(0x0010), + SE_SACL_DEFAULTED = cpu_to_le16(0x0020), + + SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), + SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), + SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), + SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), + + SE_DACL_PROTECTED = cpu_to_le16(0x1000), + SE_SACL_PROTECTED = cpu_to_le16(0x2000), + SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), + SE_SELF_RELATIVE = cpu_to_le16(0x8000) } __attribute__ ((__packed__)); typedef le16 SECURITY_DESCRIPTOR_CONTROL; @@ -1910,21 +1899,21 @@ typedef struct { * Possible flags for the volume (16-bit). */ enum { - VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001), - VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002), - VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004), - VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008), + VOLUME_IS_DIRTY = cpu_to_le16(0x0001), + VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), + VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), + VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), - VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), - VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), + VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), + VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), - VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), - VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), + VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), + VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), - VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), + VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), + VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), } __attribute__ ((__packed__)); typedef le16 VOLUME_FLAGS; @@ -2109,26 +2098,26 @@ typedef struct { * The user quota flags. Names explain meaning. */ enum { - QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001), - QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002), - QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004), + QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), + QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), + QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), - QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007), + QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), /* This is a bit mask for the user quota flags. */ /* * These flags are only present in the quota defaults index entry, i.e. * in the entry where owner_id = QUOTA_DEFAULTS_ID. */ - QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010), - QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020), - QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040), - QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080), - - QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100), - QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200), - QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400), - QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800), + QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), + QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), + QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), + QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), + + QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), + QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), + QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), + QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), }; typedef le32 QUOTA_FLAGS; @@ -2172,9 +2161,9 @@ typedef struct { * Predefined owner_id values (32-bit). */ enum { - QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000), - QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001), - QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100), + QUOTA_INVALID_ID = cpu_to_le32(0x00000000), + QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), + QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), }; /* @@ -2189,14 +2178,14 @@ typedef enum { * Index entry flags (16-bit). */ enum { - INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a + INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a sub-node, i.e. a reference to an index block in form of a virtual cluster number (see below). */ - INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last + INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last entry in an index block. The index entry does not represent a file but it can point to a sub-node. */ - INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16-bit. */ } __attribute__ ((__packed__)); @@ -2334,26 +2323,26 @@ typedef struct { * These are the predefined reparse point tags: */ enum { - IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000), - IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000), - IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000), + IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), + IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), - IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000), - IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001), - IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), + IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), - IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005), - IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006), - IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007), - IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008), + IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), + IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), - IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003), + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), - IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004), + IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), - IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000), + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), - IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff), + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), }; /* diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h index 9468e1c45ae..b5a6f08bd35 100644 --- a/fs/ntfs/logfile.h +++ b/fs/ntfs/logfile.h @@ -104,7 +104,7 @@ typedef struct { * in this particular client array. Also inside the client records themselves, * this means that there are no client records preceding or following this one. */ -#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff) +#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) #define LOGFILE_NO_CLIENT_CPU 0xffff /* @@ -112,8 +112,8 @@ typedef struct { * information about the log file in which they are present. */ enum { - RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002), - RESTART_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ + RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), + RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ } __attribute__ ((__packed__)); typedef le16 RESTART_AREA_FLAGS; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 17d32ca6bc3..23bf68453d7 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) */ /* Mark the mft record as not in use. */ - m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE)); + m->flags &= ~MFT_RECORD_IN_USE; /* Increment the sequence number, skipping zero, if it is not zero. */ old_seq_no = m->sequence_number; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 4a46743b507..f76951dcd4a 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb, * many BIOSes will refuse to boot from a bootsector if the magic is * incorrect, so we emit a warning. */ - if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55)) + if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) ntfs_warning(sb, "Invalid end of sector marker."); return true; not_ntfs: @@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol) u32 *kaddr, *kend; ntfs_name *name = NULL; int ret = 1; - static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'), - const_cpu_to_le16('i'), const_cpu_to_le16('b'), - const_cpu_to_le16('e'), const_cpu_to_le16('r'), - const_cpu_to_le16('f'), const_cpu_to_le16('i'), - const_cpu_to_le16('l'), const_cpu_to_le16('.'), - const_cpu_to_le16('s'), const_cpu_to_le16('y'), - const_cpu_to_le16('s'), 0 }; + static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + cpu_to_le16('i'), cpu_to_le16('b'), + cpu_to_le16('e'), cpu_to_le16('r'), + cpu_to_le16('f'), cpu_to_le16('i'), + cpu_to_le16('l'), cpu_to_le16('.'), + cpu_to_le16('s'), cpu_to_le16('y'), + cpu_to_le16('s'), 0 }; ntfs_debug("Entering."); /* @@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol) goto iput_out; } kaddr = (u32*)page_address(page); - if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) { + if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " "hibernated on the volume. This is the " "system volume."); @@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol) MFT_REF mref; struct inode *tmp_ino; ntfs_name *name = NULL; - static const ntfschar Quota[7] = { const_cpu_to_le16('$'), - const_cpu_to_le16('Q'), const_cpu_to_le16('u'), - const_cpu_to_le16('o'), const_cpu_to_le16('t'), - const_cpu_to_le16('a'), 0 }; - static ntfschar Q[3] = { const_cpu_to_le16('$'), - const_cpu_to_le16('Q'), 0 }; + static const ntfschar Quota[7] = { cpu_to_le16('$'), + cpu_to_le16('Q'), cpu_to_le16('u'), + cpu_to_le16('o'), cpu_to_le16('t'), + cpu_to_le16('a'), 0 }; + static ntfschar Q[3] = { cpu_to_le16('$'), + cpu_to_le16('Q'), 0 }; ntfs_debug("Entering."); /* @@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) struct page *page; ntfs_name *name = NULL; USN_HEADER *uh; - static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'), - const_cpu_to_le16('U'), const_cpu_to_le16('s'), - const_cpu_to_le16('n'), const_cpu_to_le16('J'), - const_cpu_to_le16('r'), const_cpu_to_le16('n'), - const_cpu_to_le16('l'), 0 }; - static ntfschar Max[5] = { const_cpu_to_le16('$'), - const_cpu_to_le16('M'), const_cpu_to_le16('a'), - const_cpu_to_le16('x'), 0 }; - static ntfschar J[3] = { const_cpu_to_le16('$'), - const_cpu_to_le16('J'), 0 }; + static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), + cpu_to_le16('U'), cpu_to_le16('s'), + cpu_to_le16('n'), cpu_to_le16('J'), + cpu_to_le16('r'), cpu_to_le16('n'), + cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { cpu_to_le16('$'), + cpu_to_le16('M'), cpu_to_le16('a'), + cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { cpu_to_le16('$'), + cpu_to_le16('J'), 0 }; ntfs_debug("Entering."); /* diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 4087fbdac32..00d8e6bd7c3 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -116,27 +116,27 @@ typedef struct { * documentation: http://www.linux-ntfs.org/ */ enum { - USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), - USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002), - USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004), - USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010), - USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020), - USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040), - USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100), - USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200), - USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400), - USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800), - USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000), - USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000), - USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000), - USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000), - USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000), - USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000), - USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000), - USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000), - USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000), - USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000), - USN_REASON_CLOSE = const_cpu_to_le32(0x80000000), + USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), + USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), + USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), + USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), + USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), + USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), + USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), + USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), + USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), + USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), + USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), + USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), + USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), + USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), + USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), + USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), + USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), + USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), + USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), + USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), + USN_REASON_CLOSE = cpu_to_le32(0x80000000), }; typedef le32 USN_REASON_FLAGS; @@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS; * http://www.linux-ntfs.org/ */ enum { - USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), - USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002), - USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004), + USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), + USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), + USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), }; typedef le32 USN_SOURCE_INFO_FLAGS; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 12dfb44c22e..fbeaec76210 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle, return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index eea1d24713e..b606496b72e 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -154,8 +154,9 @@ out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; sigset_t blocked, oldset; @@ -196,7 +197,8 @@ out: ret2 = ocfs2_vm_op_unblock_sigs(&oldset); if (ret2 < 0) mlog_errno(ret2); - + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 633e9dc972b..379ae5fb441 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct omfs_sb_info *sbi = OMFS_SB(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + buf->f_type = OMFS_MAGIC; buf->f_bsize = sbi->s_blocksize; buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); + return 0; } @@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_uid = current_uid(); sbi->s_gid = current_gid(); - sbi->s_dmask = sbi->s_fmask = current->fs->umask; + sbi->s_dmask = sbi->s_fmask = current_umask(); if (!parse_options((char *) data, sbi)) goto end; diff --git a/fs/open.c b/fs/open.c index 75b61677daa..377eb25b6ab 100644 --- a/fs/open.c +++ b/fs/open.c @@ -29,6 +29,7 @@ #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> +#include <linux/fs_struct.h> int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/proc/base.c b/fs/proc/base.c index e0afd326b68..f71559784bf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -80,6 +80,7 @@ #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> +#include <linux/fs_struct.h> #include "internal.h" /* NOTE: diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 43d23948384..74ea974f5ca 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram-i.freehigh), #endif #ifndef CONFIG_MMU - K((unsigned long) atomic_read(&mmap_pages_allocated)), + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), #endif K(i.totalswap), K(i.freeswap), diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 4a9e0f65ae6..83adcc86943 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -144,16 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver) { struct proc_dir_entry *ent; - if (!driver->ops->read_proc || !driver->driver_name || - driver->proc_entry) + if (!driver->driver_name || driver->proc_entry || + !driver->ops->proc_fops) return; - ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); - if (!ent) - return; - ent->read_proc = driver->ops->read_proc; - ent->data = driver; - + ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_fops, driver); driver->proc_entry = ent; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 343ea1216bc..863464d5519 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -2,6 +2,7 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/fs_struct.h> #include <linux/mount.h> #include <linux/ptrace.h> #include <linux/seq_file.h> @@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(mm); - if (current->fs && atomic_read(¤t->fs->count) > 1) + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else bytes += kobjsize(current->fs); @@ -136,14 +137,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2aad1044b84..fe1f0f31d11 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock ) static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_kernel(); @@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = qnx4_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX4_NAME_MAX; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); unlock_kernel(); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 2ca967a5ef7..607c579e5ec 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; if (!atomic_read(&inode->i_writecount)) continue; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 995ef1d6686..ebb2c417912 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = { */ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { - struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; struct page *pages; unsigned order; @@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) memset(data, 0, newsize); /* attach all the pages to the inode's address space */ - pagevec_init(&lru_pvec, 0); for (loop = 0; loop < npages; loop++) { struct page *page = pages + loop; - ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); + ret = add_to_page_cache_lru(page, inode->i_mapping, loop, + GFP_KERNEL); if (ret < 0) goto add_error; - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add_file(&lru_pvec); - /* prevent the page from being discarded on memory pressure */ SetPageDirty(page); unlock_page(page); } - pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: @@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return -EFBIG; add_error: - pagevec_lru_add_file(&lru_pvec); - page_cache_release(pages + loop); - for (loop++; loop < npages; loop++) - __free_page(pages + loop); + while (loop < npages) + __free_page(pages + loop++); return ret; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b7e6ac706b8..a404fb88e45 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -33,12 +33,15 @@ #include <linux/backing-dev.h> #include <linux/ramfs.h> #include <linux/sched.h> +#include <linux/parser.h> #include <asm/uaccess.h> #include "internal.h" /* some random number */ #define RAMFS_MAGIC 0x858458f6 +#define RAMFS_DEFAULT_MODE 0755 + static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; @@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = { static const struct super_operations ramfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, +}; + +struct ramfs_mount_opts { + umode_t mode; +}; + +enum { + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct ramfs_fs_info { + struct ramfs_mount_opts mount_opts; }; +static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + char *p; + + opts->mode = RAMFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + default: + printk(KERN_ERR "ramfs: bad mount option: %s\n", p); + return -EINVAL; + } + } + + return 0; +} + static int ramfs_fill_super(struct super_block * sb, void * data, int silent) { - struct inode * inode; - struct dentry * root; + struct ramfs_fs_info *fsi; + struct inode *inode = NULL; + struct dentry *root; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + if (!fsi) { + err = -ENOMEM; + goto fail; + } + sb->s_fs_info = fsi; + + err = ramfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_CACHE_SIZE; @@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) sb->s_magic = RAMFS_MAGIC; sb->s_op = &ramfs_ops; sb->s_time_gran = 1; - inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0); - if (!inode) - return -ENOMEM; + inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); + if (!inode) { + err = -ENOMEM; + goto fail; + } root = d_alloc_root(inode); if (!root) { - iput(inode); - return -ENOMEM; + err = -ENOMEM; + goto fail; } sb->s_root = root; return 0; +fail: + kfree(fsi); + iput(inode); + return err; } int ramfs_get_sb(struct file_system_type *fs_type, @@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type, mnt); } +static void ramfs_kill_sb(struct super_block *sb) +{ + kfree(sb->s_fs_info); + kill_litter_super(sb); +} + static struct file_system_type ramfs_fs_type = { .name = "ramfs", .get_sb = ramfs_get_sb, - .kill_sb = kill_litter_super, + .kill_sb = ramfs_kill_sb, }; static struct file_system_type rootfs_fs_type = { .name = "rootfs", diff --git a/fs/read_write.c b/fs/read_write.c index 400fe81c973..6d5d8ff238a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -731,6 +731,56 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, return ret; } +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, u32, pos_high, u32, pos_low) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = vfs_readv(file, vec, vlen, &pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, u32, pos_high, u32, pos_low) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = vfs_writev(file, vec, vlen, &pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 949b8c6addc..513f431038f 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,5 +1,6 @@ config REISERFS_FS tristate "Reiserfs support" + select CRC32 help Stores not just filenames but the files themselves in a balanced tree. Uses journalling. diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 972250c6289..0ae6486d904 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -27,6 +27,7 @@ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/namei.h> +#include <linux/crc32.h> struct file_system_type reiserfs_fs_type; @@ -1904,6 +1905,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = dentry->d_sb->s_blocksize; /* changed to accommodate gcc folks. */ buf->f_type = REISERFS_SUPER_MAGIC; + buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2, + sizeof(rs->s_uuid)/2); + return 0; } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d423416d93d..c303c426fe2 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, } else { apply_umask: /* no ACL, apply umask */ - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } return err; diff --git a/fs/splice.c b/fs/splice.c index 4ed0ba44a96..dd727d43e5b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ wait_on_page_writeback(page); - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) goto out_unlock; /* diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 681ec0d8379..ffa6edcd2d0 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -301,6 +301,7 @@ failure: static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; + u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); TRACE("Entered squashfs_statfs\n"); @@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = msblk->inodes; buf->f_ffree = 0; buf->f_namelen = SQUASHFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/super.c b/fs/super.c index 2ba481518ba..77cb4ec919b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb) __fsync_super(sb); return sync_blockdev(sb->s_bdev); } +EXPORT_SYMBOL_GPL(fsync_super); /** * generic_shutdown_super - common helper for ->kill_sb() diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 07703d3ff4a..93e0c0281d4 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -234,7 +234,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct bin_buffer *bb = file->private_data; @@ -242,15 +242,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) int ret; if (!bb->vm_ops) - return -EINVAL; + return VM_FAULT_SIGBUS; if (!bb->vm_ops->page_mkwrite) return 0; if (!sysfs_get_active_two(attr_sd)) - return -EINVAL; + return VM_FAULT_SIGBUS; - ret = bb->vm_ops->page_mkwrite(vma, page); + ret = bb->vm_ops->page_mkwrite(vma, vmf); sysfs_put_active_two(attr_sd); return ret; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3d81bf58dae..da20b48d350 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; @@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->s_ninodes; buf->f_ffree = sysv_count_free_inodes(sb); buf->f_namelen = SYSV_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 93b6de51f26..0ff89fe71e5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made * writable. UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); @@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); if (unlikely(c->ro_media)) - return -EROFS; + return VM_FAULT_SIGBUS; /* -EROFS */ /* * We have not locked @page so far so we may budget for changing the @@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (err == -ENOSPC) ubifs_warn("out of space for mmapped file " "(inode number %lu)", inode->i_ino); - return err; + return VM_FAULT_SIGBUS; } lock_page(page); @@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) out_unlock: unlock_page(page); ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; return err; } diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 2bb788a2acb..e48e9a3af76 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb, { struct buffer_head *bh = NULL; int retval = 0; - kernel_lb_addr loc; + struct kernel_lb_addr loc; loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block)); + bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); if (!bh) retval = -EIO; @@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb, return slot; } -static bool udf_add_free_space(struct udf_sb_info *sbi, - u16 partition, u32 cnt) +static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt) { + struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; - if (sbi->s_lvid_bh == NULL) - return false; + if (!sbi->s_lvid_bh) + return; lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); - return true; + udf_updated_lvid(sb); } static void udf_bitmap_free_blocks(struct super_block *sb, struct inode *inode, struct udf_bitmap *bitmap, - kernel_lb_addr bloc, uint32_t offset, + struct kernel_lb_addr *bloc, + uint32_t offset, uint32_t count) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = NULL; + struct udf_part_map *partmap; unsigned long block; unsigned long block_group; unsigned long bit; @@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb, unsigned long overflow; mutex_lock(&sbi->s_alloc_mutex); - if (bloc.logicalBlockNum < 0 || - (bloc.logicalBlockNum + count) > - sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum < 0 || + (bloc->logicalBlockNum + count) > + partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, - sbi->s_partmaps[bloc.partitionReferenceNum]. - s_partition_len); + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, + count, partmap->s_partition_len); goto error_return; } - block = bloc.logicalBlockNum + offset + + block = bloc->logicalBlockNum + offset + (sizeof(struct spaceBitmapDesc) << 3); do { @@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, } else { if (inode) vfs_dq_free_block(inode, 1); - udf_add_free_space(sbi, sbi->s_partition, 1); + udf_add_free_space(sb, sbi->s_partition, 1); } } mark_buffer_dirty(bh); @@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb, } while (overflow); error_return: - sb->s_dirt = 1; - if (sbi->s_lvid_bh) - mark_buffer_dirty(sbi->s_lvid_bh); mutex_unlock(&sbi->s_alloc_mutex); } @@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, } while (block_count > 0); out: - if (udf_add_free_space(sbi, partition, -alloc_count)) - mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 1; + udf_add_free_space(sb, partition, -alloc_count); mutex_unlock(&sbi->s_alloc_mutex); return alloc_count; } @@ -409,9 +406,7 @@ got_block: mark_buffer_dirty(bh); - if (udf_add_free_space(sbi, partition, -1)) - mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 1; + udf_add_free_space(sb, partition, -1); mutex_unlock(&sbi->s_alloc_mutex); *err = 0; return newblock; @@ -425,26 +420,28 @@ error_return: static void udf_table_free_blocks(struct super_block *sb, struct inode *inode, struct inode *table, - kernel_lb_addr bloc, uint32_t offset, + struct kernel_lb_addr *bloc, + uint32_t offset, uint32_t count) { struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *partmap; uint32_t start, end; uint32_t elen; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; struct extent_position oepos, epos; int8_t etype; int i; struct udf_inode_info *iinfo; mutex_lock(&sbi->s_alloc_mutex); - if (bloc.logicalBlockNum < 0 || - (bloc.logicalBlockNum + count) > - sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum < 0 || + (bloc->logicalBlockNum + count) > + partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, - sbi->s_partmaps[bloc.partitionReferenceNum]. - s_partition_len); + partmap->s_partition_len); goto error_return; } @@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb, could occure, but.. oh well */ if (inode) vfs_dq_free_block(inode, count); - if (udf_add_free_space(sbi, sbi->s_partition, count)) - mark_buffer_dirty(sbi->s_lvid_bh); + udf_add_free_space(sb, sbi->s_partition, count); - start = bloc.logicalBlockNum + offset; - end = bloc.logicalBlockNum + offset + count - 1; + start = bloc->logicalBlockNum + offset; + end = bloc->logicalBlockNum + offset + count - 1; epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); elen = 0; @@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb, start += count; count = 0; } - udf_write_aext(table, &oepos, eloc, elen, 1); + udf_write_aext(table, &oepos, &eloc, elen, 1); } else if (eloc.logicalBlockNum == (end + 1)) { if ((0x3FFFFFFF - elen) < (count << sb->s_blocksize_bits)) { @@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb, end -= count; count = 0; } - udf_write_aext(table, &oepos, eloc, elen, 1); + udf_write_aext(table, &oepos, &eloc, elen, 1); } if (epos.bh != oepos.bh) { @@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb, */ int adsize; - short_ad *sad = NULL; - long_ad *lad = NULL; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; struct allocExtDesc *aed; eloc.logicalBlockNum = start; @@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb, (count << sb->s_blocksize_bits); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else { brelse(oepos.bh); brelse(epos.bh); @@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb, elen -= sb->s_blocksize; epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, epos.block, 0)); + udf_get_lb_pblock(sb, &epos.block, 0)); if (!epos.bh) { brelse(oepos.bh); goto error_return; @@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb, if (sbi->s_udfrev >= 0x0200) udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 3, 1, epos.block.logicalBlockNum, - sizeof(tag)); + sizeof(struct tag)); else udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 2, 1, epos.block.logicalBlockNum, - sizeof(tag)); + sizeof(struct tag)); switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: - sad = (short_ad *)sptr; + sad = (struct short_ad *)sptr; sad->extLength = cpu_to_le32( EXT_NEXT_EXTENT_ALLOCDECS | sb->s_blocksize); @@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb, cpu_to_le32(epos.block.logicalBlockNum); break; case ICBTAG_FLAG_AD_LONG: - lad = (long_ad *)sptr; + lad = (struct long_ad *)sptr; lad->extLength = cpu_to_le32( EXT_NEXT_EXTENT_ALLOCDECS | sb->s_blocksize); @@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb, /* It's possible that stealing the block emptied the extent */ if (elen) { - udf_write_aext(table, &epos, eloc, elen, 1); + udf_write_aext(table, &epos, &eloc, elen, 1); if (!epos.bh) { iinfo->i_lenAlloc += adsize; @@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb, brelse(oepos.bh); error_return: - sb->s_dirt = 1; mutex_unlock(&sbi->s_alloc_mutex); return; } @@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, struct udf_sb_info *sbi = UDF_SB(sb); int alloc_count = 0; uint32_t elen, adsize; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; struct extent_position epos; int8_t etype = -1; struct udf_inode_info *iinfo; @@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb, iinfo = UDF_I(table); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else return 0; @@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, alloc_count = block_count; eloc.logicalBlockNum += alloc_count; elen -= (alloc_count << sb->s_blocksize_bits); - udf_write_aext(table, &epos, eloc, + udf_write_aext(table, &epos, &eloc, (etype << 30) | elen, 1); } else udf_delete_aext(table, epos, eloc, @@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb, brelse(epos.bh); - if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) { - mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 1; - } + if (alloc_count) + udf_add_free_space(sb, partition, -alloc_count); mutex_unlock(&sbi->s_alloc_mutex); return alloc_count; } @@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb, uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; uint32_t newblock = 0, adsize; uint32_t elen, goal_elen = 0; - kernel_lb_addr eloc, uninitialized_var(goal_eloc); + struct kernel_lb_addr eloc, uninitialized_var(goal_eloc); struct extent_position epos, goal_epos; int8_t etype; struct udf_inode_info *iinfo = UDF_I(table); @@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb, *err = -ENOSPC; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else return newblock; @@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb, } if (goal_elen) - udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1); + udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); else udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); brelse(goal_epos.bh); - if (udf_add_free_space(sbi, partition, -1)) - mark_buffer_dirty(sbi->s_lvid_bh); + udf_add_free_space(sb, partition, -1); - sb->s_dirt = 1; mutex_unlock(&sbi->s_alloc_mutex); *err = 0; return newblock; } -inline void udf_free_blocks(struct super_block *sb, - struct inode *inode, - kernel_lb_addr bloc, uint32_t offset, - uint32_t count) +void udf_free_blocks(struct super_block *sb, struct inode *inode, + struct kernel_lb_addr *bloc, uint32_t offset, + uint32_t count) { - uint16_t partition = bloc.partitionReferenceNum; + uint16_t partition = bloc->partitionReferenceNum; struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { - return udf_bitmap_free_blocks(sb, inode, - map->s_uspace.s_bitmap, - bloc, offset, count); + udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, + bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { - return udf_table_free_blocks(sb, inode, - map->s_uspace.s_table, - bloc, offset, count); + udf_table_free_blocks(sb, inode, map->s_uspace.s_table, + bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - return udf_bitmap_free_blocks(sb, inode, - map->s_fspace.s_bitmap, - bloc, offset, count); + udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, + bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - return udf_table_free_blocks(sb, inode, - map->s_fspace.s_table, - bloc, offset, count); - } else { - return; + udf_table_free_blocks(sb, inode, map->s_fspace.s_table, + bloc, offset, count); } } diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 62dc270c69d..2efd4d5291b 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, uint8_t lfi; loff_t size = udf_ext0_offset(dir) + dir->i_size; struct buffer_head *tmp, *bha[16]; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; int i, num, ret = 0; @@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, ret = -ENOENT; goto out; } - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else { offset = 0; } @@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) i = (elen >> dir->i_sb->s_blocksize_bits) - offset; for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i); tmp = udf_tgetblk(dir->i_sb, block); if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) bha[num++] = tmp; @@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, memcpy(fname, "..", flen); dt_type = DT_DIR; } else { - kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); + struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); - iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0); + iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); dt_type = DT_UNKNOWN; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 2820f8fcf4c..1d2c570704c 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -20,7 +20,7 @@ #if 0 static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, - uint8_t ad_size, kernel_lb_addr fe_loc, + uint8_t ad_size, struct kernel_lb_addr fe_loc, int *pos, int *offset, struct buffer_head **bh, int *error) { @@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi, struct extent_position *epos, - kernel_lb_addr *eloc, uint32_t *elen, + struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset) { struct fileIdentDesc *fi; @@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, (EXT_RECORDED_ALLOCATED >> 30)) return NULL; - block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); (*offset)++; @@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, if (i + *offset > (*elen >> blocksize_bits)) i = (*elen >> blocksize_bits)-*offset; for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, *eloc, + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset + i); tmp = udf_tgetblk(dir->i_sb, block); if (tmp && !buffer_uptodate(tmp) && @@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, (EXT_RECORDED_ALLOCATED >> 30)) return NULL; - block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); (*offset)++; @@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) } #if 0 -static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) +static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) { - extent_ad *ext; + struct extent_ad *ext; struct fileEntry *fe; uint8_t *ptr; @@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) ptr += *offset; - ext = (extent_ad *)ptr; + ext = (struct extent_ad *)ptr; - *offset = *offset + sizeof(extent_ad); + *offset = *offset + sizeof(struct extent_ad); return ext; } #endif -short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, +struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { - short_ad *sa; + struct short_ad *sa; if ((!ptr) || (!offset)) { printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); return NULL; } - if ((*offset + sizeof(short_ad)) > maxoffset) + if ((*offset + sizeof(struct short_ad)) > maxoffset) return NULL; else { - sa = (short_ad *)ptr; + sa = (struct short_ad *)ptr; if (sa->extLength == 0) return NULL; } if (inc) - *offset += sizeof(short_ad); + *offset += sizeof(struct short_ad); return sa; } -long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) +struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { - long_ad *la; + struct long_ad *la; if ((!ptr) || (!offset)) { printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); return NULL; } - if ((*offset + sizeof(long_ad)) > maxoffset) + if ((*offset + sizeof(struct long_ad)) > maxoffset) return NULL; else { - la = (long_ad *)ptr; + la = (struct long_ad *)ptr; if (la->extLength == 0) return NULL; } if (inc) - *offset += sizeof(long_ad); + *offset += sizeof(struct long_ad); return la; } diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index a0974df82b3..4792b771aa8 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -38,10 +38,10 @@ #define _ECMA_167_H 1 /* Character set specification (ECMA 167r3 1/7.2.1) */ -typedef struct { +struct charspec { uint8_t charSetType; uint8_t charSetInfo[63]; -} __attribute__ ((packed)) charspec; +} __attribute__ ((packed)); /* Character Set Type (ECMA 167r3 1/7.2.1.1) */ #define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ @@ -57,7 +57,7 @@ typedef struct { typedef uint8_t dstring; /* Timestamp (ECMA 167r3 1/7.3) */ -typedef struct { +struct timestamp { __le16 typeAndTimezone; __le16 year; uint8_t month; @@ -68,7 +68,7 @@ typedef struct { uint8_t centiseconds; uint8_t hundredsOfMicroseconds; uint8_t microseconds; -} __attribute__ ((packed)) timestamp; +} __attribute__ ((packed)); /* Type and Time Zone (ECMA 167r3 1/7.3.1) */ #define TIMESTAMP_TYPE_MASK 0xF000 @@ -78,11 +78,11 @@ typedef struct { #define TIMESTAMP_TIMEZONE_MASK 0x0FFF /* Entity identifier (ECMA 167r3 1/7.4) */ -typedef struct { +struct regid { uint8_t flags; uint8_t ident[23]; uint8_t identSuffix[8]; -} __attribute__ ((packed)) regid; +} __attribute__ ((packed)); /* Flags (ECMA 167r3 1/7.4.1) */ #define ENTITYID_FLAGS_DIRTY 0x00 @@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc { /* Boot Descriptor (ECMA 167r3 2/9.4) */ struct bootDesc { - uint8_t structType; - uint8_t stdIdent[VSD_STD_ID_LEN]; - uint8_t structVersion; - uint8_t reserved1; - regid archType; - regid bootIdent; - __le32 bootExtLocation; - __le32 bootExtLength; - __le64 loadAddress; - __le64 startAddress; - timestamp descCreationDateAndTime; - __le16 flags; - uint8_t reserved2[32]; - uint8_t bootUse[1906]; + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t reserved1; + struct regid archType; + struct regid bootIdent; + __le32 bootExtLocation; + __le32 bootExtLength; + __le64 loadAddress; + __le64 startAddress; + struct timestamp descCreationDateAndTime; + __le16 flags; + uint8_t reserved2[32]; + uint8_t bootUse[1906]; } __attribute__ ((packed)); /* Flags (ECMA 167r3 2/9.4.12) */ #define BOOT_FLAGS_ERASE 0x01 /* Extent Descriptor (ECMA 167r3 3/7.1) */ -typedef struct { +struct extent_ad { __le32 extLength; __le32 extLocation; -} __attribute__ ((packed)) extent_ad; +} __attribute__ ((packed)); -typedef struct { +struct kernel_extent_ad { uint32_t extLength; uint32_t extLocation; -} kernel_extent_ad; +}; /* Descriptor Tag (ECMA 167r3 3/7.2) */ -typedef struct { +struct tag { __le16 tagIdent; __le16 descVersion; uint8_t tagChecksum; @@ -166,7 +166,7 @@ typedef struct { __le16 descCRC; __le16 descCRCLength; __le32 tagLocation; -} __attribute__ ((packed)) tag; +} __attribute__ ((packed)); /* Tag Identifier (ECMA 167r3 3/7.2.1) */ #define TAG_IDENT_PVD 0x0001 @@ -190,28 +190,28 @@ struct NSRDesc { /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ struct primaryVolDesc { - tag descTag; - __le32 volDescSeqNum; - __le32 primaryVolDescNum; - dstring volIdent[32]; - __le16 volSeqNum; - __le16 maxVolSeqNum; - __le16 interchangeLvl; - __le16 maxInterchangeLvl; - __le32 charSetList; - __le32 maxCharSetList; - dstring volSetIdent[128]; - charspec descCharSet; - charspec explanatoryCharSet; - extent_ad volAbstract; - extent_ad volCopyright; - regid appIdent; - timestamp recordingDateAndTime; - regid impIdent; - uint8_t impUse[64]; - __le32 predecessorVolDescSeqLocation; - __le16 flags; - uint8_t reserved[22]; + struct tag descTag; + __le32 volDescSeqNum; + __le32 primaryVolDescNum; + dstring volIdent[32]; + __le16 volSeqNum; + __le16 maxVolSeqNum; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + dstring volSetIdent[128]; + struct charspec descCharSet; + struct charspec explanatoryCharSet; + struct extent_ad volAbstract; + struct extent_ad volCopyright; + struct regid appIdent; + struct timestamp recordingDateAndTime; + struct regid impIdent; + uint8_t impUse[64]; + __le32 predecessorVolDescSeqLocation; + __le16 flags; + uint8_t reserved[22]; } __attribute__ ((packed)); /* Flags (ECMA 167r3 3/10.1.21) */ @@ -219,40 +219,40 @@ struct primaryVolDesc { /* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ struct anchorVolDescPtr { - tag descTag; - extent_ad mainVolDescSeqExt; - extent_ad reserveVolDescSeqExt; - uint8_t reserved[480]; + struct tag descTag; + struct extent_ad mainVolDescSeqExt; + struct extent_ad reserveVolDescSeqExt; + uint8_t reserved[480]; } __attribute__ ((packed)); /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ struct volDescPtr { - tag descTag; - __le32 volDescSeqNum; - extent_ad nextVolDescSeqExt; - uint8_t reserved[484]; + struct tag descTag; + __le32 volDescSeqNum; + struct extent_ad nextVolDescSeqExt; + uint8_t reserved[484]; } __attribute__ ((packed)); /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ struct impUseVolDesc { - tag descTag; + struct tag descTag; __le32 volDescSeqNum; - regid impIdent; + struct regid impIdent; uint8_t impUse[460]; } __attribute__ ((packed)); /* Partition Descriptor (ECMA 167r3 3/10.5) */ struct partitionDesc { - tag descTag; + struct tag descTag; __le32 volDescSeqNum; __le16 partitionFlags; __le16 partitionNumber; - regid partitionContents; + struct regid partitionContents; uint8_t partitionContentsUse[128]; __le32 accessType; __le32 partitionStartingLocation; __le32 partitionLength; - regid impIdent; + struct regid impIdent; uint8_t impUse[128]; uint8_t reserved[156]; } __attribute__ ((packed)); @@ -278,19 +278,19 @@ struct partitionDesc { /* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ struct logicalVolDesc { - tag descTag; - __le32 volDescSeqNum; - charspec descCharSet; - dstring logicalVolIdent[128]; - __le32 logicalBlockSize; - regid domainIdent; - uint8_t logicalVolContentsUse[16]; - __le32 mapTableLength; - __le32 numPartitionMaps; - regid impIdent; - uint8_t impUse[128]; - extent_ad integritySeqExt; - uint8_t partitionMaps[0]; + struct tag descTag; + __le32 volDescSeqNum; + struct charspec descCharSet; + dstring logicalVolIdent[128]; + __le32 logicalBlockSize; + struct regid domainIdent; + uint8_t logicalVolContentsUse[16]; + __le32 mapTableLength; + __le32 numPartitionMaps; + struct regid impIdent; + uint8_t impUse[128]; + struct extent_ad integritySeqExt; + uint8_t partitionMaps[0]; } __attribute__ ((packed)); /* Generic Partition Map (ECMA 167r3 3/10.7.1) */ @@ -322,30 +322,30 @@ struct genericPartitionMap2 { /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ struct unallocSpaceDesc { - tag descTag; - __le32 volDescSeqNum; - __le32 numAllocDescs; - extent_ad allocDescs[0]; + struct tag descTag; + __le32 volDescSeqNum; + __le32 numAllocDescs; + struct extent_ad allocDescs[0]; } __attribute__ ((packed)); /* Terminating Descriptor (ECMA 167r3 3/10.9) */ struct terminatingDesc { - tag descTag; + struct tag descTag; uint8_t reserved[496]; } __attribute__ ((packed)); /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ struct logicalVolIntegrityDesc { - tag descTag; - timestamp recordingDateAndTime; - __le32 integrityType; - extent_ad nextIntegrityExt; - uint8_t logicalVolContentsUse[32]; - __le32 numOfPartitions; - __le32 lengthOfImpUse; - __le32 freeSpaceTable[0]; - __le32 sizeTable[0]; - uint8_t impUse[0]; + struct tag descTag; + struct timestamp recordingDateAndTime; + __le32 integrityType; + struct extent_ad nextIntegrityExt; + uint8_t logicalVolContentsUse[32]; + __le32 numOfPartitions; + __le32 lengthOfImpUse; + __le32 freeSpaceTable[0]; + __le32 sizeTable[0]; + uint8_t impUse[0]; } __attribute__ ((packed)); /* Integrity Type (ECMA 167r3 3/10.10.3) */ @@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc { #define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 /* Recorded Address (ECMA 167r3 4/7.1) */ -typedef struct { +struct lb_addr { __le32 logicalBlockNum; __le16 partitionReferenceNum; -} __attribute__ ((packed)) lb_addr; +} __attribute__ ((packed)); /* ... and its in-core analog */ -typedef struct { +struct kernel_lb_addr { uint32_t logicalBlockNum; uint16_t partitionReferenceNum; -} kernel_lb_addr; +}; /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ -typedef struct { +struct short_ad { __le32 extLength; __le32 extPosition; -} __attribute__ ((packed)) short_ad; +} __attribute__ ((packed)); /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ -typedef struct { +struct long_ad { __le32 extLength; - lb_addr extLocation; + struct lb_addr extLocation; uint8_t impUse[6]; -} __attribute__ ((packed)) long_ad; +} __attribute__ ((packed)); -typedef struct { - uint32_t extLength; - kernel_lb_addr extLocation; - uint8_t impUse[6]; -} kernel_long_ad; +struct kernel_long_ad { + uint32_t extLength; + struct kernel_lb_addr extLocation; + uint8_t impUse[6]; +}; /* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ -typedef struct { +struct ext_ad { __le32 extLength; __le32 recordedLength; __le32 informationLength; - lb_addr extLocation; -} __attribute__ ((packed)) ext_ad; + struct lb_addr extLocation; +} __attribute__ ((packed)); -typedef struct { - uint32_t extLength; - uint32_t recordedLength; - uint32_t informationLength; - kernel_lb_addr extLocation; -} kernel_ext_ad; +struct kernel_ext_ad { + uint32_t extLength; + uint32_t recordedLength; + uint32_t informationLength; + struct kernel_lb_addr extLocation; +}; /* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ @@ -415,44 +415,44 @@ typedef struct { /* File Set Descriptor (ECMA 167r3 4/14.1) */ struct fileSetDesc { - tag descTag; - timestamp recordingDateAndTime; - __le16 interchangeLvl; - __le16 maxInterchangeLvl; - __le32 charSetList; - __le32 maxCharSetList; - __le32 fileSetNum; - __le32 fileSetDescNum; - charspec logicalVolIdentCharSet; - dstring logicalVolIdent[128]; - charspec fileSetCharSet; - dstring fileSetIdent[32]; - dstring copyrightFileIdent[32]; - dstring abstractFileIdent[32]; - long_ad rootDirectoryICB; - regid domainIdent; - long_ad nextExt; - long_ad streamDirectoryICB; - uint8_t reserved[32]; + struct tag descTag; + struct timestamp recordingDateAndTime; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + __le32 fileSetNum; + __le32 fileSetDescNum; + struct charspec logicalVolIdentCharSet; + dstring logicalVolIdent[128]; + struct charspec fileSetCharSet; + dstring fileSetIdent[32]; + dstring copyrightFileIdent[32]; + dstring abstractFileIdent[32]; + struct long_ad rootDirectoryICB; + struct regid domainIdent; + struct long_ad nextExt; + struct long_ad streamDirectoryICB; + uint8_t reserved[32]; } __attribute__ ((packed)); /* Partition Header Descriptor (ECMA 167r3 4/14.3) */ struct partitionHeaderDesc { - short_ad unallocSpaceTable; - short_ad unallocSpaceBitmap; - short_ad partitionIntegrityTable; - short_ad freedSpaceTable; - short_ad freedSpaceBitmap; + struct short_ad unallocSpaceTable; + struct short_ad unallocSpaceBitmap; + struct short_ad partitionIntegrityTable; + struct short_ad freedSpaceTable; + struct short_ad freedSpaceBitmap; uint8_t reserved[88]; } __attribute__ ((packed)); /* File Identifier Descriptor (ECMA 167r3 4/14.4) */ struct fileIdentDesc { - tag descTag; + struct tag descTag; __le16 fileVersionNum; uint8_t fileCharacteristics; uint8_t lengthFileIdent; - long_ad icb; + struct long_ad icb; __le16 lengthOfImpUse; uint8_t impUse[0]; uint8_t fileIdent[0]; @@ -468,22 +468,22 @@ struct fileIdentDesc { /* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ struct allocExtDesc { - tag descTag; + struct tag descTag; __le32 previousAllocExtLocation; __le32 lengthAllocDescs; } __attribute__ ((packed)); /* ICB Tag (ECMA 167r3 4/14.6) */ -typedef struct { +struct icbtag { __le32 priorRecordedNumDirectEntries; __le16 strategyType; __le16 strategyParameter; __le16 numEntries; uint8_t reserved; uint8_t fileType; - lb_addr parentICBLocation; + struct lb_addr parentICBLocation; __le16 flags; -} __attribute__ ((packed)) icbtag; +} __attribute__ ((packed)); /* Strategy Type (ECMA 167r3 4/14.6.2) */ #define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 @@ -528,41 +528,41 @@ typedef struct { /* Indirect Entry (ECMA 167r3 4/14.7) */ struct indirectEntry { - tag descTag; - icbtag icbTag; - long_ad indirectICB; + struct tag descTag; + struct icbtag icbTag; + struct long_ad indirectICB; } __attribute__ ((packed)); /* Terminal Entry (ECMA 167r3 4/14.8) */ struct terminalEntry { - tag descTag; - icbtag icbTag; + struct tag descTag; + struct icbtag icbTag; } __attribute__ ((packed)); /* File Entry (ECMA 167r3 4/14.9) */ struct fileEntry { - tag descTag; - icbtag icbTag; - __le32 uid; - __le32 gid; - __le32 permissions; - __le16 fileLinkCount; - uint8_t recordFormat; - uint8_t recordDisplayAttr; - __le32 recordLength; - __le64 informationLength; - __le64 logicalBlocksRecorded; - timestamp accessTime; - timestamp modificationTime; - timestamp attrTime; - __le32 checkpoint; - long_ad extendedAttrICB; - regid impIdent; - __le64 uniqueID; - __le32 lengthExtendedAttr; - __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp attrTime; + __le32 checkpoint; + struct long_ad extendedAttrICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; } __attribute__ ((packed)); /* Permissions (ECMA 167r3 4/14.9.5) */ @@ -604,7 +604,7 @@ struct fileEntry { /* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ struct extendedAttrHeaderDesc { - tag descTag; + struct tag descTag; __le32 impAttrLocation; __le32 appAttrLocation; } __attribute__ ((packed)); @@ -687,7 +687,7 @@ struct impUseExtAttr { uint8_t reserved[3]; __le32 attrLength; __le32 impUseLength; - regid impIdent; + struct regid impIdent; uint8_t impUse[0]; } __attribute__ ((packed)); @@ -698,7 +698,7 @@ struct appUseExtAttr { uint8_t reserved[3]; __le32 attrLength; __le32 appUseLength; - regid appIdent; + struct regid appIdent; uint8_t appUse[0]; } __attribute__ ((packed)); @@ -712,15 +712,15 @@ struct appUseExtAttr { /* Unallocated Space Entry (ECMA 167r3 4/14.11) */ struct unallocSpaceEntry { - tag descTag; - icbtag icbTag; + struct tag descTag; + struct icbtag icbTag; __le32 lengthAllocDescs; uint8_t allocDescs[0]; } __attribute__ ((packed)); /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ struct spaceBitmapDesc { - tag descTag; + struct tag descTag; __le32 numOfBits; __le32 numOfBytes; uint8_t bitmap[0]; @@ -728,13 +728,13 @@ struct spaceBitmapDesc { /* Partition Integrity Entry (ECMA 167r3 4/14.13) */ struct partitionIntegrityEntry { - tag descTag; - icbtag icbTag; - timestamp recordingDateAndTime; - uint8_t integrityType; - uint8_t reserved[175]; - regid impIdent; - uint8_t impUse[256]; + struct tag descTag; + struct icbtag icbTag; + struct timestamp recordingDateAndTime; + uint8_t integrityType; + uint8_t reserved[175]; + struct regid impIdent; + uint8_t impUse[256]; } __attribute__ ((packed)); /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ @@ -765,32 +765,32 @@ struct pathComponent { /* File Entry (ECMA 167r3 4/14.17) */ struct extendedFileEntry { - tag descTag; - icbtag icbTag; - __le32 uid; - __le32 gid; - __le32 permissions; - __le16 fileLinkCount; - uint8_t recordFormat; - uint8_t recordDisplayAttr; - __le32 recordLength; - __le64 informationLength; - __le64 objectSize; - __le64 logicalBlocksRecorded; - timestamp accessTime; - timestamp modificationTime; - timestamp createTime; - timestamp attrTime; - __le32 checkpoint; - __le32 reserved; - long_ad extendedAttrICB; - long_ad streamDirectoryICB; - regid impIdent; - __le64 uniqueID; - __le32 lengthExtendedAttr; - __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 objectSize; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp createTime; + struct timestamp attrTime; + __le32 checkpoint; + __le32 reserved; + struct long_ad extendedAttrICB; + struct long_ad streamDirectoryICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; } __attribute__ ((packed)); #endif /* _ECMA_167_H */ diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 47dbe5613f9..c10fa39f97e 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode) le32_add_cpu(&lvidiu->numDirs, -1); else le32_add_cpu(&lvidiu->numFiles, -1); - - mark_buffer_dirty(sbi->s_lvid_bh); + udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); - udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1); + udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, int mode, int *err) @@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) if (!(++uniqueID & 0x00000000FFFFFFFFUL)) uniqueID += 16; lvhd->uniqueID = cpu_to_le64(uniqueID); - mark_buffer_dirty(sbi->s_lvid_bh); + udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); inode->i_mode = mode; @@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = dinfo->i_location.partitionReferenceNum; - inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0); + inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0); inode->i_blocks = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 30ebde490f7..e7533f78563 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size); static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, sector_t *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, - kernel_lb_addr, uint32_t); + struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, int, - kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); static void udf_prealloc_extents(struct inode *, int, int, - kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); static void udf_merge_extents(struct inode *, - kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); static void udf_update_extents(struct inode *, - kernel_long_ad[EXTENT_MERGE_SIZE], int, int, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); @@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, { int newblock; struct buffer_head *dbh = NULL; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; uint8_t alloctype; struct extent_position epos; @@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, epos.bh = NULL; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, eloc, elen, 0); + udf_add_aext(inode, &epos, &eloc, elen, 0); /* UniqueID stuff */ brelse(epos.bh); @@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block, /* Extend the file by 'blocks' blocks, return the number of extents added */ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, - kernel_long_ad *last_ext, sector_t blocks) + struct kernel_long_ad *last_ext, sector_t blocks) { sector_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; - kernel_lb_addr prealloc_loc = {}; + struct kernel_lb_addr prealloc_loc = {}; int prealloc_len = 0; struct udf_inode_info *iinfo; @@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, } if (fake) { - udf_add_aext(inode, last_pos, last_ext->extLocation, + udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; } else - udf_write_aext(inode, last_pos, last_ext->extLocation, + udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); /* Managed to do everything necessary? */ @@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, /* Create enough extents to cover the whole hole */ while (blocks > add) { blocks -= add; - if (udf_add_aext(inode, last_pos, last_ext->extLocation, + if (udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1) == -1) return -1; count++; @@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, if (blocks) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (blocks << sb->s_blocksize_bits); - if (udf_add_aext(inode, last_pos, last_ext->extLocation, + if (udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1) == -1) return -1; count++; @@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, out: /* Do we have some preallocated blocks saved? */ if (prealloc_len) { - if (udf_add_aext(inode, last_pos, prealloc_loc, + if (udf_add_aext(inode, last_pos, &prealloc_loc, prealloc_len, 1) == -1) return -1; last_ext->extLocation = prealloc_loc; @@ -459,9 +459,9 @@ out: /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - last_pos->offset -= sizeof(short_ad); + last_pos->offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - last_pos->offset -= sizeof(long_ad); + last_pos->offset -= sizeof(struct long_ad); else return -1; @@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, { static sector_t last_block; struct buffer_head *result = NULL; - kernel_long_ad laarr[EXTENT_MERGE_SIZE]; + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; uint32_t elen = 0, tmpelen; - kernel_lb_addr eloc, tmpeloc; + struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; uint32_t newblocknum, newblock; @@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); - etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1); + etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); + newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); *phys = newblock; return NULL; } @@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, } else { /* Create a fake extent when there's not one */ memset(&laarr[0].extLocation, 0x00, - sizeof(kernel_lb_addr)); + sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; /* Will udf_extend_file() create real extent from a fake one? */ @@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | inode->i_sb->s_blocksize; memset(&laarr[c].extLocation, 0x00, - sizeof(kernel_lb_addr)); + sizeof(struct kernel_lb_addr)); count++; endnum++; } @@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, static void udf_split_extents(struct inode *inode, int *c, int offset, int newblocknum, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; @@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, if (offset) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, - laarr[curr].extLocation, + &laarr[curr].extLocation, 0, offset); laarr[curr].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | @@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) { int start, length = 0, currlength = 0, i; @@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, inode->i_sb->s_blocksize_bits); else { memmove(&laarr[c + 2], &laarr[c + 1], - sizeof(long_ad) * (*endnum - (c + 1))); + sizeof(struct long_ad) * (*endnum - (c + 1))); (*endnum)++; laarr[c + 1].extLocation.logicalBlockNum = next; laarr[c + 1].extLocation.partitionReferenceNum = @@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, if (*endnum > (i + 1)) memmove(&laarr[i], &laarr[i + 1], - sizeof(long_ad) * + sizeof(struct long_ad) * (*endnum - (i + 1))); i--; (*endnum)--; @@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, } static void udf_merge_extents(struct inode *inode, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) { int i; @@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode, unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; for (i = 0; i < (*endnum - 1); i++) { - kernel_long_ad *li /*l[i]*/ = &laarr[i]; - kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; + struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; + struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; if (((li->extLength >> 30) == (lip1->extLength >> 30)) && (((li->extLength >> 30) == @@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode, blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], - sizeof(long_ad) * + sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; @@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode, (EXT_NOT_RECORDED_ALLOCATED >> 30)) && ((lip1->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { - udf_free_blocks(inode->i_sb, inode, li->extLocation, 0, + udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); @@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode, blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], - sizeof(long_ad) * + sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; @@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode, } else if ((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, - li->extLocation, 0, + &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); @@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode, } static void udf_update_extents(struct inode *inode, - kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], int startnum, int endnum, struct extent_position *epos) { int start = 0, i; - kernel_lb_addr tmploc; + struct kernel_lb_addr tmploc; uint32_t tmplen; if (startnum > endnum) { @@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode, for (i = start; i < endnum; i++) { udf_next_aext(inode, epos, &tmploc, &tmplen, 0); - udf_write_aext(inode, epos, laarr[i].extLocation, + udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } } @@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode) * i_nlink = 1 * i_op = NULL; */ - bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident); + bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); if (!bh) { printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", inode->i_ino); @@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode) if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; - ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, + ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct buffer_head *nbh = NULL; - kernel_lb_addr loc; + struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength && - (nbh = udf_read_ptagged(inode->i_sb, loc, 0, + (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, &ident))) { if (ident == TAG_IDENT_FE || ident == TAG_IDENT_EFE) { memcpy(&iinfo->i_location, &loc, - sizeof(kernel_lb_addr)); + sizeof(struct kernel_lb_addr)); brelse(bh); brelse(ibh); brelse(nbh); @@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; - inode->i_mode = udf_convert_permissions(fe); - inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; + if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_fmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_fmode; + else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_dmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_dmode; + else + inode->i_mode = udf_convert_permissions(fe); + inode->i_mode &= ~sbi->s_umask; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << @@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) bh = udf_tread(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - iinfo->i_location, 0)); + &iinfo->i_location, 0)); if (!bh) { udf_debug("bread failure\n"); return -EIO; @@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); crclen = sizeof(struct unallocSpaceEntry) + - iinfo->i_lenAlloc - sizeof(tag); + iinfo->i_lenAlloc - sizeof(struct tag); use->descTag.tagLocation = cpu_to_le32( iinfo->i_location. logicalBlockNum); use->descTag.descCRCLength = cpu_to_le16(crclen); use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + - sizeof(tag), + sizeof(struct tag), crclen)); use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); @@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->informationLength = cpu_to_le64(inode->i_size); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - regid *eid; + struct regid *eid; struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (!dsea) { dsea = (struct deviceSpec *) udf_add_extendedattr(inode, sizeof(struct deviceSpec) + - sizeof(regid), 12, 0x3); + sizeof(struct regid), 12, 0x3); dsea->attrType = cpu_to_le32(12); dsea->attrSubtype = 1; dsea->attrLength = cpu_to_le32( sizeof(struct deviceSpec) + - sizeof(regid)); - dsea->impUseLength = cpu_to_le32(sizeof(regid)); + sizeof(struct regid)); + dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } - eid = (regid *)dsea->impUse; - memset(eid, 0, sizeof(regid)); + eid = (struct regid *)dsea->impUse; + memset(eid, 0, sizeof(struct regid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; @@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); - memset(&(fe->impIdent), 0, sizeof(regid)); + memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; @@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); - memset(&(efe->impIdent), 0, sizeof(regid)); + memset(&(efe->impIdent), 0, sizeof(struct regid)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; @@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - - sizeof(tag); + sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); - fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), + fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); @@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) return err; } -struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) +struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); @@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) return NULL; if (inode->i_state & I_NEW) { - memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr)); + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); __udf_read_inode(inode); unlock_new_inode(inode); } @@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) if (is_bad_inode(inode)) goto out_iput; - if (ino.logicalBlockNum >= UDF_SB(sb)-> - s_partmaps[ino.partitionReferenceNum].s_partition_len) { + if (ino->logicalBlockNum >= UDF_SB(sb)-> + s_partmaps[ino->partitionReferenceNum].s_partition_len) { udf_debug("block=%d, partition=%d out of range\n", - ino.logicalBlockNum, ino.partitionReferenceNum); + ino->logicalBlockNum, ino->partitionReferenceNum); make_bad_inode(inode); goto out_iput; } @@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) } int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr eloc, uint32_t elen, int inc) + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; - short_ad *sad = NULL; - long_ad *lad = NULL; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; struct allocExtDesc *aed; int8_t etype; uint8_t *ptr; @@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, ptr = epos->bh->b_data + epos->offset; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else return -1; @@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, char *sptr, *dptr; struct buffer_head *nbh; int err, loffset; - kernel_lb_addr obloc = epos->block; + struct kernel_lb_addr obloc = epos->block; epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, obloc.partitionReferenceNum, @@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, if (!epos->block.logicalBlockNum) return -1; nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - epos->block, + &epos->block, 0)); if (!nbh) return -1; @@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, } if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, - epos->block.logicalBlockNum, sizeof(tag)); + epos->block.logicalBlockNum, sizeof(struct tag)); else udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, - epos->block.logicalBlockNum, sizeof(tag)); + epos->block.logicalBlockNum, sizeof(struct tag)); switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: - sad = (short_ad *)sptr; + sad = (struct short_ad *)sptr; sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | inode->i_sb->s_blocksize); sad->extPosition = cpu_to_le32(epos->block.logicalBlockNum); break; case ICBTAG_FLAG_AD_LONG: - lad = (long_ad *)sptr; + lad = (struct long_ad *)sptr; lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | inode->i_sb->s_blocksize); lad->extLocation = cpu_to_lelb(epos->block); @@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, } int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr eloc, uint32_t elen, int inc) + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; - short_ad *sad; - long_ad *lad; + struct short_ad *sad; + struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) @@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: - sad = (short_ad *)ptr; + sad = (struct short_ad *)ptr; sad->extLength = cpu_to_le32(elen); - sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); - adsize = sizeof(short_ad); + sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); + adsize = sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: - lad = (long_ad *)ptr; + lad = (struct long_ad *)ptr; lad->extLength = cpu_to_le32(elen); - lad->extLocation = cpu_to_lelb(eloc); + lad->extLocation = cpu_to_lelb(*eloc); memset(lad->impUse, 0x00, sizeof(lad->impUse)); - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); break; default: return -1; @@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, } int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr *eloc, uint32_t *elen, int inc) + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int8_t etype; @@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); - block = udf_get_lb_pblock(inode->i_sb, epos->block, 0); + block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = udf_tread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %d failed!\n", block); @@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, } int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, - kernel_lb_addr *eloc, uint32_t *elen, int inc) + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int alen; int8_t etype; uint8_t *ptr; - short_ad *sad; - long_ad *lad; + struct short_ad *sad; + struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) { @@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, } static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, - kernel_lb_addr neloc, uint32_t nelen) + struct kernel_lb_addr neloc, uint32_t nelen) { - kernel_lb_addr oeloc; + struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; @@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, get_bh(epos.bh); while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { - udf_write_aext(inode, &epos, neloc, nelen, 1); + udf_write_aext(inode, &epos, &neloc, nelen, 1); neloc = oeloc; nelen = (etype << 30) | oelen; } - udf_add_aext(inode, &epos, neloc, nelen, 1); + udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); return (nelen >> 30); } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, - kernel_lb_addr eloc, uint32_t elen) + struct kernel_lb_addr eloc, uint32_t elen) { struct extent_position oepos; int adsize; @@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else adsize = 0; @@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, return -1; while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { - udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1); + udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); if (oepos.bh != epos.bh) { oepos.block = epos.block; brelse(oepos.bh); @@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, oepos.offset = epos.offset - adsize; } } - memset(&eloc, 0x00, sizeof(kernel_lb_addr)); + memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); elen = 0; if (epos.bh != oepos.bh) { - udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1); - udf_write_aext(inode, &oepos, eloc, elen, 1); - udf_write_aext(inode, &oepos, eloc, elen, 1); + udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= (adsize * 2); mark_inode_dirty(inode); @@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, mark_buffer_dirty_inode(oepos.bh, inode); } } else { - udf_write_aext(inode, &oepos, eloc, elen, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= adsize; mark_inode_dirty(inode); @@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, } int8_t inode_bmap(struct inode *inode, sector_t block, - struct extent_position *pos, kernel_lb_addr *eloc, + struct extent_position *pos, struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; @@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block, long udf_block_map(struct inode *inode, sector_t block) { - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; @@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block) if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) - ret = udf_get_lb_pblock(inode->i_sb, eloc, offset); + ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); else ret = 0; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 84bf0fd4a4f..9215700c00a 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, } } /* rewrite CRC + checksum of eahd */ - crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); + crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); eahd->descTag.descCRCLength = cpu_to_le16(crclen); eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + - sizeof(tag), crclen)); + sizeof(struct tag), crclen)); eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; @@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, uint32_t location, uint16_t *ident) { - tag *tag_p; + struct tag *tag_p; struct buffer_head *bh = NULL; /* Read the block */ @@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, return NULL; } - tag_p = (tag *)(bh->b_data); + tag_p = (struct tag *)(bh->b_data); *ident = le16_to_cpu(tag_p->tagIdent); @@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, } /* Verify the descriptor CRC */ - if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || + if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, - bh->b_data + sizeof(tag), + bh->b_data + sizeof(struct tag), le16_to_cpu(tag_p->descCRCLength))) return bh; @@ -255,27 +255,28 @@ error_out: return NULL; } -struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, +struct buffer_head *udf_read_ptagged(struct super_block *sb, + struct kernel_lb_addr *loc, uint32_t offset, uint16_t *ident) { return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), - loc.logicalBlockNum + offset, ident); + loc->logicalBlockNum + offset, ident); } void udf_update_tag(char *data, int length) { - tag *tptr = (tag *)data; - length -= sizeof(tag); + struct tag *tptr = (struct tag *)data; + length -= sizeof(struct tag); tptr->descCRCLength = cpu_to_le16(length); - tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); + tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); tptr->tagChecksum = udf_tag_checksum(tptr); } void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, uint32_t loc, int length) { - tag *tptr = (tag *)data; + struct tag *tptr = (struct tag *)data; tptr->tagIdent = cpu_to_le16(ident); tptr->descVersion = cpu_to_le16(version); tptr->tagSerialNum = cpu_to_le16(snum); @@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, udf_update_tag(data, length); } -u8 udf_tag_checksum(const tag *t) +u8 udf_tag_checksum(const struct tag *t) { u8 *data = (u8 *)t; u8 checksum = 0; int i; - for (i = 0; i < sizeof(tag); ++i) + for (i = 0; i < sizeof(struct tag); ++i) if (i != 4) /* position of checksum */ checksum += data[i]; return checksum; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f84bfaa8d94..6a29fa34c47 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, uint8_t *impuse, uint8_t *fileident) { - uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag); + uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); uint16_t crc; int offset; uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); @@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, memset(fibh->ebh->b_data, 0x00, padlen + offset); } - crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), - sizeof(struct fileIdentDesc) - sizeof(tag)); + crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), + sizeof(struct fileIdentDesc) - sizeof(struct tag)); if (fibh->sbh == fibh->ebh) { crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - crclen + sizeof(tag) - + crclen + sizeof(struct tag) - sizeof(struct fileIdentDesc)); } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { crc = crc_itu_t(crc, fibh->ebh->b_data + sizeof(struct fileIdentDesc) + fibh->soffset, - crclen + sizeof(tag) - + crclen + sizeof(struct tag) - sizeof(struct fileIdentDesc)); } else { crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, @@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, uint8_t lfi; uint16_t liu; loff_t size; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; @@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) goto out_err; - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else offset = 0; @@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, #ifdef UDF_RECOVERY /* temporary shorthand for specifying files by inode number */ if (!strncmp(dentry->d_name.name, ".B=", 3)) { - kernel_lb_addr lb = { + struct kernel_lb_addr lb = { .logicalBlockNum = 0, .partitionReferenceNum = simple_strtoul(dentry->d_name.name + 3, @@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, #endif /* UDF_RECOVERY */ if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { + struct kernel_lb_addr loc; + if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation)); + loc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(dir->i_sb, &loc); if (!inode) { unlock_kernel(); return ERR_PTR(-EACCES); @@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, uint8_t lfi; uint16_t liu; int block; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen = 0; sector_t offset; struct extent_position epos = {}; @@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { block = udf_get_lb_pblock(dir->i_sb, - dinfo->i_location, 0); + &dinfo->i_location, 0); fibh->soffset = fibh->eoffset = sb->s_blocksize; goto add; } - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else offset = 0; @@ -409,10 +412,10 @@ add: if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); - udf_write_aext(dir, &epos, eloc, elen, 1); + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); } f_pos += nfidlen; @@ -494,10 +497,10 @@ add: memset(cfi, 0, sizeof(struct fileIdentDesc)); if (UDF_SB(sb)->s_udfrev >= 0x0200) udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, - sizeof(tag)); + sizeof(struct tag)); else udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, - sizeof(tag)); + sizeof(struct tag)); cfi->fileVersionNum = cpu_to_le16(1); cfi->lengthFileIdent = namelen; cfi->lengthOfImpUse = cpu_to_le16(0); @@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - memset(&(cfi->icb), 0x00, sizeof(long_ad)); + memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } @@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir) loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; int block; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; @@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir) else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, eloc, offset); + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(short_ad); + epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(long_ad); + epos.offset -= sizeof(struct long_ad); } else offset = 0; @@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct udf_fileident_bh fibh; struct fileIdentDesc *fi, cfi; - kernel_lb_addr tloc; + struct kernel_lb_addr tloc; retval = -ENOENT; lock_kernel(); @@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); - if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; if (!empty_dir(inode)) @@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct udf_fileident_bh fibh; struct fileIdentDesc *fi; struct fileIdentDesc cfi; - kernel_lb_addr tloc; + struct kernel_lb_addr tloc; retval = -ENOENT; lock_kernel(); @@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); - if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_unlink; if (!inode->i_nlink) { @@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &page_symlink_inode_operations; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t bsize; block = udf_new_block(inode->i_sb, inode, @@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, iinfo->i_location.partitionReferenceNum; bsize = inode->i_sb->s_blocksize; iinfo->i_lenExtents = bsize; - udf_add_aext(inode, &epos, eloc, bsize, 0); + udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); block = udf_get_pblock(inode->i_sb, block, @@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct fileIdentDesc ocfi, ncfi; struct buffer_head *dir_bh = NULL; int retval = -ENOENT; - kernel_lb_addr tloc; + struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); lock_kernel(); @@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(ofibh.sbh); } tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0) + if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; @@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, if (!dir_fi) goto end_rename; tloc = lelb_to_cpu(dir_fi->icb.extLocation); - if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) != + if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != old_dir->i_ino) goto end_rename; @@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, */ ncfi.fileVersionNum = ocfi.fileVersionNum; ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad)); + memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad)); udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); /* The old fid may have moved - find it again */ @@ -1242,6 +1245,7 @@ end_rename: static struct dentry *udf_get_parent(struct dentry *child) { + struct kernel_lb_addr tloc; struct inode *inode = NULL; struct qstr dotdot = {.name = "..", .len = 2}; struct fileIdentDesc cfi; @@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child) brelse(fibh.ebh); brelse(fibh.sbh); - inode = udf_iget(child->d_inode->i_sb, - lelb_to_cpu(cfi.icb.extLocation)); + tloc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(child->d_inode->i_sb, &tloc); if (!inode) goto out_unlock; unlock_kernel(); @@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, u16 partref, __u32 generation) { struct inode *inode; - kernel_lb_addr loc; + struct kernel_lb_addr loc; if (block == 0) return ERR_PTR(-ESTALE); loc.logicalBlockNum = block; loc.partitionReferenceNum = partref; - inode = udf_iget(sb, loc); + inode = udf_iget(sb, &loc); if (inode == NULL) return ERR_PTR(-ENOMEM); @@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, { int len = *lenp; struct inode *inode = de->d_inode; - kernel_lb_addr location = UDF_I(inode)->i_location; + struct kernel_lb_addr location = UDF_I(inode)->i_location; struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index 65ff47902bd..fbff74654df 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -85,7 +85,7 @@ struct appIdentSuffix { /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ /* Implementation Use (UDF 2.50 2.2.6.4) */ struct logicalVolIntegrityDescImpUse { - regid impIdent; + struct regid impIdent; __le32 numFiles; __le32 numDirs; __le16 minUDFReadRev; @@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse { /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ /* Implementation Use (UDF 2.50 2.2.7.2) */ struct impUseVolDescImpUse { - charspec LVICharset; + struct charspec LVICharset; dstring logicalVolIdent[128]; dstring LVInfo1[36]; dstring LVInfo2[36]; dstring LVInfo3[36]; - regid impIdent; + struct regid impIdent; uint8_t impUse[128]; } __attribute__ ((packed)); @@ -110,7 +110,7 @@ struct udfPartitionMap2 { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; } __attribute__ ((packed)); @@ -120,7 +120,7 @@ struct virtualPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; uint8_t reserved2[24]; @@ -131,7 +131,7 @@ struct sparablePartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; __le16 packetLength; @@ -146,7 +146,7 @@ struct metadataPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t reserved1[2]; - regid partIdent; + struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; __le32 metadataFileLoc; @@ -161,7 +161,7 @@ struct metadataPartitionMap { /* Virtual Allocation Table (UDF 1.5 2.2.10) */ struct virtualAllocationTable15 { __le32 VirtualSector[0]; - regid vatIdent; + struct regid vatIdent; __le32 previousVATICBLoc; } __attribute__ ((packed)); @@ -192,8 +192,8 @@ struct sparingEntry { } __attribute__ ((packed)); struct sparingTable { - tag descTag; - regid sparingIdent; + struct tag descTag; + struct regid sparingIdent; __le16 reallocationTableLen; __le16 reserved; __le32 sequenceNum; @@ -206,7 +206,7 @@ struct sparingTable { #define ICBTAG_FILE_TYPE_MIRROR 0xFB #define ICBTAG_FILE_TYPE_BITMAP 0xFC -/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ struct allocDescImpUse { __le16 flags; uint8_t impUse[4]; diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 96dfd207c3d..4b540ee632d 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, { struct super_block *sb = inode->i_sb; struct udf_part_map *map; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; sector_t ext_offset; struct extent_position epos = {}; diff --git a/fs/udf/super.c b/fs/udf/super.c index e25e7010627..72348cc855a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -81,16 +81,13 @@ static char error_buf[1024]; /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); static void udf_put_super(struct super_block *); -static void udf_write_super(struct super_block *); +static int udf_sync_fs(struct super_block *, int); static int udf_remount_fs(struct super_block *, int *, char *); -static int udf_check_valid(struct super_block *, int, int); -static int udf_vrs(struct super_block *sb, int silent); -static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); -static void udf_find_anchor(struct super_block *); -static int udf_find_fileset(struct super_block *, kernel_lb_addr *, - kernel_lb_addr *); +static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad); +static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *, + struct kernel_lb_addr *); static void udf_load_fileset(struct super_block *, struct buffer_head *, - kernel_lb_addr *); + struct kernel_lb_addr *); static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); @@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = { .delete_inode = udf_delete_inode, .clear_inode = udf_clear_inode, .put_super = udf_put_super, - .write_super = udf_write_super, + .sync_fs = udf_sync_fs, .statfs = udf_statfs, .remount_fs = udf_remount_fs, .show_options = udf_show_options, @@ -201,6 +198,8 @@ struct udf_options { mode_t umask; gid_t gid; uid_t uid; + mode_t fmode; + mode_t dmode; struct nls_table *nls_map; }; @@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) seq_puts(seq, ",nostrict"); - if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE) + if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET)) seq_printf(seq, ",bs=%lu", sb->s_blocksize); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) seq_puts(seq, ",unhide"); @@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) seq_printf(seq, ",gid=%u", sbi->s_gid); if (sbi->s_umask != 0) seq_printf(seq, ",umask=%o", sbi->s_umask); + if (sbi->s_fmode != UDF_INVALID_MODE) + seq_printf(seq, ",mode=%o", sbi->s_fmode); + if (sbi->s_dmode != UDF_INVALID_MODE) + seq_printf(seq, ",dmode=%o", sbi->s_dmode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) seq_printf(seq, ",session=%u", sbi->s_session); if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); - /* - * s_anchor[2] could be zeroed out in case there is no anchor - * in the specified block, but then the "anchor=N" option - * originally given by the user wasn't effective, so it's OK - * if we don't show it. - */ - if (sbi->s_anchor[2] != 0) - seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]); + if (sbi->s_anchor != 0) + seq_printf(seq, ",anchor=%u", sbi->s_anchor); /* * volume, partition, fileset and rootdir seem to be ignored * currently @@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) * * gid= Set the default group. * umask= Set the default umask. + * mode= Set the default file permissions. + * dmode= Set the default directory permissions. * uid= Set the default user. * bs= Set the block size. * unhide Show otherwise hidden files. @@ -366,7 +365,8 @@ enum { Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, Opt_rootdir, Opt_utf8, Opt_iocharset, - Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore, + Opt_fmode, Opt_dmode }; static const match_table_t tokens = { @@ -395,6 +395,8 @@ static const match_table_t tokens = { {Opt_rootdir, "rootdir=%u"}, {Opt_utf8, "utf8"}, {Opt_iocharset, "iocharset=%s"}, + {Opt_fmode, "mode=%o"}, + {Opt_dmode, "dmode=%o"}, {Opt_err, NULL} }; @@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, int option; uopt->novrs = 0; - uopt->blocksize = UDF_DEFAULT_BLOCKSIZE; uopt->partition = 0xFFFF; uopt->session = 0xFFFFFFFF; uopt->lastblock = 0; @@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt, switch (token) { case Opt_novrs: uopt->novrs = 1; + break; case Opt_bs: if (match_int(&args[0], &option)) return 0; uopt->blocksize = option; + uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); break; case Opt_unhide: uopt->flags |= (1 << UDF_FLAG_UNHIDE); @@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_gforget: uopt->flags |= (1 << UDF_FLAG_GID_FORGET); break; + case Opt_fmode: + if (match_octal(args, &option)) + return 0; + uopt->fmode = option & 0777; + break; + case Opt_dmode: + if (match_octal(args, &option)) + return 0; + uopt->dmode = option & 0777; + break; default: printk(KERN_ERR "udf: bad mount option \"%s\" " "or missing value\n", p); @@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, return 1; } -static void udf_write_super(struct super_block *sb) -{ - lock_kernel(); - - if (!(sb->s_flags & MS_RDONLY)) - udf_open_lvid(sb); - sb->s_dirt = 0; - - unlock_kernel(); -} - static int udf_remount_fs(struct super_block *sb, int *flags, char *options) { struct udf_options uopt; @@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) uopt.uid = sbi->s_uid; uopt.gid = sbi->s_gid; uopt.umask = sbi->s_umask; + uopt.fmode = sbi->s_fmode; + uopt.dmode = sbi->s_dmode; if (!udf_parse_options(options, &uopt, true)) return -EINVAL; @@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; if (sbi->s_lvid_bh) { int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); @@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) return 0; } -static int udf_vrs(struct super_block *sb, int silent) +/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ +/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ +static loff_t udf_check_vsd(struct super_block *sb) { struct volStructDesc *vsd = NULL; loff_t sector = 32768; int sectorsize; struct buffer_head *bh = NULL; - int iso9660 = 0; int nsr02 = 0; int nsr03 = 0; struct udf_sb_info *sbi; - /* Block size must be a multiple of 512 */ - if (sb->s_blocksize & 511) - return 0; sbi = UDF_SB(sb); - if (sb->s_blocksize < sizeof(struct volStructDesc)) sectorsize = sizeof(struct volStructDesc); else @@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent) break; } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) { - iso9660 = sector; switch (vsd->structType) { case 0: udf_debug("ISO9660 Boot Record found\n"); @@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent) return 0; } -/* - * Check whether there is an anchor block in the given block - */ -static int udf_check_anchor_block(struct super_block *sb, sector_t block) -{ - struct buffer_head *bh; - uint16_t ident; - - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= - sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) - return 0; - - bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) - return 0; - brelse(bh); - - return ident == TAG_IDENT_AVDP; -} - -/* Search for an anchor volume descriptor pointer */ -static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock) -{ - sector_t last[6]; - int i; - struct udf_sb_info *sbi = UDF_SB(sb); - - last[0] = lastblock; - last[1] = last[0] - 1; - last[2] = last[0] + 1; - last[3] = last[0] - 2; - last[4] = last[0] - 150; - last[5] = last[0] - 152; - - /* according to spec, anchor is in either: - * block 256 - * lastblock-256 - * lastblock - * however, if the disc isn't closed, it could be 512 */ - - for (i = 0; i < ARRAY_SIZE(last); i++) { - if (last[i] < 0) - continue; - if (last[i] >= sb->s_bdev->bd_inode->i_size >> - sb->s_blocksize_bits) - continue; - - if (udf_check_anchor_block(sb, last[i])) { - sbi->s_anchor[0] = last[i]; - sbi->s_anchor[1] = last[i] - 256; - return last[i]; - } - - if (last[i] < 256) - continue; - - if (udf_check_anchor_block(sb, last[i] - 256)) { - sbi->s_anchor[1] = last[i] - 256; - return last[i]; - } - } - - if (udf_check_anchor_block(sb, sbi->s_session + 256)) { - sbi->s_anchor[0] = sbi->s_session + 256; - return last[0]; - } - if (udf_check_anchor_block(sb, sbi->s_session + 512)) { - sbi->s_anchor[0] = sbi->s_session + 512; - return last[0]; - } - return 0; -} - -/* - * Find an anchor volume descriptor. The function expects sbi->s_lastblock to - * be the last block on the media. - * - * Return 1 if not found, 0 if ok - * - */ -static void udf_find_anchor(struct super_block *sb) -{ - sector_t lastblock; - struct buffer_head *bh = NULL; - uint16_t ident; - int i; - struct udf_sb_info *sbi = UDF_SB(sb); - - lastblock = udf_scan_anchors(sb, sbi->s_last_block); - if (lastblock) - goto check_anchor; - - /* No anchor found? Try VARCONV conversion of block numbers */ - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); - /* Firstly, we try to not convert number of the last block */ - lastblock = udf_scan_anchors(sb, - udf_variable_to_fixed(sbi->s_last_block)); - if (lastblock) - goto check_anchor; - - /* Secondly, we try with converted number of the last block */ - lastblock = udf_scan_anchors(sb, sbi->s_last_block); - if (!lastblock) { - /* VARCONV didn't help. Clear it. */ - UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - } - -check_anchor: - /* - * Check located anchors and the anchor block supplied via - * mount options - */ - for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { - if (!sbi->s_anchor[i]) - continue; - bh = udf_read_tagged(sb, sbi->s_anchor[i], - sbi->s_anchor[i], &ident); - if (!bh) - sbi->s_anchor[i] = 0; - else { - brelse(bh); - if (ident != TAG_IDENT_AVDP) - sbi->s_anchor[i] = 0; - } - } - - sbi->s_last_block = lastblock; -} - static int udf_find_fileset(struct super_block *sb, - kernel_lb_addr *fileset, - kernel_lb_addr *root) + struct kernel_lb_addr *fileset, + struct kernel_lb_addr *root) { struct buffer_head *bh = NULL; long lastblock; @@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb, if (fileset->logicalBlockNum != 0xFFFFFFFF || fileset->partitionReferenceNum != 0xFFFF) { - bh = udf_read_ptagged(sb, *fileset, 0, &ident); + bh = udf_read_ptagged(sb, fileset, 0, &ident); if (!bh) { return 1; @@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb, sbi = UDF_SB(sb); if (!bh) { /* Search backwards through the partitions */ - kernel_lb_addr newfileset; + struct kernel_lb_addr newfileset; /* --> cvg: FIXME - is it reasonable? */ return 1; @@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb, newfileset.logicalBlockNum = 0; do { - bh = udf_read_ptagged(sb, newfileset, 0, + bh = udf_read_ptagged(sb, &newfileset, 0, &ident); if (!bh) { newfileset.logicalBlockNum++; @@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb, static int udf_load_pvoldesc(struct super_block *sb, sector_t block) { struct primaryVolDesc *pvoldesc; - struct ustr instr; - struct ustr outstr; + struct ustr *instr, *outstr; struct buffer_head *bh; uint16_t ident; + int ret = 1; + + instr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!instr) + return 1; + + outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!outstr) + goto out1; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 1; + goto out2; + BUG_ON(ident != TAG_IDENT_PVD); pvoldesc = (struct primaryVolDesc *)bh->b_data; @@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, pvoldesc->recordingDateAndTime)) { #ifdef UDFFS_DEBUG - timestamp *ts = &pvoldesc->recordingDateAndTime; + struct timestamp *ts = &pvoldesc->recordingDateAndTime; udf_debug("recording time %04u/%02u/%02u" " %02u:%02u (%x)\n", le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, @@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) #endif } - if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) - if (udf_CS0toUTF8(&outstr, &instr)) { - strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name, - outstr.u_len > 31 ? 31 : outstr.u_len); + if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) + if (udf_CS0toUTF8(outstr, instr)) { + strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, + outstr->u_len > 31 ? 31 : outstr->u_len); udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); } - if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) - if (udf_CS0toUTF8(&outstr, &instr)) - udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); + if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) + if (udf_CS0toUTF8(outstr, instr)) + udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); brelse(bh); - return 0; + ret = 0; +out2: + kfree(outstr); +out1: + kfree(instr); + return ret; } static int udf_load_metadata_files(struct super_block *sb, int partition) @@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_meta_data *mdata; - kernel_lb_addr addr; + struct kernel_lb_addr addr; int fe_error = 0; map = &sbi->s_partmaps[partition]; @@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Metadata file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_metadata_fe = udf_iget(sb, addr); + mdata->s_metadata_fe = udf_iget(sb, &addr); if (mdata->s_metadata_fe == NULL) { udf_warning(sb, __func__, "metadata inode efe not found, " @@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Mirror metadata file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_mirror_fe = udf_iget(sb, addr); + mdata->s_mirror_fe = udf_iget(sb, &addr); if (mdata->s_mirror_fe == NULL) { if (fe_error) { @@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_bitmap_fe = udf_iget(sb, addr); + mdata->s_bitmap_fe = udf_iget(sb, &addr); if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) @@ -1037,7 +923,7 @@ error_exit: } static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, - kernel_lb_addr *root) + struct kernel_lb_addr *root) { struct fileSetDesc *fset; @@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd = (struct partitionHeaderDesc *)p->partitionContentsUse; if (phd->unallocSpaceTable.extLength) { - kernel_lb_addr loc = { + struct kernel_lb_addr loc = { .logicalBlockNum = le32_to_cpu( phd->unallocSpaceTable.extPosition), .partitionReferenceNum = p_index, }; - map->s_uspace.s_table = udf_iget(sb, loc); + map->s_uspace.s_table = udf_iget(sb, &loc); if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); @@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, udf_debug("partitionIntegrityTable (part %d)\n", p_index); if (phd->freedSpaceTable.extLength) { - kernel_lb_addr loc = { + struct kernel_lb_addr loc = { .logicalBlockNum = le32_to_cpu( phd->freedSpaceTable.extPosition), .partitionReferenceNum = p_index, }; - map->s_fspace.s_table = udf_iget(sb, loc); + map->s_fspace.s_table = udf_iget(sb, &loc); if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); @@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; - kernel_lb_addr ino; + struct kernel_lb_addr ino; struct buffer_head *bh = NULL; struct udf_inode_info *vati; uint32_t pos; @@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) /* VAT file entry is in the last recorded block */ ino.partitionReferenceNum = type1_index; ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, ino); + sbi->s_vat_inode = udf_iget(sb, &ino); if (!sbi->s_vat_inode) return 1; @@ -1322,7 +1208,7 @@ out_bh: } static int udf_load_logicalvol(struct super_block *sb, sector_t block, - kernel_lb_addr *fileset) + struct kernel_lb_addr *fileset) { struct logicalVolDesc *lvd; int i, j, offset; @@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } if (fileset) { - long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]); + struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); udf_debug("FileSet found in LogicalVolDesc at block=%d, " @@ -1490,7 +1376,7 @@ out_bh: * udf_load_logicalvolint * */ -static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) +static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc) { struct buffer_head *bh = NULL; uint16_t ident; @@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) * Written, tested, and released. */ static noinline int udf_process_sequence(struct super_block *sb, long block, - long lastblock, kernel_lb_addr *fileset) + long lastblock, struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; struct udf_vds_record vds[VDS_POS_LENGTH]; @@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, return 0; } +static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, + struct kernel_lb_addr *fileset) +{ + struct anchorVolDescPtr *anchor; + long main_s, main_e, reserve_s, reserve_e; + struct udf_sb_info *sbi; + + sbi = UDF_SB(sb); + anchor = (struct anchorVolDescPtr *)bh->b_data; + + /* Locate the main sequence */ + main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); + main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); + main_e = main_e >> sb->s_blocksize_bits; + main_e += main_s; + + /* Locate the reserve sequence */ + reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); + reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); + reserve_e = reserve_e >> sb->s_blocksize_bits; + reserve_e += reserve_s; + + /* Process the main & reserve sequences */ + /* responsible for finding the PartitionDesc(s) */ + if (!udf_process_sequence(sb, main_s, main_e, fileset)) + return 1; + return !udf_process_sequence(sb, reserve_s, reserve_e, fileset); +} + /* - * udf_check_valid() + * Check whether there is an anchor block in the given block and + * load Volume Descriptor Sequence if so. */ -static int udf_check_valid(struct super_block *sb, int novrs, int silent) +static int udf_check_anchor_block(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) { - long block; - struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh; + uint16_t ident; + int ret; - if (novrs) { - udf_debug("Validity check skipped because of novrs option\n"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && + udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return 0; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 0; + if (ident != TAG_IDENT_AVDP) { + brelse(bh); return 0; } - /* Check that it is NSR02 compliant */ - /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ - block = udf_vrs(sb, silent); - if (block == -1) - udf_debug("Failed to read byte 32768. Assuming open " - "disc. Skipping validity check\n"); - if (block && !sbi->s_last_block) - sbi->s_last_block = udf_get_last_block(sb); - return !block; + ret = udf_load_sequence(sb, bh, fileset); + brelse(bh); + return ret; } -static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) +/* Search for an anchor volume descriptor pointer */ +static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, + struct kernel_lb_addr *fileset) { - struct anchorVolDescPtr *anchor; - uint16_t ident; - struct buffer_head *bh; - long main_s, main_e, reserve_s, reserve_e; + sector_t last[6]; int i; - struct udf_sb_info *sbi; - - if (!sb) - return 1; - sbi = UDF_SB(sb); + struct udf_sb_info *sbi = UDF_SB(sb); + int last_count = 0; - for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { - if (!sbi->s_anchor[i]) + /* First try user provided anchor */ + if (sbi->s_anchor) { + if (udf_check_anchor_block(sb, sbi->s_anchor, fileset)) + return lastblock; + } + /* + * according to spec, anchor is in either: + * block 256 + * lastblock-256 + * lastblock + * however, if the disc isn't closed, it could be 512. + */ + if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset)) + return lastblock; + /* + * The trouble is which block is the last one. Drives often misreport + * this so we try various possibilities. + */ + last[last_count++] = lastblock; + if (lastblock >= 1) + last[last_count++] = lastblock - 1; + last[last_count++] = lastblock + 1; + if (lastblock >= 2) + last[last_count++] = lastblock - 2; + if (lastblock >= 150) + last[last_count++] = lastblock - 150; + if (lastblock >= 152) + last[last_count++] = lastblock - 152; + + for (i = 0; i < last_count; i++) { + if (last[i] >= sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits) continue; - - bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], - &ident); - if (!bh) + if (udf_check_anchor_block(sb, last[i], fileset)) + return last[i]; + if (last[i] < 256) continue; + if (udf_check_anchor_block(sb, last[i] - 256, fileset)) + return last[i]; + } - anchor = (struct anchorVolDescPtr *)bh->b_data; + /* Finally try block 512 in case media is open */ + if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset)) + return last[0]; + return 0; +} - /* Locate the main sequence */ - main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); - main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); - main_e = main_e >> sb->s_blocksize_bits; - main_e += main_s; +/* + * Find an anchor volume descriptor and load Volume Descriptor Sequence from + * area specified by it. The function expects sbi->s_lastblock to be the last + * block on the media. + * + * Return 1 if ok, 0 if not found. + * + */ +static int udf_find_anchor(struct super_block *sb, + struct kernel_lb_addr *fileset) +{ + sector_t lastblock; + struct udf_sb_info *sbi = UDF_SB(sb); - /* Locate the reserve sequence */ - reserve_s = le32_to_cpu( - anchor->reserveVolDescSeqExt.extLocation); - reserve_e = le32_to_cpu( - anchor->reserveVolDescSeqExt.extLength); - reserve_e = reserve_e >> sb->s_blocksize_bits; - reserve_e += reserve_s; + lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); + if (lastblock) + goto out; - brelse(bh); + /* No anchor found? Try VARCONV conversion of block numbers */ + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + /* Firstly, we try to not convert number of the last block */ + lastblock = udf_scan_anchors(sb, + udf_variable_to_fixed(sbi->s_last_block), + fileset); + if (lastblock) + goto out; - /* Process the main & reserve sequences */ - /* responsible for finding the PartitionDesc(s) */ - if (!(udf_process_sequence(sb, main_s, main_e, - fileset) && - udf_process_sequence(sb, reserve_s, reserve_e, - fileset))) - break; + /* Secondly, we try with converted number of the last block */ + lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); + if (!lastblock) { + /* VARCONV didn't help. Clear it. */ + UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); + return 0; } +out: + sbi->s_last_block = lastblock; + return 1; +} - if (i == ARRAY_SIZE(sbi->s_anchor)) { - udf_debug("No Anchor block found\n"); - return 1; +/* + * Check Volume Structure Descriptor, find Anchor block and load Volume + * Descriptor Sequence + */ +static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, + int silent, struct kernel_lb_addr *fileset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + loff_t nsr_off; + + if (!sb_set_blocksize(sb, uopt->blocksize)) { + if (!silent) + printk(KERN_WARNING "UDF-fs: Bad block size\n"); + return 0; + } + sbi->s_last_block = uopt->lastblock; + if (!uopt->novrs) { + /* Check that it is NSR02 compliant */ + nsr_off = udf_check_vsd(sb); + if (!nsr_off) { + if (!silent) + printk(KERN_WARNING "UDF-fs: No VRS found\n"); + return 0; + } + if (nsr_off == -1) + udf_debug("Failed to read byte 32768. Assuming open " + "disc. Skipping validity check\n"); + if (!sbi->s_last_block) + sbi->s_last_block = udf_get_last_block(sb); + } else { + udf_debug("Validity check skipped because of novrs option\n"); } - udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]); - return 0; + /* Look for anchor block and load Volume Descriptor Sequence */ + sbi->s_anchor = uopt->anchor; + if (!udf_find_anchor(sb, fileset)) { + if (!silent) + printk(KERN_WARNING "UDF-fs: No anchor found\n"); + return 0; + } + return 1; } static void udf_open_lvid(struct super_block *sb) @@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb) struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; + if (!bh) return; - lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvidiu = udf_sb_lvidiu(sbi); @@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb) lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); - lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(tag), + crc_itu_t(0, (char *)lvid + sizeof(struct tag), le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; } static void udf_close_lvid(struct super_block *sb) @@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb) return; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; - - if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN) - return; - lvidiu = udf_sb_lvidiu(sbi); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; @@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(tag), + crc_itu_t(0, (char *)lvid + sizeof(struct tag), le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; } static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) @@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map) static int udf_fill_super(struct super_block *sb, void *options, int silent) { int i; + int ret; struct inode *inode = NULL; struct udf_options uopt; - kernel_lb_addr rootdir, fileset; + struct kernel_lb_addr rootdir, fileset; struct udf_sb_info *sbi; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); uopt.uid = -1; uopt.gid = -1; uopt.umask = 0; + uopt.fmode = UDF_INVALID_MODE; + uopt.dmode = UDF_INVALID_MODE; sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); if (!sbi) @@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; sbi->s_nls_map = uopt.nls_map; - /* Set the block size for all transfers */ - if (!sb_min_blocksize(sb, uopt.blocksize)) { - udf_debug("Bad block size (%d)\n", uopt.blocksize); - printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize); - goto error_out; - } - if (uopt.session == 0xFFFFFFFF) sbi->s_session = udf_get_last_session(sb); else @@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) udf_debug("Multi-session=%d\n", sbi->s_session); - sbi->s_last_block = uopt.lastblock; - sbi->s_anchor[0] = sbi->s_anchor[1] = 0; - sbi->s_anchor[2] = uopt.anchor; - - if (udf_check_valid(sb, uopt.novrs, silent)) { - /* read volume recognition sequences */ - printk(KERN_WARNING "UDF-fs: No VRS found\n"); - goto error_out; - } - - udf_find_anchor(sb); - /* Fill in the rest of the superblock */ sb->s_op = &udf_sb_ops; sb->s_export_op = &udf_export_ops; @@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; - if (udf_load_sequence(sb, &fileset)) { + if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } else { + uopt.blocksize = bdev_hardsect_size(sb->s_bdev); + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { + if (!silent) + printk(KERN_NOTICE + "UDF-fs: Rescanning with blocksize " + "%d\n", UDF_DEFAULT_BLOCKSIZE); + uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } + } + if (!ret) { printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); goto error_out; } @@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (!silent) { - timestamp ts; + struct timestamp ts; udf_time_to_disk_stamp(&ts, sbi->s_record_time); udf_info("UDF: Mounting volume '%s', " "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", @@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Assign the root inode */ /* assign inodes by physical block number */ /* perhaps it's not extensible enough, but for now ... */ - inode = udf_iget(sb, rootdir); + inode = udf_iget(sb, &rootdir); if (!inode) { printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " "partition=%d\n", @@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +static int udf_sync_fs(struct super_block *sb, int wait) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + + mutex_lock(&sbi->s_alloc_mutex); + if (sbi->s_lvid_dirty) { + /* + * Blockdevice will be synced later so we don't have to submit + * the buffer for IO + */ + mark_buffer_dirty(sbi->s_lvid_bh); + sb->s_dirt = 0; + sbi->s_lvid_dirty = 0; + } + mutex_unlock(&sbi->s_alloc_mutex); + + return 0; +} + static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDescImpUse *lvidiu; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); if (sbi->s_lvid_bh != NULL) lvidiu = udf_sb_lvidiu(sbi); @@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) le32_to_cpu(lvidiu->numDirs)) : 0) + buf->f_bfree; buf->f_ffree = buf->f_bfree; - /* __kernel_fsid_t f_fsid */ buf->f_namelen = UDF_NAME_LEN - 2; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } @@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, unsigned int accum = 0; int index; int block = 0, newblock; - kernel_lb_addr loc; + struct kernel_lb_addr loc; uint32_t bytes; uint8_t *ptr; uint16_t ident; @@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_read_ptagged(sb, loc, 0, &ident); + bh = udf_read_ptagged(sb, &loc, 0, &ident); if (!bh) { printk(KERN_ERR "udf: udf_count_free failed\n"); @@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, bytes -= cur_bytes; if (bytes) { brelse(bh); - newblock = udf_get_lb_pblock(sb, loc, ++block); + newblock = udf_get_lb_pblock(sb, &loc, ++block); bh = udf_tread(sb, newblock); if (!bh) { udf_debug("read failed\n"); @@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb, { unsigned int accum = 0; uint32_t elen; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; int8_t etype; struct extent_position epos; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 65e19b4f942..225527cdc88 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -28,10 +28,10 @@ #include "udf_sb.h" static void extent_trunc(struct inode *inode, struct extent_position *epos, - kernel_lb_addr eloc, int8_t etype, uint32_t elen, + struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, uint32_t nelen) { - kernel_lb_addr neloc = {}; + struct kernel_lb_addr neloc = {}; int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> @@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos, last_block); etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); } else - neloc = eloc; + neloc = *eloc; nelen = (etype << 30) | nelen; } if (elen != nelen) { - udf_write_aext(inode, epos, neloc, nelen, 0); + udf_write_aext(inode, epos, &neloc, nelen, 0); if (last_block - first_block > 0) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); @@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos, void udf_truncate_tail_extent(struct inode *inode) { struct extent_position epos = {}; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen, nelen; uint64_t lbcount = 0; int8_t etype = -1, netype; @@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else BUG(); @@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode) (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; - extent_trunc(inode, &epos, eloc, etype, elen, nelen); + extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) printk(KERN_ERR "udf_truncate_tail_extent(): " @@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode) void udf_discard_prealloc(struct inode *inode) { struct extent_position epos = { NULL, 0, {0, 0} }; - kernel_lb_addr eloc; + struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1, netype; @@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else adsize = 0; @@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode) if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { epos.offset -= adsize; lbcount -= elen; - extent_trunc(inode, &epos, eloc, etype, elen, 0); + extent_trunc(inode, &epos, &eloc, etype, elen, 0); if (!epos.bh) { iinfo->i_lenAlloc = epos.offset - @@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode, void udf_truncate_extents(struct inode *inode) { struct extent_position epos; - kernel_lb_addr eloc, neloc = {}; + struct kernel_lb_addr eloc, neloc = {}; uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; @@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(short_ad); + adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(long_ad); + adsize = sizeof(struct long_ad); else BUG(); @@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode) (inode->i_size & (sb->s_blocksize - 1)); if (etype != -1) { epos.offset -= adsize; - extent_trunc(inode, &epos, eloc, etype, elen, byte_offset); + extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); epos.offset += adsize; if (byte_offset) lenalloc = epos.offset; @@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode) while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { - udf_write_aext(inode, &epos, neloc, nelen, 0); + udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, epos.block, + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; @@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode) epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, eloc, 0)); + udf_get_lb_pblock(sb, &eloc, 0)); if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> @@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode) else indirect_ext_len = 1; } else { - extent_trunc(inode, &epos, eloc, etype, + extent_trunc(inode, &epos, &eloc, etype, elen, 0); epos.offset += adsize; } @@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode) if (indirect_ext_len) { BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, epos.block, 0, + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; @@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode) udf_update_alloc_ext_desc(inode, &epos, lenalloc); } else if (inode->i_size) { if (byte_offset) { - kernel_long_ad extent; + struct kernel_long_ad extent; /* * OK, there is not extent covering inode->i_size and diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 4f86b1d98a5..e58d1de4107 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -4,7 +4,7 @@ struct udf_inode_info { struct timespec i_crtime; /* Physical address of inode */ - kernel_lb_addr i_location; + struct kernel_lb_addr i_location; __u64 i_unique; __u32 i_lenEAttr; __u32 i_lenAlloc; @@ -17,8 +17,8 @@ struct udf_inode_info { unsigned i_strat4096 : 1; unsigned reserved : 26; union { - short_ad *i_sad; - long_ad *i_lad; + struct short_ad *i_sad; + struct long_ad *i_lad; __u8 *i_data; } i_ext; struct inode vfs_inode; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 1c1c514a972..d113b72c276 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -30,6 +30,7 @@ #define UDF_FLAG_GID_SET 16 #define UDF_FLAG_SESSION_SET 17 #define UDF_FLAG_LASTBLOCK_SET 18 +#define UDF_FLAG_BLOCKSIZE_SET 19 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 @@ -48,6 +49,8 @@ #define UDF_SPARABLE_MAP15 0x1522U #define UDF_METADATA_MAP25 0x2511U +#define UDF_INVALID_MODE ((mode_t)-1) + #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ struct udf_meta_data { @@ -114,7 +117,7 @@ struct udf_sb_info { /* Sector headers */ __s32 s_session; - __u32 s_anchor[3]; + __u32 s_anchor; __u32 s_last_block; struct buffer_head *s_lvid_bh; @@ -123,6 +126,8 @@ struct udf_sb_info { mode_t s_umask; gid_t s_gid; uid_t s_uid; + mode_t s_fmode; + mode_t s_dmode; /* Root Info */ struct timespec s_record_time; @@ -143,6 +148,8 @@ struct udf_sb_info { struct inode *s_vat_inode; struct mutex s_alloc_mutex; + /* Protected by s_alloc_mutex */ + unsigned int s_lvid_dirty; }; static inline struct udf_sb_info *UDF_SB(struct super_block *sb) diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8ec865de5f1..cac51b77a5d 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode) return 0; } -#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset)) - /* computes tag checksum */ -u8 udf_tag_checksum(const tag *t); +u8 udf_tag_checksum(const struct tag *t); struct dentry; struct inode; @@ -95,7 +93,7 @@ struct udf_vds_record { }; struct generic_desc { - tag descTag; + struct tag descTag; __le32 volDescSeqNum; }; @@ -108,11 +106,22 @@ struct ustr { struct extent_position { struct buffer_head *bh; uint32_t offset; - kernel_lb_addr block; + struct kernel_lb_addr block; }; /* super.c */ extern void udf_warning(struct super_block *, const char *, const char *, ...); +static inline void udf_updated_lvid(struct super_block *sb) +{ + struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; + + BUG_ON(!bh); + WARN_ON_ONCE(((struct logicalVolIntegrityDesc *) + bh->b_data)->integrityType != + cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN)); + sb->s_dirt = 1; + UDF_SB(sb)->s_lvid_dirty = 1; +} /* namei.c */ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, @@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int, unsigned long); /* inode.c */ -extern struct inode *udf_iget(struct super_block *, kernel_lb_addr); +extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); extern int udf_sync_inode(struct inode *); extern void udf_expand_file_adinicb(struct inode *, int, int *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); @@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *); extern int udf_write_inode(struct inode *, int); extern long udf_block_map(struct inode *, sector_t); extern int udf_extend_file(struct inode *, struct extent_position *, - kernel_long_ad *, sector_t); + struct kernel_long_ad *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, - kernel_lb_addr *, uint32_t *, sector_t *); + struct kernel_lb_addr *, uint32_t *, sector_t *); extern int8_t udf_add_aext(struct inode *, struct extent_position *, - kernel_lb_addr, uint32_t, int); + struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_write_aext(struct inode *, struct extent_position *, - kernel_lb_addr, uint32_t, int); + struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_delete_aext(struct inode *, struct extent_position, - kernel_lb_addr, uint32_t); + struct kernel_lb_addr, uint32_t); extern int8_t udf_next_aext(struct inode *, struct extent_position *, - kernel_lb_addr *, uint32_t *, int); + struct kernel_lb_addr *, uint32_t *, int); extern int8_t udf_current_aext(struct inode *, struct extent_position *, - kernel_lb_addr *, uint32_t *, int); + struct kernel_lb_addr *, uint32_t *, int); /* misc.c */ extern struct buffer_head *udf_tgetblk(struct super_block *, int); @@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, uint32_t, uint16_t *); extern struct buffer_head *udf_read_ptagged(struct super_block *, - kernel_lb_addr, uint32_t, + struct kernel_lb_addr *, uint32_t, uint16_t *); extern void udf_update_tag(char *, int); extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); @@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t, uint32_t); extern int udf_relocate_blocks(struct super_block *, long, long *); +static inline uint32_t +udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, + uint32_t offset) +{ + return udf_get_pblock(sb, loc->logicalBlockNum, + loc->partitionReferenceNum, offset); +} + /* unicode.c */ extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, @@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, - kernel_lb_addr, uint32_t, uint32_t); + struct kernel_lb_addr *, uint32_t, uint32_t); extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, uint32_t, uint32_t); extern int udf_new_block(struct super_block *, struct inode *, uint16_t, @@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, struct udf_fileident_bh *, struct fileIdentDesc *, struct extent_position *, - kernel_lb_addr *, uint32_t *, + struct kernel_lb_addr *, uint32_t *, sector_t *); extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset); -extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); -extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); +extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); +extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); /* udftime.c */ extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, - timestamp src); -extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); + struct timestamp src); +extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src); #endif /* __UDF_DECL_H */ diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h index 489f52fb428..6a9f3a9cc42 100644 --- a/fs/udf/udfend.h +++ b/fs/udf/udfend.h @@ -4,9 +4,9 @@ #include <asm/byteorder.h> #include <linux/string.h> -static inline kernel_lb_addr lelb_to_cpu(lb_addr in) +static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in) { - kernel_lb_addr out; + struct kernel_lb_addr out; out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); @@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in) return out; } -static inline lb_addr cpu_to_lelb(kernel_lb_addr in) +static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in) { - lb_addr out; + struct lb_addr out; out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); @@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in) return out; } -static inline short_ad lesa_to_cpu(short_ad in) +static inline struct short_ad lesa_to_cpu(struct short_ad in) { - short_ad out; + struct short_ad out; out.extLength = le32_to_cpu(in.extLength); out.extPosition = le32_to_cpu(in.extPosition); @@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in) return out; } -static inline short_ad cpu_to_lesa(short_ad in) +static inline struct short_ad cpu_to_lesa(struct short_ad in) { - short_ad out; + struct short_ad out; out.extLength = cpu_to_le32(in.extLength); out.extPosition = cpu_to_le32(in.extPosition); @@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in) return out; } -static inline kernel_long_ad lela_to_cpu(long_ad in) +static inline struct kernel_long_ad lela_to_cpu(struct long_ad in) { - kernel_long_ad out; + struct kernel_long_ad out; out.extLength = le32_to_cpu(in.extLength); out.extLocation = lelb_to_cpu(in.extLocation); @@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in) return out; } -static inline long_ad cpu_to_lela(kernel_long_ad in) +static inline struct long_ad cpu_to_lela(struct kernel_long_ad in) { - long_ad out; + struct long_ad out; out.extLength = cpu_to_le32(in.extLength); out.extLocation = cpu_to_lelb(in.extLocation); @@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in) return out; } -static inline kernel_extent_ad leea_to_cpu(extent_ad in) +static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in) { - kernel_extent_ad out; + struct kernel_extent_ad out; out.extLength = le32_to_cpu(in.extLength); out.extLocation = le32_to_cpu(in.extLocation); diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 5f811655c9b..b8c828c4d20 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -85,7 +85,8 @@ extern struct timezone sys_tz; #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) -struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) +struct timespec * +udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) { int yday; u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); @@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) return dest; } -timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) +struct timestamp * +udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) { long int days, rem, y; const unsigned short int *ip; diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 9fdf8c93c58..cefa8c8913e 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, { const uint8_t *ocu; uint8_t cmp_id, ocu_len; - int i; + int i, len; ocu_len = ocu_i->u_len; @@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, if (cmp_id == 16) c = (c << 8) | ocu[i++]; - utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], - UDF_NAME_LEN - utf_o->u_len); + len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], + UDF_NAME_LEN - utf_o->u_len); + /* Valid character? */ + if (len >= 0) + utf_o->u_len += len; + else + utf_o->u_name[utf_o->u_len++] = '?'; } utf_o->u_cmpID = 8; @@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) { - unsigned len, i, max_val; + int len; + unsigned i, max_val; uint16_t uni_char; int u_len; @@ -302,8 +308,13 @@ try_again: u_len = 0U; for (i = 0U; i < uni->u_len; i++) { len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); - if (len <= 0) + if (!len) continue; + /* Invalid character, deal with it */ + if (len < 0) { + len = 1; + uni_char = '?'; + } if (uni_char > max_val) { max_val = 0xffffU; @@ -324,34 +335,43 @@ try_again: int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) { - struct ustr filename, unifilename; - int len; + struct ustr *filename, *unifilename; + int len = 0; - if (udf_build_ustr_exact(&unifilename, sname, flen)) + filename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!filename) return 0; + unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!unifilename) + goto out1; + + if (udf_build_ustr_exact(unifilename, sname, flen)) + goto out2; + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - if (!udf_CS0toUTF8(&filename, &unifilename)) { + if (!udf_CS0toUTF8(filename, unifilename)) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); - return 0; + goto out2; } } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, - &unifilename)) { + if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, + unifilename)) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); - return 0; + goto out2; } } else - return 0; - - len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, - unifilename.u_name, unifilename.u_len); - if (len) - return len; - - return 0; + goto out2; + + len = udf_translate_to_linux(dname, filename->u_name, filename->u_len, + unifilename->u_name, unifilename->u_len); +out2: + kfree(unifilename); +out1: + kfree(filename); + return len; } int udf_put_filename(struct super_block *sb, const uint8_t *sname, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e1c1fc5ee23..60359291761 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_kernel(); @@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); unlock_kernel(); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c3dc491fff8..60f107e47fe 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ xfs_qm_syscalls.o \ xfs_qm_bhv.o \ xfs_qm.o) +xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o ifeq ($(CONFIG_XFS_QUOTA),y) xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h deleted file mode 100644 index 2a88d56c4dc..00000000000 --- a/fs/xfs/linux-2.6/mutex.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_MUTEX_H__ -#define __XFS_SUPPORT_MUTEX_H__ - -#include <linux/mutex.h> - -typedef struct mutex mutex_t; - -#endif /* __XFS_SUPPORT_MUTEX_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index de3a198f771..c13f67300fe 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = { .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e14c4e3aea0..f4e25544157 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -234,9 +234,9 @@ xfs_file_mmap( STATIC int xfs_vm_page_mkwrite( struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { - return block_page_mkwrite(vma, page, xfs_get_blocks); + return block_page_mkwrite(vma, vmf, xfs_get_blocks); } const struct file_operations xfs_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 4bd112313f3..d0b499418a7 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -34,6 +34,7 @@ #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_ioctl.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_rtalloc.h" @@ -78,92 +79,74 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; + struct file *file = NULL; + struct path path; + int error; + struct xfs_inode *ip; - memset((char *)&handle, 0, sizeof(handle)); - - switch (cmd) { - case XFS_IOC_PATH_TO_FSHANDLE: - case XFS_IOC_PATH_TO_HANDLE: { - struct path path; - int error = user_lpath((const char __user *)hreq->path, &path); + if (cmd == XFS_IOC_FD_TO_HANDLE) { + file = fget(hreq->fd); + if (!file) + return -EBADF; + inode = file->f_path.dentry->d_inode; + } else { + error = user_lpath((const char __user *)hreq->path, &path); if (error) return error; - - ASSERT(path.dentry); - ASSERT(path.dentry->d_inode); - inode = igrab(path.dentry->d_inode); - path_put(&path); - break; + inode = path.dentry->d_inode; } + ip = XFS_I(inode); - case XFS_IOC_FD_TO_HANDLE: { - struct file *file; - - file = fget(hreq->fd); - if (!file) - return -EBADF; + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; - ASSERT(file->f_path.dentry); - ASSERT(file->f_path.dentry->d_inode); - inode = igrab(file->f_path.dentry->d_inode); - fput(file); - break; - } + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; - default: - ASSERT(0); - return -XFS_ERROR(EINVAL); - } - if (inode->i_sb->s_magic != XFS_SB_MAGIC) { - /* we're not in XFS anymore, Toto */ - iput(inode); - return -XFS_ERROR(EINVAL); - } + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - break; - default: - iput(inode); - return -XFS_ERROR(EBADF); - } - - /* now we can grab the fsid */ - memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid, - sizeof(xfs_fsid_t)); - hsize = sizeof(xfs_fsid_t); - - if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { - xfs_inode_t *ip = XFS_I(inode); + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { + /* + * This handle only contains an fsid, zero the rest. + */ + memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); + hsize = sizeof(xfs_fsid_t); + } else { int lock_mode; - /* need to get access to the xfs_inode to read the generation */ lock_mode = xfs_ilock_map_shared(ip); - - /* fill in fid section of handle from inode */ handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = ip->i_d.di_gen; handle.ha_fid.fid_ino = ip->i_ino; - xfs_iunlock_map_shared(ip, lock_mode); hsize = XFS_HSIZE(handle); } - /* now copy our handle into the user buffer & write out the size */ + error = -EFAULT; if (copy_to_user(hreq->ohandle, &handle, hsize) || - copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) { - iput(inode); - return -XFS_ERROR(EFAULT); - } + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; - iput(inode); - return 0; + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fput(file); + else + path_put(&path); + return error; } /* diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 7aa53fefc67..6075382336d 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -211,8 +211,13 @@ xfs_vn_mknod( * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ - if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) - return -EINVAL; + if (S_ISCHR(mode) || S_ISBLK(mode)) { + if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + return -EINVAL; + rdev = sysv_encode_dev(rdev); + } else { + rdev = 0; + } if (test_default_acl && test_default_acl(dir)) { if (!_ACL_ALLOC(default_acl)) { @@ -224,28 +229,11 @@ xfs_vn_mknod( } } - xfs_dentry_to_name(&name, dentry); - if (IS_POSIXACL(dir) && !default_acl) - mode &= ~current->fs->umask; - - switch (mode & S_IFMT) { - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: - rdev = sysv_encode_dev(rdev); - case S_IFREG: - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); - break; - case S_IFDIR: - error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL); - break; - default: - error = EINVAL; - break; - } + mode &= ~current_umask(); + xfs_dentry_to_name(&name, dentry); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); if (unlikely(error)) goto out_free_acl; @@ -416,7 +404,7 @@ xfs_vn_symlink( mode_t mode; mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); xfs_dentry_to_name(&name, dentry); error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); @@ -553,9 +541,6 @@ xfs_vn_getattr( stat->uid = ip->i_d.di_uid; stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; -#if XFS_BIG_INUMS - stat->ino += mp->m_inoadd; -#endif stat->atime = inode->i_atime; stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 507492d6dcc..f65a53f8752 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -38,7 +38,6 @@ #include <kmem.h> #include <mrlock.h> #include <sv.h> -#include <mutex.h> #include <time.h> #include <support/ktrace.h> @@ -51,6 +50,7 @@ #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/file.h> #include <linux/swap.h> #include <linux/errno.h> @@ -147,17 +147,6 @@ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) -/* - * IRIX (BSD) quotactl makes use of separate commands for user/group, - * whereas on Linux the syscall encodes this information into the cmd - * field (see the QCMD macro in quota.h). These macros help keep the - * code portable - they are not visible from the syscall interface. - */ -#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */ -#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */ -#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */ -#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */ - #define dfltprid 0 #define MAXPATHLEN 1024 diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c new file mode 100644 index 00000000000..94d9a633d3d --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_dmapi.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "quota/xfs_qm.h" +#include <linux/quota.h> + + +STATIC int +xfs_quota_type(int type) +{ + switch (type) { + case USRQUOTA: + return XFS_DQ_USER; + case GRPQUOTA: + return XFS_DQ_GROUP; + default: + return XFS_DQ_PROJ; + } +} + +STATIC int +xfs_fs_quota_sync( + struct super_block *sb, + int type) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_sync_inodes(mp, SYNC_DELWRI); +} + +STATIC int +xfs_fs_get_xstate( + struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstat(mp, fqs); +} + +STATIC int +xfs_fs_set_xstate( + struct super_block *sb, + unsigned int uflags, + int op) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (uflags & XFS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & XFS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; + if (uflags & XFS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & XFS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) + flags |= XFS_OQUOTA_ENFD; + + switch (op) { + case Q_XQUOTAON: + return -xfs_qm_scall_quotaon(mp, flags); + case Q_XQUOTAOFF: + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_quotaoff(mp, flags); + case Q_XQUOTARM: + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_trunc_qfiles(mp, flags); + } + + return -EINVAL; +} + +STATIC int +xfs_fs_get_xquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); +} + +STATIC int +xfs_fs_set_xquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); +} + +struct quotactl_ops xfs_quotactl_operations = { + .quota_sync = xfs_fs_quota_sync, + .get_xstate = xfs_fs_get_xstate, + .set_xstate = xfs_fs_set_xstate, + .get_xquota = xfs_fs_get_xquota, + .set_xquota = xfs_fs_set_xquota, +}; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 32ae5028e96..bb685269f83 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -68,7 +68,6 @@ #include <linux/freezer.h> #include <linux/parser.h> -static struct quotactl_ops xfs_quotactl_operations; static struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; @@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ #define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ #define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ -#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */ #define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ #define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ #define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ @@ -180,7 +178,7 @@ xfs_parseargs( int dswidth = 0; int iosize = 0; int dmapi_implies_ikeep = 1; - uchar_t iosizelog = 0; + __uint8_t iosizelog = 0; /* * Copy binary VFS mount flags we are interested in. @@ -291,16 +289,6 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { mp->m_flags |= XFS_MOUNT_NORECOVERY; - } else if (!strcmp(this_char, MNTOPT_INO64)) { -#if XFS_BIG_INUMS - mp->m_flags |= XFS_MOUNT_INO64; - mp->m_inoadd = XFS_INO64_OFFSET; -#else - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", - this_char); - return EINVAL; -#endif } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { mp->m_flags |= XFS_MOUNT_NOALIGN; } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { @@ -529,7 +517,6 @@ xfs_showargs( /* the few simple ones we can get from the mount struct */ { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, - { XFS_MOUNT_INO64, "," MNTOPT_INO64 }, { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, @@ -634,7 +621,7 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } -int +STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, @@ -651,7 +638,7 @@ xfs_blkdev_get( return -error; } -void +STATIC void xfs_blkdev_put( struct block_device *bdev) { @@ -872,7 +859,7 @@ xfsaild_wakeup( wake_up_process(ailp->xa_task); } -int +STATIC int xfsaild( void *data) { @@ -990,26 +977,57 @@ xfs_fs_write_inode( int sync) { struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; int error = 0; - int flags = 0; xfs_itrace_entry(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + if (sync) { error = xfs_wait_on_pages(ip, 0, -1); if (error) - goto out_error; - flags |= FLUSH_SYNC; + goto out; } - error = xfs_inode_flush(ip, flags); -out_error: + /* + * Bypass inodes which have already been cleaned by + * the inode flush clustering code inside xfs_iflush + */ + if (xfs_inode_clean(ip)) + goto out; + + /* + * We make this non-blocking if the inode is contended, return + * EAGAIN to indicate to the caller that they did not succeed. + * This prevents the flush path from blocking on inodes inside + * another operation right now, they get caught later by xfs_sync. + */ + if (sync) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_iflock(ip); + + error = xfs_iflush(ip, XFS_IFLUSH_SYNC); + } else { + error = EAGAIN; + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + goto out; + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; + + error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); + } + + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + out: /* * if we failed to write out the inode then mark * it dirty again so we'll try again later. */ if (error) xfs_mark_inode_dirty_sync(ip); - return -error; } @@ -1169,18 +1187,12 @@ xfs_fs_statfs( statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); fakeinos = statp->f_bfree << sbp->sb_inopblog; -#if XFS_BIG_INUMS - fakeinos += mp->m_inoadd; -#endif statp->f_files = MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) -#if XFS_BIG_INUMS - if (!mp->m_inoadd) -#endif - statp->f_files = min_t(typeof(statp->f_files), - statp->f_files, - mp->m_maxicount); + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); spin_unlock(&mp->m_sb_lock); @@ -1302,57 +1314,6 @@ xfs_fs_show_options( return -xfs_showargs(XFS_M(mnt->mnt_sb), m); } -STATIC int -xfs_fs_quotasync( - struct super_block *sb, - int type) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL); -} - -STATIC int -xfs_fs_getxstate( - struct super_block *sb, - struct fs_quota_stat *fqs) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs); -} - -STATIC int -xfs_fs_setxstate( - struct super_block *sb, - unsigned int flags, - int op) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags); -} - -STATIC int -xfs_fs_getxquota( - struct super_block *sb, - int type, - qid_t id, - struct fs_disk_quota *fdq) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), - (type == USRQUOTA) ? Q_XGETQUOTA : - ((type == GRPQUOTA) ? Q_XGETGQUOTA : - Q_XGETPQUOTA), id, (caddr_t)fdq); -} - -STATIC int -xfs_fs_setxquota( - struct super_block *sb, - int type, - qid_t id, - struct fs_disk_quota *fdq) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), - (type == USRQUOTA) ? Q_XSETQLIM : - ((type == GRPQUOTA) ? Q_XSETGQLIM : - Q_XSETPQLIM), id, (caddr_t)fdq); -} - /* * This function fills in xfs_mount_t fields based on mount args. * Note: the superblock _has_ now been read in. @@ -1435,7 +1396,9 @@ xfs_fs_fill_super( sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; +#ifdef CONFIG_XFS_QUOTA sb->s_qcop = &xfs_quotactl_operations; +#endif sb->s_op = &xfs_super_operations; error = xfs_dmops_get(mp); @@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, }; -static struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quotasync, - .get_xstate = xfs_fs_getxstate, - .set_xstate = xfs_fs_setxstate, - .get_xquota = xfs_fs_getxquota, - .set_xquota = xfs_fs_setxquota, -}; - static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index d5d776d4cd6..5a2ea3a2178 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; extern struct xattr_handler *xfs_xattr_handlers[]; +extern struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 5f6de1efe1f..04f058c848a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -19,6 +19,7 @@ #define XFS_SYNC_H 1 struct xfs_mount; +struct xfs_perag; typedef struct bhv_vfs_sync_work { struct list_head w_list; diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index f65983a230d..ad7fbead4c9 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -41,11 +41,6 @@ struct attrlist_cursor_kern; #define IO_INVIS 0x00020 /* don't update inode timestamps */ /* - * Flags for xfs_inode_flush - */ -#define FLUSH_SYNC 1 /* wait for flush to complete */ - -/* * Flush/Invalidate options for vop_toss/flush/flushinval_pages. */ #define FI_NONE 0 /* none */ @@ -55,33 +50,6 @@ struct attrlist_cursor_kern; the operation completes. */ /* - * Dealing with bad inodes - */ -static inline int VN_BAD(struct inode *vp) -{ - return is_bad_inode(vp); -} - -/* - * Extracting atime values in various formats - */ -static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime) -{ - bs_atime->tv_sec = vp->i_atime.tv_sec; - bs_atime->tv_nsec = vp->i_atime.tv_nsec; -} - -static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts) -{ - *ts = vp->i_atime; -} - -static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt) -{ - *tt = vp->i_atime.tv_sec; -} - -/* * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 6543c0b2975..e4babcc6342 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -804,7 +804,7 @@ xfs_qm_dqlookup( uint flist_locked; xfs_dquot_t *d; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); flist_locked = B_FALSE; @@ -877,7 +877,7 @@ xfs_qm_dqlookup( /* * move the dquot to the front of the hashchain */ - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); if (dqp->HL_PREVP != &qh->qh_next) { xfs_dqtrace_entry(dqp, "DQLOOKUP: HASH MOVETOFRONT"); @@ -892,13 +892,13 @@ xfs_qm_dqlookup( } xfs_dqtrace_entry(dqp, "LOOKUP END"); *O_dqpp = dqp; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); return (0); } } *O_dqpp = NULL; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); return (1); } @@ -956,7 +956,7 @@ xfs_qm_dqget( ASSERT(ip->i_gdquot == NULL); } #endif - XFS_DQ_HASH_LOCK(h); + mutex_lock(&h->qh_lock); /* * Look in the cache (hashtable). @@ -971,7 +971,7 @@ xfs_qm_dqget( */ ASSERT(*O_dqpp); ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); return (0); /* success */ } @@ -991,7 +991,7 @@ xfs_qm_dqget( * we don't keep the lock across a disk read */ version = h->qh_version; - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); /* * Allocate the dquot on the kernel heap, and read the ondisk @@ -1056,7 +1056,7 @@ xfs_qm_dqget( /* * Hashlock comes after ilock in lock order */ - XFS_DQ_HASH_LOCK(h); + mutex_lock(&h->qh_lock); if (version != h->qh_version) { xfs_dquot_t *tmpdqp; /* @@ -1072,7 +1072,7 @@ xfs_qm_dqget( * and start over. */ xfs_qm_dqput(tmpdqp); - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); xfs_qm_dqdestroy(dqp); XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); goto again; @@ -1083,7 +1083,7 @@ xfs_qm_dqget( * Put the dquot at the beginning of the hash-chain and mp's list * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. */ - ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); + ASSERT(mutex_is_locked(&h->qh_lock)); dqp->q_hash = h; XQM_HASHLIST_INSERT(h, dqp); @@ -1102,7 +1102,7 @@ xfs_qm_dqget( XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); xfs_qm_mplist_unlock(mp); - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); xfs_dqtrace_entry(dqp, "DQGET DONE"); @@ -1440,7 +1440,7 @@ xfs_qm_dqpurge( xfs_mount_t *mp = dqp->q_mount; ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); + ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); xfs_dqlock(dqp); /* @@ -1453,7 +1453,7 @@ xfs_qm_dqpurge( */ if (dqp->q_nrefs != 0) { xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mutex_unlock(&dqp->q_hash->qh_lock); return (1); } @@ -1517,7 +1517,7 @@ xfs_qm_dqpurge( memset(&dqp->q_core, 0, sizeof(dqp->q_core)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(thishash); + mutex_unlock(&thishash->qh_lock); return (0); } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index d443e93b433..de0f402ddb4 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -34,7 +34,7 @@ */ typedef struct xfs_dqhash { struct xfs_dquot *qh_next; - mutex_t qh_lock; + struct mutex qh_lock; uint qh_version; /* ever increasing version */ uint qh_nelems; /* number of dquots on the list */ } xfs_dqhash_t; @@ -81,7 +81,7 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ - mutex_t q_qlock; /* quota lock */ + struct mutex q_qlock; /* quota lock */ struct completion q_flush; /* flush completion queue */ atomic_t q_pincount; /* dquot pin count */ wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ @@ -109,19 +109,6 @@ enum { #define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) -#ifdef DEBUG -static inline int -XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) -{ - if (mutex_trylock(&dqp->q_qlock)) { - mutex_unlock(&dqp->q_qlock); - return 0; - } - return 1; -} -#endif - - /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to @@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) complete(&dqp->q_flush); } +#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 7a2beb64314..5b6695049e0 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -55,7 +55,7 @@ * quota functionality, including maintaining the freelist and hash * tables of dquots. */ -mutex_t xfs_Gqm_lock; +struct mutex xfs_Gqm_lock; struct xfs_qm *xfs_Gqm; uint ndquot; @@ -69,8 +69,6 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC void xfs_qm_freelist_init(xfs_frlist_t *); STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); -STATIC int xfs_qm_mplist_nowait(xfs_mount_t *); -STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); @@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = { }; #ifdef DEBUG -extern mutex_t qcheck_lock; +extern struct mutex qcheck_lock; #endif #ifdef QUOTADEBUG @@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref( * the structure could disappear between the entry to this routine and * a HOLD operation if not locked. */ - XFS_QM_LOCK(xfs_Gqm); + mutex_lock(&xfs_Gqm_lock); if (xfs_Gqm == NULL) xfs_Gqm = xfs_Gqm_init(); @@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref( * debugging and statistical purposes, but ... * Just take a reference and get out. */ - XFS_QM_HOLD(xfs_Gqm); - XFS_QM_UNLOCK(xfs_Gqm); + xfs_Gqm->qm_nrefs++; + mutex_unlock(&xfs_Gqm_lock); return 0; } @@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref( * Destroy the entire XQM. If somebody mounts with quotaon, this'll * be restarted. */ - XFS_QM_LOCK(xfs_Gqm); - XFS_QM_RELE(xfs_Gqm); - if (xfs_Gqm->qm_nrefs == 0) { + mutex_lock(&xfs_Gqm_lock); + if (--xfs_Gqm->qm_nrefs == 0) { xfs_qm_destroy(xfs_Gqm); xfs_Gqm = NULL; } - XFS_QM_UNLOCK(xfs_Gqm); + mutex_unlock(&xfs_Gqm_lock); } /* @@ -577,10 +574,10 @@ xfs_qm_dqpurge_int( continue; } - if (! xfs_qm_dqhashlock_nowait(dqp)) { + if (!mutex_trylock(&dqp->q_hash->qh_lock)) { nrecl = XFS_QI_MPLRECLAIMS(mp); xfs_qm_mplist_unlock(mp); - XFS_DQ_HASH_LOCK(dqp->q_hash); + mutex_lock(&dqp->q_hash->qh_lock); xfs_qm_mplist_lock(mp); /* @@ -590,7 +587,7 @@ xfs_qm_dqpurge_int( * this point, but somebody might be taking things off. */ if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { - XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mutex_unlock(&dqp->q_hash->qh_lock); goto again; } } @@ -632,7 +629,6 @@ xfs_qm_dqattach_one( xfs_dqid_t id, uint type, uint doalloc, - uint dolock, xfs_dquot_t *udqhint, /* hint */ xfs_dquot_t **IO_idqpp) { @@ -641,16 +637,16 @@ xfs_qm_dqattach_one( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; + /* * See if we already have it in the inode itself. IO_idqpp is * &i_udquot or &i_gdquot. This made the code look weird, but * made the logic a lot simpler. */ - if ((dqp = *IO_idqpp)) { - if (dolock) - xfs_dqlock(dqp); + dqp = *IO_idqpp; + if (dqp) { xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); - goto done; + return 0; } /* @@ -659,38 +655,38 @@ xfs_qm_dqattach_one( * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside * the user dquot. */ - ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - if (udqhint && !dolock) + if (udqhint) { + ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); xfs_dqlock(udqhint); - /* - * No need to take dqlock to look at the id. - * The ID can't change until it gets reclaimed, and it won't - * be reclaimed as long as we have a ref from inode and we hold - * the ilock. - */ - if (udqhint && - (dqp = udqhint->q_gdquot) && - (be32_to_cpu(dqp->q_core.d_id) == id)) { - ASSERT(XFS_DQ_IS_LOCKED(udqhint)); - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); - ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - if (!dolock) { + /* + * No need to take dqlock to look at the id. + * + * The ID can't change until it gets reclaimed, and it won't + * be reclaimed as long as we have a ref from inode and we + * hold the ilock. + */ + dqp = udqhint->q_gdquot; + if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { + xfs_dqlock(dqp); + XFS_DQHOLD(dqp); + ASSERT(*IO_idqpp == NULL); + *IO_idqpp = dqp; + xfs_dqunlock(dqp); xfs_dqunlock(udqhint); + return 0; } - goto done; - } - /* - * We can't hold a dquot lock when we call the dqget code. - * We'll deadlock in no time, because of (not conforming to) - * lock ordering - the inodelock comes before any dquot lock, - * and we may drop and reacquire the ilock in xfs_qm_dqget(). - */ - if (udqhint) + + /* + * We can't hold a dquot lock when we call the dqget code. + * We'll deadlock in no time, because of (not conforming to) + * lock ordering - the inodelock comes before any dquot lock, + * and we may drop and reacquire the ilock in xfs_qm_dqget(). + */ xfs_dqunlock(udqhint); + } + /* * Find the dquot from somewhere. This bumps the * reference count of dquot and returns it locked. @@ -698,48 +694,19 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, - doalloc|XFS_QMOPT_DOWARN, &dqp))) { - if (udqhint && dolock) - xfs_dqlock(udqhint); - goto done; - } + error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + if (error) + return error; xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); + /* * dqget may have dropped and re-acquired the ilock, but it guarantees * that the dquot returned is the one that should go in the inode. */ *IO_idqpp = dqp; - ASSERT(dqp); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (! dolock) { - xfs_dqunlock(dqp); - goto done; - } - if (! udqhint) - goto done; - - ASSERT(udqhint); - ASSERT(dolock); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (! xfs_qm_dqlock_nowait(udqhint)) { - xfs_dqunlock(dqp); - xfs_dqlock(udqhint); - xfs_dqlock(dqp); - } - done: -#ifdef QUOTADEBUG - if (udqhint) { - if (dolock) - ASSERT(XFS_DQ_IS_LOCKED(udqhint)); - } - if (! error) { - if (dolock) - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - } -#endif - return error; + xfs_dqunlock(dqp); + return 0; } @@ -754,24 +721,15 @@ xfs_qm_dqattach_one( STATIC void xfs_qm_dqattach_grouphint( xfs_dquot_t *udq, - xfs_dquot_t *gdq, - uint locked) + xfs_dquot_t *gdq) { xfs_dquot_t *tmp; -#ifdef QUOTADEBUG - if (locked) { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - } -#endif - if (! locked) - xfs_dqlock(udq); + xfs_dqlock(udq); if ((tmp = udq->q_gdquot)) { if (tmp == gdq) { - if (! locked) - xfs_dqunlock(udq); + xfs_dqunlock(udq); return; } @@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint( * because the freelist lock comes before dqlocks. */ xfs_dqunlock(udq); - if (locked) - xfs_dqunlock(gdq); /* * we took a hard reference once upon a time in dqget, * so give it back when the udquot no longer points at it @@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint( } else { ASSERT(XFS_DQ_IS_LOCKED(udq)); - if (! locked) { - xfs_dqlock(gdq); - } + xfs_dqlock(gdq); } ASSERT(XFS_DQ_IS_LOCKED(udq)); @@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint( XFS_DQHOLD(gdq); udq->q_gdquot = gdq; } - if (! locked) { - xfs_dqunlock(gdq); - xfs_dqunlock(udq); - } + + xfs_dqunlock(gdq); + xfs_dqunlock(udq); } @@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint( * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. - * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty - * much made this code a complete mess, but it has been pretty useful. * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. @@ -851,7 +802,6 @@ xfs_qm_dqattach( if (XFS_IS_UQUOTA_ON(mp)) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, NULL, &ip->i_udquot); if (error) goto done; @@ -863,11 +813,9 @@ xfs_qm_dqattach( error = XFS_IS_GQUOTA_ON(mp) ? xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, ip->i_udquot, &ip->i_gdquot) : xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, ip->i_udquot, &ip->i_gdquot); /* * Don't worry about the udquot that we may have @@ -898,22 +846,13 @@ xfs_qm_dqattach( /* * Attach i_gdquot to the gdquot hint inside the i_udquot. */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, - flags & XFS_QMOPT_DQLOCK); + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); } done: #ifdef QUOTADEBUG if (! error) { - if (ip->i_udquot) { - if (flags & XFS_QMOPT_DQLOCK) - ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot)); - } - if (ip->i_gdquot) { - if (flags & XFS_QMOPT_DQLOCK) - ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot)); - } if (XFS_IS_UQUOTA_ON(mp)) ASSERT(ip->i_udquot); if (XFS_IS_OQUOTA_ON(mp)) @@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist( * a dqlookup process that holds the hashlock that is * waiting for the freelist lock. */ - if (! xfs_qm_dqhashlock_nowait(dqp)) { + if (!mutex_trylock(&dqp->q_hash->qh_lock)) { xfs_dqfunlock(dqp); xfs_dqunlock(dqp); dqp = dqp->dq_flnext; @@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist( /* XXX put a sentinel so that we can come back here */ xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(hash); + mutex_unlock(&hash->qh_lock); xfs_qm_freelist_unlock(xfs_Gqm); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) return nreclaimed; @@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist( XQM_HASHLIST_REMOVE(hash, dqp); xfs_dqfunlock(dqp); xfs_qm_mplist_unlock(dqp->q_mount); - XFS_DQ_HASH_UNLOCK(hash); + mutex_unlock(&hash->qh_lock); off_freelist: XQM_FREELIST_REMOVE(dqp); @@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void) continue; } - if (! xfs_qm_dqhashlock_nowait(dqp)) + if (!mutex_trylock(&dqp->q_hash->qh_lock)) goto mplistunlock; ASSERT(dqp->q_nrefs == 0); @@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void) XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); XQM_FREELIST_REMOVE(dqp); dqpout = dqp; - XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mutex_unlock(&dqp->q_hash->qh_lock); mplistunlock: xfs_qm_mplist_unlock(dqp->q_mount); xfs_dqfunlock(dqp); @@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) { xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); } - -STATIC int -xfs_qm_dqhashlock_nowait( - xfs_dquot_t *dqp) -{ - int locked; - - locked = mutex_trylock(&((dqp)->q_hash->qh_lock)); - return locked; -} - -int -xfs_qm_freelist_lock_nowait( - xfs_qm_t *xqm) -{ - int locked; - - locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock)); - return locked; -} - -STATIC int -xfs_qm_mplist_nowait( - xfs_mount_t *mp) -{ - int locked; - - ASSERT(mp->m_quotainfo); - locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp))); - return locked; -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index ddf09166387..a371954cae1 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -27,7 +27,7 @@ struct xfs_qm; struct xfs_inode; extern uint ndquot; -extern mutex_t xfs_Gqm_lock; +extern struct mutex xfs_Gqm_lock; extern struct xfs_qm *xfs_Gqm; extern kmem_zone_t *qm_dqzone; extern kmem_zone_t *qm_dqtrxzone; @@ -79,7 +79,7 @@ typedef xfs_dqhash_t xfs_dqlist_t; typedef struct xfs_frlist { struct xfs_dquot *qh_next; struct xfs_dquot *qh_prev; - mutex_t qh_lock; + struct mutex qh_lock; uint qh_version; uint qh_nelems; } xfs_frlist_t; @@ -115,7 +115,7 @@ typedef struct xfs_quotainfo { xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ - mutex_t qi_quotaofflock;/* to serialize quotaoff */ + struct mutex qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dqs in above chunk */ xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ @@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct { #define XFS_QM_IWARNLIMIT 5 #define XFS_QM_RTBWARNLIMIT 5 -#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock)) -#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) -#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) -#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) - extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); extern void xfs_qm_mount_quotas(xfs_mount_t *); extern int xfs_qm_quotacheck(xfs_mount_t *); @@ -178,6 +173,16 @@ extern void xfs_qm_dqdetach(xfs_inode_t *); extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); +/* quota ops */ +extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); +extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); +extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); +extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); + /* vop stuff */ extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, uid_t, gid_t, prid_t, uint, @@ -194,11 +199,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *, /* list stuff */ extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); extern void xfs_qm_freelist_unlink(xfs_dquot_t *); -extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *); - -/* system call interface */ -extern int xfs_qm_quotactl(struct xfs_mount *, int, int, - xfs_caddr_t); #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index bc6c5cca3e1..63037c689a4 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = { .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, .xfs_dqstatvfs = xfs_qm_statvfs, .xfs_dqsync = xfs_qm_sync, - .xfs_quotactl = xfs_qm_quotactl, .xfs_dqtrxops = &xfs_trans_dquot_ops, }; EXPORT_SYMBOL(xfs_qmcore_xfs); diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 68139b38aed..c7b66f6506c 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -57,135 +57,16 @@ # define qdprintk(s, args...) do { } while (0) #endif -STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); -STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); -STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint); -STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t); STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, uint); -STATIC uint xfs_qm_import_flags(uint); STATIC uint xfs_qm_export_flags(uint); -STATIC uint xfs_qm_import_qtype_flags(uint); STATIC uint xfs_qm_export_qtype_flags(uint); STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, fs_disk_quota_t *); /* - * The main distribution switch of all XFS quotactl system calls. - */ -int -xfs_qm_quotactl( - xfs_mount_t *mp, - int cmd, - int id, - xfs_caddr_t addr) -{ - int error; - - ASSERT(addr != NULL || cmd == Q_XQUOTASYNC); - - /* - * The following commands are valid even when quotaoff. - */ - switch (cmd) { - case Q_XQUOTARM: - /* - * Truncate quota files. quota must be off. - */ - if (XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(EINVAL); - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - return (xfs_qm_scall_trunc_qfiles(mp, - xfs_qm_import_qtype_flags(*(uint *)addr))); - - case Q_XGETQSTAT: - /* - * Get quota status information. - */ - return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr)); - - case Q_XQUOTAON: - /* - * QUOTAON - enabling quota enforcement. - * Quota accounting must be turned on at mount time. - */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - return (xfs_qm_scall_quotaon(mp, - xfs_qm_import_flags(*(uint *)addr))); - - case Q_XQUOTAOFF: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - break; - - case Q_XQUOTASYNC: - return xfs_sync_inodes(mp, SYNC_DELWRI); - - default: - break; - } - - if (! XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(ESRCH); - - switch (cmd) { - case Q_XQUOTAOFF: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_quotaoff(mp, - xfs_qm_import_flags(*(uint *)addr), - B_FALSE); - break; - - case Q_XGETQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER, - (fs_disk_quota_t *)addr); - break; - case Q_XGETGQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); - break; - case Q_XGETPQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, - (fs_disk_quota_t *)addr); - break; - - case Q_XSETQLIM: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER, - (fs_disk_quota_t *)addr); - break; - case Q_XSETGQLIM: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); - break; - case Q_XSETPQLIM: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, - (fs_disk_quota_t *)addr); - break; - - default: - error = XFS_ERROR(EINVAL); - break; - } - - return (error); -} - -/* * Turn off quota accounting and/or enforcement for all udquots and/or * gdquots. Called only at unmount time. * @@ -193,11 +74,10 @@ xfs_qm_quotactl( * incore, and modifies the ondisk dquot directly. Therefore, for example, * it is an error to call this twice, without purging the cache. */ -STATIC int +int xfs_qm_scall_quotaoff( xfs_mount_t *mp, - uint flags, - boolean_t force) + uint flags) { uint dqtype; int error; @@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff( xfs_qoff_logitem_t *qoffstart; int nculprits; - if (!force && !capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); /* * No file system can have quotas enabled on disk but not in core. * Note that quota utilities (like quotaoff) _expect_ @@ -375,7 +253,7 @@ out_error: return (error); } -STATIC int +int xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) @@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles( int error = 0, error2 = 0; xfs_inode_t *qip; - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); return XFS_ERROR(EINVAL); @@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles( * effect immediately. * (Switching on quota accounting must be done at mount time.) */ -STATIC int +int xfs_qm_scall_quotaon( xfs_mount_t *mp, uint flags) @@ -426,9 +302,6 @@ xfs_qm_scall_quotaon( uint accflags; __int64_t sbflags; - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* * Switching on quota accounting must be done at mount time. @@ -517,7 +390,7 @@ xfs_qm_scall_quotaon( /* * Return quota status information, such as uquota-off, enforcements, etc. */ -STATIC int +int xfs_qm_scall_getqstat( xfs_mount_t *mp, fs_quota_stat_t *out) @@ -582,7 +455,7 @@ xfs_qm_scall_getqstat( /* * Adjust quota limits, and start/stop timers accordingly. */ -STATIC int +int xfs_qm_scall_setqlim( xfs_mount_t *mp, xfs_dqid_t id, @@ -595,9 +468,6 @@ xfs_qm_scall_setqlim( int error; xfs_qcnt_t hard, soft; - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) return (0); @@ -742,7 +612,7 @@ xfs_qm_scall_setqlim( return error; } -STATIC int +int xfs_qm_scall_getquota( xfs_mount_t *mp, xfs_dqid_t id, @@ -935,30 +805,6 @@ xfs_qm_export_dquot( } STATIC uint -xfs_qm_import_qtype_flags( - uint uflags) -{ - uint oflags = 0; - - /* - * Can't be more than one, or none. - */ - if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) || - ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) == - (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0)) - return (0); - - oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0; - oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0; - oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0; - return oflags; -} - -STATIC uint xfs_qm_export_qtype_flags( uint flags) { @@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags( } STATIC uint -xfs_qm_import_flags( - uint uflags) -{ - uint flags = 0; - - if (uflags & XFS_QUOTA_UDQ_ACCT) - flags |= XFS_UQUOTA_ACCT; - if (uflags & XFS_QUOTA_PDQ_ACCT) - flags |= XFS_PQUOTA_ACCT; - if (uflags & XFS_QUOTA_GDQ_ACCT) - flags |= XFS_GQUOTA_ACCT; - if (uflags & XFS_QUOTA_UDQ_ENFD) - flags |= XFS_UQUOTA_ENFD; - if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) - flags |= XFS_OQUOTA_ENFD; - return (flags); -} - - -STATIC uint xfs_qm_export_flags( uint flags) { @@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab; xfs_dqhash_t *qmtest_gdqtab; int qmtest_hashmask; int qmtest_nfails; -mutex_t qcheck_lock; +struct mutex qcheck_lock; #define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ (__psunsigned_t)(id)) & \ diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index c4fcea600bc..8286b2842b6 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -42,34 +42,24 @@ #define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) #define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock) #define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) #define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) -#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) -#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) -#ifdef DEBUG -struct xfs_dqhash; -static inline int XQMISLCKD(struct xfs_dqhash *h) -{ - if (mutex_trylock(&h->qh_lock)) { - mutex_unlock(&h->qh_lock); - return 0; - } - return 1; -} -#endif - -#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) -#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h) -#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h) - -#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp))) -#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp))) -#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp))) - -#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist)) -#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist)) +#define xfs_qm_mplist_lock(mp) \ + mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock)) +#define xfs_qm_mplist_nowait(mp) \ + mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock)) +#define xfs_qm_mplist_unlock(mp) \ + mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock)) +#define XFS_QM_IS_MPLIST_LOCKED(mp) \ + mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock)) + +#define xfs_qm_freelist_lock(qm) \ + mutex_lock(&((qm)->qm_dqfreelist.qh_lock)) +#define xfs_qm_freelist_lock_nowait(qm) \ + mutex_trylock(&((qm)->qm_dqfreelist.qh_lock)) +#define xfs_qm_freelist_unlock(qm) \ + mutex_unlock(&((qm)->qm_dqfreelist.qh_lock)) /* * Hash into a bucket in the dquot hash table, based on <mp, id>. diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 99611381e74..447173bcf96 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -624,10 +624,9 @@ xfs_trans_dqresv( xfs_qcnt_t *resbcountp; xfs_quotainfo_t *q = mp->m_quotainfo; - if (! (flags & XFS_QMOPT_DQLOCK)) { - xfs_dqlock(dqp); - } - ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + xfs_dqlock(dqp); + if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); if (!hardlimit) @@ -740,10 +739,8 @@ xfs_trans_dqresv( ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); error_return: - if (! (flags & XFS_QMOPT_DQLOCK)) { - xfs_dqunlock(dqp); - } - return (error); + xfs_dqunlock(dqp); + return error; } @@ -753,8 +750,7 @@ error_return: * grp/prj quotas is important, because this follows a both-or-nothing * approach. * - * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. - * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index ae548296542..3f3610a7ee0 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -24,6 +24,7 @@ #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_error.h" static char message[1024]; /* keep it off the stack */ static DEFINE_SPINLOCK(xfs_err_lock); diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c index 5830c040ea7..b83f76b6d41 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/support/uuid.c @@ -17,10 +17,6 @@ */ #include <xfs.h> -static DEFINE_MUTEX(uuid_monitor); -static int uuid_table_size; -static uuid_t *uuid_table; - /* IRIX interpretation of an uuid_t */ typedef struct { __be32 uu_timelow; @@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) fsid[1] = be32_to_cpu(uup->uu_timelow); } -void -uuid_create_nil(uuid_t *uuid) -{ - memset(uuid, 0, sizeof(*uuid)); -} - int uuid_is_nil(uuid_t *uuid) { @@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2) { return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; } - -/* - * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom - * 64-bit words. NOTE: This function can not be changed EVER. Although - * brain-dead, some applications depend on this 64-bit value remaining - * persistent. Specifically, DMI vendors store the value as a persistent - * filehandle. - */ -__uint64_t -uuid_hash64(uuid_t *uuid) -{ - __uint64_t *sp = (__uint64_t *)uuid; - - return sp[0] + sp[1]; -} - -int -uuid_table_insert(uuid_t *uuid) -{ - int i, hole; - - mutex_lock(&uuid_monitor); - for (i = 0, hole = -1; i < uuid_table_size; i++) { - if (uuid_is_nil(&uuid_table[i])) { - hole = i; - continue; - } - if (uuid_equal(uuid, &uuid_table[i])) { - mutex_unlock(&uuid_monitor); - return 0; - } - } - if (hole < 0) { - uuid_table = kmem_realloc(uuid_table, - (uuid_table_size + 1) * sizeof(*uuid_table), - uuid_table_size * sizeof(*uuid_table), - KM_SLEEP); - hole = uuid_table_size++; - } - uuid_table[hole] = *uuid; - mutex_unlock(&uuid_monitor); - return 1; -} - -void -uuid_table_remove(uuid_t *uuid) -{ - int i; - - mutex_lock(&uuid_monitor); - for (i = 0; i < uuid_table_size; i++) { - if (uuid_is_nil(&uuid_table[i])) - continue; - if (!uuid_equal(uuid, &uuid_table[i])) - continue; - uuid_create_nil(&uuid_table[i]); - break; - } - ASSERT(i < uuid_table_size); - mutex_unlock(&uuid_monitor); -} diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h index cff5b607d44..4732d71262c 100644 --- a/fs/xfs/support/uuid.h +++ b/fs/xfs/support/uuid.h @@ -22,12 +22,8 @@ typedef struct { unsigned char __u_bits[16]; } uuid_t; -extern void uuid_create_nil(uuid_t *uuid); extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); -extern __uint64_t uuid_hash64(uuid_t *uuid); -extern int uuid_table_insert(uuid_t *uuid); -extern void uuid_table_remove(uuid_t *uuid); #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 143d63ecb20..c8641f713ca 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -223,8 +223,8 @@ typedef struct xfs_perag be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) #define XFS_MIN_FREELIST_PAG(pag,mp) \ (XFS_MIN_FREELIST_RAW( \ - (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 028e44e58ea..2cf944eb796 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels( } /* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + xfs_extlen_t need, delta = 0; + + need = XFS_MIN_FREELIST_PAG(pag, mp); + if (need > pag->pagf_flcount) + delta = need - pag->pagf_flcount; + + if (pag->pagf_longest > delta) + return pag->pagf_longest - delta; + return pag->pagf_flcount > 0 || pag->pagf_longest > 0; +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist( } if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; /* * If it looks like there isn't a long enough extent, or enough * total blocks, reject it. */ - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + need = XFS_MIN_FREELIST_PAG(pag, mp); + longest = xfs_alloc_longest_free_extent(mp, pag); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest || ((int)(pag->pagf_freeblks + pag->pagf_flcount - diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 588172796f7..e704caee10d 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag); #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 6c323f8a4cd..afdc8911637 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) * minimum offset only needs to be the space required for * the btree root. */ - if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset) + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; @@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) } /* + * After the last attribute is removed revert to original inode format, + * making all literal area available to the data fork once more. + */ +STATIC void +xfs_attr_fork_reset( + struct xfs_inode *ip, + struct xfs_trans *tp) +{ + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ip->i_d.di_forkoff = 0; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_afp == NULL); + + ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* * Remove an attribute from the shortform attribute list structure. */ int @@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) */ totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && - !(args->op_flags & XFS_DA_OP_ADDNAME) && - (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { - /* - * Last attribute now removed, revert to original - * inode format making all literal area available - * to the data fork once more. - */ - xfs_idestroy_fork(dp, XFS_ATTR_FORK); - dp->i_d.di_forkoff = 0; - dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ASSERT(dp->i_d.di_anextents == 0); - ASSERT(dp->i_afp == NULL); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & XFS_DA_OP_ADDNAME)) { + xfs_attr_fork_reset(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); @@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - - /* - * Last attribute was removed, revert to original - * inode format making all literal area available - * to the data fork once more. - */ - xfs_idestroy_fork(dp, XFS_ATTR_FORK); - dp->i_d.di_forkoff = 0; - dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ASSERT(dp->i_d.di_anextents == 0); - ASSERT(dp->i_afp == NULL); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + xfs_attr_fork_reset(dp, args->trans); goto out; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c852cd65aae..3a6ed426327 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2479,7 +2479,7 @@ xfs_bmap_adjacent( fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); /* * If allocating at eof, and there's a previous real block, - * try to use it's last block as our starting point. + * try to use its last block as our starting point. */ if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && !isnullstartblock(ap->prevp->br_startblock) && @@ -2712,9 +2712,6 @@ xfs_bmap_btalloc( xfs_agnumber_t startag; xfs_alloc_arg_t args; xfs_extlen_t blen; - xfs_extlen_t delta; - xfs_extlen_t longest; - xfs_extlen_t need; xfs_extlen_t nextminlen = 0; xfs_perag_t *pag; int nullfb; /* true if ap->firstblock isn't set */ @@ -2796,13 +2793,8 @@ xfs_bmap_btalloc( * See xfs_alloc_fix_freelist... */ if (pag->pagf_init) { - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? - need - pag->pagf_flcount : 0; - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || - pag->pagf_longest > 0); + xfs_extlen_t longest; + longest = xfs_alloc_longest_free_extent(mp, pag); if (blen < longest) blen = longest; } else @@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree( } /* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp)); + return offset; +} + +/* * Helper routine to reset inode di_forkoff field when switching * attribute fork from local to extent format - we reset it where * possible to make space available for inline data fork extents. @@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset( int whichfork) { if (whichfork == XFS_ATTR_FORK && - (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && - (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { - ip->i_d.di_forkoff = mp->m_attroffset >> 3; - ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / - (uint)sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / - (uint)sizeof(xfs_bmbt_rec_t); + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) { + ip->i_d.di_forkoff = dfl_forkoff; + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); + } } } @@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork( case XFS_DINODE_FMT_BTREE: ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = mp->m_attroffset >> 3; + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; else if (mp->m_flags & XFS_MOUNT_ATTR2) version = 2; break; @@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels( * (a signed 16-bit number, xfs_aextnum_t). * * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be (m_attroffset >> 3) - * because we could have mounted with ATTR2 and then mounted back - * with ATTR1, keeping the di_forkoff's fixed but probably at - * various positions. Therefore, for both ATTR1 and ATTR2 - * we have to assume the worst case scenario of a minimum size - * available. + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. */ if (whichfork == XFS_DATA_FORK) { maxleafents = MAXEXTNUM; @@ -4804,7 +4820,7 @@ xfs_bmapi( xfs_extlen_t minlen; /* min allocation size */ xfs_mount_t *mp; /* xfs mount structure */ int n; /* current extent index */ - int nallocs; /* number of extents alloc\'d */ + int nallocs; /* number of extents alloc'd */ xfs_extnum_t nextents; /* number of extents in file */ xfs_fileoff_t obno; /* old block number (offset) */ xfs_bmbt_irec_t prev; /* previous file extent record */ @@ -6204,7 +6220,7 @@ xfs_bmap_get_bp( return(bp); } -void +STATIC void xfs_check_block( struct xfs_btree_block *block, xfs_mount_t *mp, @@ -6494,7 +6510,7 @@ xfs_bmap_count_tree( block = XFS_BUF_TO_BLOCK(bp); if (--level) { - /* Not at node above leafs, count this level of nodes */ + /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { if ((error = xfs_btree_read_bufl(mp, tp, nextbno, diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index be2979d88d3..1b8ff9256bd 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -125,7 +125,7 @@ typedef struct xfs_bmalloca { struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ xfs_extlen_t alen; /* i/o length asked/allocated */ xfs_extlen_t total; /* total blocks needed for xaction */ - xfs_extlen_t minlen; /* mininum allocation size (blocks) */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ char eof; /* set if allocating past last extent */ char wasdel; /* replacing a delayed allocation */ @@ -338,6 +338,10 @@ xfs_check_nostate_extents( xfs_extnum_t idx, xfs_extnum_t num); +uint +xfs_default_attroffset( + struct xfs_inode *ip); + #ifdef __KERNEL__ /* diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e73c332eb23..e9df9957482 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -1883,7 +1883,7 @@ xfs_btree_lshift( /* * We add one entry to the left side and remove one for the right side. - * Accout for it here, the changes will be updated on disk and logged + * Account for it here, the changes will be updated on disk and logged * later. */ lrecs++; @@ -3535,7 +3535,7 @@ xfs_btree_delrec( XFS_BTREE_STATS_INC(cur, join); /* - * Fix up the the number of records and right block pointer in the + * Fix up the number of records and right block pointer in the * surviving block, and log it. */ xfs_btree_set_numrecs(left, lrecs + rrecs); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 789fffdf8b2..4f852b735b9 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -41,7 +41,7 @@ extern kmem_zone_t *xfs_btree_cur_zone; /* * Generic btree header. * - * This is a comination of the actual format used on disk for short and long + * This is a combination of the actual format used on disk for short and long * format btrees. The first three fields are shared by both format, but * the pointers are different and should be used with care. * diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c45f74ff1a5..9ff6e57a507 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * This is implemented with some source-level loop unrolling. */ xfs_dahash_t -xfs_da_hashname(const uchar_t *name, int namelen) +xfs_da_hashname(const __uint8_t *name, int namelen) { xfs_dahash_t hash; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 70b710c1792..8c536167bf7 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -91,9 +91,9 @@ enum xfs_dacmp { * Structure to ease passing around component names. */ typedef struct xfs_da_args { - const uchar_t *name; /* string (maybe not NULL terminated) */ + const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ - uchar_t *value; /* set of bytes (maybe contain NULLs) */ + __uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ xfs_dahash_t hashval; /* hash value of name */ @@ -185,7 +185,7 @@ typedef struct xfs_da_state { unsigned char inleaf; /* insert into 1->lf, 0->splf */ unsigned char extravalid; /* T/F: extrablk is in use */ unsigned char extraafter; /* T/F: extrablk is after new */ - xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ + xfs_da_state_blk_t extrablk; /* for double-splits on leaves */ /* for dirv2 extrablk is data */ } xfs_da_state_t; @@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_dabuf_t *dead_buf); -uint xfs_da_hashname(const uchar_t *name_string, int name_length); +uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const char *name, int len); @@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); extern struct kmem_zone *xfs_da_state_zone; extern struct kmem_zone *xfs_dabuf_zone; +extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f8278cfcc1d..e6d839bddbf 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -79,6 +79,12 @@ xfs_swapext( goto out_put_target_file; } + if (IS_SWAPFILE(file->f_path.dentry->d_inode) || + IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { + error = XFS_ERROR(EINVAL); + goto out_put_target_file; + } + ip = XFS_I(file->f_path.dentry->d_inode); tip = XFS_I(target_file->f_path.dentry->d_inode); @@ -118,19 +124,17 @@ xfs_swap_extents( xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; int ilf_fields, tilf_fields; - static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; int error = 0; int aforkblks = 0; int taforkblks = 0; __uint64_t tmp; - char locked = 0; mp = ip->i_mount; tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); - goto error0; + goto out; } sbp = &sxp->sx_stat; @@ -143,25 +147,24 @@ xfs_swap_extents( */ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - locked = 1; /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* Should never get a local format */ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } if (VN_CACHED(VFS_I(tip)) != 0) { @@ -169,13 +172,13 @@ xfs_swap_extents( error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); if (error) - goto error0; + goto out_unlock; } /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* Verify all data are being swapped */ @@ -183,7 +186,7 @@ xfs_swap_extents( sxp->sx_length != ip->i_d.di_size || sxp->sx_length != tip->i_d.di_size) { error = XFS_ERROR(EFAULT); - goto error0; + goto out_unlock; } /* @@ -193,7 +196,7 @@ xfs_swap_extents( */ if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* @@ -208,7 +211,7 @@ xfs_swap_extents( (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { error = XFS_ERROR(EBUSY); - goto error0; + goto out_unlock; } /* We need to fail if the file is memory mapped. Once we have tossed @@ -219,7 +222,7 @@ xfs_swap_extents( */ if (VN_MAPPED(VFS_I(ip))) { error = XFS_ERROR(EBUSY); - goto error0; + goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -242,8 +245,7 @@ xfs_swap_extents( xfs_iunlock(ip, XFS_IOLOCK_EXCL); xfs_iunlock(tip, XFS_IOLOCK_EXCL); xfs_trans_cancel(tp, 0); - locked = 0; - goto error0; + goto out; } xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); @@ -253,19 +255,15 @@ xfs_swap_extents( if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } + if (error) + goto out_trans_cancel; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &taforkblks); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } + if (error) + goto out_trans_cancel; } /* @@ -332,10 +330,10 @@ xfs_swap_extents( IHOLD(ip); - xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); IHOLD(tip); - xfs_trans_ijoin(tp, tip, lock_flags); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); @@ -344,19 +342,19 @@ xfs_swap_extents( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - } error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); - locked = 0; - error0: - if (locked) { - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - } - if (tempifp != NULL) - kmem_free(tempifp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out: + kmem_free(tempifp); return error; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out_unlock; } diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 162e8726df5..e5b153b2e6a 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp) ((mp)->m_litino) +#define XFS_LITINO(mp) \ + ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) + #define XFS_BROOT_SIZE_ADJ \ (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 1afb12278b8..c657bec6d95 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -46,8 +46,6 @@ struct xfs_name xfs_name_dotdot = {"..", 2}; -extern const struct xfs_nameops xfs_default_nameops; - /* * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e1f0a06aaf0..ab52e9e1c1e 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -448,7 +448,6 @@ xfs_dir2_block_getdents( xfs_mount_t *mp; /* filesystem mount point */ char *ptr; /* current data entry */ int wantoff; /* starting block offset */ - xfs_ino_t ino; xfs_off_t cook; mp = dp->i_mount; @@ -509,16 +508,12 @@ xfs_dir2_block_getdents( cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (char *)dep - (char *)block); - ino = be64_to_cpu(dep->inumber); -#if XFS_BIG_INUMS - ino += mp->m_inoadd; -#endif /* * If it didn't fit, set the final offset to here & return. */ if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, - ino, DT_UNKNOWN)) { + be64_to_cpu(dep->inumber), DT_UNKNOWN)) { *offset = cook & 0x7fffffff; xfs_da_brelse(NULL, bp); return 0; diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h index b816e025273..efbc290c7fe 100644 --- a/fs/xfs/xfs_dir2_data.h +++ b/fs/xfs/xfs_dir2_data.h @@ -38,7 +38,7 @@ struct xfs_trans; /* * Directory address space divided into sections, - * spaces separated by 32gb. + * spaces separated by 32GB. */ #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ef805a374ee..fa913e45944 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -549,7 +549,7 @@ xfs_dir2_leaf_addname( * Check the internal consistency of a leaf1 block. * Pop an assert if something is wrong. */ -void +STATIC void xfs_dir2_leaf_check( xfs_inode_t *dp, /* incore directory inode */ xfs_dabuf_t *bp) /* leaf's buffer */ @@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents( int ra_index; /* *map index for read-ahead */ int ra_offset; /* map entry offset for ra */ int ra_want; /* readahead count wanted */ - xfs_ino_t ino; /* * If the offset is at or past the largest allowed value, @@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents( continue; } - /* - * Copy the entry into the putargs, and try formatting it. - */ dep = (xfs_dir2_data_entry_t *)ptr; - length = xfs_dir2_data_entsize(dep->namelen); - ino = be64_to_cpu(dep->inumber); -#if XFS_BIG_INUMS - ino += mp->m_inoadd; -#endif - - /* - * Won't fit. Return to caller. - */ if (filldir(dirent, dep->name, dep->namelen, xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, - ino, DT_UNKNOWN)) + be64_to_cpu(dep->inumber), DT_UNKNOWN)) break; /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index fa6c3a5ddbc..5a81ccd1045 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove( } xfs_dir2_leafn_check(dp, bp); /* - * Return indication of whether this leaf block is emtpy enough + * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ *rval = diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index a8a8a6efad5..e89734e8464 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -748,11 +748,7 @@ xfs_dir2_sf_getdents( * Put . entry unless we're starting past it. */ if (*offset <= dot_offset) { - ino = dp->i_ino; -#if XFS_BIG_INUMS - ino += mp->m_inoadd; -#endif - if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) { + if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { *offset = dot_offset & 0x7fffffff; return 0; } @@ -763,9 +759,6 @@ xfs_dir2_sf_getdents( */ if (*offset <= dotdot_offset) { ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); -#if XFS_BIG_INUMS - ino += mp->m_inoadd; -#endif if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { *offset = dotdot_offset & 0x7fffffff; return 0; @@ -786,10 +779,6 @@ xfs_dir2_sf_getdents( } ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); -#if XFS_BIG_INUMS - ino += mp->m_inoadd; -#endif - if (filldir(dirent, sfep->name, sfep->namelen, off & 0x7fffffff, ino, DT_UNKNOWN)) { *offset = off & 0x7fffffff; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 2f049f63e85..0d22c56fdf6 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -33,12 +33,10 @@ typedef struct xfs_extent { * conversion routine. */ -#ifndef HAVE_FORMAT32 typedef struct xfs_extent_32 { __uint64_t ext_start; __uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; -#endif typedef struct xfs_extent_64 { __uint64_t ext_start; @@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format { xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; -#ifndef HAVE_FORMAT32 typedef struct xfs_efi_log_format_32 { __uint16_t efi_type; /* efi log item type */ __uint16_t efi_size; /* size of this item */ @@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 { __uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; -#endif typedef struct xfs_efi_log_format_64 { __uint16_t efi_type; /* efi log item type */ @@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format { xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; -#ifndef HAVE_FORMAT32 typedef struct xfs_efd_log_format_32 { __uint16_t efd_type; /* efd log item type */ __uint16_t efd_size; /* size of this item */ @@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 { __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; -#endif typedef struct xfs_efd_log_format_64 { __uint16_t efd_type; /* efd log item type */ diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index f3bb75da384..6c87c8f304e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -140,7 +140,7 @@ _xfs_filestream_pick_ag( xfs_extlen_t minlen) { int err, trylock, nscan; - xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; + xfs_extlen_t longest, free, minfree, maxfree = 0; xfs_agnumber_t ag, max_ag = NULLAGNUMBER; struct xfs_perag *pag; @@ -186,12 +186,7 @@ _xfs_filestream_pick_ag( goto next_ag; } - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); - + longest = xfs_alloc_longest_free_extent(mp, pag); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 680d0e0ec93..8379e3bca26 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -576,7 +576,7 @@ out: if (fdblks_delta) { /* * If we are putting blocks back here, m_resblks_avail is - * already at it's max so this will put it in the free pool. + * already at its max so this will put it in the free pool. * * If we need space, we'll either succeed in getting it * from the free block count or we'll get an enospc. If diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ab016e5ae7b..3120a3a5e20 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc( args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; /* Allow space for the inode btree to split. */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; } else @@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc( /* * Allow space for the inode btree to split. */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc( * Initialize all inodes in this buffer and then log them. * * XXX: It would be much better if we had just one transaction to - * log a whole cluster of inodes instead of all the indivdual + * log a whole cluster of inodes instead of all the individual * transactions causing a lot of log traffic. */ xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); @@ -943,7 +943,7 @@ nextag: ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); - XFS_INOBT_CLR_FREE(&rec, offset); + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) @@ -1105,11 +1105,11 @@ xfs_difree( */ off = agino - rec.ir_startino; ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); - ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); + ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); /* * Mark the inode free & increment the count. */ - XFS_INOBT_SET_FREE(&rec, off); + rec.ir_free |= XFS_INOBT_MASK(off); rec.ir_freecount++; /* diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 99f2408e8d8..c282a9af539 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur( } /* - * intial value of ptr for lookup + * initial value of ptr for lookup */ STATIC void xfs_inobt_init_ptr_from_cur( diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 5580e255ff0..f782ad0c476 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -32,14 +32,14 @@ struct xfs_mount; #define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ typedef __uint64_t xfs_inofree_t; -#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) #define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) -#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { - return (((n) >= XFS_INODES_PER_CHUNK ? \ - (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i); + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* @@ -69,20 +69,6 @@ typedef struct xfs_inobt_key { typedef __be32 xfs_inobt_ptr_t; /* - * Bit manipulations for ir_free. - */ -#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) -#define XFS_INOBT_IS_FREE(rp,i) \ - (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) -#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i)) -#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) - -/* - * Maximum number of inode btree levels. - */ -#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels) - -/* * block numbers in the AG. */ #define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1f175fa34b2..f879c1bc4b9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp { /* * NOTE: This structure must be kept identical to struct xfs_dinode - * in xfs_dinode.h except for the endianess annotations. + * in xfs_dinode.h except for the endianness annotations. */ typedef struct xfs_icdinode { __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 9957d0602d5..a52ac125f05 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format { __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; -#ifndef HAVE_FORMAT32 typedef struct xfs_inode_log_format_32 { __uint16_t ilf_type; /* inode log item type */ __uint16_t ilf_size; /* size of this item */ @@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 { __int32_t ilf_len; /* len of inode buffer */ __int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)) xfs_inode_log_format_32_t; -#endif typedef struct xfs_inode_log_format_64 { __uint16_t ilf_type; /* inode log item type */ diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index ee1a0c134cc..a1cc1322fc0 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -63,7 +63,7 @@ typedef enum { */ typedef struct xfs_iomap { - xfs_daddr_t iomap_bn; /* first 512b blk of mapping */ + xfs_daddr_t iomap_bn; /* first 512B blk of mapping */ xfs_buftarg_t *iomap_target; xfs_off_t iomap_offset; /* offset of mapping, bytes */ xfs_off_t iomap_bsize; /* size of mapping, bytes */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index cf98a805ec9..aeb2d2221c7 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -83,7 +83,12 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); + /* + * We are reading the atime from the Linux inode because the + * dinode might not be uptodate. + */ + buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; + buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; @@ -579,7 +584,7 @@ xfs_bulkstat( * first inode of the cluster. * * Careful with clustidx. There can be - * multple clusters per chunk, a single + * multiple clusters per chunk, a single * cluster per chunk or a cluster that has * inodes represented from several different * chunks (if blocksize is large). diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f4726f702a9..f76c6d7cea2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -574,7 +574,7 @@ xfs_log_mount( error = xfs_trans_ail_init(mp); if (error) { cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); - goto error; + goto out_free_log; } mp->m_log->l_ailp = mp->m_ail; @@ -594,20 +594,22 @@ xfs_log_mount( mp->m_flags |= XFS_MOUNT_RDONLY; if (error) { cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); - goto error; + goto out_destroy_ail; } } /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; - /* End mounting message in xfs_log_mount_finish */ return 0; -error: - xfs_log_unmount_dealloc(mp); + +out_destroy_ail: + xfs_trans_ail_destroy(mp); +out_free_log: + xlog_dealloc_log(mp->m_log); out: return error; -} /* xfs_log_mount */ +} /* * Finish the recovery of the file system. This is separate from @@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp) } /* - * Unmount processing for the log. - */ -int -xfs_log_unmount(xfs_mount_t *mp) -{ - int error; - - error = xfs_log_unmount_write(mp); - xfs_log_unmount_dealloc(mp); - return error; -} - -/* * Final log writes as part of unmount. * * Mark the filesystem clean as unmount happens. Note that during relocation @@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) * and deallocate the log as the aild references the log. */ void -xfs_log_unmount_dealloc(xfs_mount_t *mp) +xfs_log_unmount(xfs_mount_t *mp) { xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); @@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp) /* * Return size of each in-core log record buffer. * - * All machines get 8 x 32KB buffers by default, unless tuned otherwise. + * All machines get 8 x 32kB buffers by default, unless tuned otherwise. * * If the filesystem blocksize is too large, we may need to choose a * larger size since the directory code currently logs entire blocks. @@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, } if (xfs_sb_version_haslogv2(&mp->m_sb)) { - /* # headers = size / 32K - * one header holds cycles from 32K of data + /* # headers = size / 32k + * one header holds cycles from 32k of data */ xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; @@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, goto done; } - /* All machines use 32KB buffers by default. */ + /* All machines use 32kB buffers by default. */ log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; @@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, log->l_iclog_hsize = BBSIZE; log->l_iclog_heads = 1; - /* - * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use - * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers. - */ - if (mp->m_sb.sb_blocksize >= 16*1024) { - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - if (mp->m_logbufs <= 0) { - switch (mp->m_sb.sb_blocksize) { - case 16*1024: /* 16 KB */ - log->l_iclog_bufs = 3; - break; - case 32*1024: /* 32 KB */ - log->l_iclog_bufs = 4; - break; - case 64*1024: /* 64 KB */ - log->l_iclog_bufs = 8; - break; - default: - xlog_panic("XFS: Invalid blocksize"); - break; - } - } - } - -done: /* are we being asked to make the sizes selected above visible? */ +done: + /* are we being asked to make the sizes selected above visible? */ if (mp->m_logbufs == 0) mp->m_logbufs = log->l_iclog_bufs; if (mp->m_logbsize == 0) @@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) */ /* - * Free a used ticket when it's refcount falls to zero. + * Free a used ticket when its refcount falls to zero. */ void xfs_log_ticket_put( diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 8a3e84e900a..d0c9baa50b1 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -170,9 +170,8 @@ int xfs_log_write(struct xfs_mount *mp, int nentries, xfs_log_ticket_t ticket, xfs_lsn_t *start_lsn); -int xfs_log_unmount(struct xfs_mount *mp); int xfs_log_unmount_write(struct xfs_mount *mp); -void xfs_log_unmount_dealloc(struct xfs_mount *mp); +void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); int xfs_log_need_covered(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 654167be0ef..bcad5f4c1fd 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -359,7 +359,7 @@ typedef struct xlog_in_core { int ic_size; int ic_offset; int ic_bwritecnt; - ushort_t ic_state; + unsigned short ic_state; char *ic_datap; /* pointer to iclog data */ #ifdef XFS_LOG_TRACE struct ktrace *ic_trace; @@ -455,7 +455,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log); extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); -extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 61af610d79b..7ba450116d4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -94,12 +94,30 @@ xlog_put_bp( xfs_buf_free(bp); } +STATIC xfs_caddr_t +xlog_align( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + xfs_caddr_t ptr; + + if (!log->l_sectbb_log) + return XFS_BUF_PTR(bp); + + ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); + ASSERT(XFS_BUF_SIZE(bp) >= + BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); + return ptr; +} + /* * nbblks should be uint, but oh well. Just want to catch that 32-bit length. */ -int -xlog_bread( +STATIC int +xlog_bread_noalign( xlog_t *log, xfs_daddr_t blk_no, int nbblks, @@ -137,6 +155,24 @@ xlog_bread( return error; } +STATIC int +xlog_bread( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp, + xfs_caddr_t *offset) +{ + int error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + if (error) + return error; + + *offset = xlog_align(log, blk_no, nbblks, bp); + return 0; +} + /* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. @@ -180,24 +216,6 @@ xlog_bwrite( return error; } -STATIC xfs_caddr_t -xlog_align( - xlog_t *log, - xfs_daddr_t blk_no, - int nbblks, - xfs_buf_t *bp) -{ - xfs_caddr_t ptr; - - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); - - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); - return ptr; -} - #ifdef DEBUG /* * dump debug superblock and log record information @@ -211,11 +229,11 @@ xlog_header_check_dump( cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); + cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]); cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); cmn_err(CE_DEBUG, " log : uuid = "); for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); + cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]); cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); } #else @@ -321,9 +339,9 @@ xlog_find_cycle_start( mid_blk = BLK_AVG(first_blk, *last_blk); while (mid_blk != first_blk && mid_blk != *last_blk) { - if ((error = xlog_bread(log, mid_blk, 1, bp))) + error = xlog_bread(log, mid_blk, 1, bp, &offset); + if (error) return error; - offset = xlog_align(log, mid_blk, 1, bp); mid_cycle = xlog_get_cycle(offset); if (mid_cycle == cycle) { *last_blk = mid_blk; @@ -379,10 +397,10 @@ xlog_find_verify_cycle( bcount = min(bufblks, (start_blk + nbblks - i)); - if ((error = xlog_bread(log, i, bcount, bp))) + error = xlog_bread(log, i, bcount, bp, &buf); + if (error) goto out; - buf = xlog_align(log, i, bcount, bp); for (j = 0; j < bcount; j++) { cycle = xlog_get_cycle(buf); if (cycle == stop_on_cycle_no) { @@ -436,9 +454,9 @@ xlog_find_verify_log_record( return ENOMEM; smallmem = 1; } else { - if ((error = xlog_bread(log, start_blk, num_blks, bp))) + error = xlog_bread(log, start_blk, num_blks, bp, &offset); + if (error) goto out; - offset = xlog_align(log, start_blk, num_blks, bp); offset += ((num_blks - 1) << BBSHIFT); } @@ -453,9 +471,9 @@ xlog_find_verify_log_record( } if (smallmem) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto out; - offset = xlog_align(log, i, 1, bp); } head = (xlog_rec_header_t *)offset; @@ -559,15 +577,18 @@ xlog_find_head( bp = xlog_get_bp(log, 1); if (!bp) return ENOMEM; - if ((error = xlog_bread(log, 0, 1, bp))) + + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, 0, 1, bp); + first_half_cycle = xlog_get_cycle(offset); last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ - if ((error = xlog_bread(log, last_blk, 1, bp))) + error = xlog_bread(log, last_blk, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, last_blk, 1, bp); + last_half_cycle = xlog_get_cycle(offset); ASSERT(last_half_cycle != 0); @@ -817,9 +838,10 @@ xlog_find_tail( if (!bp) return ENOMEM; if (*head_blk == 0) { /* special case */ - if ((error = xlog_bread(log, 0, 1, bp))) + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bread_err; - offset = xlog_align(log, 0, 1, bp); + if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ @@ -832,9 +854,10 @@ xlog_find_tail( */ ASSERT(*head_blk < INT_MAX); for (i = (int)(*head_blk) - 1; i >= 0; i--) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto bread_err; - offset = xlog_align(log, i, 1, bp); + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; break; @@ -848,9 +871,10 @@ xlog_find_tail( */ if (!found) { for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto bread_err; - offset = xlog_align(log, i, 1, bp); + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 2; @@ -922,10 +946,10 @@ xlog_find_tail( if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; - if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) goto bread_err; - } - offset = xlog_align(log, umount_data_blk, 1, bp); + op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { /* @@ -1017,9 +1041,10 @@ xlog_find_zeroed( bp = xlog_get_bp(log, 1); if (!bp) return ENOMEM; - if ((error = xlog_bread(log, 0, 1, bp))) + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, 0, 1, bp); + first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; @@ -1028,9 +1053,10 @@ xlog_find_zeroed( } /* check partially zeroed log */ - if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) + error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, log_bbnum-1, 1, bp); + last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); @@ -1152,10 +1178,10 @@ xlog_write_log_records( */ balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); if (balign != start_block) { - if ((error = xlog_bread(log, start_block, 1, bp))) { - xlog_put_bp(bp); - return error; - } + error = xlog_bread_noalign(log, start_block, 1, bp); + if (error) + goto out_put_bp; + j = start_block - balign; } @@ -1175,10 +1201,14 @@ xlog_write_log_records( balign = BBTOB(ealign - start_block); error = XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); - if (!error) - error = xlog_bread(log, ealign, sectbb, bp); - if (!error) - error = XFS_BUF_SET_PTR(bp, offset, bufblks); + if (error) + break; + + error = xlog_bread_noalign(log, ealign, sectbb, bp); + if (error) + break; + + error = XFS_BUF_SET_PTR(bp, offset, bufblks); if (error) break; } @@ -1195,6 +1225,8 @@ xlog_write_log_records( start_block += endcount; j = 0; } + + out_put_bp: xlog_put_bp(bp); return error; } @@ -2511,16 +2543,10 @@ xlog_recover_do_inode_trans( } write_inode_buffer: - if (ITEM_TYPE(item) == XFS_LI_INODE) { - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); - } else { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); - } - + ASSERT(bp->b_mount == NULL || bp->b_mount == mp); + bp->b_mount = mp; + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); error: if (need_free) kmem_free(in_f); @@ -2769,51 +2795,48 @@ xlog_recover_do_trans( int error = 0; xlog_recover_item_t *item, *first_item; - if ((error = xlog_recover_reorder_trans(trans))) + error = xlog_recover_reorder_trans(trans); + if (error) return error; + first_item = item = trans->r_itemq; do { - /* - * we don't need to worry about the block number being - * truncated in > 1 TB buffers because in user-land, - * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so - * the blknos will get through the user-mode buffer - * cache properly. The only bad case is o32 kernels - * where xfs_daddr_t is 32-bits but mount will warn us - * off a > 1 TB filesystem before we get here. - */ - if ((ITEM_TYPE(item) == XFS_LI_BUF)) { - if ((error = xlog_recover_do_buffer_trans(log, item, - pass))) - break; - } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) { - if ((error = xlog_recover_do_inode_trans(log, item, - pass))) - break; - } else if (ITEM_TYPE(item) == XFS_LI_EFI) { - if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn, - pass))) - break; - } else if (ITEM_TYPE(item) == XFS_LI_EFD) { + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + error = xlog_recover_do_buffer_trans(log, item, pass); + break; + case XFS_LI_INODE: + error = xlog_recover_do_inode_trans(log, item, pass); + break; + case XFS_LI_EFI: + error = xlog_recover_do_efi_trans(log, item, + trans->r_lsn, pass); + break; + case XFS_LI_EFD: xlog_recover_do_efd_trans(log, item, pass); - } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { - if ((error = xlog_recover_do_dquot_trans(log, item, - pass))) - break; - } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { - if ((error = xlog_recover_do_quotaoff_trans(log, item, - pass))) - break; - } else { - xlog_warn("XFS: xlog_recover_do_trans"); + error = 0; + break; + case XFS_LI_DQUOT: + error = xlog_recover_do_dquot_trans(log, item, pass); + break; + case XFS_LI_QUOTAOFF: + error = xlog_recover_do_quotaoff_trans(log, item, + pass); + break; + default: + xlog_warn( + "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item)); ASSERT(0); error = XFS_ERROR(EIO); break; } + + if (error) + return error; item = item->ri_next; } while (first_item != item); - return error; + return 0; } /* @@ -3490,9 +3513,11 @@ xlog_do_recovery_pass( hbp = xlog_get_bp(log, 1); if (!hbp) return ENOMEM; - if ((error = xlog_bread(log, tail_blk, 1, hbp))) + + error = xlog_bread(log, tail_blk, 1, hbp, &offset); + if (error) goto bread_err1; - offset = xlog_align(log, tail_blk, 1, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) @@ -3526,9 +3551,10 @@ xlog_do_recovery_pass( memset(rhash, 0, sizeof(rhash)); if (tail_blk <= head_blk) { for (blk_no = tail_blk; blk_no < head_blk; ) { - if ((error = xlog_bread(log, blk_no, hblks, hbp))) + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) @@ -3536,10 +3562,11 @@ xlog_do_recovery_pass( /* blocks in data section */ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - error = xlog_bread(log, blk_no + hblks, bblks, dbp); + error = xlog_bread(log, blk_no + hblks, bblks, dbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no + hblks, bblks, dbp); + xlog_unpack_data(rhead, offset, log); if ((error = xlog_recover_process_data(log, rhash, rhead, offset, pass))) @@ -3562,10 +3589,10 @@ xlog_do_recovery_pass( wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { /* Read header in one read */ - error = xlog_bread(log, blk_no, hblks, hbp); + error = xlog_bread(log, blk_no, hblks, hbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); } else { /* This LR is split across physical log end */ if (blk_no != log->l_logBBsize) { @@ -3573,12 +3600,13 @@ xlog_do_recovery_pass( ASSERT(blk_no <= INT_MAX); split_hblks = log->l_logBBsize - (int)blk_no; ASSERT(split_hblks > 0); - if ((error = xlog_bread(log, blk_no, - split_hblks, hbp))) + error = xlog_bread(log, blk_no, + split_hblks, hbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, - split_hblks, hbp); } + /* * Note: this black magic still works with * large sector sizes (non-512) only because: @@ -3596,14 +3624,19 @@ xlog_do_recovery_pass( error = XFS_BUF_SET_PTR(hbp, bufaddr + BBTOB(split_hblks), BBTOB(hblks - split_hblks)); - if (!error) - error = xlog_bread(log, 0, - wrapped_hblks, hbp); - if (!error) - error = XFS_BUF_SET_PTR(hbp, bufaddr, + if (error) + goto bread_err2; + + error = xlog_bread_noalign(log, 0, + wrapped_hblks, hbp); + if (error) + goto bread_err2; + + error = XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks)); if (error) goto bread_err2; + if (!offset) offset = xlog_align(log, 0, wrapped_hblks, hbp); @@ -3619,10 +3652,10 @@ xlog_do_recovery_pass( /* Read in data for log record */ if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp); + error = xlog_bread(log, blk_no, bblks, dbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no, bblks, dbp); } else { /* This log record is split across the * physical end of log */ @@ -3636,12 +3669,13 @@ xlog_do_recovery_pass( split_bblks = log->l_logBBsize - (int)blk_no; ASSERT(split_bblks > 0); - if ((error = xlog_bread(log, blk_no, - split_bblks, dbp))) + error = xlog_bread(log, blk_no, + split_bblks, dbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, - split_bblks, dbp); } + /* * Note: this black magic still works with * large sector sizes (non-512) only because: @@ -3658,15 +3692,19 @@ xlog_do_recovery_pass( error = XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks), BBTOB(bblks - split_bblks)); - if (!error) - error = xlog_bread(log, wrapped_hblks, - bblks - split_bblks, - dbp); - if (!error) - error = XFS_BUF_SET_PTR(dbp, bufaddr, - h_size); if (error) goto bread_err2; + + error = xlog_bread_noalign(log, wrapped_hblks, + bblks - split_bblks, + dbp); + if (error) + goto bread_err2; + + error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size); + if (error) + goto bread_err2; + if (!offset) offset = xlog_align(log, wrapped_hblks, bblks - split_bblks, dbp); @@ -3683,17 +3721,21 @@ xlog_do_recovery_pass( /* read first part of physical log */ while (blk_no < head_blk) { - if ((error = xlog_bread(log, blk_no, hblks, hbp))) + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no+hblks, bblks, dbp); + xlog_unpack_data(rhead, offset, log); if ((error = xlog_recover_process_data(log, rhash, rhead, offset, pass))) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 35300250e86..b101990df02 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,7 +45,6 @@ #include "xfs_fsops.h" #include "xfs_utils.h" -STATIC int xfs_uuid_mount(xfs_mount_t *); STATIC void xfs_unmountfs_wait(xfs_mount_t *); @@ -121,6 +120,84 @@ static const struct { { sizeof(xfs_sb_t), 0 } }; +static DEFINE_MUTEX(xfs_uuid_table_mutex); +static int xfs_uuid_table_size; +static uuid_t *xfs_uuid_table; + +/* + * See if the UUID is unique among mounted XFS filesystems. + * Mount fails if UUID is nil or a FS with the same UUID is already mounted. + */ +STATIC int +xfs_uuid_mount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int hole, i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return 0; + + if (uuid_is_nil(uuid)) { + cmn_err(CE_WARN, + "XFS: Filesystem %s has nil UUID - can't mount", + mp->m_fsname); + return XFS_ERROR(EINVAL); + } + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) { + hole = i; + continue; + } + if (uuid_equal(uuid, &xfs_uuid_table[i])) + goto out_duplicate; + } + + if (hole < 0) { + xfs_uuid_table = kmem_realloc(xfs_uuid_table, + (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), + xfs_uuid_table_size * sizeof(*xfs_uuid_table), + KM_SLEEP); + hole = xfs_uuid_table_size++; + } + xfs_uuid_table[hole] = *uuid; + mutex_unlock(&xfs_uuid_table_mutex); + + return 0; + + out_duplicate: + mutex_unlock(&xfs_uuid_table_mutex); + cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount", + mp->m_fsname); + return XFS_ERROR(EINVAL); +} + +STATIC void +xfs_uuid_unmount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return; + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) + continue; + if (!uuid_equal(uuid, &xfs_uuid_table[i])) + continue; + memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); + break; + } + ASSERT(i < xfs_uuid_table_size); + mutex_unlock(&xfs_uuid_table_mutex); +} + + /* * Free up the resources associated with a mount structure. Assume that * the structure was initially zeroed, so we can tell which fields got @@ -256,6 +333,22 @@ xfs_mount_validate_sb( return XFS_ERROR(ENOSYS); } + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + xfs_fs_mount_cmn_err(flags, + "inode size of %d bytes not supported", + sbp->sb_inodesize); + return XFS_ERROR(ENOSYS); + } + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, @@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode); mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; - /* - * Setup for attributes, in case they get created. - * This value is for inodes getting attributes for the first time, - * the per-inode value is for old attribute values. - */ - ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048); - switch (sbp->sb_inodesize) { - case 256: - mp->m_attroffset = XFS_LITINO(mp) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - break; - case 512: - case 1024: - case 2048: - mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - break; - default: - ASSERT(0); - } - ASSERT(mp->m_attroffset < XFS_LITINO(mp)); - mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; @@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) for (index = 0; index < agcount; index++) { /* * read the agf, then the agi. This gets us - * all the inforamtion we need and populates the + * all the information we need and populates the * per-ag structures for us. */ error = xfs_alloc_pagf_init(mp, NULL, index, 0); @@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp) } /* - * xfs_mountfs - * * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct * - if we're a 32-bit kernel, do a size check on the superblock @@ -905,7 +974,6 @@ xfs_mountfs( xfs_inode_t *rip; __uint64_t resblks; uint quotamount, quotaflags; - int uuid_mounted = 0; int error = 0; xfs_mount_common(mp, sbp); @@ -960,7 +1028,7 @@ xfs_mountfs( */ error = xfs_update_alignment(mp); if (error) - goto error1; + goto out; xfs_alloc_compute_maxlevels(mp); xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); @@ -971,19 +1039,9 @@ xfs_mountfs( mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); - /* - * XFS uses the uuid from the superblock as the unique - * identifier for fsid. We can not use the uuid from the volume - * since a single partition filesystem is identical to a single - * partition volume/filesystem. - */ - if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) { - if (xfs_uuid_mount(mp)) { - error = XFS_ERROR(EINVAL); - goto error1; - } - uuid_mounted=1; - } + error = xfs_uuid_mount(mp); + if (error) + goto out; /* * Set the minimum read and write sizes @@ -1007,7 +1065,7 @@ xfs_mountfs( */ error = xfs_check_sizes(mp); if (error) - goto error1; + goto out_remove_uuid; /* * Initialize realtime fields in the mount structure @@ -1015,7 +1073,7 @@ xfs_mountfs( error = xfs_rtmount_init(mp); if (error) { cmn_err(CE_WARN, "XFS: RT mount failed"); - goto error1; + goto out_remove_uuid; } /* @@ -1045,26 +1103,26 @@ xfs_mountfs( mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_MAYFAIL); if (!mp->m_perag) - goto error1; + goto out_remove_uuid; mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); + if (!sbp->sb_logblocks) { + cmn_err(CE_WARN, "XFS: no log defined"); + XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto out_free_perag; + } + /* * log's mount-time initialization. Perform 1st part recovery if needed */ - if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ - error = xfs_log_mount(mp, mp->m_logdev_targp, - XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), - XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); - if (error) { - cmn_err(CE_WARN, "XFS: log mount failed"); - goto error2; - } - } else { /* No log has been defined */ - cmn_err(CE_WARN, "XFS: no log defined"); - XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); - goto error2; + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + cmn_err(CE_WARN, "XFS: log mount failed"); + goto out_free_perag; } /* @@ -1086,15 +1144,14 @@ xfs_mountfs( * If we are currently making the filesystem, the initialisation will * fail as the perag data is in an undefined state. */ - if (xfs_sb_version_haslazysbcount(&mp->m_sb) && !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); - if (error) { - goto error2; - } + if (error) + goto out_free_perag; } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -1102,7 +1159,7 @@ xfs_mountfs( error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); if (error) { cmn_err(CE_WARN, "XFS: failed to read root inode"); - goto error3; + goto out_log_dealloc; } ASSERT(rip != NULL); @@ -1116,7 +1173,7 @@ xfs_mountfs( XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); - goto error4; + goto out_rele_rip; } mp->m_rootip = rip; /* save it */ @@ -1131,7 +1188,7 @@ xfs_mountfs( * Free up the root inode. */ cmn_err(CE_WARN, "XFS: failed to read RT inodes"); - goto error4; + goto out_rele_rip; } /* @@ -1143,7 +1200,7 @@ xfs_mountfs( error = xfs_mount_log_sb(mp, mp->m_update_flags); if (error) { cmn_err(CE_WARN, "XFS: failed to write sb changes"); - goto error4; + goto out_rtunmount; } } @@ -1152,7 +1209,7 @@ xfs_mountfs( */ error = XFS_QM_INIT(mp, "amount, "aflags); if (error) - goto error4; + goto out_rtunmount; /* * Finish recovering the file system. This part needed to be @@ -1162,7 +1219,7 @@ xfs_mountfs( error = xfs_log_mount_finish(mp); if (error) { cmn_err(CE_WARN, "XFS: log mount finish failed"); - goto error4; + goto out_rtunmount; } /* @@ -1170,7 +1227,7 @@ xfs_mountfs( */ error = XFS_QM_MOUNT(mp, quotamount, quotaflags); if (error) - goto error4; + goto out_rtunmount; /* * Now we are mounted, reserve a small amount of unused space for @@ -1194,18 +1251,17 @@ xfs_mountfs( return 0; - error4: - /* - * Free up the root inode. - */ + out_rtunmount: + xfs_rtunmount_inodes(mp); + out_rele_rip: IRELE(rip); - error3: - xfs_log_unmount_dealloc(mp); - error2: + out_log_dealloc: + xfs_log_unmount(mp); + out_free_perag: xfs_free_perag(mp); - error1: - if (uuid_mounted) - uuid_table_remove(&mp->m_sb.sb_uuid); + out_remove_uuid: + xfs_uuid_unmount(mp); + out: return error; } @@ -1226,15 +1282,12 @@ xfs_unmountfs( */ XFS_QM_UNMOUNT(mp); - if (mp->m_rbmip) - IRELE(mp->m_rbmip); - if (mp->m_rsumip) - IRELE(mp->m_rsumip); + xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); /* * We can potentially deadlock here if we have an inode cluster - * that has been freed has it's buffer still pinned in memory because + * that has been freed has its buffer still pinned in memory because * the transaction is still sitting in a iclog. The stale inodes * on that buffer will have their flush locks held until the * transaction hits the disk and the callbacks run. the inode @@ -1266,7 +1319,7 @@ xfs_unmountfs( * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for * lazy superblock counting because it trusts the incore superblock - * counters to be aboslutely correct on clean unmount. + * counters to be absolutely correct on clean unmount. * * We don't bother correcting this elsewhere for lazy superblock * counting because on mount of an unclean filesystem we reconstruct the @@ -1288,10 +1341,9 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); xfs_unmountfs_wait(mp); /* wait for async bufs */ - xfs_log_unmount(mp); /* Done! No more fs ops. */ - - if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) - uuid_table_remove(&mp->m_sb.sb_uuid); + xfs_log_unmount_write(mp); + xfs_log_unmount(mp); + xfs_uuid_unmount(mp); #if defined(DEBUG) xfs_errortag_clearall(mp, 0); @@ -1793,29 +1845,6 @@ xfs_freesb( } /* - * See if the UUID is unique among mounted XFS filesystems. - * Mount fails if UUID is nil or a FS with the same UUID is already mounted. - */ -STATIC int -xfs_uuid_mount( - xfs_mount_t *mp) -{ - if (uuid_is_nil(&mp->m_sb.sb_uuid)) { - cmn_err(CE_WARN, - "XFS: Filesystem %s has nil UUID - can't mount", - mp->m_fsname); - return -1; - } - if (!uuid_table_insert(&mp->m_sb.sb_uuid)) { - cmn_err(CE_WARN, - "XFS: Filesystem %s has duplicate UUID - can't mount", - mp->m_fsname); - return -1; - } - return 0; -} - -/* * Used to log changes to the superblock unit and width fields which could * be altered by the mount options, as well as any potential sb_features2 * fixup. Only the first superblock is updated. @@ -1868,7 +1897,7 @@ xfs_mount_log_sb( * we disable the per-cpu counter and go through the slow path. * * The slow path is the current xfs_mod_incore_sb() function. This means that - * when we disable a per-cpu counter, we need to drain it's resources back to + * when we disable a per-cpu counter, we need to drain its resources back to * the global superblock. We do this after disabling the counter to prevent * more threads from queueing up on the counter. * diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f5e9937f9bd..7af44adffc8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, uint); typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *); typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); -typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t); typedef struct xfs_qmops { xfs_qminit_t xfs_qminit; @@ -154,7 +153,6 @@ typedef struct xfs_qmops { xfs_dqvopchownresv_t xfs_dqvopchownresv; xfs_dqstatvfs_t xfs_dqstatvfs; xfs_dqsync_t xfs_dqsync; - xfs_quotactl_t xfs_quotactl; struct xfs_dqtrxops *xfs_dqtrxops; } xfs_qmops_t; @@ -188,8 +186,6 @@ typedef struct xfs_qmops { (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp) #define XFS_QM_DQSYNC(mp, flags) \ (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags) -#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \ - (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr) #ifdef HAVE_PERCPU_SB @@ -273,19 +269,17 @@ typedef struct xfs_mount { uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ - uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ + uint m_in_maxlevels; /* max inobt btree levels. */ struct xfs_perag *m_perag; /* per-ag accounting info */ struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ - uint m_attroffset; /* inode attribute offset */ uint m_dir_node_ents; /* #entries in a dir danode */ uint m_attr_node_ents; /* #entries in attr danode */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ - int m_litino; /* size of inode union area */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ xfs_trans_reservations_t m_reservations;/* precomputed res values */ @@ -293,9 +287,6 @@ typedef struct xfs_mount { __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ -#if XFS_BIG_INUMS - xfs_ino_t m_inoadd; /* add value for ino64_offset */ -#endif int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ @@ -337,7 +328,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_INO64 (1ULL << 1) #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem @@ -389,8 +379,8 @@ typedef struct xfs_mount { * Synchronous read and write sizes. This should be * better for NFSv2 wsync filesystems. */ -#define XFS_WSYNC_READIO_LOG 15 /* 32K */ -#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */ +#define XFS_WSYNC_READIO_LOG 15 /* 32k */ +#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ /* * Allow large block sizes to be reported to userspace programs if the @@ -500,9 +490,6 @@ typedef struct xfs_mod_sb { int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; -#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) -#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) - extern int xfs_log_sbcount(xfs_mount_t *, uint); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c index 27f80581520..e101790ea8e 100644 --- a/fs/xfs/xfs_qmops.c +++ b/fs/xfs/xfs_qmops.c @@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = { .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval, .xfs_dqsync = (xfs_dqsync_t) fs_noerr, - .xfs_quotactl = (xfs_quotactl_t) fs_nosys, }; int diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 48965ecaa15..f5d1202dde2 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -18,6 +18,8 @@ #ifndef __XFS_QUOTA_H__ #define __XFS_QUOTA_H__ +struct xfs_trans; + /* * The ondisk form of a dquot structure. */ @@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat { * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */ #define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index c5bb86f3ec0..385f6dceba5 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -2288,6 +2288,16 @@ xfs_rtmount_inodes( return 0; } +void +xfs_rtunmount_inodes( + struct xfs_mount *mp) +{ + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); +} + /* * Pick an extent for allocation at the start of a new realtime file. * Use the sequence number stored in the atime field of the bitmap inode. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 8d8dcd21571..b2d67adb6a0 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -23,8 +23,8 @@ struct xfs_trans; /* Min and max rt extent sizes, specified in bytes */ #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ -#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ -#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ /* * Constants for bit manipulations. @@ -108,6 +108,9 @@ xfs_rtfree_extent( int /* error */ xfs_rtmount_init( struct xfs_mount *mp); /* file system mount structure */ +void +xfs_rtunmount_inodes( + struct xfs_mount *mp); /* * Get the bitmap and summary inodes into the mount structure @@ -146,6 +149,7 @@ xfs_growfs_rt( # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index d6fe4a88d79..775249a54f6 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) * In a write transaction we can allocate a maximum of 2 * extents. This gives: * the inode getting the new extents: inode size - * the inode\'s bmap btree: max depth * block size + * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size @@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) /* * In truncating a file we free up to two extents at once. We can modify: * the inode being truncated: inode size - * the inode\'s bmap btree: (max depth + 1) * block size + * the inode's bmap btree: (max depth + 1) * block size * And the bmap_finish transaction can free the blocks and bmap blocks: * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size @@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ (128 * 5) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) #define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) @@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) * the new inode: inode size * the inode btree entry: 1 block * the directory btree: (max depth + v2) * dir block size - * the directory inode\'s bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 KB + * the directory inode's bmap btree: (max depth + v2) * block size + * the blocks for the symlink: 1 kB * Or in the first xact we allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize @@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ (2 * (mp)->m_sb.sb_sectsize + \ XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ + XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) @@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) * the inode btree entry: block size * the superblock for the nlink flag: sector size * the directory btree: (max depth + v2) * dir block size - * the directory inode\'s bmap btree: (max depth + v2) * block size + * the directory inode's bmap btree: (max depth + v2) * block size * Or in the first xact we allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size @@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ (3 * (mp)->m_sb.sb_sectsize + \ XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ + XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) #define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) @@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ (128 * 5) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) @@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) /* * Removing the attribute fork of a file * the inode being truncated: inode size - * the inode\'s bmap btree: max depth * block size + * the inode's bmap btree: max depth * block size * And the bmap_finish transaction can free the blocks and bmap blocks: * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2d47f10f8be..f31271c30de 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -79,7 +79,7 @@ xfs_trans_ail_tail( * the push is run asynchronously in a separate thread, so we return the tail * of the log right now instead of the tail after the push. This means we will * either continue right away, or we will sleep waiting on the async thread to - * do it's work. + * do its work. * * We do this unlocked - we only need to know whether there is anything in the * AIL at the time we are called. We don't need to access the contents of @@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next( /* * Now that the traversal is complete, we need to remove the cursor * from the list of traversing cursors. Avoid removing the embedded - * push cursor, but use the fact it is alway present to make the + * push cursor, but use the fact it is always present to make the * list deletion simple. */ void diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index e110bf57d7f..eb3fc57f9ee 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -22,7 +22,7 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" -/* XXX: from here down needed until struct xfs_trans has it's own ailp */ +/* XXX: from here down needed until struct xfs_trans has its own ailp */ #include "xfs_bit.h" #include "xfs_buf_item.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 4ea2e5074bd..7d2c920dfb9 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -47,7 +47,7 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) + (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) /* * Space reservation values for various transactions. diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b2f724502f1..d725428c9df 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -21,14 +21,6 @@ #ifdef __KERNEL__ /* - * POSIX Extensions - */ -typedef unsigned char uchar_t; -typedef unsigned short ushort_t; -typedef unsigned int uint_t; -typedef unsigned long ulong_t; - -/* * Additional type declarations for XFS */ typedef signed char __int8_t; diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index fcc2285d03e..79b9e5ea535 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -374,7 +374,7 @@ xfs_truncate_file( /* * Follow the normal truncate locking protocol. Since we - * hold the inode in the transaction, we know that it's number + * hold the inode in the transaction, we know that its number * of references will stay constant. */ xfs_ilock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 0e55c5d7db5..7394c7af5de 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1136,7 +1136,7 @@ xfs_inactive( * If the inode is already free, then there can be nothing * to clean up here. */ - if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { + if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { ASSERT(ip->i_df.if_real_bytes == 0); ASSERT(ip->i_df.if_broot_bytes == 0); return VN_INACTIVE_CACHE; @@ -1387,23 +1387,28 @@ xfs_create( xfs_inode_t **ipp, cred_t *credp) { - xfs_mount_t *mp = dp->i_mount; - xfs_inode_t *ip; - xfs_trans_t *tp; + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; boolean_t unlock_dp_on_error = B_FALSE; - int dm_event_sent = 0; uint cancel_flags; int committed; xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; uint resblks; + uint log_res; + uint log_count; - ASSERT(!*ipp); xfs_itrace_entry(dp); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, dp, DM_RIGHT_NULL, NULL, @@ -1412,84 +1417,97 @@ xfs_create( if (error) return error; - dm_event_sent = 1; } - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* Return through std_return after this point. */ - - udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; else - prid = (xfs_prid_t)dfltprid; + prid = dfltprid; /* * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; - ip = NULL; + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + log_res = XFS_MKDIR_LOG_RES(mp); + log_count = XFS_MKDIR_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + log_res = XFS_CREATE_LOG_RES(mp); + log_count = XFS_CREATE_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_CREATE_SPACE_RES(mp, name->len); + /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + error = xfs_trans_reserve(tp, resblks, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); } if (error) { cancel_flags = 0; - goto error_return; + goto out_trans_cancel; } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = B_TRUE; - xfs_bmap_init(&free_list, &first_block); + /* + * Check for directory link count overflow. + */ + if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto out_trans_cancel; + } - ASSERT(ip == NULL); + xfs_bmap_init(&free_list, &first_block); /* * Reserve disk quota and the inode. */ error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; error = xfs_dir_canenter(tp, dp, name, resblks); if (error) - goto error_return; - error = xfs_dir_ialloc(&tp, dp, mode, 1, - rdev, credp, prid, resblks > 0, - &ip, &committed); + goto out_trans_cancel; + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, + prid, resblks > 0, &ip, &committed); if (error) { if (error == ENOSPC) - goto error_return; - goto abort_return; + goto out_trans_cancel; + goto out_trans_abort; } - xfs_itrace_ref(ip); /* * At this point, we've gotten a newly allocated inode. * It is locked (and joined to the transaction). */ - + xfs_itrace_ref(ip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* @@ -1508,19 +1526,28 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != ENOSPC); - goto abort_return; + goto out_trans_abort; } xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } /* * Attach the dquot(s) to the inodes and modify them incore. @@ -1537,16 +1564,13 @@ xfs_create( IHOLD(ip); error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - goto abort_rele; - } + if (error) + goto out_abort_rele; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) { IRELE(ip); - tp = NULL; - goto error_return; + goto out_dqrele; } XFS_QM_DQRELE(mp, udqp); @@ -1555,26 +1579,22 @@ xfs_create( *ipp = ip; /* Fallthrough to std_return with error = 0 */ - -std_return: - if ((*ipp || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dp, DM_RIGHT_NULL, - *ipp ? ip : NULL, - DM_RIGHT_NULL, name->name, NULL, - mode, error, 0); + std_return: + if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { + XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL, + ip, DM_RIGHT_NULL, name->name, NULL, mode, + error, 0); } + return error; - abort_return: + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - if (tp != NULL) - xfs_trans_cancel(tp, cancel_flags); - + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_dqrele: XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); @@ -1583,20 +1603,18 @@ std_return: goto std_return; - abort_rele: + out_abort_rele: /* * Wait until after the current transaction is aborted to * release the inode. This prevents recursive transactions * and deadlocks from xfs_inactive. */ + xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; xfs_trans_cancel(tp, cancel_flags); IRELE(ip); - - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - goto std_return; + unlock_dp_on_error = B_FALSE; + goto out_dqrele; } #ifdef DEBUG @@ -2004,8 +2022,10 @@ xfs_link( /* Return through std_return after this point. */ error = XFS_QM_DQATTACH(mp, sip, 0); - if (!error && sip != tdp) - error = XFS_QM_DQATTACH(mp, tdp, 0); + if (error) + goto std_return; + + error = XFS_QM_DQATTACH(mp, tdp, 0); if (error) goto std_return; @@ -2110,209 +2130,6 @@ std_return: goto std_return; } - -int -xfs_mkdir( - xfs_inode_t *dp, - struct xfs_name *dir_name, - mode_t mode, - xfs_inode_t **ipp, - cred_t *credp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_inode_t *cdp; /* inode of created dir */ - xfs_trans_t *tp; - int cancel_flags; - int error; - int committed; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; - boolean_t created = B_FALSE; - int dm_event_sent = 0; - xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - tp = NULL; - - if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, dir_name->name, NULL, - mode, 0, 0); - if (error) - return error; - dm_event_sent = 1; - } - - /* Return through std_return after this point. */ - - xfs_itrace_entry(dp); - - mp = dp->i_mount; - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; - else - prid = (xfs_prid_t)dfltprid; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len); - error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_MKDIR_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; - - /* - * Check for directory link count overflow. - */ - if (dp->i_d.di_nlink >= XFS_MAXLINK) { - error = XFS_ERROR(EMLINK); - goto error_return; - } - - /* - * Reserve disk quota and the inode. - */ - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - error = xfs_dir_canenter(tp, dp, dir_name, resblks); - if (error) - goto error_return; - /* - * create the directory inode. - */ - error = xfs_dir_ialloc(&tp, dp, mode, 2, - 0, credp, prid, resblks > 0, - &cdp, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto abort_return; - } - xfs_itrace_ref(cdp); - - /* - * Now we add the directory inode to the transaction. - * We waited until now since xfs_dir_ialloc might start - * a new transaction. Had we joined the transaction - * earlier, the locks might have gotten released. An error - * from here on will result in the transaction cancel - * unlocking dp so don't do it explicitly in the error path. - */ - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; - - xfs_bmap_init(&free_list, &first_block); - - error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); - if (error) { - ASSERT(error != ENOSPC); - goto error1; - } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - error = xfs_dir_init(tp, cdp, dp); - if (error) - goto error2; - - error = xfs_bumplink(tp, dp); - if (error) - goto error2; - - created = B_TRUE; - - *ipp = cdp; - IHOLD(cdp); - - /* - * Attach the dquots to the new inode and modify the icount incore. - */ - XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp); - - /* - * If this is a synchronous mount, make sure that the - * mkdir transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - IRELE(cdp); - goto error2; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - if (error) { - IRELE(cdp); - } - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit. */ - -std_return: - if ((created || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dp, DM_RIGHT_NULL, - created ? cdp : NULL, - DM_RIGHT_NULL, - dir_name->name, NULL, - mode, error, 0); - } - return error; - - error2: - error1: - xfs_bmap_cancel(&free_list); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - goto std_return; -} - int xfs_symlink( xfs_inode_t *dp, @@ -2587,51 +2404,6 @@ std_return: } int -xfs_inode_flush( - xfs_inode_t *ip, - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - int error = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Bypass inodes which have already been cleaned by - * the inode flush clustering code inside xfs_iflush - */ - if (xfs_inode_clean(ip)) - return 0; - - /* - * We make this non-blocking if the inode is contended, - * return EAGAIN to indicate to the caller that they - * did not succeed. This prevents the flush path from - * blocking on inodes inside another operation right - * now, they get caught later by xfs_sync. - */ - if (flags & FLUSH_SYNC) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return EAGAIN; - } - } else { - return EAGAIN; - } - - error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC - : XFS_IFLUSH_ASYNC_NOBLOCK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - return error; -} - - -int xfs_set_dmattrs( xfs_inode_t *ip, u_int evmask, @@ -2676,7 +2448,7 @@ xfs_reclaim( ASSERT(!VN_MAPPED(VFS_I(ip))); /* bad inode, get out here ASAP */ - if (VN_BAD(VFS_I(ip))) { + if (is_bad_inode(VFS_I(ip))) { xfs_ireclaim(ip); return 0; } @@ -3090,7 +2862,7 @@ xfs_free_file_space( /* * Need to zero the stuff we're not freeing, on disk. - * If its a realtime file & can't use unwritten extents then we + * If it's a realtime file & can't use unwritten extents then we * actually need to zero the extent edges. Otherwise xfs_bunmapi * will take care of it for us. */ diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 76df328c61b..04373c6c61f 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name, - mode_t mode, struct xfs_inode **ipp, cred_t *credp); int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, mode_t mode, struct xfs_inode **ipp, cred_t *credp); -int xfs_inode_flush(struct xfs_inode *ip, int flags); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, |