diff options
36 files changed, 1042 insertions, 966 deletions
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt index d0ec45ae4e7..44bd766f2e5 100644 --- a/Documentation/filesystems/nfs-rdma.txt +++ b/Documentation/filesystems/nfs-rdma.txt @@ -5,7 +5,7 @@ ################################################################################ Author: NetApp and Open Grid Computing - Date: April 15, 2008 + Date: May 29, 2008 Table of Contents ~~~~~~~~~~~~~~~~~ @@ -60,16 +60,18 @@ Installation The procedures described in this document have been tested with distributions from Red Hat's Fedora Project (http://fedora.redhat.com/). - - Install nfs-utils-1.1.1 or greater on the client + - Install nfs-utils-1.1.2 or greater on the client - An NFS/RDMA mount point can only be obtained by using the mount.nfs - command in nfs-utils-1.1.1 or greater. To see which version of mount.nfs - you are using, type: + An NFS/RDMA mount point can be obtained by using the mount.nfs command in + nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils + version with support for NFS/RDMA mounts, but for various reasons we + recommend using nfs-utils-1.1.2 or greater). To see which version of + mount.nfs you are using, type: - > /sbin/mount.nfs -V + $ /sbin/mount.nfs -V - If the version is less than 1.1.1 or the command does not exist, - then you will need to install the latest version of nfs-utils. + If the version is less than 1.1.2 or the command does not exist, + you should install the latest version of nfs-utils. Download the latest package from: @@ -77,22 +79,33 @@ Installation Uncompress the package and follow the installation instructions. - If you will not be using GSS and NFSv4, the installation process - can be simplified by disabling these features when running configure: + If you will not need the idmapper and gssd executables (you do not need + these to create an NFS/RDMA enabled mount command), the installation + process can be simplified by disabling these features when running + configure: - > ./configure --disable-gss --disable-nfsv4 + $ ./configure --disable-gss --disable-nfsv4 - For more information on this see the package's README and INSTALL files. + To build nfs-utils you will need the tcp_wrappers package installed. For + more information on this see the package's README and INSTALL files. After building the nfs-utils package, there will be a mount.nfs binary in the utils/mount directory. This binary can be used to initiate NFS v2, v3, - or v4 mounts. To initiate a v4 mount, the binary must be called mount.nfs4. - The standard technique is to create a symlink called mount.nfs4 to mount.nfs. + or v4 mounts. To initiate a v4 mount, the binary must be called + mount.nfs4. The standard technique is to create a symlink called + mount.nfs4 to mount.nfs. - NOTE: mount.nfs and therefore nfs-utils-1.1.1 or greater is only needed + This mount.nfs binary should be installed at /sbin/mount.nfs as follows: + + $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs + + In this location, mount.nfs will be invoked automatically for NFS mounts + by the system mount commmand. + + NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed on the NFS client machine. You do not need this specific version of nfs-utils on the server. Furthermore, only the mount.nfs command from - nfs-utils-1.1.1 is needed on the client. + nfs-utils-1.1.2 is needed on the client. - Install a Linux kernel with NFS/RDMA @@ -156,8 +169,8 @@ Check RDMA and NFS Setup this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel card: - > modprobe ib_mthca - > modprobe ib_ipoib + $ modprobe ib_mthca + $ modprobe ib_ipoib If you are using InfiniBand, make sure there is a Subnet Manager (SM) running on the network. If your IB switch has an embedded SM, you can @@ -166,7 +179,7 @@ Check RDMA and NFS Setup If an SM is running on your network, you should see the following: - > cat /sys/class/infiniband/driverX/ports/1/state + $ cat /sys/class/infiniband/driverX/ports/1/state 4: ACTIVE where driverX is mthca0, ipath5, ehca3, etc. @@ -174,10 +187,10 @@ Check RDMA and NFS Setup To further test the InfiniBand software stack, use IPoIB (this assumes you have two IB hosts named host1 and host2): - host1> ifconfig ib0 a.b.c.x - host2> ifconfig ib0 a.b.c.y - host1> ping a.b.c.y - host2> ping a.b.c.x + host1$ ifconfig ib0 a.b.c.x + host2$ ifconfig ib0 a.b.c.y + host1$ ping a.b.c.y + host2$ ping a.b.c.x For other device types, follow the appropriate procedures. @@ -202,11 +215,11 @@ NFS/RDMA Setup /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash) /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) - The IP address(es) is(are) the client's IPoIB address for an InfiniBand HCA or the - cleint's iWARP address(es) for an RNIC. + The IP address(es) is(are) the client's IPoIB address for an InfiniBand + HCA or the cleint's iWARP address(es) for an RNIC. - NOTE: The "insecure" option must be used because the NFS/RDMA client does not - use a reserved port. + NOTE: The "insecure" option must be used because the NFS/RDMA client does + not use a reserved port. Each time a machine boots: @@ -214,43 +227,45 @@ NFS/RDMA Setup For InfiniBand using a Mellanox adapter: - > modprobe ib_mthca - > modprobe ib_ipoib - > ifconfig ib0 a.b.c.d + $ modprobe ib_mthca + $ modprobe ib_ipoib + $ ifconfig ib0 a.b.c.d NOTE: use unique addresses for the client and server - Start the NFS server - If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config), - load the RDMA transport module: + If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in + kernel config), load the RDMA transport module: - > modprobe svcrdma + $ modprobe svcrdma - Regardless of how the server was built (module or built-in), start the server: + Regardless of how the server was built (module or built-in), start the + server: - > /etc/init.d/nfs start + $ /etc/init.d/nfs start or - > service nfs start + $ service nfs start Instruct the server to listen on the RDMA transport: - > echo rdma 2050 > /proc/fs/nfsd/portlist + $ echo rdma 2050 > /proc/fs/nfsd/portlist - On the client system - If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config), - load the RDMA client module: + If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in + kernel config), load the RDMA client module: - > modprobe xprtrdma.ko + $ modprobe xprtrdma.ko - Regardless of how the client was built (module or built-in), issue the mount.nfs command: + Regardless of how the client was built (module or built-in), use this + command to mount the NFS/RDMA server: - > /path/to/your/mount.nfs <IPoIB-server-name-or-address>:/<export> /mnt -i -o rdma,port=2050 + $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt - To verify that the mount is using RDMA, run "cat /proc/mounts" and check the - "proto" field for the given mount. + To verify that the mount is using RDMA, run "cat /proc/mounts" and check + the "proto" field for the given mount. Congratulations! You're using NFS/RDMA! diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2169af4d545..5bd9bf0fa9d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -50,7 +50,7 @@ EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct task_struct *nlmsvc_task; -static struct svc_serv *nlmsvc_serv; +static struct svc_rqst *nlmsvc_rqst; int nlmsvc_grace_period; unsigned long nlmsvc_timeout; @@ -194,20 +194,11 @@ lockd(void *vrqstp) svc_process(rqstp); } - flush_signals(current); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); - unlock_kernel(); - - nlmsvc_task = NULL; - nlmsvc_serv = NULL; - - /* Exit the RPC thread */ - svc_exit_thread(rqstp); - return 0; } @@ -254,16 +245,15 @@ int lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ { struct svc_serv *serv; - struct svc_rqst *rqstp; int error = 0; mutex_lock(&nlmsvc_mutex); /* * Check whether we're already up and running. */ - if (nlmsvc_serv) { + if (nlmsvc_rqst) { if (proto) - error = make_socks(nlmsvc_serv, proto); + error = make_socks(nlmsvc_rqst->rq_server, proto); goto out; } @@ -288,9 +278,10 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ /* * Create the kernel thread and wait for it to start. */ - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(nlmsvc_rqst)) { + error = PTR_ERR(nlmsvc_rqst); + nlmsvc_rqst = NULL; printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); @@ -298,16 +289,15 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ } svc_sock_update_bufs(serv); - nlmsvc_serv = rqstp->rq_server; - nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name); + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); + svc_exit_thread(nlmsvc_rqst); nlmsvc_task = NULL; - nlmsvc_serv = NULL; + nlmsvc_rqst = NULL; printk(KERN_WARNING "lockd_up: kthread_run failed, error=%d\n", error); - svc_exit_thread(rqstp); goto destroy_and_out; } @@ -346,6 +336,9 @@ lockd_down(void) BUG(); } kthread_stop(nlmsvc_task); + svc_exit_thread(nlmsvc_rqst); + nlmsvc_task = NULL; + nlmsvc_rqst = NULL; out: mutex_unlock(&nlmsvc_mutex); } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 2e27176ff42..39944463933 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -58,8 +58,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return 0; no_locks: - if (host) - nlm_release_host(host); + nlm_release_host(host); if (error) return error; return nlm_lck_denied_nolocks; @@ -100,7 +99,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now check for conflicting locks */ - resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie); + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -146,7 +145,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, #endif /* Now try to lock the file */ - resp->status = nlmsvc_lock(rqstp, file, &argp->lock, + resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 56a08ab9a4c..821b9acdfb6 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -129,9 +129,9 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b) { - if(a->len != b->len) + if (a->len != b->len) return 0; - if(memcmp(a->data,b->data,a->len)) + if (memcmp(a->data, b->data, a->len)) return 0; return 1; } @@ -180,6 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, struct nlm_block *block; struct nlm_rqst *call = NULL; + nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return NULL; @@ -358,10 +359,10 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) */ __be32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_lock *lock, int wait, struct nlm_cookie *cookie) + struct nlm_host *host, struct nlm_lock *lock, int wait, + struct nlm_cookie *cookie) { struct nlm_block *block = NULL; - struct nlm_host *host; int error; __be32 ret; @@ -373,11 +374,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - /* Create host handle for callback */ - host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); - if (host == NULL) - return nlm_lck_denied_nolocks; - /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) @@ -385,8 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ block = nlmsvc_lookup_block(file, lock); if (block == NULL) { - block = nlmsvc_create_block(rqstp, nlm_get_host(host), file, - lock, cookie); + block = nlmsvc_create_block(rqstp, host, file, lock, cookie); ret = nlm_lck_denied_nolocks; if (block == NULL) goto out; @@ -417,7 +412,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, lock->fl.fl_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); - switch(error) { + switch (error) { case 0: ret = nlm_granted; goto out; @@ -450,7 +445,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block); - nlm_release_host(host); dprintk("lockd: nlmsvc_lock returned %u\n", ret); return ret; } @@ -460,8 +454,8 @@ out: */ __be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_lock *lock, struct nlm_lock *conflock, - struct nlm_cookie *cookie) + struct nlm_host *host, struct nlm_lock *lock, + struct nlm_lock *conflock, struct nlm_cookie *cookie) { struct nlm_block *block = NULL; int error; @@ -479,16 +473,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, if (block == NULL) { struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - struct nlm_host *host; if (conf == NULL) return nlm_granted; - /* Create host handle for callback */ - host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); - if (host == NULL) { - kfree(conf); - return nlm_lck_denied_nolocks; - } block = nlmsvc_create_block(rqstp, host, file, lock, cookie); if (block == NULL) { kfree(conf); @@ -897,7 +884,7 @@ nlmsvc_retry_blocked(void) if (block->b_when == NLM_NEVER) break; - if (time_after(block->b_when,jiffies)) { + if (time_after(block->b_when, jiffies)) { timeout = block->b_when - jiffies; break; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index ce6952b50a7..76019d2ff72 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -87,8 +87,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return 0; no_locks: - if (host) - nlm_release_host(host); + nlm_release_host(host); if (error) return error; return nlm_lck_denied_nolocks; @@ -129,7 +128,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now check for conflicting locks */ - resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie)); + resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -176,7 +175,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, #endif /* Now try to lock the file */ - resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock, + resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index d1c48b539df..198b4e55b37 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -373,13 +373,16 @@ nlmsvc_free_host_resources(struct nlm_host *host) } } -/* - * Remove all locks held for clients +/** + * nlmsvc_invalidate_all - remove all locks held for clients + * + * Release all locks held by NFS clients. + * */ void nlmsvc_invalidate_all(void) { - /* Release all locks held by NFS clients. + /* * Previously, the code would call * nlmsvc_free_host_resources for each client in * turn, which is about as inefficient as it gets. @@ -396,6 +399,12 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) return sb == file->f_file->f_path.mnt->mnt_sb; } +/** + * nlmsvc_unlock_all_by_sb - release locks held on this file system + * @sb: super block + * + * Release all locks held by clients accessing this file system. + */ int nlmsvc_unlock_all_by_sb(struct super_block *sb) { @@ -409,17 +418,22 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - __be32 *server_addr = datap; - - return host->h_saddr.sin_addr.s_addr == *server_addr; + return nlm_cmp_addr(&host->h_saddr, datap); } +/** + * nlmsvc_unlock_all_by_ip - release local locks by IP address + * @server_addr: server's IP address as seen by clients + * + * Release all locks held by clients accessing this host + * via the passed in IP address. + */ int -nlmsvc_unlock_all_by_ip(__be32 server_addr) +nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr) { int ret; - ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL); - return ret ? -EIO : 0; + ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL); + return ret ? -EIO : 0; } EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip); diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 9e4a568a501..6b6225ac492 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -35,7 +35,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) fh.fh_export = NULL; exp_readlock(); - nfserr = nfsd_open(rqstp, &fh, S_IFREG, MAY_LOCK, filp); + nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); fh_put(&fh); rqstp->rq_client = NULL; exp_readunlock(); diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1c3b7654e96..4e3219e8411 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -40,7 +40,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) RETURN_STATUS(nfserr); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) @@ -107,7 +108,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); if (!nfserr) { nfserr = nfserrno( nfsd_set_posix_acl( @@ -134,7 +135,7 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - return fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); } /* diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index b647f2f872d..9981dbb377a 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -36,7 +36,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); - if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) RETURN_STATUS(nfserr); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) @@ -101,7 +102,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); if (!nfserr) { nfserr = nfserrno( nfsd_set_posix_acl( diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index c721a1e6e9d..4d617ea28cf 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -63,7 +63,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); if (nfserr) RETURN_STATUS(nfserr); @@ -242,7 +242,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, attr = &argp->attrs; /* Get the directory inode */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_CREATE); + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE); if (nfserr) RETURN_STATUS(nfserr); @@ -558,7 +558,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); /* Check special features of the file system. May request * different read/write sizes for file systems known to have @@ -597,7 +597,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->p_case_insensitive = 0; resp->p_case_preserving = 1; - nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); if (nfserr == 0) { struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c309c881bd4..eef1629806f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -71,11 +71,11 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs return nfserr_inval; if (open->op_share_access & NFS4_SHARE_ACCESS_READ) - accmode |= MAY_READ; + accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - accmode |= (MAY_WRITE | MAY_TRUNC); + accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); if (open->op_share_deny & NFS4_SHARE_DENY_WRITE) - accmode |= MAY_WRITE; + accmode |= NFSD_MAY_WRITE; status = fh_verify(rqstp, current_fh, S_IFREG, accmode); @@ -126,7 +126,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size); if (!created) - status = do_open_permission(rqstp, current_fh, open, MAY_NOP); + status = do_open_permission(rqstp, current_fh, open, + NFSD_MAY_NOP); out: fh_put(&resfh); @@ -157,7 +158,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); - status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE); + status = do_open_permission(rqstp, current_fh, open, + NFSD_MAY_OWNER_OVERRIDE); return status; } @@ -186,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, rp->rp_openfh_len); - status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) dprintk("nfsd4_open: replay failed" " restoring previous filehandle\n"); @@ -285,7 +287,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, putfh->pf_fhlen); - return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); } static __be32 @@ -363,7 +365,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_init(&resfh, NFS4_FHSIZE); - status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE); + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, + NFSD_MAY_CREATE); if (status == nfserr_symlink) status = nfserr_notdir; if (status) @@ -445,7 +448,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -730,7 +733,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int count; __be32 status; - status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -843,10 +846,13 @@ struct nfsd4_operation { #define ALLOWED_WITHOUT_FH 1 /* GETATTR and ops not listed as returning NFS4ERR_MOVED: */ #define ALLOWED_ON_ABSENT_FS 2 + char *op_name; }; static struct nfsd4_operation nfsd4_ops[]; +static inline char *nfsd4_op_name(unsigned opnum); + /* * COMPOUND call. */ @@ -888,7 +894,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; - dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum); + dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", + resp->opcnt, args->opcnt, op->opnum, + nfsd4_op_name(op->opnum)); /* * The XDR decode routines may have pre-set op->status; @@ -952,126 +960,170 @@ encode_op: out: nfsd4_release_compoundargs(args); cstate_free(cstate); + dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, + .op_name = "OP_ACCESS", }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, + .op_name = "OP_CLOSE", }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, + .op_name = "OP_COMMIT", }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, + .op_name = "OP_CREATE", }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_name = "OP_DELEGRETURN", }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, .op_flags = ALLOWED_ON_ABSENT_FS, + .op_name = "OP_GETATTR", }, [OP_GETFH] = { .op_func = (nfsd4op_func)nfsd4_getfh, + .op_name = "OP_GETFH", }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, + .op_name = "OP_LINK", }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, + .op_name = "OP_LOCK", }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, + .op_name = "OP_LOCKT", }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, + .op_name = "OP_LOCKU", }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, + .op_name = "OP_LOOKUP", }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, + .op_name = "OP_LOOKUPP", }, [OP_NVERIFY] = { .op_func = (nfsd4op_func)nfsd4_nverify, + .op_name = "OP_NVERIFY", }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, + .op_name = "OP_OPEN", }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_name = "OP_OPEN_CONFIRM", }, [OP_OPEN_DOWNGRADE] = { .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_name = "OP_OPEN_DOWNGRADE", }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { - /* unsupported; just for future reference: */ + /* unsupported, just for future reference: */ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_PUTPUBFH", }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_PUTROOTFH", }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, + .op_name = "OP_READ", }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, + .op_name = "OP_READDIR", }, [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, + .op_name = "OP_READLINK", }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, + .op_name = "OP_REMOVE", }, [OP_RENAME] = { + .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RENEW", }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RESTOREFH", }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, + .op_name = "OP_SAVEFH", }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, + .op_name = "OP_SECINFO", }, [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, + .op_name = "OP_SETATTR", }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_SETCLIENTID", }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_SETCLIENTID_CONFIRM", }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, + .op_name = "OP_VERIFY", }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, + .op_name = "OP_WRITE", }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RELEASE_LOCKOWNER", }, }; +static inline char * +nfsd4_op_name(unsigned opnum) +{ + if (opnum < ARRAY_SIZE(nfsd4_ops)) + return nfsd4_ops[opnum].op_name; + return "unknown_operation"; +} + #define nfs4svc_decode_voidargs NULL #define nfs4svc_release_void NULL #define nfsd4_voidres nfsd4_voidargs diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8799b870818..1578d7a2667 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1173,6 +1173,24 @@ static inline int deny_valid(u32 x) return x <= NFS4_SHARE_DENY_BOTH; } +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ static void set_access(unsigned int *access, unsigned long bmap) { int i; @@ -1570,6 +1588,10 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta int err = get_write_access(inode); if (err) return nfserrno(err); + err = mnt_want_write(cur_fh->fh_export->ex_path.mnt); + if (err) + return nfserrno(err); + file_take_write(filp); } status = nfsd4_truncate(rqstp, cur_fh, open); if (status) { @@ -1579,8 +1601,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta } /* remember the open */ filp->f_mode |= open->op_share_access; - set_bit(open->op_share_access, &stp->st_access_bmap); - set_bit(open->op_share_deny, &stp->st_deny_bmap); + __set_bit(open->op_share_access, &stp->st_access_bmap); + __set_bit(open->op_share_deny, &stp->st_deny_bmap); return nfs_ok; } @@ -1722,9 +1744,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf /* Stateid was not found, this is a new OPEN */ int flags = 0; if (open->op_share_access & NFS4_SHARE_ACCESS_READ) - flags |= MAY_READ; + flags |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - flags |= MAY_WRITE; + flags |= NFSD_MAY_WRITE; status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags); if (status) goto out; @@ -2610,7 +2632,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if ((status = fh_verify(rqstp, &cstate->current_fh, - S_IFREG, MAY_LOCK))) { + S_IFREG, NFSD_MAY_LOCK))) { dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } @@ -3249,12 +3271,14 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); } +/* + * user_recovery_dirname is protected by the nfsd_mutex since it's only + * accessed when nfsd is starting. + */ static void nfs4_set_recdir(char *recdir) { - nfs4_lock_state(); strcpy(user_recovery_dirname, recdir); - nfs4_unlock_state(); } /* @@ -3278,6 +3302,12 @@ nfs4_reset_recoverydir(char *recdir) return status; } +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} + /* * Called when leasetime is changed. * @@ -3286,11 +3316,12 @@ nfs4_reset_recoverydir(char *recdir) * we start to register any changes in lease time. If the administrator * really wants to change the lease time *now*, they can go ahead and bring * nfsd down and then back up again after changing the lease time. + * + * user_lease_time is protected by nfsd_mutex since it's only really accessed + * when nfsd is starting */ void nfs4_reset_lease(time_t leasetime) { - lock_kernel(); user_lease_time = leasetime; - unlock_kernel(); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c513bbdf2d3..14ba4d9b285 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -986,10 +986,74 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel } static __be32 +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +{ + return nfs_ok; +} + +static __be32 +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +{ + return nfserr_opnotsupp; +} + +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); + +static nfsd4_dec nfsd4_dec_ops[] = { + [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] = (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, + [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, +}; + +struct nfsd4_minorversion_ops { + nfsd4_dec *decoders; + int nops; +}; + +static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { + [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, +}; + +static __be32 nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { DECODE_HEAD; struct nfsd4_op *op; + struct nfsd4_minorversion_ops *ops; int i; /* @@ -1019,6 +1083,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } } + if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion)) + argp->opcnt = 0; + + ops = &nfsd4_minorversion[argp->minorversion]; for (i = 0; i < argp->opcnt; i++) { op = &argp->ops[i]; op->replay = NULL; @@ -1056,120 +1124,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } op->opnum = ntohl(*argp->p++); - switch (op->opnum) { - case 2: /* Reserved operation */ - op->opnum = OP_ILLEGAL; - if (argp->minorversion == 0) - op->status = nfserr_op_illegal; - else - op->status = nfserr_minor_vers_mismatch; - break; - case OP_ACCESS: - op->status = nfsd4_decode_access(argp, &op->u.access); - break; - case OP_CLOSE: - op->status = nfsd4_decode_close(argp, &op->u.close); - break; - case OP_COMMIT: - op->status = nfsd4_decode_commit(argp, &op->u.commit); - break; - case OP_CREATE: - op->status = nfsd4_decode_create(argp, &op->u.create); - break; - case OP_DELEGRETURN: - op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn); - break; - case OP_GETATTR: - op->status = nfsd4_decode_getattr(argp, &op->u.getattr); - break; - case OP_GETFH: - op->status = nfs_ok; - break; - case OP_LINK: - op->status = nfsd4_decode_link(argp, &op->u.link); - break; - case OP_LOCK: - op->status = nfsd4_decode_lock(argp, &op->u.lock); - break; - case OP_LOCKT: - op->status = nfsd4_decode_lockt(argp, &op->u.lockt); - break; - case OP_LOCKU: - op->status = nfsd4_decode_locku(argp, &op->u.locku); - break; - case OP_LOOKUP: - op->status = nfsd4_decode_lookup(argp, &op->u.lookup); - break; - case OP_LOOKUPP: - op->status = nfs_ok; - break; - case OP_NVERIFY: - op->status = nfsd4_decode_verify(argp, &op->u.nverify); - break; - case OP_OPEN: - op->status = nfsd4_decode_open(argp, &op->u.open); - break; - case OP_OPEN_CONFIRM: - op->status = nfsd4_decode_open_confirm(argp, &op->u.open_confirm); - break; - case OP_OPEN_DOWNGRADE: - op->status = nfsd4_decode_open_downgrade(argp, &op->u.open_downgrade); - break; - case OP_PUTFH: - op->status = nfsd4_decode_putfh(argp, &op->u.putfh); - break; - case OP_PUTROOTFH: - op->status = nfs_ok; - break; - case OP_READ: - op->status = nfsd4_decode_read(argp, &op->u.read); - break; - case OP_READDIR: - op->status = nfsd4_decode_readdir(argp, &op->u.readdir); - break; - case OP_READLINK: - op->status = nfs_ok; - break; - case OP_REMOVE: - op->status = nfsd4_decode_remove(argp, &op->u.remove); - break; - case OP_RENAME: - op->status = nfsd4_decode_rename(argp, &op->u.rename); - break; - case OP_RESTOREFH: - op->status = nfs_ok; - break; - case OP_RENEW: - op->status = nfsd4_decode_renew(argp, &op->u.renew); - break; - case OP_SAVEFH: - op->status = nfs_ok; - break; - case OP_SECINFO: - op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo); - break; - case OP_SETATTR: - op->status = nfsd4_decode_setattr(argp, &op->u.setattr); - break; - case OP_SETCLIENTID: - op->status = nfsd4_decode_setclientid(argp, &op->u.setclientid); - break; - case OP_SETCLIENTID_CONFIRM: - op->status = nfsd4_decode_setclientid_confirm(argp, &op->u.setclientid_confirm); - break; - case OP_VERIFY: - op->status = nfsd4_decode_verify(argp, &op->u.verify); - break; - case OP_WRITE: - op->status = nfsd4_decode_write(argp, &op->u.write); - break; - case OP_RELEASE_LOCKOWNER: - op->status = nfsd4_decode_release_lockowner(argp, &op->u.release_lockowner); - break; - default: + if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) + op->status = ops->decoders[op->opnum](argp, &op->u); + else { op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; - break; } if (op->status) { @@ -1201,11 +1160,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) *p++ = htonl((u32)((n) >> 32)); \ *p++ = htonl((u32)(n)); \ } while (0) -#define WRITEMEM(ptr,nbytes) do { \ +#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \ *(p + XDR_QUADLEN(nbytes) -1) = 0; \ memcpy(p, ptr, nbytes); \ p += XDR_QUADLEN(nbytes); \ -} while (0) +}} while (0) #define WRITECINFO(c) do { \ *p++ = htonl(c.atomic); \ *p++ = htonl(c.before_ctime_sec); \ @@ -1991,7 +1950,7 @@ fail: return -EINVAL; } -static void +static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { ENCODE_HEAD; @@ -2002,9 +1961,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITE32(access->ac_resp_access); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { ENCODE_SEQID_OP_HEAD; @@ -2016,10 +1976,11 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c ADJUST_ARGS(); } ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { ENCODE_HEAD; @@ -2029,9 +1990,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITEMEM(commit->co_verf.data, 8); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { ENCODE_HEAD; @@ -2044,6 +2006,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITE32(create->cr_bmval[1]); ADJUST_ARGS(); } + return nfserr; } static __be32 @@ -2064,9 +2027,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 return nfserr; } -static void -nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp) +static __be32 +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { + struct svc_fh *fhp = *fhpp; unsigned int len; ENCODE_HEAD; @@ -2077,6 +2041,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh WRITEMEM(&fhp->fh_handle.fh_base, len); ADJUST_ARGS(); } + return nfserr; } /* @@ -2104,7 +2069,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie ADJUST_ARGS(); } -static void +static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { ENCODE_SEQID_OP_HEAD; @@ -2118,16 +2083,18 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo nfsd4_encode_lock_denied(resp, &lock->lk_denied); ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + return nfserr; } -static void +static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lockt->lt_denied); + return nfserr; } -static void +static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { ENCODE_SEQID_OP_HEAD; @@ -2140,10 +2107,11 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l } ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { ENCODE_HEAD; @@ -2153,10 +2121,11 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li WRITECINFO(link->li_cinfo); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { ENCODE_SEQID_OP_HEAD; @@ -2219,9 +2188,10 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op /* XXX save filehandle here */ out: ENCODE_SEQID_OP_TAIL(open->op_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { ENCODE_SEQID_OP_HEAD; @@ -2234,9 +2204,10 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct } ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + return nfserr; } -static void +static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { ENCODE_SEQID_OP_HEAD; @@ -2249,6 +2220,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc } ENCODE_SEQID_OP_TAIL(od->od_stateowner); + return nfserr; } static __be32 @@ -2443,7 +2415,7 @@ err_no_verf: return nfserr; } -static void +static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { ENCODE_HEAD; @@ -2453,9 +2425,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITECINFO(remove->rm_cinfo); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { ENCODE_HEAD; @@ -2466,9 +2439,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ WRITECINFO(rename->rn_tinfo); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { @@ -2532,13 +2506,14 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, out: if (exp) exp_put(exp); + return nfserr; } /* * The SETATTR encode routine is special -- it always encodes a bitmap, * regardless of the error status. */ -static void +static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { ENCODE_HEAD; @@ -2555,9 +2530,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 WRITE32(setattr->sa_bmval[1]); } ADJUST_ARGS(); + return nfserr; } -static void +static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { ENCODE_HEAD; @@ -2574,9 +2550,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n WRITE32(0); ADJUST_ARGS(); } + return nfserr; } -static void +static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { ENCODE_HEAD; @@ -2588,8 +2565,56 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w WRITEMEM(write->wr_verifier.data, 8); ADJUST_ARGS(); } + return nfserr; } +static __be32 +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +{ + return nfserr; +} + +typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); + +static nfsd4_enc nfsd4_enc_ops[] = { + [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, + [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, + [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, + [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, + [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, + [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, + [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, + [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, + [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, + [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, + [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, + [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ] = (nfsd4_enc)nfsd4_encode_read, + [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, + [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, + [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, + [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, + [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, + [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, + [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, + [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, +}; + void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { @@ -2601,101 +2626,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) statp = p++; /* to be backfilled at the end */ ADJUST_ARGS(); - switch (op->opnum) { - case OP_ACCESS: - nfsd4_encode_access(resp, op->status, &op->u.access); - break; - case OP_CLOSE: - nfsd4_encode_close(resp, op->status, &op->u.close); - break; - case OP_COMMIT: - nfsd4_encode_commit(resp, op->status, &op->u.commit); - break; - case OP_CREATE: - nfsd4_encode_create(resp, op->status, &op->u.create); - break; - case OP_DELEGRETURN: - break; - case OP_GETATTR: - op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr); - break; - case OP_GETFH: - nfsd4_encode_getfh(resp, op->status, op->u.getfh); - break; - case OP_LINK: - nfsd4_encode_link(resp, op->status, &op->u.link); - break; - case OP_LOCK: - nfsd4_encode_lock(resp, op->status, &op->u.lock); - break; - case OP_LOCKT: - nfsd4_encode_lockt(resp, op->status, &op->u.lockt); - break; - case OP_LOCKU: - nfsd4_encode_locku(resp, op->status, &op->u.locku); - break; - case OP_LOOKUP: - break; - case OP_LOOKUPP: - break; - case OP_NVERIFY: - break; - case OP_OPEN: - nfsd4_encode_open(resp, op->status, &op->u.open); - break; - case OP_OPEN_CONFIRM: - nfsd4_encode_open_confirm(resp, op->status, &op->u.open_confirm); - break; - case OP_OPEN_DOWNGRADE: - nfsd4_encode_open_downgrade(resp, op->status, &op->u.open_downgrade); - break; - case OP_PUTFH: - break; - case OP_PUTROOTFH: - break; - case OP_READ: - op->status = nfsd4_encode_read(resp, op->status, &op->u.read); - break; - case OP_READDIR: - op->status = nfsd4_encode_readdir(resp, op->status, &op->u.readdir); - break; - case OP_READLINK: - op->status = nfsd4_encode_readlink(resp, op->status, &op->u.readlink); - break; - case OP_REMOVE: - nfsd4_encode_remove(resp, op->status, &op->u.remove); - break; - case OP_RENAME: - nfsd4_encode_rename(resp, op->status, &op->u.rename); - break; - case OP_RENEW: - break; - case OP_RESTOREFH: - break; - case OP_SAVEFH: - break; - case OP_SECINFO: - nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo); - break; - case OP_SETATTR: - nfsd4_encode_setattr(resp, op->status, &op->u.setattr); - break; - case OP_SETCLIENTID: - nfsd4_encode_setclientid(resp, op->status, &op->u.setclientid); - break; - case OP_SETCLIENTID_CONFIRM: - break; - case OP_VERIFY: - break; - case OP_WRITE: - nfsd4_encode_write(resp, op->status, &op->u.write); - break; - case OP_RELEASE_LOCKOWNER: - break; - default: - break; - } - + if (op->opnum == OP_ILLEGAL) + goto status; + BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + !nfsd4_enc_ops[op->opnum]); + op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); +status: /* * Note: We write the status directly, instead of using WRITE32(), * since it is already in network byte order. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5ac00c4fee9..1955a2702e6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -310,9 +310,12 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) { - __be32 server_ip; - char *fo_path, c; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; int b1, b2, b3, b4; + char c; + char *fo_path; /* sanity check */ if (size == 0) @@ -326,11 +329,13 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) return -EINVAL; /* get ipv4 address */ - if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) + if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4) return -EINVAL; - server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); + if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255) + return -EINVAL; + sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - return nlmsvc_unlock_all_by_ip(server_ip); + return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); } static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) @@ -450,22 +455,26 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) int i; int rv; int len; - int npools = nfsd_nrpools(); + int npools; int *nthreads; + mutex_lock(&nfsd_mutex); + npools = nfsd_nrpools(); if (npools == 0) { /* * NFS is shut down. The admin can start it by * writing to the threads file but NOT the pool_threads * file, sorry. Report zero threads. */ + mutex_unlock(&nfsd_mutex); strcpy(buf, "0\n"); return strlen(buf); } nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + rv = -ENOMEM; if (nthreads == NULL) - return -ENOMEM; + goto out_free; if (size > 0) { for (i = 0; i < npools; i++) { @@ -496,14 +505,16 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) mesg += len; } + mutex_unlock(&nfsd_mutex); return (mesg-buf); out_free: kfree(nthreads); + mutex_unlock(&nfsd_mutex); return rv; } -static ssize_t write_versions(struct file *file, char *buf, size_t size) +static ssize_t __write_versions(struct file *file, char *buf, size_t size) { /* * Format: @@ -566,14 +577,23 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) return len; } -static ssize_t write_ports(struct file *file, char *buf, size_t size) +static ssize_t write_versions(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_versions(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +static ssize_t __write_ports(struct file *file, char *buf, size_t size) { if (size == 0) { int len = 0; - lock_kernel(); + if (nfsd_serv) len = svc_xprt_names(nfsd_serv, buf, 0); - unlock_kernel(); return len; } /* Either a single 'fd' number is written, in which @@ -603,9 +623,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) /* Decrease the count, but don't shutdown the * the service */ - lock_kernel(); nfsd_serv->sv_nrthreads--; - unlock_kernel(); } return err < 0 ? err : 0; } @@ -614,10 +632,8 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int len = 0; if (!toclose) return -ENOMEM; - lock_kernel(); if (nfsd_serv) len = svc_sock_names(buf, nfsd_serv, toclose); - unlock_kernel(); if (len >= 0) lockd_down(); kfree(toclose); @@ -655,7 +671,6 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { if (port == 0) return -EINVAL; - lock_kernel(); if (nfsd_serv) { xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); @@ -666,13 +681,23 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) } else err = -ENOTCONN; } - unlock_kernel(); return err < 0 ? err : 0; } } return -EINVAL; } +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_ports(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + + int nfsd_max_blksize; static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) @@ -691,13 +716,13 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) if (bsize > NFSSVC_MAXBLKSIZE) bsize = NFSSVC_MAXBLKSIZE; bsize &= ~(1024-1); - lock_kernel(); + mutex_lock(&nfsd_mutex); if (nfsd_serv && nfsd_serv->sv_nrthreads) { - unlock_kernel(); + mutex_unlock(&nfsd_mutex); return -EBUSY; } nfsd_max_blksize = bsize; - unlock_kernel(); + mutex_unlock(&nfsd_mutex); } return sprintf(buf, "%d\n", nfsd_max_blksize); } @@ -705,16 +730,17 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) #ifdef CONFIG_NFSD_V4 extern time_t nfs4_leasetime(void); -static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) { /* if size > 10 seconds, call * nfs4_reset_lease() then write out the new lease (seconds) as reply */ char *mesg = buf; - int rv; + int rv, lease; if (size > 0) { - int lease; + if (nfsd_serv) + return -EBUSY; rv = get_int(&mesg, &lease); if (rv) return rv; @@ -726,24 +752,52 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) return strlen(buf); } -static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_leasetime(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +extern char *nfs4_recoverydir(void); + +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) { char *mesg = buf; char *recdir; int len, status; - if (size == 0 || size > PATH_MAX || buf[size-1] != '\n') - return -EINVAL; - buf[size-1] = 0; + if (size > 0) { + if (nfsd_serv) + return -EBUSY; + if (size > PATH_MAX || buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; - recdir = mesg; - len = qword_get(&mesg, recdir, size); - if (len <= 0) - return -EINVAL; + recdir = mesg; + len = qword_get(&mesg, recdir, size); + if (len <= 0) + return -EINVAL; - status = nfs4_reset_recoverydir(recdir); + status = nfs4_reset_recoverydir(recdir); + } + sprintf(buf, "%s\n", nfs4_recoverydir()); return strlen(buf); } + +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_recoverydir(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + #endif /*----------------------------------------------------------------------------*/ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 100ae564116..f45451eb1e3 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -176,9 +176,24 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); - error = nfsd_setuser_and_check_port(rqstp, exp); - if (error) - goto out; + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + /* Elevate privileges so that the lack of 'r' or 'x' + * permission on some parent directory will + * not stop exportfs_decode_fh from being able + * to reconnect a directory into the dentry cache. + * The same problem can affect "SUBTREECHECK" exports, + * but as nfsd_acceptable depends on correct + * access control settings being in effect, we cannot + * fix that case easily. + */ + current->cap_effective = + cap_raise_nfsd_set(current->cap_effective, + current->cap_permitted); + } else { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + } /* * Look up the dentry using the NFS file handle. @@ -215,6 +230,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) goto out; } + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) { + dput(dentry); + goto out; + } + } + if (S_ISDIR(dentry->d_inode->i_mode) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", @@ -279,7 +302,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - if (!(access & MAY_LOCK)) { + if (!(access & NFSD_MAY_LOCK)) { /* * pseudoflavor restrictions are not enforced on NLM, * which clients virtually always use auth_sys for, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6cfc96a1248..0766f95d236 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -65,7 +65,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); return nfsd_return_attrs(nfserr, resp); } @@ -215,11 +215,11 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, SVCFH_fmt(dirfhp), argp->len, argp->name); /* First verify the parent file handle */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_EXEC); + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); if (nfserr) goto done; /* must fh_put dirfhp even on error */ - /* Check for MAY_WRITE in nfsd_create if necessary */ + /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ nfserr = nfserr_acces; if (!argp->len) @@ -281,7 +281,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = nfsd_permission(rqstp, newfhp->fh_export, newfhp->fh_dentry, - MAY_WRITE|MAY_LOCAL_ACCESS); + NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); if (nfserr && nfserr != nfserr_rofs) goto out_unlock; } @@ -614,6 +614,7 @@ nfserrno (int errno) #endif { nfserr_stale, -ESTALE }, { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, { nfserr_dropit, -EAGAIN }, { nfserr_dropit, -ENOMEM }, { nfserr_badname, -ESRCH }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 941041f4b13..80292ff5e92 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -21,6 +21,7 @@ #include <linux/smp_lock.h> #include <linux/freezer.h> #include <linux/fs_struct.h> +#include <linux/kthread.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> @@ -36,28 +37,38 @@ #define NFSDDBG_FACILITY NFSDDBG_SVC -/* these signals will be delivered to an nfsd thread - * when handling a request - */ -#define ALLOWED_SIGS (sigmask(SIGKILL)) -/* these signals will be delivered to an nfsd thread - * when not handling a request. i.e. when waiting - */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT)) -/* if the last thread dies with SIGHUP, then the exports table is - * left unchanged ( like 2.4-{0-9} ). Any other signal will clear - * the exports table (like 2.2). - */ -#define SIG_NOCLEAN SIGHUP - extern struct svc_program nfsd_program; -static void nfsd(struct svc_rqst *rqstp); +static int nfsd(void *vrqstp); struct timeval nfssvc_boot; - struct svc_serv *nfsd_serv; static atomic_t nfsd_busy; static unsigned long nfsd_last_call; static DEFINE_SPINLOCK(nfsd_call_lock); +/* + * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members + * of the svc_serv struct. In particular, ->sv_nrthreads but also to some + * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * + * If (out side the lock) nfsd_serv is non-NULL, then it must point to a + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number + * of nfsd threads must exist and each must listed in ->sp_all_threads in each + * entry of ->sv_pools[]. + * + * Transitions of the thread count between zero and non-zero are of particular + * interest since the svc_serv needs to be created and initialized at that + * point, or freed. + * + * Finally, the nfsd_mutex also protects some of the global variables that are + * accessed when nfsd starts and that are settable via the write_* routines in + * nfsctl.c. In particular: + * + * user_recovery_dirname + * user_lease_time + * nfsd_versions + */ +DEFINE_MUTEX(nfsd_mutex); +struct svc_serv *nfsd_serv; + #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -145,13 +156,14 @@ int nfsd_vers(int vers, enum vers_op change) int nfsd_nrthreads(void) { - if (nfsd_serv == NULL) - return 0; - else - return nfsd_serv->sv_nrthreads; + int rv = 0; + mutex_lock(&nfsd_mutex); + if (nfsd_serv) + rv = nfsd_serv->sv_nrthreads; + mutex_unlock(&nfsd_mutex); + return rv; } -static int killsig; /* signal that was used to kill last nfsd */ static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ @@ -162,11 +174,9 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_racache_shutdown(); nfs4_state_shutdown(); - printk(KERN_WARNING "nfsd: last server has exited\n"); - if (killsig != SIG_NOCLEAN) { - printk(KERN_WARNING "nfsd: unexporting all filesystems\n"); - nfsd_export_flush(); - } + printk(KERN_WARNING "nfsd: last server has exited, flushing export " + "cache\n"); + nfsd_export_flush(); } void nfsd_reset_versions(void) @@ -190,13 +200,14 @@ void nfsd_reset_versions(void) } } + int nfsd_create_serv(void) { int err = 0; - lock_kernel(); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { svc_get(nfsd_serv); - unlock_kernel(); return 0; } if (nfsd_max_blksize == 0) { @@ -217,13 +228,11 @@ int nfsd_create_serv(void) } atomic_set(&nfsd_busy, 0); - nfsd_serv = svc_create_pooled(&nfsd_program, - nfsd_max_blksize, - nfsd_last_thread, - nfsd, SIG_NOCLEAN, THIS_MODULE); + nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; - unlock_kernel(); + do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; } @@ -282,6 +291,8 @@ int nfsd_set_nrthreads(int n, int *nthreads) int tot = 0; int err = 0; + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + if (nfsd_serv == NULL || n <= 0) return 0; @@ -316,7 +327,6 @@ int nfsd_set_nrthreads(int n, int *nthreads) nthreads[0] = 1; /* apply the new numbers */ - lock_kernel(); svc_get(nfsd_serv); for (i = 0; i < n; i++) { err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], @@ -325,7 +335,6 @@ int nfsd_set_nrthreads(int n, int *nthreads) break; } svc_destroy(nfsd_serv); - unlock_kernel(); return err; } @@ -334,8 +343,8 @@ int nfsd_svc(unsigned short port, int nrservs) { int error; - - lock_kernel(); + + mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); error = -EINVAL; if (nrservs <= 0) @@ -363,7 +372,7 @@ nfsd_svc(unsigned short port, int nrservs) failure: svc_destroy(nfsd_serv); /* Release server */ out: - unlock_kernel(); + mutex_unlock(&nfsd_mutex); return error; } @@ -391,18 +400,17 @@ update_thread_usage(int busy_threads) /* * This is the NFS server kernel thread */ -static void -nfsd(struct svc_rqst *rqstp) +static int +nfsd(void *vrqstp) { + struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct fs_struct *fsp; - int err; - sigset_t shutdown_mask, allowed_mask; + int err, preverr = 0; /* Lock module and set up kernel thread */ - lock_kernel(); - daemonize("nfsd"); + mutex_lock(&nfsd_mutex); - /* After daemonize() this kernel thread shares current->fs + /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ fsp = copy_fs_struct(current->fs); @@ -414,14 +422,17 @@ nfsd(struct svc_rqst *rqstp) current->fs = fsp; current->fs->umask = 0; - siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS); - siginitsetinv(&allowed_mask, ALLOWED_SIGS); + /* + * thread is spawned with all signals set to SIG_IGN, re-enable + * the ones that will bring down the thread + */ + allow_signal(SIGKILL); + allow_signal(SIGHUP); + allow_signal(SIGINT); + allow_signal(SIGQUIT); nfsdstats.th_cnt++; - - rqstp->rq_task = current; - - unlock_kernel(); + mutex_unlock(&nfsd_mutex); /* * We want less throttling in balance_dirty_pages() so that nfs to @@ -435,26 +446,30 @@ nfsd(struct svc_rqst *rqstp) * The main request loop */ for (;;) { - /* Block all but the shutdown signals */ - sigprocmask(SIG_SETMASK, &shutdown_mask, NULL); - /* * Find a socket with data available and call its * recvfrom routine. */ while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) ; - if (err < 0) + if (err == -EINTR) break; + else if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, -err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; + } + update_thread_usage(atomic_read(&nfsd_busy)); atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); - /* Process request with signals blocked. */ - sigprocmask(SIG_SETMASK, &allowed_mask, NULL); - svc_process(rqstp); /* Unlock export hash tables */ @@ -463,22 +478,10 @@ nfsd(struct svc_rqst *rqstp) atomic_dec(&nfsd_busy); } - if (err != -EINTR) { - printk(KERN_WARNING "nfsd: terminating on error %d\n", -err); - } else { - unsigned int signo; - - for (signo = 1; signo <= _NSIG; signo++) - if (sigismember(¤t->pending.signal, signo) && - !sigismember(¤t->blocked, signo)) - break; - killsig = signo; - } /* Clear signals before calling svc_exit_thread() */ flush_signals(current); - lock_kernel(); - + mutex_lock(&nfsd_mutex); nfsdstats.th_cnt --; out: @@ -486,8 +489,9 @@ out: svc_exit_thread(rqstp); /* Release module */ - unlock_kernel(); + mutex_unlock(&nfsd_mutex); module_put_and_exit(0); + return 0; } static __be32 map_new_errors(u32 vers, __be32 nfserr) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a3a291f771f..0f4481e0502 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -144,7 +144,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); /* Obtain dentry and export. */ - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); if (err) return err; @@ -262,14 +262,14 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, { struct dentry *dentry; struct inode *inode; - int accmode = MAY_SATTR; + int accmode = NFSD_MAY_SATTR; int ftype = 0; __be32 err; int host_err; int size_change = 0; if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) - accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE; + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; if (iap->ia_valid & ATTR_SIZE) ftype = S_IFREG; @@ -331,7 +331,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, */ if (iap->ia_valid & ATTR_SIZE) { if (iap->ia_size < inode->i_size) { - err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); + err = nfsd_permission(rqstp, fhp->fh_export, dentry, + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; } @@ -462,7 +463,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int flags = 0; /* Get inode */ - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); + error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); if (error) return error; @@ -563,20 +564,20 @@ struct accessmap { int how; }; static struct accessmap nfs3_regaccess[] = { - { NFS3_ACCESS_READ, MAY_READ }, - { NFS3_ACCESS_EXECUTE, MAY_EXEC }, - { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC }, - { NFS3_ACCESS_EXTEND, MAY_WRITE }, + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, { 0, 0 } }; static struct accessmap nfs3_diraccess[] = { - { NFS3_ACCESS_READ, MAY_READ }, - { NFS3_ACCESS_LOOKUP, MAY_EXEC }, - { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC }, - { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE }, - { NFS3_ACCESS_DELETE, MAY_REMOVE }, + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, + { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, + { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, { 0, 0 } }; @@ -589,10 +590,10 @@ static struct accessmap nfs3_anyaccess[] = { * mainly at mode bits, and we make sure to ignore read-only * filesystem checks */ - { NFS3_ACCESS_READ, MAY_READ }, - { NFS3_ACCESS_EXECUTE, MAY_EXEC }, - { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS }, - { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, { 0, 0 } }; @@ -606,7 +607,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor u32 query, result = 0, sresult = 0; __be32 error; - error = fh_verify(rqstp, fhp, 0, MAY_NOP); + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); if (error) goto out; @@ -678,7 +679,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. */ - err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE); + err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; @@ -689,7 +690,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * or any access when mandatory locking enabled */ err = nfserr_perm; - if (IS_APPEND(inode) && (access & MAY_WRITE)) + if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE)) goto out; /* * We must ignore files (but only files) which might have mandatory @@ -706,14 +707,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * Check to see if there are any leases on this file. * This may block while leases are broken. */ - host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); + host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); if (host_err == -EWOULDBLOCK) host_err = -ETIMEDOUT; if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; - if (access & MAY_WRITE) { - if (access & MAY_READ) + if (access & NFSD_MAY_WRITE) { + if (access & NFSD_MAY_READ) flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; @@ -1069,12 +1070,12 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - MAY_READ|MAY_OWNER_OVERRIDE); + NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); } else { - err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); @@ -1098,13 +1099,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - MAY_WRITE|MAY_OWNER_OVERRIDE); + NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); } else { - err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; @@ -1136,7 +1137,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if ((u64)count > ~(u64)offset) return nfserr_inval; - if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) return err; if (EX_ISSYNC(fhp->fh_export)) { if (file->f_op && file->f_op->fsync) { @@ -1197,7 +1199,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (isdotent(fname, flen)) goto out; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1248,36 +1250,34 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + err = nfserr_inval; + if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + goto out; + } + + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + /* * Get the dir op function pointer. */ err = 0; switch (type) { case S_IFREG: - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); - if (host_err) - goto out_nfserr; host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); break; case S_IFDIR: - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); - if (host_err) - goto out_nfserr; host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); - if (host_err) - goto out_nfserr; host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; - default: - printk("nfsd: bad file type %o in nfsd_create\n", type); - host_err = -EINVAL; - goto out_nfserr; } if (host_err < 0) { mnt_drop_write(fhp->fh_export->ex_path.mnt); @@ -1289,7 +1289,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, write_inode_now(dchild->d_inode, 1); } - err2 = nfsd_create_setattr(rqstp, resfhp, iap); if (err2) err = err2; @@ -1334,7 +1333,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1471,7 +1470,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) __be32 err; int host_err; - err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); + err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); if (err) goto out; @@ -1526,7 +1525,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (isdotent(fname, flen)) goto out; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; fh_lock(fhp); @@ -1591,10 +1590,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, __be32 err; int host_err; - err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; - err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP); + err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); if (err) goto out; @@ -1661,10 +1660,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, __be32 err; int host_err; - err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; - err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE); + err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1768,7 +1767,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, err = nfserr_acces; if (!flen || isdotent(fname, flen)) goto out; - err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; @@ -1834,7 +1833,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, struct file *file; loff_t offset = *offsetp; - err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file); if (err) goto out; @@ -1875,7 +1874,7 @@ out: __be32 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) { - __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP); + __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; @@ -1896,18 +1895,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct inode *inode = dentry->d_inode; int err; - if (acc == MAY_NOP) + if (acc == NFSD_MAY_NOP) return 0; #if 0 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", acc, - (acc & MAY_READ)? " read" : "", - (acc & MAY_WRITE)? " write" : "", - (acc & MAY_EXEC)? " exec" : "", - (acc & MAY_SATTR)? " sattr" : "", - (acc & MAY_TRUNC)? " trunc" : "", - (acc & MAY_LOCK)? " lock" : "", - (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "", + (acc & NFSD_MAY_READ)? " read" : "", + (acc & NFSD_MAY_WRITE)? " write" : "", + (acc & NFSD_MAY_EXEC)? " exec" : "", + (acc & NFSD_MAY_SATTR)? " sattr" : "", + (acc & NFSD_MAY_TRUNC)? " trunc" : "", + (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", inode->i_mode, IS_IMMUTABLE(inode)? " immut" : "", IS_APPEND(inode)? " append" : "", @@ -1920,18 +1919,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, * system. But if it is IRIX doing check on write-access for a * device special file, we ignore rofs. */ - if (!(acc & MAY_LOCAL_ACCESS)) - if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { + if (!(acc & NFSD_MAY_LOCAL_ACCESS)) + if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { if (exp_rdonly(rqstp, exp) || __mnt_is_readonly(exp->ex_path.mnt)) return nfserr_rofs; - if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) + if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) return nfserr_perm; } - if ((acc & MAY_TRUNC) && IS_APPEND(inode)) + if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) return nfserr_perm; - if (acc & MAY_LOCK) { + if (acc & NFSD_MAY_LOCK) { /* If we cannot rely on authentication in NLM requests, * just allow locks, otherwise require read permission, or * ownership @@ -1939,7 +1938,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, if (exp->ex_flags & NFSEXP_NOAUTHNLM) return 0; else - acc = MAY_READ | MAY_OWNER_OVERRIDE; + acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; } /* * The file owner always gets access permission for accesses that @@ -1955,15 +1954,16 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, * We must trust the client to do permission checking - using "ACCESS" * with NFSv3. */ - if ((acc & MAY_OWNER_OVERRIDE) && + if ((acc & NFSD_MAY_OWNER_OVERRIDE) && inode->i_uid == current->fsuid) return 0; + /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (MAY_READ | MAY_OWNER_OVERRIDE)) + acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) err = permission(inode, MAY_EXEC, NULL); return err? nfserrno(err) : 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index c6455dadb21..9c2ac5c0ef5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -918,12 +918,12 @@ struct file_lock { struct list_head fl_link; /* doubly linked list of all locks */ struct list_head fl_block; /* circular list of blocked processes */ fl_owner_t fl_owner; + unsigned char fl_flags; + unsigned char fl_type; unsigned int fl_pid; struct pid *fl_nspid; wait_queue_head_t fl_wait; struct file *fl_file; - unsigned char fl_flags; - unsigned char fl_type; loff_t fl_start; loff_t fl_end; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 102d928f720..dbb87ab282e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -200,10 +200,12 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); * Server-side lock handling */ __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, - struct nlm_lock *, int, struct nlm_cookie *); + struct nlm_host *, struct nlm_lock *, int, + struct nlm_cookie *); __be32 nlmsvc_unlock(struct nlm_file *, struct nlm_lock *); __be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *, - struct nlm_lock *, struct nlm_lock *, struct nlm_cookie *); + struct nlm_host *, struct nlm_lock *, + struct nlm_lock *, struct nlm_cookie *); __be32 nlmsvc_cancel_blocked(struct nlm_file *, struct nlm_lock *); unsigned long nlmsvc_retry_blocked(void); void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, @@ -224,7 +226,7 @@ void nlmsvc_invalidate_all(void); * Cluster failover support */ int nlmsvc_unlock_all_by_sb(struct super_block *sb); -int nlmsvc_unlock_all_by_ip(__be32 server_addr); +int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 8726491de15..ea036676948 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -65,9 +65,6 @@ #define NFS4_ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x00000010 #define NFS4_ACE_FAILED_ACCESS_ACE_FLAG 0x00000020 #define NFS4_ACE_IDENTIFIER_GROUP 0x00000040 -#define NFS4_ACE_OWNER 0x00000080 -#define NFS4_ACE_GROUP 0x00000100 -#define NFS4_ACE_EVERYONE 0x00000200 #define NFS4_ACE_READ_DATA 0x00000001 #define NFS4_ACE_LIST_DIRECTORY 0x00000001 diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 41d30c9c9de..a2861d95ecc 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -28,20 +28,20 @@ #define NFSD_SUPPORTED_MINOR_VERSION 0 /* - * Special flags for nfsd_permission. These must be different from MAY_READ, - * MAY_WRITE, and MAY_EXEC. + * Flags for nfsd_permission */ -#define MAY_NOP 0 -#define MAY_SATTR 8 -#define MAY_TRUNC 16 -#define MAY_LOCK 32 -#define MAY_OWNER_OVERRIDE 64 -#define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ -#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC) -# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE." -#endif -#define MAY_CREATE (MAY_EXEC|MAY_WRITE) -#define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC) +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ +#define NFSD_MAY_READ 4 /* == MAY_READ */ +#define NFSD_MAY_SATTR 8 +#define NFSD_MAY_TRUNC 16 +#define NFSD_MAY_LOCK 32 +#define NFSD_MAY_OWNER_OVERRIDE 64 +#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) /* * Callback function for readdir @@ -54,6 +54,7 @@ typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); extern struct svc_program nfsd_program; extern struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; +extern struct mutex nfsd_mutex; extern struct svc_serv *nfsd_serv; extern struct seq_operations nfs_exports_op; diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index db348f74937..d0fe2e37845 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -98,8 +98,6 @@ struct nfs4_callback { u32 cb_ident; /* RPC client info */ atomic_t cb_set; /* successful CB_NULL call */ - struct rpc_program cb_program; - struct rpc_stat cb_stat; struct rpc_clnt * cb_client; }; diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index a10f1fb0bf7..e7bbdba474d 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -51,6 +51,9 @@ struct krb5_ctx { extern spinlock_t krb5_seq_lock; +/* The length of the Kerberos GSS token header */ +#define GSS_KRB5_TOK_HDR_LEN (16) + #define KG_TOK_MIC_MSG 0x0101 #define KG_TOK_WRAP_MSG 0x0201 diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4b54c5fdcfd..dc69068d94c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -22,7 +22,7 @@ /* * This is the RPC server thread function prototype */ -typedef void (*svc_thread_fn)(struct svc_rqst *); +typedef int (*svc_thread_fn)(void *); /* * @@ -80,7 +80,6 @@ struct svc_serv { struct module * sv_module; /* optional module to count when * adding threads */ svc_thread_fn sv_function; /* main function for threads */ - int sv_kill_signal; /* signal to kill threads */ }; /* @@ -388,8 +387,8 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - void (*shutdown)(struct svc_serv*), - svc_thread_fn, int sig, struct module *); + void (*shutdown)(struct svc_serv*), svc_thread_fn, + struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 05eb4664d0d..ef2e3a20bf3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -72,7 +72,7 @@ extern atomic_t rdma_stat_sq_prod; */ struct svc_rdma_op_ctxt { struct svc_rdma_op_ctxt *read_hdr; - struct list_head free_list; + int hdr_count; struct xdr_buf arg; struct list_head dto_q; enum ib_wr_opcode wr_op; @@ -86,6 +86,31 @@ struct svc_rdma_op_ctxt { struct page *pages[RPCSVC_MAXPAGES]; }; +/* + * NFS_ requests are mapped on the client side by the chunk lists in + * the RPCRDMA header. During the fetching of the RPC from the client + * and the writing of the reply to the client, the memory in the + * client and the memory in the server must be mapped as contiguous + * vaddr/len for access by the hardware. These data strucures keep + * these mappings. + * + * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the + * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the + * 'ch' field maps the read-list of the RPCRDMA header to the 'sge' + * mapping of the reply. + */ +struct svc_rdma_chunk_sge { + int start; /* sge no for this chunk */ + int count; /* sge count for this chunk */ +}; +struct svc_rdma_req_map { + unsigned long count; + union { + struct kvec sge[RPCSVC_MAXPAGES]; + struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; + }; +}; + #define RDMACTXT_F_LAST_CTXT 2 struct svcxprt_rdma { @@ -93,7 +118,6 @@ struct svcxprt_rdma { struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ struct list_head sc_accept_q; /* Conn. waiting accept */ int sc_ord; /* RDMA read limit */ - wait_queue_head_t sc_read_wait; int sc_max_sge; int sc_sq_depth; /* Depth of SQ */ @@ -104,12 +128,8 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; + atomic_t sc_dma_used; atomic_t sc_ctxt_used; - struct list_head sc_ctxt_free; - int sc_ctxt_cnt; - int sc_ctxt_bump; - int sc_ctxt_max; - spinlock_t sc_ctxt_lock; struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; @@ -173,6 +193,8 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); +extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); +extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); extern void svc_sq_reap(struct svcxprt_rdma *); extern void svc_rq_reap(struct svcxprt_rdma *); extern struct svc_xprt_class svc_rdma_class; diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index f3431a7e33d..4de8bcf26fa 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,12 +5,12 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ - gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o + gss_mech_switch.o svcauth_gss.o obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o gss_krb5_wrap.o + gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 1d52308ca32..c93fca20455 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -83,8 +83,6 @@ out: return ret; } -EXPORT_SYMBOL(krb5_encrypt); - u32 krb5_decrypt( struct crypto_blkcipher *tfm, @@ -118,8 +116,6 @@ out: return ret; } -EXPORT_SYMBOL(krb5_decrypt); - static int checksummer(struct scatterlist *sg, void *data) { @@ -161,8 +157,6 @@ out: return err ? GSS_S_FAILURE : 0; } -EXPORT_SYMBOL(make_checksum); - struct encryptor_desc { u8 iv[8]; /* XXX hard-coded blocksize */ struct blkcipher_desc desc; @@ -262,8 +256,6 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, return ret; } -EXPORT_SYMBOL(gss_encrypt_xdr_buf); - struct decryptor_desc { u8 iv[8]; /* XXX hard-coded blocksize */ struct blkcipher_desc desc; @@ -334,5 +326,3 @@ gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); } - -EXPORT_SYMBOL(gss_decrypt_xdr_buf); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 5f1d36dfbcf..b8f42ef7178 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -78,7 +78,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; char cksumdata[16]; struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; - unsigned char *ptr, *krb5_hdr, *msg_start; + unsigned char *ptr, *msg_start; s32 now; u32 seq_send; @@ -87,36 +87,36 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, now = get_seconds(); - token->len = g_token_size(&ctx->mech_used, 24); + token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8); ptr = token->data; - g_make_token_header(&ctx->mech_used, 24, &ptr); + g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr); - *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); - *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff); - /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr - 2; - msg_start = krb5_hdr + 24; + msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8; - *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); - memset(krb5_hdr + 4, 0xff, 4); + *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); + memset(ptr + 4, 0xff, 4); - if (make_checksum("md5", krb5_hdr, 8, text, 0, &md5cksum)) + if (make_checksum("md5", ptr, 8, text, 0, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); spin_lock(&krb5_seq_lock); seq_send = ctx->seq_send++; spin_unlock(&krb5_seq_lock); if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, - seq_send, krb5_hdr + 16, krb5_hdr + 8)) + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, + ptr + 8)) return GSS_S_FAILURE; return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index d91a5d00480..066ec73c84d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -92,30 +92,30 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, read_token->len)) return GSS_S_DEFECTIVE_TOKEN; - if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || - (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) + if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) return GSS_S_DEFECTIVE_TOKEN; /* XXX sanity-check bodysize?? */ - signalg = ptr[0] + (ptr[1] << 8); + signalg = ptr[2] + (ptr[3] << 8); if (signalg != SGN_ALG_DES_MAC_MD5) return GSS_S_DEFECTIVE_TOKEN; - sealalg = ptr[2] + (ptr[3] << 8); + sealalg = ptr[4] + (ptr[5] << 8); if (sealalg != SEAL_ALG_NONE) return GSS_S_DEFECTIVE_TOKEN; - if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) return GSS_S_DEFECTIVE_TOKEN; - if (make_checksum("md5", ptr - 2, 8, message_buffer, 0, &md5cksum)) + if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) return GSS_S_FAILURE; - if (memcmp(md5cksum.data + 8, ptr + 14, 8)) + if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ @@ -127,7 +127,7 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, /* do sequencing checks */ - if (krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, &seqnum)) + if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum)) return GSS_S_FAILURE; if ((ctx->initiate && direction != 0xff) || diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index b00b1b42630..ae8e69b59c4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -87,8 +87,8 @@ out: return 0; } -static inline void -make_confounder(char *p, int blocksize) +static void +make_confounder(char *p, u32 conflen) { static u64 i = 0; u64 *q = (u64 *)p; @@ -102,8 +102,22 @@ make_confounder(char *p, int blocksize) * uniqueness would mean worrying about atomicity and rollover, and I * don't care enough. */ - BUG_ON(blocksize != 8); - *q = i++; + /* initialize to random value */ + if (i == 0) { + i = random32(); + i = (i << 32) | random32(); + } + + switch (conflen) { + case 16: + *q++ = i++; + /* fall through */ + case 8: + *q++ = i++; + break; + default: + BUG(); + } } /* Assumptions: the head and tail of inbuf are ours to play with. @@ -122,7 +136,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, char cksumdata[16]; struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; int blocksize = 0, plainlen; - unsigned char *ptr, *krb5_hdr, *msg_start; + unsigned char *ptr, *msg_start; s32 now; int headlen; struct page **tmp_pages; @@ -149,26 +163,26 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, buf->len += headlen; BUG_ON((buf->len - offset - headlen) % blocksize); - g_make_token_header(&kctx->mech_used, 24 + plainlen, &ptr); + g_make_token_header(&kctx->mech_used, + GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr); - *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); - *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); - /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr - 2; - msg_start = krb5_hdr + 24; + msg_start = ptr + 24; - *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); - memset(krb5_hdr + 4, 0xff, 4); - *(__be16 *)(krb5_hdr + 4) = htons(SEAL_ALG_DES); + *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); + memset(ptr + 4, 0xff, 4); + *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES); make_confounder(msg_start, blocksize); /* XXXJBF: UGH!: */ tmp_pages = buf->pages; buf->pages = pages; - if (make_checksum("md5", krb5_hdr, 8, buf, + if (make_checksum("md5", ptr, 8, buf, offset + headlen - blocksize, &md5cksum)) return GSS_S_FAILURE; buf->pages = tmp_pages; @@ -176,7 +190,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); spin_lock(&krb5_seq_lock); seq_send = kctx->seq_send++; @@ -185,7 +199,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, - seq_send, krb5_hdr + 16, krb5_hdr + 8))) + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) return GSS_S_FAILURE; if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, @@ -219,38 +233,38 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) buf->len - offset)) return GSS_S_DEFECTIVE_TOKEN; - if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || - (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) + if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) return GSS_S_DEFECTIVE_TOKEN; /* XXX sanity-check bodysize?? */ /* get the sign and seal algorithms */ - signalg = ptr[0] + (ptr[1] << 8); + signalg = ptr[2] + (ptr[3] << 8); if (signalg != SGN_ALG_DES_MAC_MD5) return GSS_S_DEFECTIVE_TOKEN; - sealalg = ptr[2] + (ptr[3] << 8); + sealalg = ptr[4] + (ptr[5] << 8); if (sealalg != SEAL_ALG_DES) return GSS_S_DEFECTIVE_TOKEN; - if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) return GSS_S_DEFECTIVE_TOKEN; if (gss_decrypt_xdr_buf(kctx->enc, buf, - ptr + 22 - (unsigned char *)buf->head[0].iov_base)) + ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base)) return GSS_S_DEFECTIVE_TOKEN; - if (make_checksum("md5", ptr - 2, 8, buf, - ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) + if (make_checksum("md5", ptr, 8, buf, + ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - if (memcmp(md5cksum.data + 8, ptr + 14, 8)) + if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ @@ -262,8 +276,8 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) /* do sequencing checks */ - if (krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, - &seqnum)) + if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, + &direction, &seqnum)) return GSS_S_BAD_SIG; if ((kctx->initiate && direction != 0xff) || @@ -274,7 +288,7 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) * better to copy and encrypt at the same time. */ blocksize = crypto_blkcipher_blocksize(kctx->enc); - data_start = ptr + 22 + blocksize; + data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize; orig_start = buf->head[0].iov_base + offset; data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; memmove(orig_start, data_start, data_len); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 01c7e311b90..5a32cb7c4bb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/kthread.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> @@ -291,15 +292,14 @@ svc_pool_map_put(void) /* - * Set the current thread's cpus_allowed mask so that it + * Set the given thread's cpus_allowed mask so that it * will only run on cpus in the given pool. - * - * Returns 1 and fills in oldmask iff a cpumask was applied. */ -static inline int -svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +static inline void +svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) { struct svc_pool_map *m = &svc_pool_map; + unsigned int node = m->pool_to[pidx]; /* * The caller checks for sv_nrpools > 1, which @@ -307,26 +307,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) */ BUG_ON(m->count == 0); - switch (m->mode) - { - default: - return 0; + switch (m->mode) { case SVC_POOL_PERCPU: { - unsigned int cpu = m->pool_to[pidx]; - - *oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - return 1; + set_cpus_allowed_ptr(task, &cpumask_of_cpu(node)); + break; } case SVC_POOL_PERNODE: { - unsigned int node = m->pool_to[pidx]; node_to_cpumask_ptr(nodecpumask, node); - - *oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, nodecpumask); - return 1; + set_cpus_allowed_ptr(task, nodecpumask); + break; } } } @@ -443,7 +434,7 @@ EXPORT_SYMBOL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, void (*shutdown)(struct svc_serv *serv), - svc_thread_fn func, int sig, struct module *mod) + svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); @@ -452,7 +443,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, if (serv != NULL) { serv->sv_function = func; - serv->sv_kill_signal = sig; serv->sv_module = mod; } @@ -461,7 +451,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, EXPORT_SYMBOL(svc_create_pooled); /* - * Destroy an RPC service. Should be called with the BKL held + * Destroy an RPC service. Should be called with appropriate locking to + * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. */ void svc_destroy(struct svc_serv *serv) @@ -578,46 +569,6 @@ out_enomem: EXPORT_SYMBOL(svc_prepare_thread); /* - * Create a thread in the given pool. Caller must hold BKL. - * On a NUMA or SMP machine, with a multi-pool serv, the thread - * will be restricted to run on the cpus belonging to the pool. - */ -static int -__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, - struct svc_pool *pool) -{ - struct svc_rqst *rqstp; - int error = -ENOMEM; - int have_oldmask = 0; - cpumask_t uninitialized_var(oldmask); - - rqstp = svc_prepare_thread(serv, pool); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); - goto out; - } - - if (serv->sv_nrpools > 1) - have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); - - error = kernel_thread((int (*)(void *)) func, rqstp, 0); - - if (have_oldmask) - set_cpus_allowed(current, oldmask); - - if (error < 0) - goto out_thread; - svc_sock_update_bufs(serv); - error = 0; -out: - return error; - -out_thread: - svc_exit_thread(rqstp); - goto out; -} - -/* * Choose a pool in which to create a new thread, for svc_set_num_threads */ static inline struct svc_pool * @@ -674,7 +625,7 @@ found_pool: * of threads the given number. If `pool' is non-NULL, applies * only to threads in that pool, otherwise round-robins between * all pools. Must be called with a svc_get() reference and - * the BKL held. + * the BKL or another lock to protect access to svc_serv fields. * * Destroying threads relies on the service threads filling in * rqstp->rq_task, which only the nfs ones do. Assumes the serv @@ -686,7 +637,9 @@ found_pool: int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct task_struct *victim; + struct svc_rqst *rqstp; + struct task_struct *task; + struct svc_pool *chosen_pool; int error = 0; unsigned int state = serv->sv_nrthreads-1; @@ -702,18 +655,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) /* create new threads */ while (nrservs > 0) { nrservs--; + chosen_pool = choose_pool(serv, pool, &state); + + rqstp = svc_prepare_thread(serv, chosen_pool); + if (IS_ERR(rqstp)) { + error = PTR_ERR(rqstp); + break; + } + __module_get(serv->sv_module); - error = __svc_create_thread(serv->sv_function, serv, - choose_pool(serv, pool, &state)); - if (error < 0) { + task = kthread_create(serv->sv_function, rqstp, serv->sv_name); + if (IS_ERR(task)) { + error = PTR_ERR(task); module_put(serv->sv_module); + svc_exit_thread(rqstp); break; } + + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); } /* destroy old threads */ while (nrservs < 0 && - (victim = choose_victim(serv, pool, &state)) != NULL) { - send_sig(serv->sv_kill_signal, victim, 1); + (task = choose_victim(serv, pool, &state)) != NULL) { + send_sig(SIGINT, task, 1); nrservs++; } @@ -722,7 +691,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) EXPORT_SYMBOL(svc_set_num_threads); /* - * Called from a server thread as it's exiting. Caller must hold BKL. + * Called from a server thread as it's exiting. Caller must hold the BKL or + * the "service mutex", whichever is appropriate for the service. */ void svc_exit_thread(struct svc_rqst *rqstp) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 88c0ca20bb1..87101177825 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; +/* Temporary NFS request map and context caches */ +struct kmem_cache *svc_rdma_map_cachep; +struct kmem_cache *svc_rdma_ctxt_cachep; + /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + flush_scheduled_work(); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; } svc_unreg_xprt_class(&svc_rdma_class); + kmem_cache_destroy(svc_rdma_map_cachep); + kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) @@ -255,9 +262,37 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + /* Create the temporary map cache */ + svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", + sizeof(struct svc_rdma_req_map), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_map_cachep) { + printk(KERN_INFO "Could not allocate map cache.\n"); + goto err0; + } + + /* Create the temporary context cache */ + svc_rdma_ctxt_cachep = + kmem_cache_create("svc_rdma_ctxt_cache", + sizeof(struct svc_rdma_op_ctxt), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_ctxt_cachep) { + printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); + goto err1; + } + /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); return 0; + err1: + kmem_cache_destroy(svc_rdma_map_cachep); + err0: + unregister_sysctl_table(svcrdma_table_header); + return -ENOMEM; } MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); MODULE_DESCRIPTION("SVC RDMA Transport"); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 06ab4841537..b4b17f44cb2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -112,11 +112,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -struct chunk_sge { - int start; /* sge no for this chunk */ - int count; /* sge count for this chunk */ -}; - /* Encode a read-chunk-list as an array of IB SGE * * Assumptions: @@ -134,8 +129,8 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, struct rpcrdma_msg *rmsgp, - struct ib_sge *sge, - struct chunk_sge *ch_sge_ary, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, int ch_count, int byte_count) { @@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, head->arg.head[0] = rqstp->rq_arg.head[0]; head->arg.tail[0] = rqstp->rq_arg.tail[0]; head->arg.pages = &head->pages[head->count]; - head->sge[0].length = head->count; /* save count of hdr pages */ + head->hdr_count = head->count; /* save count of hdr pages */ head->arg.page_base = 0; head->arg.page_len = ch_bytes; head->arg.len = rqstp->rq_arg.len + ch_bytes; head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; head->count++; - ch_sge_ary[0].start = 0; + chl_map->ch[0].start = 0; while (byte_count) { + rpl_map->sge[sge_no].iov_base = + page_address(rqstp->rq_arg.pages[page_no]) + page_off; sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); - sge[sge_no].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - rqstp->rq_arg.pages[page_no], - page_off, sge_bytes, - DMA_FROM_DEVICE); - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + rpl_map->sge[sge_no].iov_len = sge_bytes; /* * Don't bump head->count here because the same page * may be used by multiple SGE. @@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, * SGE, move to the next SGE */ if (ch_bytes == 0) { - ch_sge_ary[ch_no].count = - sge_no - ch_sge_ary[ch_no].start; + chl_map->ch[ch_no].count = + sge_no - chl_map->ch[ch_no].start; ch_no++; ch++; - ch_sge_ary[ch_no].start = sge_no; + chl_map->ch[ch_no].start = sge_no; ch_bytes = ch->rc_target.rs_length; /* If bytes remaining account for next chunk */ if (byte_count) { @@ -220,18 +211,25 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, return sge_no; } -static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, - struct ib_sge *sge, +static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt, + struct kvec *vec, u64 *sgl_offset, int count) { int i; ctxt->count = count; + ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { - ctxt->sge[i].addr = sge[i].addr; - ctxt->sge[i].length = sge[i].length; - *sgl_offset = *sgl_offset + sge[i].length; + atomic_inc(&xprt->sc_dma_used); + ctxt->sge[i].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + vec[i].iov_base, vec[i].iov_len, + DMA_FROM_DEVICE); + ctxt->sge[i].length = vec[i].iov_len; + ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey; + *sgl_offset = *sgl_offset + vec[i].iov_len; } } @@ -282,34 +280,29 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, struct ib_send_wr read_wr; int err = 0; int ch_no; - struct ib_sge *sge; int ch_count; int byte_count; int sge_count; u64 sgl_offset; struct rpcrdma_read_chunk *ch; struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_op_ctxt *tmp_sge_ctxt; - struct svc_rdma_op_ctxt *tmp_ch_ctxt; - struct chunk_sge *ch_sge_ary; + struct svc_rdma_req_map *rpl_map; + struct svc_rdma_req_map *chl_map; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); if (!ch) return 0; - /* Allocate temporary contexts to keep SGE */ - BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); - tmp_sge_ctxt = svc_rdma_get_context(xprt); - sge = tmp_sge_ctxt->sge; - tmp_ch_ctxt = svc_rdma_get_context(xprt); - ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; + /* Allocate temporary reply and chunk maps */ + rpl_map = svc_rdma_get_req_map(); + chl_map = svc_rdma_get_req_map(); svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, - sge, ch_sge_ary, + rpl_map, chl_map, ch_count, byte_count); sgl_offset = 0; ch_no = 0; @@ -331,14 +324,15 @@ next_sge: read_wr.wr.rdma.remote_addr = get_unaligned(&(ch->rc_target.rs_offset)) + sgl_offset; - read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; + read_wr.sg_list = ctxt->sge; read_wr.num_sge = - rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); - rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], + rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); + rdma_set_ctxt_sge(xprt, ctxt, + &rpl_map->sge[chl_map->ch[ch_no].start], &sgl_offset, read_wr.num_sge); if (((ch+1)->rc_discrim == 0) && - (read_wr.num_sge == ch_sge_ary[ch_no].count)) { + (read_wr.num_sge == chl_map->ch[ch_no].count)) { /* * Mark the last RDMA_READ with a bit to * indicate all RPC data has been fetched from @@ -358,9 +352,9 @@ next_sge: } atomic_inc(&rdma_stat_read); - if (read_wr.num_sge < ch_sge_ary[ch_no].count) { - ch_sge_ary[ch_no].count -= read_wr.num_sge; - ch_sge_ary[ch_no].start += read_wr.num_sge; + if (read_wr.num_sge < chl_map->ch[ch_no].count) { + chl_map->ch[ch_no].count -= read_wr.num_sge; + chl_map->ch[ch_no].start += read_wr.num_sge; goto next_sge; } sgl_offset = 0; @@ -368,8 +362,8 @@ next_sge: } out: - svc_rdma_put_context(tmp_sge_ctxt, 0); - svc_rdma_put_context(tmp_ch_ctxt, 0); + svc_rdma_put_req_map(rpl_map); + svc_rdma_put_req_map(chl_map); /* Detach arg pages. svc_recv will replenish them */ for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) @@ -399,7 +393,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; + rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; rqstp->rq_arg.page_base = head->arg.page_base; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index fb82b1b683f..a19b22b452a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -63,52 +63,44 @@ * SGE[2..sge_count-2] data from xdr->pages[] * SGE[sge_count-1] data from xdr->tail. * + * The max SGE we need is the length of the XDR / pagesize + one for + * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES + * reserves a page for both the request and the reply header, and this + * array is only concerned with the reply we are assured that we have + * on extra page for the RPCRMDA header. */ -static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct ib_sge *sge, - int *sge_count) +static void xdr_to_sge(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { - /* Max we need is the length of the XDR / pagesize + one for - * head + one for tail + one for RPCRDMA header - */ int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; - u32 byte_count = xdr->len; u32 sge_bytes; u32 page_bytes; - int page_off; + u32 page_off; int page_no; + BUG_ON(xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; /* Head SGE */ - sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, - xdr->head[0].iov_base, - xdr->head[0].iov_len, - DMA_TO_DEVICE); - sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); - byte_count -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; sge_no++; /* pages SGE */ page_no = 0; page_bytes = xdr->page_len; page_off = xdr->page_base; - while (byte_count && page_bytes) { - sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); - sge[sge_no].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - xdr->pages[page_no], page_off, - sge_bytes, DMA_TO_DEVICE); - sge_bytes = min(sge_bytes, page_bytes); - byte_count -= sge_bytes; + while (page_bytes) { + vec->sge[sge_no].iov_base = + page_address(xdr->pages[page_no]) + page_off; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); page_bytes -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + vec->sge[sge_no].iov_len = sge_bytes; sge_no++; page_no++; @@ -116,36 +108,24 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, } /* Tail SGE */ - if (byte_count && xdr->tail[0].iov_len) { - sge[sge_no].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - xdr->tail[0].iov_base, - xdr->tail[0].iov_len, - DMA_TO_DEVICE); - sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); - byte_count -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + if (xdr->tail[0].iov_len) { + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; sge_no++; } BUG_ON(sge_no > sge_max); - BUG_ON(byte_count != 0); - - *sge_count = sge_no; - return sge; + vec->count = sge_no; } - /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, u32 rmr, u64 to, u32 xdr_off, int write_len, - struct ib_sge *xdr_sge, int sge_count) + struct svc_rdma_req_map *vec) { - struct svc_rdma_op_ctxt *tmp_sge_ctxt; struct ib_send_wr write_wr; struct ib_sge *sge; int xdr_sge_no; @@ -154,25 +134,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, int sge_off; int bc; struct svc_rdma_op_ctxt *ctxt; - int ret = 0; - BUG_ON(sge_count > RPCSVC_MAXPAGES); + BUG_ON(vec->count > RPCSVC_MAXPAGES); dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " - "write_len=%d, xdr_sge=%p, sge_count=%d\n", + "write_len=%d, vec->sge=%p, vec->count=%lu\n", rmr, (unsigned long long)to, xdr_off, - write_len, xdr_sge, sge_count); + write_len, vec->sge, vec->count); ctxt = svc_rdma_get_context(xprt); - ctxt->count = 0; - tmp_sge_ctxt = svc_rdma_get_context(xprt); - sge = tmp_sge_ctxt->sge; + ctxt->direction = DMA_TO_DEVICE; + sge = ctxt->sge; /* Find the SGE associated with xdr_off */ - for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; xdr_sge_no++) { - if (xdr_sge[xdr_sge_no].length > bc) + if (vec->sge[xdr_sge_no].iov_len > bc) break; - bc -= xdr_sge[xdr_sge_no].length; + bc -= vec->sge[xdr_sge_no].iov_len; } sge_off = bc; @@ -180,21 +158,28 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_no = 0; /* Copy the remaining SGE */ - while (bc != 0 && xdr_sge_no < sge_count) { - sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; - sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; + while (bc != 0 && xdr_sge_no < vec->count) { + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; sge_bytes = min((size_t)bc, - (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); + (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off)); sge[sge_no].length = sge_bytes; - + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *) + vec->sge[xdr_sge_no].iov_base + sge_off, + sge_bytes, DMA_TO_DEVICE); + if (dma_mapping_error(sge[sge_no].addr)) + goto err; sge_off = 0; sge_no++; + ctxt->count++; xdr_sge_no++; bc -= sge_bytes; } BUG_ON(bc != 0); - BUG_ON(xdr_sge_no > sge_count); + BUG_ON(xdr_sge_no > vec->count); /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); @@ -209,21 +194,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, /* Post It */ atomic_inc(&rdma_stat_write); - if (svc_rdma_send(xprt, &write_wr)) { - svc_rdma_put_context(ctxt, 1); - /* Fatal error, close transport */ - ret = -EIO; - } - svc_rdma_put_context(tmp_sge_ctxt, 0); - return ret; + if (svc_rdma_send(xprt, &write_wr)) + goto err; + return 0; + err: + svc_rdma_put_context(ctxt, 0); + /* Fatal error, close transport */ + return -EIO; } static int send_write_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_argp, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, - struct ib_sge *sge, - int sge_count) + struct svc_rdma_req_map *vec) { u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; int write_len; @@ -269,8 +253,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, rs_offset + chunk_off, xdr_off, this_write, - sge, - sge_count); + vec); if (ret) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); @@ -292,8 +275,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_argp, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, - struct ib_sge *sge, - int sge_count) + struct svc_rdma_req_map *vec) { u32 xfer_len = rqstp->rq_res.len; int write_len; @@ -341,8 +323,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, rs_offset + chunk_off, xdr_off, this_write, - sge, - sge_count); + vec); if (ret) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); @@ -380,7 +361,7 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_op_ctxt *ctxt, - int sge_count, + struct svc_rdma_req_map *vec, int byte_count) { struct ib_send_wr send_wr; @@ -405,6 +386,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ + atomic_inc(&rdma->sc_dma_used); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_TO_DEVICE); @@ -413,10 +395,16 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; /* Determine how many of our SGE are to be transmitted */ - for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { - sge_bytes = min((size_t)ctxt->sge[sge_no].length, - (size_t)byte_count); + for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { + sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].addr = + ib_dma_map_single(rdma->sc_cm_id->device, + vec->sge[sge_no].iov_base, + sge_bytes, DMA_TO_DEVICE); + ctxt->sge[sge_no].length = sge_bytes; + ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey; } BUG_ON(byte_count != 0); @@ -428,8 +416,10 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; + /* If there are more pages than SGE, terminate SGE list */ + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; } - BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; @@ -473,20 +463,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) enum rpcrdma_proc reply_type; int ret; int inline_bytes; - struct ib_sge *sge; - int sge_count = 0; struct page *res_page; struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); /* Get the RDMA request header. */ rdma_argp = xdr_start(&rqstp->rq_arg); - /* Build an SGE for the XDR */ + /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); + vec = svc_rdma_get_req_map(); + xdr_to_sge(rdma, &rqstp->rq_res, vec); inline_bytes = rqstp->rq_res.len; @@ -503,7 +493,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Send any write-chunk data and build resp write-list */ ret = send_write_chunks(rdma, rdma_argp, rdma_resp, - rqstp, sge, sge_count); + rqstp, vec); if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", ret); @@ -513,7 +503,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Send any reply-list data and update resp reply-list */ ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, - rqstp, sge, sge_count); + rqstp, vec); if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", ret); @@ -521,11 +511,13 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) } inline_bytes -= ret; - ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); + svc_rdma_put_req_map(vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; error: + svc_rdma_put_req_map(vec); svc_rdma_put_context(ctxt, 0); put_page(res_page); return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e132509d1db..19ddc382b77 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -84,70 +84,37 @@ struct svc_xprt_class svc_rdma_class = { .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, }; -static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) +/* WR context cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_ctxt_cachep; + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { - int target; - int at_least_one = 0; struct svc_rdma_op_ctxt *ctxt; - target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, - xprt->sc_ctxt_max); - - spin_lock_bh(&xprt->sc_ctxt_lock); - while (xprt->sc_ctxt_cnt < target) { - xprt->sc_ctxt_cnt++; - spin_unlock_bh(&xprt->sc_ctxt_lock); - - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); - - spin_lock_bh(&xprt->sc_ctxt_lock); - if (ctxt) { - at_least_one = 1; - INIT_LIST_HEAD(&ctxt->free_list); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - } else { - /* kmalloc failed...give up for now */ - xprt->sc_ctxt_cnt--; + while (1) { + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); + if (ctxt) break; - } + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } - spin_unlock_bh(&xprt->sc_ctxt_lock); - dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", - xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); - return at_least_one; + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + atomic_inc(&xprt->sc_ctxt_used); + return ctxt; } -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { - struct svc_rdma_op_ctxt *ctxt; - - while (1) { - spin_lock_bh(&xprt->sc_ctxt_lock); - if (unlikely(list_empty(&xprt->sc_ctxt_free))) { - /* Try to bump my cache. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); - - if (rdma_bump_context_cache(xprt)) - continue; - - printk(KERN_INFO "svcrdma: sleeping waiting for " - "context memory on xprt=%p\n", - xprt); - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - continue; - } - ctxt = list_entry(xprt->sc_ctxt_free.next, - struct svc_rdma_op_ctxt, - free_list); - list_del_init(&ctxt->free_list); - spin_unlock_bh(&xprt->sc_ctxt_lock); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); - ctxt->count = 0; - atomic_inc(&xprt->sc_ctxt_used); - break; + struct svcxprt_rdma *xprt = ctxt->xprt; + int i; + for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); } - return ctxt; } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -161,18 +128,36 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - for (i = 0; i < ctxt->count; i++) - ib_dma_unmap_single(xprt->sc_cm_id->device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - - spin_lock_bh(&xprt->sc_ctxt_lock); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); atomic_dec(&xprt->sc_ctxt_used); } +/* Temporary NFS request map cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_map_cachep; + +/* + * Temporary NFS req mappings are shared across all transport + * instances. These are short lived and should be bounded by the number + * of concurrent server threads * depth of the SQ. + */ +struct svc_rdma_req_map *svc_rdma_get_req_map(void) +{ + struct svc_rdma_req_map *map; + while (1) { + map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); + if (map) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + map->count = 0; + return map; +} + +void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +{ + kmem_cache_free(svc_rdma_map_cachep, map); +} + /* ib_cq event handler */ static void cq_event_handler(struct ib_event *event, void *context) { @@ -302,6 +287,7 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; ctxt->wc_status = wc.status; ctxt->byte_len = wc.byte_len; + svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) { /* Close the transport */ dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); @@ -351,6 +337,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; xprt = ctxt->xprt; + svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) /* Close the transport */ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -361,10 +348,13 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) switch (ctxt->wr_op) { case IB_WR_SEND: - case IB_WR_RDMA_WRITE: svc_rdma_put_context(ctxt, 1); break; + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; @@ -423,40 +413,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) tasklet_schedule(&dto_tasklet); } -static void create_context_cache(struct svcxprt_rdma *xprt, - int ctxt_count, int ctxt_bump, int ctxt_max) -{ - struct svc_rdma_op_ctxt *ctxt; - int i; - - xprt->sc_ctxt_max = ctxt_max; - xprt->sc_ctxt_bump = ctxt_bump; - xprt->sc_ctxt_cnt = 0; - atomic_set(&xprt->sc_ctxt_used, 0); - - INIT_LIST_HEAD(&xprt->sc_ctxt_free); - for (i = 0; i < ctxt_count; i++) { - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt) { - INIT_LIST_HEAD(&ctxt->free_list); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - xprt->sc_ctxt_cnt++; - } - } -} - -static void destroy_context_cache(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_ctxt_free)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(xprt->sc_ctxt_free.next, - struct svc_rdma_op_ctxt, - free_list); - list_del_init(&ctxt->free_list); - kfree(ctxt); - } -} - static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, int listener) { @@ -473,7 +429,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_read_complete_lock); - spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); cma_xprt->sc_ord = svcrdma_ord; @@ -482,21 +437,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, cma_xprt->sc_max_requests = svcrdma_max_requests; cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; atomic_set(&cma_xprt->sc_sq_count, 0); + atomic_set(&cma_xprt->sc_ctxt_used, 0); - if (!listener) { - int reqs = cma_xprt->sc_max_requests; - create_context_cache(cma_xprt, - reqs << 1, /* starting size */ - reqs, /* bump amount */ - reqs + - cma_xprt->sc_sq_depth + - RPCRDMA_MAX_THREADS + 1); /* max */ - if (list_empty(&cma_xprt->sc_ctxt_free)) { - kfree(cma_xprt); - return NULL; - } - clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); - } else + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); return cma_xprt; @@ -532,6 +475,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; + atomic_inc(&xprt->sc_dma_used); pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); @@ -566,7 +510,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) * will call the recvfrom method on the listen xprt which will accept the new * connection. */ -static void handle_connect_req(struct rdma_cm_id *new_cma_id) +static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; @@ -583,6 +527,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + /* Save client advertised inbound read limit for use later in accept. */ + newxprt->sc_ord = client_ird; + /* Set the local and remote addresses in the transport */ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); @@ -619,7 +566,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event=%d\n", cma_id, cma_id->context, event->event); - handle_connect_req(cma_id); + handle_connect_req(cma_id, + event->param.conn.responder_resources); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -793,8 +741,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) (size_t)svcrdma_max_requests); newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; - newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, - (size_t)svcrdma_ord); + /* + * Limit ORD based on client limit, local device limit, and + * configured svcrdma limit. + */ + newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); if (IS_ERR(newxprt->sc_pd)) { @@ -987,7 +939,6 @@ static void __svc_rdma_free(struct work_struct *work) * cm_id because the device ptr is needed to unmap the dma in * svc_rdma_put_context. */ - spin_lock_bh(&rdma->sc_read_complete_lock); while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_entry(rdma->sc_read_complete_q.next, @@ -996,10 +947,8 @@ static void __svc_rdma_free(struct work_struct *work) list_del_init(&ctxt->dto_q); svc_rdma_put_context(ctxt, 1); } - spin_unlock_bh(&rdma->sc_read_complete_lock); /* Destroy queued, but not processed recv completions */ - spin_lock_bh(&rdma->sc_rq_dto_lock); while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_entry(rdma->sc_rq_dto_q.next, @@ -1008,10 +957,10 @@ static void __svc_rdma_free(struct work_struct *work) list_del_init(&ctxt->dto_q); svc_rdma_put_context(ctxt, 1); } - spin_unlock_bh(&rdma->sc_rq_dto_lock); /* Warn if we leaked a resource or under-referenced */ WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1032,7 +981,6 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); - destroy_context_cache(rdma); kfree(rdma); } @@ -1132,6 +1080,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); /* Prepare SGE for local address */ + atomic_inc(&xprt->sc_dma_used); sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, p, 0, PAGE_SIZE, DMA_FROM_DEVICE); sge.lkey = xprt->sc_phys_mr->lkey; |