diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 16:26:12 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 16:26:12 +0100 |
commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /net/sunrpc | |
parent | 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'net/sunrpc')
-rw-r--r-- | net/sunrpc/auth.c | 2 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/Makefile | 4 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/auth_gss.c | 2 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_krb5_crypto.c | 10 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_krb5_seal.c | 26 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_krb5_unseal.c | 16 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_krb5_wrap.c | 72 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 6 | ||||
-rw-r--r-- | net/sunrpc/rpc_pipe.c | 2 | ||||
-rw-r--r-- | net/sunrpc/rpcb_clnt.c | 121 | ||||
-rw-r--r-- | net/sunrpc/svc.c | 361 | ||||
-rw-r--r-- | net/sunrpc/svc_xprt.c | 39 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 17 | ||||
-rw-r--r-- | net/sunrpc/sysctl.c | 18 | ||||
-rw-r--r-- | net/sunrpc/xprt.c | 12 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 33 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma.c | 35 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 255 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 370 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 530 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 41 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 741 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 17 | ||||
-rw-r--r-- | net/sunrpc/xprtsock.c | 4 |
24 files changed, 1897 insertions, 837 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 6bfea9ed686..436bf1b4b76 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -83,10 +83,8 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) if (flavor >= RPC_AUTH_MAXFLAVOR) goto out; -#ifdef CONFIG_KMOD if ((ops = auth_flavors[flavor]) == NULL) request_module("rpc-auth-%u", flavor); -#endif spin_lock(&rpc_authflavor_lock); ops = auth_flavors[flavor]; if (ops == NULL || !try_module_get(ops->owner)) { diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index f3431a7e33d..4de8bcf26fa 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,12 +5,12 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ - gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o + gss_mech_switch.o svcauth_gss.o obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o gss_krb5_wrap.o + gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 834a83199bd..853a4142cea 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -33,8 +33,6 @@ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $Id$ */ diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 1d52308ca32..c93fca20455 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -83,8 +83,6 @@ out: return ret; } -EXPORT_SYMBOL(krb5_encrypt); - u32 krb5_decrypt( struct crypto_blkcipher *tfm, @@ -118,8 +116,6 @@ out: return ret; } -EXPORT_SYMBOL(krb5_decrypt); - static int checksummer(struct scatterlist *sg, void *data) { @@ -161,8 +157,6 @@ out: return err ? GSS_S_FAILURE : 0; } -EXPORT_SYMBOL(make_checksum); - struct encryptor_desc { u8 iv[8]; /* XXX hard-coded blocksize */ struct blkcipher_desc desc; @@ -262,8 +256,6 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, return ret; } -EXPORT_SYMBOL(gss_encrypt_xdr_buf); - struct decryptor_desc { u8 iv[8]; /* XXX hard-coded blocksize */ struct blkcipher_desc desc; @@ -334,5 +326,3 @@ gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); } - -EXPORT_SYMBOL(gss_decrypt_xdr_buf); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 5f1d36dfbcf..b8f42ef7178 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -78,7 +78,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; char cksumdata[16]; struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; - unsigned char *ptr, *krb5_hdr, *msg_start; + unsigned char *ptr, *msg_start; s32 now; u32 seq_send; @@ -87,36 +87,36 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, now = get_seconds(); - token->len = g_token_size(&ctx->mech_used, 24); + token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8); ptr = token->data; - g_make_token_header(&ctx->mech_used, 24, &ptr); + g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr); - *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); - *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff); - /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr - 2; - msg_start = krb5_hdr + 24; + msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8; - *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); - memset(krb5_hdr + 4, 0xff, 4); + *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); + memset(ptr + 4, 0xff, 4); - if (make_checksum("md5", krb5_hdr, 8, text, 0, &md5cksum)) + if (make_checksum("md5", ptr, 8, text, 0, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); spin_lock(&krb5_seq_lock); seq_send = ctx->seq_send++; spin_unlock(&krb5_seq_lock); if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, - seq_send, krb5_hdr + 16, krb5_hdr + 8)) + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, + ptr + 8)) return GSS_S_FAILURE; return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index d91a5d00480..066ec73c84d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -92,30 +92,30 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, read_token->len)) return GSS_S_DEFECTIVE_TOKEN; - if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || - (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) + if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) return GSS_S_DEFECTIVE_TOKEN; /* XXX sanity-check bodysize?? */ - signalg = ptr[0] + (ptr[1] << 8); + signalg = ptr[2] + (ptr[3] << 8); if (signalg != SGN_ALG_DES_MAC_MD5) return GSS_S_DEFECTIVE_TOKEN; - sealalg = ptr[2] + (ptr[3] << 8); + sealalg = ptr[4] + (ptr[5] << 8); if (sealalg != SEAL_ALG_NONE) return GSS_S_DEFECTIVE_TOKEN; - if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) return GSS_S_DEFECTIVE_TOKEN; - if (make_checksum("md5", ptr - 2, 8, message_buffer, 0, &md5cksum)) + if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) return GSS_S_FAILURE; - if (memcmp(md5cksum.data + 8, ptr + 14, 8)) + if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ @@ -127,7 +127,7 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, /* do sequencing checks */ - if (krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, &seqnum)) + if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum)) return GSS_S_FAILURE; if ((ctx->initiate && direction != 0xff) || diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index b00b1b42630..ae8e69b59c4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -87,8 +87,8 @@ out: return 0; } -static inline void -make_confounder(char *p, int blocksize) +static void +make_confounder(char *p, u32 conflen) { static u64 i = 0; u64 *q = (u64 *)p; @@ -102,8 +102,22 @@ make_confounder(char *p, int blocksize) * uniqueness would mean worrying about atomicity and rollover, and I * don't care enough. */ - BUG_ON(blocksize != 8); - *q = i++; + /* initialize to random value */ + if (i == 0) { + i = random32(); + i = (i << 32) | random32(); + } + + switch (conflen) { + case 16: + *q++ = i++; + /* fall through */ + case 8: + *q++ = i++; + break; + default: + BUG(); + } } /* Assumptions: the head and tail of inbuf are ours to play with. @@ -122,7 +136,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, char cksumdata[16]; struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; int blocksize = 0, plainlen; - unsigned char *ptr, *krb5_hdr, *msg_start; + unsigned char *ptr, *msg_start; s32 now; int headlen; struct page **tmp_pages; @@ -149,26 +163,26 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, buf->len += headlen; BUG_ON((buf->len - offset - headlen) % blocksize); - g_make_token_header(&kctx->mech_used, 24 + plainlen, &ptr); + g_make_token_header(&kctx->mech_used, + GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr); - *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); - *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); - /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr - 2; - msg_start = krb5_hdr + 24; + msg_start = ptr + 24; - *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); - memset(krb5_hdr + 4, 0xff, 4); - *(__be16 *)(krb5_hdr + 4) = htons(SEAL_ALG_DES); + *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); + memset(ptr + 4, 0xff, 4); + *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES); make_confounder(msg_start, blocksize); /* XXXJBF: UGH!: */ tmp_pages = buf->pages; buf->pages = pages; - if (make_checksum("md5", krb5_hdr, 8, buf, + if (make_checksum("md5", ptr, 8, buf, offset + headlen - blocksize, &md5cksum)) return GSS_S_FAILURE; buf->pages = tmp_pages; @@ -176,7 +190,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); spin_lock(&krb5_seq_lock); seq_send = kctx->seq_send++; @@ -185,7 +199,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, - seq_send, krb5_hdr + 16, krb5_hdr + 8))) + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) return GSS_S_FAILURE; if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, @@ -219,38 +233,38 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) buf->len - offset)) return GSS_S_DEFECTIVE_TOKEN; - if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || - (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) + if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) return GSS_S_DEFECTIVE_TOKEN; /* XXX sanity-check bodysize?? */ /* get the sign and seal algorithms */ - signalg = ptr[0] + (ptr[1] << 8); + signalg = ptr[2] + (ptr[3] << 8); if (signalg != SGN_ALG_DES_MAC_MD5) return GSS_S_DEFECTIVE_TOKEN; - sealalg = ptr[2] + (ptr[3] << 8); + sealalg = ptr[4] + (ptr[5] << 8); if (sealalg != SEAL_ALG_DES) return GSS_S_DEFECTIVE_TOKEN; - if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) return GSS_S_DEFECTIVE_TOKEN; if (gss_decrypt_xdr_buf(kctx->enc, buf, - ptr + 22 - (unsigned char *)buf->head[0].iov_base)) + ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base)) return GSS_S_DEFECTIVE_TOKEN; - if (make_checksum("md5", ptr - 2, 8, buf, - ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) + if (make_checksum("md5", ptr, 8, buf, + ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - if (memcmp(md5cksum.data + 8, ptr + 14, 8)) + if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ @@ -262,8 +276,8 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) /* do sequencing checks */ - if (krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, - &seqnum)) + if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, + &direction, &seqnum)) return GSS_S_BAD_SIG; if ((kctx->initiate && direction != 0xff) || @@ -274,7 +288,7 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) * better to copy and encrypt at the same time. */ blocksize = crypto_blkcipher_blocksize(kctx->enc); - data_start = ptr + 22 + blocksize; + data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize; orig_start = buf->head[0].iov_base + offset; data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; memmove(orig_start, data_start, data_len); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 76739e928d0..4895c341e46 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_protname = program->name; - clnt->cl_prog = program->number; + clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; clnt->cl_stats = program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); @@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru } /* save the nodename */ - clnt->cl_nodelen = strlen(utsname()->nodename); + clnt->cl_nodelen = strlen(init_utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); rpc_register_client(clnt); return clnt; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 5a9b0e7828c..23a2b8f6dc4 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -897,7 +897,7 @@ static struct file_system_type rpc_pipe_fs_type = { }; static void -init_once(struct kmem_cache * cachep, void *foo) +init_once(void *foo) { struct rpc_inode *rpci = (struct rpc_inode *) foo; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 24db2b4d12d..41013dd66ac 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -20,6 +20,7 @@ #include <linux/in6.h> #include <linux/kernel.h> #include <linux/errno.h> +#include <net/ipv6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> @@ -176,13 +177,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, } static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, - u32 version, struct rpc_message *msg, - int *result) + u32 version, struct rpc_message *msg) { struct rpc_clnt *rpcb_clnt; - int error = 0; + int result, error = 0; - *result = 0; + msg->rpc_resp = &result; rpcb_clnt = rpcb_create_local(addr, addrlen, version); if (!IS_ERR(rpcb_clnt)) { @@ -191,12 +191,15 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, } else error = PTR_ERR(rpcb_clnt); - if (error < 0) + if (error < 0) { printk(KERN_WARNING "RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); - dprintk("RPC: registration status %d/%d\n", error, *result); + return error; + } - return error; + if (!result) + return -EACCES; + return 0; } /** @@ -205,7 +208,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * @vers: RPC version number to bind * @prot: transport protocol to register * @port: port value to register - * @okay: OUT: result code + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services @@ -217,15 +224,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * all registered transports for [program, version] from the local * rpcbind database. * - * Returns zero if the registration request was dispatched - * successfully and a reply was received. The rpcbind daemon's - * boolean result code is stored in *okay. - * - * Returns an errno value and sets *result to zero if there was - * some problem that prevented the rpcbind request from being - * dispatched, or if the rpcbind daemon did not respond within - * the timeout. - * * This function uses rpcbind protocol version 2 to contact the * local rpcbind daemon. * @@ -236,7 +234,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 * addresses). */ -int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) +int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) { struct rpcbind_args map = { .r_prog = prog, @@ -246,7 +244,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) }; struct rpc_message msg = { .rpc_argp = &map, - .rpc_resp = okay, }; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " @@ -259,7 +256,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_2, &msg, okay); + RPCBVERS_2, &msg); } /* @@ -290,7 +287,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_4, msg, msg->rpc_resp); + RPCBVERS_4, msg); } /* @@ -304,10 +301,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, char buf[64]; /* Construct AF_INET6 universal address */ - snprintf(buf, sizeof(buf), - NIP6_FMT".%u.%u", - NIP6(address_to_register->sin6_addr), - port >> 8, port & 0xff); + if (ipv6_addr_any(&address_to_register->sin6_addr)) + snprintf(buf, sizeof(buf), "::.%u.%u", + port >> 8, port & 0xff); + else + snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u", + NIP6(address_to_register->sin6_addr), + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, sizeof(rpcb_in6addr_loopback), - RPCBVERS_4, msg, msg->rpc_resp); + RPCBVERS_4, msg); } /** @@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * @version: RPC version number of service to (un)register * @address: address family, IP address, and port to (un)register * @netid: netid of transport protocol to (un)register - * @result: result code from rpcbind RPC call + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services @@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * to zero. Callers pass a netid of "" to unregister all * transport netids associated with [program, version, address]. * - * Returns zero if the registration request was dispatched - * successfully and a reply was received. The rpcbind daemon's - * result code is stored in *result. - * - * Returns an errno value and sets *result to zero if there was - * some problem that prevented the rpcbind request from being - * dispatched, or if the rpcbind daemon did not respond within - * the timeout. - * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support * version 4 of the rpcbind protocol in order for these functions @@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * advertises the service on all IPv4 and IPv6 addresses. */ int rpcb_v4_register(const u32 program, const u32 version, - const struct sockaddr *address, const char *netid, - int *result) + const struct sockaddr *address, const char *netid) { struct rpcbind_args map = { .r_prog = program, @@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version, }; struct rpc_message msg = { .rpc_argp = &map, - .rpc_resp = result, }; - *result = 0; - switch (address->sa_family) { case AF_INET: return rpcb_register_netid4((struct sockaddr_in *)address, @@ -469,6 +460,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi return rpc_run_task(&task_setup_data); } +/* + * In the case where rpc clients have been cloned, we want to make + * sure that we use the program number/version etc of the actual + * owner of the xprt. To do so, we walk back up the tree of parents + * to find whoever created the transport and/or whoever has the + * autobind flag set. + */ +static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) +{ + struct rpc_clnt *parent = clnt->cl_parent; + + while (parent != clnt) { + if (parent->cl_xprt != clnt->cl_xprt) + break; + if (clnt->cl_autobind) + break; + clnt = parent; + parent = parent->cl_parent; + } + return clnt; +} + /** * rpcb_getport_async - obtain the port for a given RPC service on a given host * @task: task that is waiting for portmapper request @@ -478,10 +491,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi */ void rpcb_getport_async(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_clnt *clnt; struct rpc_procinfo *proc; u32 bind_version; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; static struct rpcbind_args *map; struct rpc_task *child; @@ -490,13 +503,13 @@ void rpcb_getport_async(struct rpc_task *task) size_t salen; int status; + clnt = rpcb_find_transport_owner(task->tk_client); + xprt = clnt->cl_xprt; + dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __func__, clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); - /* Autobind on cloned rpc clients is discouraged */ - BUG_ON(clnt->cl_parent != clnt); - /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on(&xprt->binding, task, NULL); @@ -558,7 +571,7 @@ void rpcb_getport_async(struct rpc_task *task) status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", task->tk_pid, __func__); - goto bailout_nofree; + goto bailout_release_client; } map->r_prog = clnt->cl_prog; map->r_vers = clnt->cl_vers; @@ -578,11 +591,13 @@ void rpcb_getport_async(struct rpc_task *task) task->tk_pid, __func__); return; } - rpc_put_task(child); - task->tk_xprt->stat.bind_count++; + xprt->stat.bind_count++; + rpc_put_task(child); return; +bailout_release_client: + rpc_release_client(rpcb_clnt); bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); task->tk_status = status; @@ -633,7 +648,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { - dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n", + dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n", rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); *p++ = htonl(rpcb->r_prog); *p++ = htonl(rpcb->r_vers); @@ -648,7 +663,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p, unsigned short *portp) { *portp = (unsigned short) ntohl(*p++); - dprintk("RPC: rpcb_decode_getport result %u\n", + dprintk("RPC: rpcb getport result: %u\n", *portp); return 0; } @@ -657,7 +672,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, unsigned int *boolp) { *boolp = (unsigned int) ntohl(*p++); - dprintk("RPC: rpcb_decode_set: call %s\n", + dprintk("RPC: rpcb set/unset call %s\n", (*boolp ? "succeeded" : "failed")); return 0; } @@ -665,7 +680,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { - dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n", + dprintk("RPC: encoding rpcb request (%u, %u, %s)\n", rpcb->r_prog, rpcb->r_vers, rpcb->r_addr); *p++ = htonl(rpcb->r_prog); *p++ = htonl(rpcb->r_vers); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 01c7e311b90..54c98d87684 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/kthread.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> @@ -27,6 +28,8 @@ #define RPCDBG_FACILITY RPCDBG_SVCDSP +static void svc_unregister(const struct svc_serv *serv); + #define svc_serv_is_pooled(serv) ((serv)->sv_function) /* @@ -291,15 +294,14 @@ svc_pool_map_put(void) /* - * Set the current thread's cpus_allowed mask so that it + * Set the given thread's cpus_allowed mask so that it * will only run on cpus in the given pool. - * - * Returns 1 and fills in oldmask iff a cpumask was applied. */ -static inline int -svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +static inline void +svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) { struct svc_pool_map *m = &svc_pool_map; + unsigned int node = m->pool_to[pidx]; /* * The caller checks for sv_nrpools > 1, which @@ -307,26 +309,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) */ BUG_ON(m->count == 0); - switch (m->mode) - { - default: - return 0; + switch (m->mode) { case SVC_POOL_PERCPU: { - unsigned int cpu = m->pool_to[pidx]; - - *oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - return 1; + set_cpus_allowed_ptr(task, &cpumask_of_cpu(node)); + break; } case SVC_POOL_PERNODE: { - unsigned int node = m->pool_to[pidx]; node_to_cpumask_ptr(nodecpumask, node); - - *oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, nodecpumask); - return 1; + set_cpus_allowed_ptr(task, nodecpumask); + break; } } } @@ -366,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@ -375,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; + serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@ -425,34 +419,32 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, spin_lock_init(&pool->sp_lock); } - /* Remove any stale portmap registrations */ - svc_register(serv, 0, 0); + svc_unregister(serv); return serv; } struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); } EXPORT_SYMBOL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv), - svc_thread_fn func, int sig, struct module *mod) + sa_family_t family, void (*shutdown)(struct svc_serv *serv), + svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, family, shutdown); if (serv != NULL) { serv->sv_function = func; - serv->sv_kill_signal = sig; serv->sv_module = mod; } @@ -461,7 +453,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, EXPORT_SYMBOL(svc_create_pooled); /* - * Destroy an RPC service. Should be called with the BKL held + * Destroy an RPC service. Should be called with appropriate locking to + * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. */ void svc_destroy(struct svc_serv *serv) @@ -495,8 +488,7 @@ svc_destroy(struct svc_serv *serv) if (svc_serv_is_pooled(serv)) svc_pool_map_put(); - /* Unregister service with the portmapper */ - svc_register(serv, 0, 0); + svc_unregister(serv); kfree(serv->sv_pools); kfree(serv); } @@ -578,46 +570,6 @@ out_enomem: EXPORT_SYMBOL(svc_prepare_thread); /* - * Create a thread in the given pool. Caller must hold BKL. - * On a NUMA or SMP machine, with a multi-pool serv, the thread - * will be restricted to run on the cpus belonging to the pool. - */ -static int -__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, - struct svc_pool *pool) -{ - struct svc_rqst *rqstp; - int error = -ENOMEM; - int have_oldmask = 0; - cpumask_t uninitialized_var(oldmask); - - rqstp = svc_prepare_thread(serv, pool); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); - goto out; - } - - if (serv->sv_nrpools > 1) - have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); - - error = kernel_thread((int (*)(void *)) func, rqstp, 0); - - if (have_oldmask) - set_cpus_allowed(current, oldmask); - - if (error < 0) - goto out_thread; - svc_sock_update_bufs(serv); - error = 0; -out: - return error; - -out_thread: - svc_exit_thread(rqstp); - goto out; -} - -/* * Choose a pool in which to create a new thread, for svc_set_num_threads */ static inline struct svc_pool * @@ -674,7 +626,7 @@ found_pool: * of threads the given number. If `pool' is non-NULL, applies * only to threads in that pool, otherwise round-robins between * all pools. Must be called with a svc_get() reference and - * the BKL held. + * the BKL or another lock to protect access to svc_serv fields. * * Destroying threads relies on the service threads filling in * rqstp->rq_task, which only the nfs ones do. Assumes the serv @@ -686,7 +638,9 @@ found_pool: int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct task_struct *victim; + struct svc_rqst *rqstp; + struct task_struct *task; + struct svc_pool *chosen_pool; int error = 0; unsigned int state = serv->sv_nrthreads-1; @@ -702,18 +656,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) /* create new threads */ while (nrservs > 0) { nrservs--; + chosen_pool = choose_pool(serv, pool, &state); + + rqstp = svc_prepare_thread(serv, chosen_pool); + if (IS_ERR(rqstp)) { + error = PTR_ERR(rqstp); + break; + } + __module_get(serv->sv_module); - error = __svc_create_thread(serv->sv_function, serv, - choose_pool(serv, pool, &state)); - if (error < 0) { + task = kthread_create(serv->sv_function, rqstp, serv->sv_name); + if (IS_ERR(task)) { + error = PTR_ERR(task); module_put(serv->sv_module); + svc_exit_thread(rqstp); break; } + + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); } /* destroy old threads */ while (nrservs < 0 && - (victim = choose_victim(serv, pool, &state)) != NULL) { - send_sig(serv->sv_kill_signal, victim, 1); + (task = choose_victim(serv, pool, &state)) != NULL) { + send_sig(SIGINT, task, 1); nrservs++; } @@ -722,7 +692,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) EXPORT_SYMBOL(svc_set_num_threads); /* - * Called from a server thread as it's exiting. Caller must hold BKL. + * Called from a server thread as it's exiting. Caller must hold the BKL or + * the "service mutex", whichever is appropriate for the service. */ void svc_exit_thread(struct svc_rqst *rqstp) @@ -748,55 +719,245 @@ svc_exit_thread(struct svc_rqst *rqstp) } EXPORT_SYMBOL(svc_exit_thread); +#ifdef CONFIG_SUNRPC_REGISTER_V4 + /* - * Register an RPC service with the local portmapper. - * To unregister a service, call this routine with - * proto and port == 0. + * Register an "inet" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. */ -int -svc_register(struct svc_serv *serv, int proto, unsigned short port) +static int __svc_rpcb_register4(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; + char *netid; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP; + break; + default: + return -EPROTONOSUPPORT; + } + + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin, netid); +} + +/* + * Register an "inet6" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_rpcb_register6(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + char *netid; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP6; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP6; + break; + default: + return -EPROTONOSUPPORT; + } + + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, netid); +} + +/* + * Register a kernel RPC service via rpcbind version 4. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const u32 program, const u32 version, + const sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + int error; + + switch (family) { + case AF_INET: + return __svc_rpcb_register4(program, version, + protocol, port); + case AF_INET6: + error = __svc_rpcb_register6(program, version, + protocol, port); + if (error < 0) + return error; + + /* + * Work around bug in some versions of Linux rpcbind + * which don't allow registration of both inet and + * inet6 netids. + * + * Error return ignored for now. + */ + __svc_rpcb_register4(program, version, + protocol, port); + return 0; + } + + return -EAFNOSUPPORT; +} + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +/* + * Register a kernel RPC service via rpcbind version 2. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const u32 program, const u32 version, + sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + if (family != AF_INET) + return -EAFNOSUPPORT; + + return rpcb_register(program, version, protocol, port); +} + +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + +/** + * svc_register - register an RPC service with the local portmapper + * @serv: svc_serv struct for the service to register + * @proto: transport protocol number to advertise + * @port: port to advertise + * + * Service is registered for any address in serv's address family + */ +int svc_register(const struct svc_serv *serv, const unsigned short proto, + const unsigned short port) { struct svc_program *progp; - unsigned long flags; unsigned int i; - int error = 0, dummy; + int error = 0; - if (!port) - clear_thread_flag(TIF_SIGPENDING); + BUG_ON(proto == 0 && port == 0); for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; - dprintk("svc: svc_register(%s, %s, %d, %d)%s\n", + dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", progp->pg_name, + i, proto == IPPROTO_UDP? "udp" : "tcp", port, - i, + serv->sv_family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); if (progp->pg_vers[i]->vs_hidden) continue; - error = rpcb_register(progp->pg_prog, i, proto, port, &dummy); + error = __svc_register(progp->pg_prog, i, + serv->sv_family, proto, port); if (error < 0) break; - if (port && !dummy) { - error = -EACCES; - break; - } } } - if (!port) { - spin_lock_irqsave(¤t->sighand->siglock, flags); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return error; +} + +#ifdef CONFIG_SUNRPC_REGISTER_V4 + +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = 0, + }; + int error; + + error = rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, ""); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + int error; + + error = rpcb_register(program, version, 0, 0); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + +/* + * All netids, bind addresses and ports registered for [program, version] + * are removed from the local rpcbind database (if the service is not + * hidden) to make way for a new instance of the service. + * + * The result of unregistration is reported via dprintk for those who want + * verification of the result, but is otherwise not important. + */ +static void svc_unregister(const struct svc_serv *serv) +{ + struct svc_program *progp; + unsigned long flags; + unsigned int i; + + clear_thread_flag(TIF_SIGPENDING); + + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + if (progp->pg_vers[i]->vs_hidden) + continue; + + __svc_unregister(progp->pg_prog, i, progp->pg_name); + } } - return error; + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e46c825f495..bf5b5cdafeb 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, } EXPORT_SYMBOL_GPL(svc_xprt_init); -int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, - int flags) +static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, + struct svc_serv *serv, + unsigned short port, int flags) { - struct svc_xprt_class *xcl; struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + struct sockaddr *sap; + size_t len; + + switch (serv->sv_family) { + case AF_INET: + sap = (struct sockaddr *)&sin; + len = sizeof(sin); + break; + case AF_INET6: + sap = (struct sockaddr *)&sin6; + len = sizeof(sin6); + break; + default: + return ERR_PTR(-EAFNOSUPPORT); + } + + return xcl->xcl_ops->xpo_create(serv, sap, len, flags); +} + +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, + int flags) +{ + struct svc_xprt_class *xcl; + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); spin_lock(&svc_xprt_class_lock); list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { @@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = xcl->xcl_ops-> - xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin), - flags); + newxprt = __svc_xpo_create(xcl, serv, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 3e65719f1ef..95293f549e9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int val; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); + /* + * We start one listener per sv_serv. We want AF_INET + * requests to be automatically shunted to our AF_INET6 + * listener using a mapped IPv4 address. Make sure + * no-one starts an equivalent IPv4 listener, which + * would steal our incoming connections. + */ + val = 0; + if (serv->sv_family == AF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1154,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, int svc_addsock(struct svc_serv *serv, int fd, - char *name_return, - int *proto) + char *name_return) { int err = 0; struct socket *so = sockfd_lookup(fd, &err); @@ -1190,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv, sockfd_put(so); return err; } - if (proto) *proto = so->sk->sk_protocol; return one_sock_name(name_return, svsk); } EXPORT_SYMBOL_GPL(svc_addsock); diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 0f8c439b848..5231f7aaac0 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,24 +60,14 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; - int len; + size_t len; + if ((*ppos && !write) || !*lenp) { *lenp = 0; return 0; } - if (write) - return -EINVAL; - else { - len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - if (!access_ok(VERIFY_WRITE, buffer, len)) - return -EFAULT; - - if (__copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; - } - *lenp -= len; - *ppos += len; - return 0; + len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); + return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 99a52aabe33..29e401bb612 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport) goto out; } - result = -EINVAL; - if (try_module_get(THIS_MODULE)) { - list_add_tail(&transport->list, &xprt_list); - printk(KERN_INFO "RPC: Registered %s transport module.\n", - transport->name); - result = 0; - } + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; out: spin_unlock(&xprt_list_lock); @@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport) "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); - module_put(THIS_MODULE); goto out; } } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e55427f73df..14106d26bb9 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; if (n == nsegs) return 0; seg[n].mr_page = NULL; @@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen == 0) return -1; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" - " headerp 0x%p base 0x%p lkey 0x%x\n", + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); @@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b * Scatter inline received data back into provided iov's. */ static void -rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; @@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) } else rqst->rq_rcv_buf.tail[0].iov_len = 0; + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + if (copy_len) dprintk("RPC: %s: %d bytes in" " %d extra segments (%d lost)\n", @@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, ep->rep_connected); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } spin_unlock_bh(&xprt->transport_lock); } @@ -769,7 +782,7 @@ repost: /* check for expected message types */ /* The order of some of these tests is important. */ switch (headerp->rm_type) { - case __constant_htonl(RDMA_MSG): + case htonl(RDMA_MSG): /* never expect read chunks */ /* never expect reply chunks (two ways to check) */ /* never expect write chunks without having offered RDMA */ @@ -792,17 +805,23 @@ repost: ((unsigned char *)iptr - (unsigned char *)headerp); status = rep->rr_len + rdmalen; r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } } else { /* else ordinary inline */ + rdmalen = 0; iptr = (__be32 *)((unsigned char *)headerp + 28); rep->rr_len -= 28; /*sizeof *headerp;*/ status = rep->rr_len; } /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); break; - case __constant_htonl(RDMA_NOMSG): + case htonl(RDMA_NOMSG): /* never expect read or write chunks, always reply chunks */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 88c0ca20bb1..87101177825 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; +/* Temporary NFS request map and context caches */ +struct kmem_cache *svc_rdma_map_cachep; +struct kmem_cache *svc_rdma_ctxt_cachep; + /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + flush_scheduled_work(); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; } svc_unreg_xprt_class(&svc_rdma_class); + kmem_cache_destroy(svc_rdma_map_cachep); + kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) @@ -255,9 +262,37 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + /* Create the temporary map cache */ + svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", + sizeof(struct svc_rdma_req_map), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_map_cachep) { + printk(KERN_INFO "Could not allocate map cache.\n"); + goto err0; + } + + /* Create the temporary context cache */ + svc_rdma_ctxt_cachep = + kmem_cache_create("svc_rdma_ctxt_cache", + sizeof(struct svc_rdma_op_ctxt), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_ctxt_cachep) { + printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); + goto err1; + } + /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); return 0; + err1: + kmem_cache_destroy(svc_rdma_map_cachep); + err0: + unregister_sysctl_table(svcrdma_table_header); + return -ENOMEM; } MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); MODULE_DESCRIPTION("SVC RDMA Transport"); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 06ab4841537..a4756576d68 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -112,16 +112,11 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -struct chunk_sge { - int start; /* sge no for this chunk */ - int count; /* sge count for this chunk */ -}; - /* Encode a read-chunk-list as an array of IB SGE * * Assumptions: * - chunk[0]->position points to pages[0] at an offset of 0 - * - pages[] is not physically or virtually contigous and consists of + * - pages[] is not physically or virtually contiguous and consists of * PAGE_SIZE elements. * * Output: @@ -130,12 +125,12 @@ struct chunk_sge { * chunk in the read list * */ -static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, +static int map_read_chunks(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, struct rpcrdma_msg *rmsgp, - struct ib_sge *sge, - struct chunk_sge *ch_sge_ary, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, int ch_count, int byte_count) { @@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, head->arg.head[0] = rqstp->rq_arg.head[0]; head->arg.tail[0] = rqstp->rq_arg.tail[0]; head->arg.pages = &head->pages[head->count]; - head->sge[0].length = head->count; /* save count of hdr pages */ + head->hdr_count = head->count; /* save count of hdr pages */ head->arg.page_base = 0; head->arg.page_len = ch_bytes; head->arg.len = rqstp->rq_arg.len + ch_bytes; head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; head->count++; - ch_sge_ary[0].start = 0; + chl_map->ch[0].start = 0; while (byte_count) { + rpl_map->sge[sge_no].iov_base = + page_address(rqstp->rq_arg.pages[page_no]) + page_off; sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); - sge[sge_no].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - rqstp->rq_arg.pages[page_no], - page_off, sge_bytes, - DMA_FROM_DEVICE); - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + rpl_map->sge[sge_no].iov_len = sge_bytes; /* * Don't bump head->count here because the same page * may be used by multiple SGE. @@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, * SGE, move to the next SGE */ if (ch_bytes == 0) { - ch_sge_ary[ch_no].count = - sge_no - ch_sge_ary[ch_no].start; + chl_map->ch[ch_no].count = + sge_no - chl_map->ch[ch_no].start; ch_no++; ch++; - ch_sge_ary[ch_no].start = sge_no; + chl_map->ch[ch_no].start = sge_no; ch_bytes = ch->rc_target.rs_length; /* If bytes remaining account for next chunk */ if (byte_count) { @@ -220,19 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, return sge_no; } -static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, - struct ib_sge *sge, - u64 *sgl_offset, - int count) +/* Map a read-chunk-list to an XDR and fast register the page-list. + * + * Assumptions: + * - chunk[0] position points to pages[0] at an offset of 0 + * - pages[] will be made physically contiguous by creating a one-off memory + * region using the fastreg verb. + * - byte_count is # of bytes in read-chunk-list + * - ch_count is # of chunks in read-chunk-list + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + */ +static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, + int ch_count, + int byte_count) +{ + int page_no; + int ch_no; + u32 offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_fastreg_mr *frmr; + int ret = 0; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + + head->frmr = frmr; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = byte_count; + head->arg.len = rqstp->rq_arg.len + byte_count; + head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + + /* Fast register the page list */ + frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->direction = DMA_FROM_DEVICE; + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); + frmr->map_len = byte_count; + frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, + page_address(rqstp->rq_arg.pages[page_no]), + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + } + head->count += page_no; + + /* rq_respages points one past arg pages */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + + /* Create the reply and chunk maps */ + offset = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + for (ch_no = 0; ch_no < ch_count; ch_no++) { + rpl_map->sge[ch_no].iov_base = frmr->kva + offset; + rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length; + chl_map->ch[ch_no].count = 1; + chl_map->ch[ch_no].start = ch_no; + offset += ch->rc_target.rs_length; + ch++; + } + + ret = svc_rdma_fastreg(xprt, frmr); + if (ret) + goto fatal_err; + + return ch_no; + + fatal_err: + printk("svcrdma: error fast registering xdr for xprt %p", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_fastreg_mr *frmr, + struct kvec *vec, + u64 *sgl_offset, + int count) { int i; ctxt->count = count; + ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { - ctxt->sge[i].addr = sge[i].addr; - ctxt->sge[i].length = sge[i].length; - *sgl_offset = *sgl_offset + sge[i].length; + ctxt->sge[i].length = 0; /* in case map fails */ + if (!frmr) { + ctxt->sge[i].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + vec[i].iov_base, + vec[i].iov_len, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[i].addr)) + return -EINVAL; + ctxt->sge[i].lkey = xprt->sc_dma_lkey; + atomic_inc(&xprt->sc_dma_used); + } else { + ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; + ctxt->sge[i].lkey = frmr->mr->lkey; + } + ctxt->sge[i].length = vec[i].iov_len; + *sgl_offset = *sgl_offset + vec[i].iov_len; } + return 0; } static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) @@ -280,37 +380,44 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *hdr_ctxt) { struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; int err = 0; int ch_no; - struct ib_sge *sge; int ch_count; int byte_count; int sge_count; u64 sgl_offset; struct rpcrdma_read_chunk *ch; struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_op_ctxt *tmp_sge_ctxt; - struct svc_rdma_op_ctxt *tmp_ch_ctxt; - struct chunk_sge *ch_sge_ary; + struct svc_rdma_req_map *rpl_map; + struct svc_rdma_req_map *chl_map; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); if (!ch) return 0; - /* Allocate temporary contexts to keep SGE */ - BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); - tmp_sge_ctxt = svc_rdma_get_context(xprt); - sge = tmp_sge_ctxt->sge; - tmp_ch_ctxt = svc_rdma_get_context(xprt); - ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; + /* Allocate temporary reply and chunk maps */ + rpl_map = svc_rdma_get_req_map(); + chl_map = svc_rdma_get_req_map(); svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; - sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, - sge, ch_sge_ary, - ch_count, byte_count); + + if (!xprt->sc_frmr_pg_list_len) + sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + else + sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + if (sge_count < 0) { + err = -EIO; + goto out; + } + sgl_offset = 0; ch_no = 0; @@ -319,32 +426,64 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, next_sge: ctxt = svc_rdma_get_context(xprt); ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = hdr_ctxt->frmr; + ctxt->read_hdr = NULL; clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare READ WR */ memset(&read_wr, 0, sizeof read_wr); - ctxt->wr_op = IB_WR_RDMA_READ; read_wr.wr_id = (unsigned long)ctxt; read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; read_wr.send_flags = IB_SEND_SIGNALED; read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; read_wr.wr.rdma.remote_addr = get_unaligned(&(ch->rc_target.rs_offset)) + sgl_offset; - read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; + read_wr.sg_list = ctxt->sge; read_wr.num_sge = - rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); - rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], - &sgl_offset, - read_wr.num_sge); + rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); + err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, + &rpl_map->sge[chl_map->ch[ch_no].start], + &sgl_offset, + read_wr.num_sge); + if (err) { + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + goto out; + } if (((ch+1)->rc_discrim == 0) && - (read_wr.num_sge == ch_sge_ary[ch_no].count)) { + (read_wr.num_sge == chl_map->ch[ch_no].count)) { /* * Mark the last RDMA_READ with a bit to * indicate all RPC data has been fetched from * the client and the RPC needs to be enqueued. */ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + if (hdr_ctxt->frmr) { + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + /* + * Invalidate the local MR used to map the data + * sink. + */ + if (xprt->sc_dev_caps & + SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = + IB_WR_RDMA_READ_WITH_INV; + ctxt->wr_op = read_wr.opcode; + read_wr.ex.invalidate_rkey = + ctxt->frmr->mr->lkey; + } else { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + hdr_ctxt->frmr->mr->lkey; + read_wr.next = &inv_wr; + } + } ctxt->read_hdr = hdr_ctxt; } /* Post the read */ @@ -358,9 +497,9 @@ next_sge: } atomic_inc(&rdma_stat_read); - if (read_wr.num_sge < ch_sge_ary[ch_no].count) { - ch_sge_ary[ch_no].count -= read_wr.num_sge; - ch_sge_ary[ch_no].start += read_wr.num_sge; + if (read_wr.num_sge < chl_map->ch[ch_no].count) { + chl_map->ch[ch_no].count -= read_wr.num_sge; + chl_map->ch[ch_no].start += read_wr.num_sge; goto next_sge; } sgl_offset = 0; @@ -368,8 +507,8 @@ next_sge: } out: - svc_rdma_put_context(tmp_sge_ctxt, 0); - svc_rdma_put_context(tmp_ch_ctxt, 0); + svc_rdma_put_req_map(rpl_map); + svc_rdma_put_req_map(chl_map); /* Detach arg pages. svc_recv will replenish them */ for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) @@ -399,7 +538,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; + rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; rqstp->rq_arg.page_base = head->arg.page_base; @@ -449,18 +588,18 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - spin_lock_bh(&rdma_xprt->sc_read_complete_lock); + spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, struct svc_rdma_op_ctxt, dto_q); list_del_init(&ctxt->dto_q); } - spin_unlock_bh(&rdma_xprt->sc_read_complete_lock); - if (ctxt) + if (ctxt) { + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); return rdma_read_complete(rqstp, ctxt); + } - spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, struct svc_rdma_op_ctxt, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index fb82b1b683f..9a7a8e7ae03 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -63,52 +63,165 @@ * SGE[2..sge_count-2] data from xdr->pages[] * SGE[sge_count-1] data from xdr->tail. * + * The max SGE we need is the length of the XDR / pagesize + one for + * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES + * reserves a page for both the request and the reply header, and this + * array is only concerned with the reply we are assured that we have + * on extra page for the RPCRMDA header. */ -static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct ib_sge *sge, - int *sge_count) +int fast_reg_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +{ + int sge_no; + u32 sge_bytes; + u32 page_bytes; + u32 page_off; + int page_no = 0; + u8 *frva; + struct svc_rdma_fastreg_mr *frmr; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + vec->frmr = frmr; + + /* Skip the RPCRDMA header */ + sge_no = 1; + + /* Map the head. */ + frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; + vec->count = 2; + sge_no++; + + /* Build the FRMR */ + frmr->kva = frva; + frmr->direction = DMA_TO_DEVICE; + frmr->access_flags = 0; + frmr->map_len = PAGE_SIZE; + frmr->page_list_len = 1; + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *)xdr->head[0].iov_base, + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + + page_off = xdr->page_base; + page_bytes = xdr->page_len + page_off; + if (!page_bytes) + goto encode_tail; + + /* Map the pages */ + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + vec->sge[sge_no].iov_len = page_bytes; + sge_no++; + while (page_bytes) { + struct page *page; + + page = xdr->pages[page_no++]; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); + page_bytes -= sge_bytes; + + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, page, 0, + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + + atomic_inc(&xprt->sc_dma_used); + page_off = 0; /* reset for next time through loop */ + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + vec->count++; + + encode_tail: + /* Map tail */ + if (0 == xdr->tail[0].iov_len) + goto done; + + vec->count++; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; + + if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == + ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { + /* + * If head and tail use the same page, we don't need + * to map it again. + */ + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + } else { + void *va; + + /* Map another page for the tail */ + page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + + done: + if (svc_rdma_fastreg(xprt, frmr)) + goto fatal_err; + + return 0; + + fatal_err: + printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { - /* Max we need is the length of the XDR / pagesize + one for - * head + one for tail + one for RPCRDMA header - */ int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; - u32 byte_count = xdr->len; u32 sge_bytes; u32 page_bytes; - int page_off; + u32 page_off; int page_no; + BUG_ON(xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + + if (xprt->sc_frmr_pg_list_len) + return fast_reg_xdr(xprt, xdr, vec); + /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; /* Head SGE */ - sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, - xdr->head[0].iov_base, - xdr->head[0].iov_len, - DMA_TO_DEVICE); - sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); - byte_count -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; sge_no++; /* pages SGE */ page_no = 0; page_bytes = xdr->page_len; page_off = xdr->page_base; - while (byte_count && page_bytes) { - sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); - sge[sge_no].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - xdr->pages[page_no], page_off, - sge_bytes, DMA_TO_DEVICE); - sge_bytes = min(sge_bytes, page_bytes); - byte_count -= sge_bytes; + while (page_bytes) { + vec->sge[sge_no].iov_base = + page_address(xdr->pages[page_no]) + page_off; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); page_bytes -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + vec->sge[sge_no].iov_len = sge_bytes; sge_no++; page_no++; @@ -116,36 +229,27 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, } /* Tail SGE */ - if (byte_count && xdr->tail[0].iov_len) { - sge[sge_no].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - xdr->tail[0].iov_base, - xdr->tail[0].iov_len, - DMA_TO_DEVICE); - sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); - byte_count -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + if (xdr->tail[0].iov_len) { + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; sge_no++; } BUG_ON(sge_no > sge_max); - BUG_ON(byte_count != 0); - - *sge_count = sge_no; - return sge; + vec->count = sge_no; + return 0; } - /* Assumptions: + * - We are using FRMR + * - or - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, u32 rmr, u64 to, u32 xdr_off, int write_len, - struct ib_sge *xdr_sge, int sge_count) + struct svc_rdma_req_map *vec) { - struct svc_rdma_op_ctxt *tmp_sge_ctxt; struct ib_send_wr write_wr; struct ib_sge *sge; int xdr_sge_no; @@ -154,25 +258,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, int sge_off; int bc; struct svc_rdma_op_ctxt *ctxt; - int ret = 0; - BUG_ON(sge_count > RPCSVC_MAXPAGES); + BUG_ON(vec->count > RPCSVC_MAXPAGES); dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " - "write_len=%d, xdr_sge=%p, sge_count=%d\n", + "write_len=%d, vec->sge=%p, vec->count=%lu\n", rmr, (unsigned long long)to, xdr_off, - write_len, xdr_sge, sge_count); + write_len, vec->sge, vec->count); ctxt = svc_rdma_get_context(xprt); - ctxt->count = 0; - tmp_sge_ctxt = svc_rdma_get_context(xprt); - sge = tmp_sge_ctxt->sge; + ctxt->direction = DMA_TO_DEVICE; + sge = ctxt->sge; /* Find the SGE associated with xdr_off */ - for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; xdr_sge_no++) { - if (xdr_sge[xdr_sge_no].length > bc) + if (vec->sge[xdr_sge_no].iov_len > bc) break; - bc -= xdr_sge[xdr_sge_no].length; + bc -= vec->sge[xdr_sge_no].iov_len; } sge_off = bc; @@ -180,22 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_no = 0; /* Copy the remaining SGE */ - while (bc != 0 && xdr_sge_no < sge_count) { - sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; - sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; - sge_bytes = min((size_t)bc, - (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); + while (bc != 0) { + sge_bytes = min_t(size_t, + bc, vec->sge[xdr_sge_no].iov_len-sge_off); sge[sge_no].length = sge_bytes; - + if (!vec->frmr) { + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *) + vec->sge[xdr_sge_no].iov_base + sge_off, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; + } else { + sge[sge_no].addr = (unsigned long) + vec->sge[xdr_sge_no].iov_base + sge_off; + sge[sge_no].lkey = vec->frmr->mr->lkey; + } + ctxt->count++; + ctxt->frmr = vec->frmr; sge_off = 0; sge_no++; xdr_sge_no++; + BUG_ON(xdr_sge_no > vec->count); bc -= sge_bytes; } - BUG_ON(bc != 0); - BUG_ON(xdr_sge_no > sge_count); - /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); ctxt->wr_op = IB_WR_RDMA_WRITE; @@ -209,21 +324,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, /* Post It */ atomic_inc(&rdma_stat_write); - if (svc_rdma_send(xprt, &write_wr)) { - svc_rdma_put_context(ctxt, 1); - /* Fatal error, close transport */ - ret = -EIO; - } - svc_rdma_put_context(tmp_sge_ctxt, 0); - return ret; + if (svc_rdma_send(xprt, &write_wr)) + goto err; + return 0; + err: + svc_rdma_put_context(ctxt, 0); + /* Fatal error, close transport */ + return -EIO; } static int send_write_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_argp, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, - struct ib_sge *sge, - int sge_count) + struct svc_rdma_req_map *vec) { u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; int write_len; @@ -241,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - max_write = xprt->sc_max_sge * PAGE_SIZE; + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; @@ -269,8 +386,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, rs_offset + chunk_off, xdr_off, this_write, - sge, - sge_count); + vec); if (ret) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); @@ -292,8 +408,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_argp, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, - struct ib_sge *sge, - int sge_count) + struct svc_rdma_req_map *vec) { u32 xfer_len = rqstp->rq_res.len; int write_len; @@ -314,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - max_write = xprt->sc_max_sge * PAGE_SIZE; + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; /* xdr offset starts at RPC message */ for (xdr_off = 0, chunk_no = 0; @@ -324,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, ch = &arg_ary->wc_array[chunk_no].wc_target; write_len = min(xfer_len, ch->rs_length); - /* Prepare the reply chunk given the length actually * written */ rs_offset = get_unaligned(&(ch->rs_offset)); @@ -341,8 +458,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, rs_offset + chunk_off, xdr_off, this_write, - sge, - sge_count); + vec); if (ret) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); @@ -380,10 +496,11 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_op_ctxt *ctxt, - int sge_count, + struct svc_rdma_req_map *vec, int byte_count) { struct ib_send_wr send_wr; + struct ib_send_wr inv_wr; int sge_no; int sge_bytes; int page_no; @@ -403,20 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; + ctxt->frmr = vec->frmr; + if (vec->frmr) + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + else + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); - ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; + ctxt->sge[0].lkey = rdma->sc_dma_lkey; /* Determine how many of our SGE are to be transmitted */ - for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { - sge_bytes = min((size_t)ctxt->sge[sge_no].length, - (size_t)byte_count); + for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { + sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; + if (!vec->frmr) { + ctxt->sge[sge_no].addr = + ib_dma_map_single(rdma->sc_cm_id->device, + vec->sge[sge_no].iov_base, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + } else { + ctxt->sge[sge_no].addr = (unsigned long) + vec->sge[sge_no].iov_base; + ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; + } + ctxt->sge[sge_no].length = sge_bytes; } BUG_ON(byte_count != 0); @@ -428,9 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; + /* + * If there are more pages than SGE, terminate SGE + * list so that svc_rdma_unmap_dma doesn't attempt to + * unmap garbage. + */ + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; } - BUG_ON(sge_no > rdma->sc_max_sge); + BUG_ON(sge_no > ctxt->count); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; @@ -438,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; + if (vec->frmr) { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + vec->frmr->mr->lkey; + send_wr.next = &inv_wr; + } ret = svc_rdma_send(rdma, &send_wr); if (ret) - svc_rdma_put_context(ctxt, 1); + goto err; - return ret; + return 0; + + err: + svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 1); + return -EIO; } void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) @@ -473,21 +636,22 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) enum rpcrdma_proc reply_type; int ret; int inline_bytes; - struct ib_sge *sge; - int sge_count = 0; struct page *res_page; struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); /* Get the RDMA request header. */ rdma_argp = xdr_start(&rqstp->rq_arg); - /* Build an SGE for the XDR */ + /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); - + vec = svc_rdma_get_req_map(); + ret = map_xdr(rdma, &rqstp->rq_res, vec); + if (ret) + goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ @@ -503,30 +667,34 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Send any write-chunk data and build resp write-list */ ret = send_write_chunks(rdma, rdma_argp, rdma_resp, - rqstp, sge, sge_count); + rqstp, vec); if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", ret); - goto error; + goto err1; } inline_bytes -= ret; /* Send any reply-list data and update resp reply-list */ ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, - rqstp, sge, sge_count); + rqstp, vec); if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", ret); - goto error; + goto err1; } inline_bytes -= ret; - ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); + svc_rdma_put_req_map(vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; - error: - svc_rdma_put_context(ctxt, 0); + + err1: put_page(res_page); + err0: + svc_rdma_put_req_map(vec); + svc_rdma_put_context(ctxt, 0); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e132509d1db..6fb493cbd29 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -84,70 +84,46 @@ struct svc_xprt_class svc_rdma_class = { .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, }; -static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) +/* WR context cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_ctxt_cachep; + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { - int target; - int at_least_one = 0; struct svc_rdma_op_ctxt *ctxt; - target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, - xprt->sc_ctxt_max); - - spin_lock_bh(&xprt->sc_ctxt_lock); - while (xprt->sc_ctxt_cnt < target) { - xprt->sc_ctxt_cnt++; - spin_unlock_bh(&xprt->sc_ctxt_lock); - - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); - - spin_lock_bh(&xprt->sc_ctxt_lock); - if (ctxt) { - at_least_one = 1; - INIT_LIST_HEAD(&ctxt->free_list); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - } else { - /* kmalloc failed...give up for now */ - xprt->sc_ctxt_cnt--; + while (1) { + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); + if (ctxt) break; - } + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } - spin_unlock_bh(&xprt->sc_ctxt_lock); - dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", - xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); - return at_least_one; + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + ctxt->frmr = NULL; + atomic_inc(&xprt->sc_ctxt_used); + return ctxt; } -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { - struct svc_rdma_op_ctxt *ctxt; - - while (1) { - spin_lock_bh(&xprt->sc_ctxt_lock); - if (unlikely(list_empty(&xprt->sc_ctxt_free))) { - /* Try to bump my cache. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); - - if (rdma_bump_context_cache(xprt)) - continue; - - printk(KERN_INFO "svcrdma: sleeping waiting for " - "context memory on xprt=%p\n", - xprt); - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - continue; + struct svcxprt_rdma *xprt = ctxt->xprt; + int i; + for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + /* + * Unmap the DMA addr in the SGE if the lkey matches + * the sc_dma_lkey, otherwise, ignore it since it is + * an FRMR lkey and will be unmapped later when the + * last WR that uses it completes. + */ + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); } - ctxt = list_entry(xprt->sc_ctxt_free.next, - struct svc_rdma_op_ctxt, - free_list); - list_del_init(&ctxt->free_list); - spin_unlock_bh(&xprt->sc_ctxt_lock); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); - ctxt->count = 0; - atomic_inc(&xprt->sc_ctxt_used); - break; } - return ctxt; } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -161,18 +137,37 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - for (i = 0; i < ctxt->count; i++) - ib_dma_unmap_single(xprt->sc_cm_id->device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - - spin_lock_bh(&xprt->sc_ctxt_lock); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); atomic_dec(&xprt->sc_ctxt_used); } +/* Temporary NFS request map cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_map_cachep; + +/* + * Temporary NFS req mappings are shared across all transport + * instances. These are short lived and should be bounded by the number + * of concurrent server threads * depth of the SQ. + */ +struct svc_rdma_req_map *svc_rdma_get_req_map(void) +{ + struct svc_rdma_req_map *map; + while (1) { + map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); + if (map) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + map->count = 0; + map->frmr = NULL; + return map; +} + +void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +{ + kmem_cache_free(svc_rdma_map_cachep, map); +} + /* ib_cq event handler */ static void cq_event_handler(struct ib_event *event, void *context) { @@ -302,6 +297,7 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; ctxt->wc_status = wc.status; ctxt->byte_len = wc.byte_len; + svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) { /* Close the transport */ dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); @@ -330,6 +326,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) } /* + * Processs a completion context + */ +static void process_context(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt) +{ + svc_rdma_unmap_dma(ctxt); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + svc_rdma_put_context(ctxt, 0); + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); + break; + } +} + +/* * Send Queue Completion Handler - potentially called on interrupt context. * * Note that caller must hold a transport reference. @@ -341,16 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; - if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - xprt = ctxt->xprt; - if (wc.status != IB_WC_SUCCESS) /* Close the transport */ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -359,32 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) atomic_dec(&xprt->sc_sq_count); wake_up(&xprt->sc_send_wait); - switch (ctxt->wr_op) { - case IB_WR_SEND: - case IB_WR_RDMA_WRITE: - svc_rdma_put_context(ctxt, 1); - break; - - case IB_WR_RDMA_READ: - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - BUG_ON(!read_hdr); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - spin_lock_bh(&xprt->sc_read_complete_lock); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_read_complete_lock); - svc_xprt_enqueue(&xprt->sc_xprt); - } - svc_rdma_put_context(ctxt, 0); - break; + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + if (ctxt) + process_context(xprt, ctxt); - default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d, status=%d\n", - wc.opcode, wc.status); - break; - } svc_xprt_put(&xprt->sc_xprt); } @@ -423,40 +437,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) tasklet_schedule(&dto_tasklet); } -static void create_context_cache(struct svcxprt_rdma *xprt, - int ctxt_count, int ctxt_bump, int ctxt_max) -{ - struct svc_rdma_op_ctxt *ctxt; - int i; - - xprt->sc_ctxt_max = ctxt_max; - xprt->sc_ctxt_bump = ctxt_bump; - xprt->sc_ctxt_cnt = 0; - atomic_set(&xprt->sc_ctxt_used, 0); - - INIT_LIST_HEAD(&xprt->sc_ctxt_free); - for (i = 0; i < ctxt_count; i++) { - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt) { - INIT_LIST_HEAD(&ctxt->free_list); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - xprt->sc_ctxt_cnt++; - } - } -} - -static void destroy_context_cache(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_ctxt_free)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(xprt->sc_ctxt_free.next, - struct svc_rdma_op_ctxt, - free_list); - list_del_init(&ctxt->free_list); - kfree(ctxt); - } -} - static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, int listener) { @@ -469,12 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); - spin_lock_init(&cma_xprt->sc_read_complete_lock); - spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_frmr_q_lock); cma_xprt->sc_ord = svcrdma_ord; @@ -482,21 +462,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, cma_xprt->sc_max_requests = svcrdma_max_requests; cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; atomic_set(&cma_xprt->sc_sq_count, 0); + atomic_set(&cma_xprt->sc_ctxt_used, 0); - if (!listener) { - int reqs = cma_xprt->sc_max_requests; - create_context_cache(cma_xprt, - reqs << 1, /* starting size */ - reqs, /* bump amount */ - reqs + - cma_xprt->sc_sq_depth + - RPCRDMA_MAX_THREADS + 1); /* max */ - if (list_empty(&cma_xprt->sc_ctxt_free)) { - kfree(cma_xprt); - return NULL; - } - clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); - } else + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); return cma_xprt; @@ -520,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; struct page *page; - unsigned long pa; + dma_addr_t pa; int sge_no; int buflen; int ret; @@ -535,9 +503,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) + goto err_put_ctxt; + atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; buflen += PAGE_SIZE; } ctxt->count = sge_no; @@ -553,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) svc_rdma_put_context(ctxt, 1); } return ret; + + err_put_ctxt: + svc_rdma_put_context(ctxt, 1); + return -ENOMEM; } /* @@ -566,7 +541,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) * will call the recvfrom method on the listen xprt which will accept the new * connection. */ -static void handle_connect_req(struct rdma_cm_id *new_cma_id) +static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; @@ -583,6 +558,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + /* Save client advertised inbound read limit for use later in accept. */ + newxprt->sc_ord = client_ird; + /* Set the local and remote addresses in the transport */ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); @@ -619,7 +597,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event=%d\n", cma_id, cma_id->context, event->event); - handle_connect_req(cma_id); + handle_connect_req(cma_id, + event->param.conn.initiator_depth); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -739,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(ret); } +static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) +{ + struct ib_mr *mr; + struct ib_fast_reg_page_list *pl; + struct svc_rdma_fastreg_mr *frmr; + + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); + if (!frmr) + goto err; + + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + if (!mr) + goto err_free_frmr; + + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, + RPCSVC_MAXPAGES); + if (!pl) + goto err_free_mr; + + frmr->mr = mr; + frmr->page_list = pl; + INIT_LIST_HEAD(&frmr->frmr_list); + return frmr; + + err_free_mr: + ib_dereg_mr(mr); + err_free_frmr: + kfree(frmr); + err: + return ERR_PTR(-ENOMEM); +} + +static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_fastreg_mr *frmr; + + while (!list_empty(&xprt->sc_frmr_q)) { + frmr = list_entry(xprt->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + ib_dereg_mr(frmr->mr); + ib_free_fast_reg_page_list(frmr->page_list); + kfree(frmr); + } +} + +struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_fastreg_mr *frmr = NULL; + + spin_lock_bh(&rdma->sc_frmr_q_lock); + if (!list_empty(&rdma->sc_frmr_q)) { + frmr = list_entry(rdma->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + frmr->map_len = 0; + frmr->page_list_len = 0; + } + spin_unlock_bh(&rdma->sc_frmr_q_lock); + if (frmr) + return frmr; + + return rdma_alloc_frmr(rdma); +} + +static void frmr_unmap_dma(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + int page_no; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + dma_addr_t addr = frmr->page_list->page_list[page_no]; + if (ib_dma_mapping_error(frmr->mr->device, addr)) + continue; + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); + } +} + +void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, + struct svc_rdma_fastreg_mr *frmr) +{ + if (frmr) { + frmr_unmap_dma(rdma, frmr); + spin_lock_bh(&rdma->sc_frmr_q_lock); + BUG_ON(!list_empty(&frmr->frmr_list)); + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); + spin_unlock_bh(&rdma->sc_frmr_q_lock); + } +} + /* * This is the xpo_recvfrom function for listening endpoints. Its * purpose is to accept incoming connections. The CMA callback handler @@ -757,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; + int dma_mr_acc; + int need_dma_mr; int ret; int i; @@ -793,8 +865,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) (size_t)svcrdma_max_requests); newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; - newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, - (size_t)svcrdma_ord); + /* + * Limit ORD based on client limit, local device limit, and + * configured svcrdma limit. + */ + newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); if (IS_ERR(newxprt->sc_pd)) { @@ -868,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* Register all of physical memory */ - newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); + /* + * Use the most secure set of MR resources based on the + * transport type and available memory management features in + * the device. Here's the table implemented below: + * + * Fast Global DMA Remote WR + * Reg LKEY MR Access + * Sup'd Sup'd Needed Needed + * + * IWARP N N Y Y + * N Y Y Y + * Y N Y N + * Y Y N - + * + * IB N N Y N + * N Y N - + * Y N Y N + * Y Y N - + * + * NB: iWARP requires remote write access for the data sink + * of an RDMA_READ. IB does not. + */ + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + newxprt->sc_frmr_pg_list_len = + devattr.max_fast_reg_page_list_len; + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + } + + /* + * Determine if a DMA MR is required and if so, what privs are required + */ + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { + case RDMA_TRANSPORT_IWARP: + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: + if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + default: goto errout; } + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ + if (need_dma_mr) { + /* Register all of physical memory */ + newxprt->sc_phys_mr = + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", + ret); + goto errout; + } + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; + } else + newxprt->sc_dma_lkey = + newxprt->sc_cm_id->device->local_dma_lkey; + /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt); @@ -987,7 +1125,6 @@ static void __svc_rdma_free(struct work_struct *work) * cm_id because the device ptr is needed to unmap the dma in * svc_rdma_put_context. */ - spin_lock_bh(&rdma->sc_read_complete_lock); while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_entry(rdma->sc_read_complete_q.next, @@ -996,10 +1133,8 @@ static void __svc_rdma_free(struct work_struct *work) list_del_init(&ctxt->dto_q); svc_rdma_put_context(ctxt, 1); } - spin_unlock_bh(&rdma->sc_read_complete_lock); /* Destroy queued, but not processed recv completions */ - spin_lock_bh(&rdma->sc_rq_dto_lock); while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_entry(rdma->sc_rq_dto_q.next, @@ -1008,10 +1143,13 @@ static void __svc_rdma_free(struct work_struct *work) list_del_init(&ctxt->dto_q); svc_rdma_put_context(ctxt, 1); } - spin_unlock_bh(&rdma->sc_rq_dto_lock); /* Warn if we leaked a resource or under-referenced */ WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + + /* De-allocate fastreg mr */ + rdma_dealloc_frmr_q(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1032,7 +1170,6 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); - destroy_context_cache(rdma); kfree(rdma); } @@ -1067,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +/* + * Attempt to register the kvec representing the RPC memory with the + * device. + * + * Returns: + * NULL : The device does not support fastreg or there were no more + * fastreg mr. + * frmr : The kvec register request was successfully posted. + * <0 : An error was encountered attempting to register the kvec. + */ +int svc_rdma_fastreg(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + struct ib_send_wr fastreg_wr; + u8 key; + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof fastreg_wr); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + return svc_rdma_send(xprt, &fastreg_wr); +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { - struct ib_send_wr *bad_wr; + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); - BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != - wr->opcode); + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + /* If the SQ is full, wait until an SQ entry is available */ while (1) { spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); @@ -1096,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return 0; continue; } - /* Bumped used SQ WR count and post */ - svc_xprt_get(&xprt->sc_xprt); + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&xprt->sc_xprt); + + /* Bump used SQ WR count and post */ + atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); - if (!ret) - atomic_inc(&xprt->sc_sq_count); - else { - svc_xprt_put(&xprt->sc_xprt); + if (ret) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + atomic_sub(wr_count, &xprt->sc_sq_count); + for (i = 0; i < wr_count; i ++) + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); } spin_unlock_bh(&xprt->sc_lock); + if (ret) + wake_up(&xprt->sc_send_wait); break; } return ret; @@ -1134,7 +1316,12 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, /* Prepare SGE for local address */ sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, p, 0, PAGE_SIZE, DMA_FROM_DEVICE); - sge.lkey = xprt->sc_phys_mr->lkey; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { + put_page(p); + return; + } + atomic_inc(&xprt->sc_dma_used); + sge.lkey = xprt->sc_dma_lkey; sge.length = length; ctxt = svc_rdma_get_context(xprt); @@ -1155,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, if (ret) { dprintk("svcrdma: Error %d posting send for protocol error\n", ret); + ib_dma_unmap_page(xprt->sc_cm_id->device, + sge.addr, PAGE_SIZE, + DMA_FROM_DEVICE); svc_rdma_put_context(ctxt, 1); } } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a564c1a39ec..9839c3d9414 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -#if !RPCRDMA_PERSISTENT_REGISTRATION -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ -#else -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; -#endif +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; #ifdef RPC_DEBUG @@ -140,6 +137,14 @@ static ctl_table xr_tunables_table[] = { .extra2 = &max_memreg, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0, }, }; @@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } @@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task) /* Reconnect */ schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > (30 * HZ)) + xprt->reestablish_timeout = (30 * HZ); + else if (xprt->reestablish_timeout < (5 * HZ)) + xprt->reestablish_timeout = (5 * HZ); } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) @@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); out: + req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; outfail: @@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task) req->rl_reply->rr_xprt = xprt; } - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ - } + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + task->tk_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; return 0; + +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ } static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) @@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8ea283ecc52..a5fef5e6c32 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: @@ -338,13 +339,32 @@ connected: wake_up_all(&ep->rep_connect_wait); break; default: - ia->ri_async_rc = -EINVAL; - dprintk("RPC: %s: unexpected CM event %X\n", + dprintk("RPC: %s: unexpected CM event %d\n", __func__, event->event); - complete(&ia->ri_done); break; } +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "on %s, memreg %d slots %d ird %d%s\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "closed (%d)\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + connstate); + } +#endif + return 0; } @@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rdma_cm_id *id; int rc; + init_completion(&ia->ri_done); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, return id; } - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; @@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc; + int rc, mem_priv; + struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; - init_completion(&ia->ri_done); - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -443,6 +466,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + switch (memreg) { + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { + dprintk("RPC: %s: MEMWINDOWS registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; + } + break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + case RPCRDMA_FRMR: + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + } + + /* * Optionally obtain an underlying physical identity mapping in * order to do a memory window-based bind. This base registration * is protected from remote access - that is enabled only by binding @@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * revoked after the corresponding completion similar to a storage * adapter. */ - if (memreg > RPCRDMA_REGISTER) { - int mem_priv = IB_ACCESS_LOCAL_WRITE; - switch (memreg) { + switch (memreg) { + case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; #if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: - mem_priv |= IB_ACCESS_REMOTE_WRITE; - mem_priv |= IB_ACCESS_REMOTE_READ; - break; + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv |= IB_ACCESS_MW_BIND; - break; - default: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_MW_BIND; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) break; - } + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) memreg = RPCRDMA_REGISTER; ia->ri_bind_mem = NULL; } + break; + default: + printk(KERN_ERR "%s: invalid memory registration mode %d\n", + __func__, memreg); + rc = -EINVAL; + goto out2; } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) return 0; out2: rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; out1: return rc; } @@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { rc = ib_dealloc_pd(ia->ri_pd); dprintk("RPC: %s: ib_dealloc_pd returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); } /* @@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + /* Add room for frmr register and invalidate WRs */ + ep->rep_attr.cap.max_send_wr *= 3; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ @@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.private_data_len = 0; /* Client offers RDMA Read but does not initiate */ - switch (ia->ri_memreg_strategy) { - case RPCRDMA_BOUNCEBUFFERS: + ep->rep_remote_cma.initiator_depth = 0; + if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) ep->rep_remote_cma.responder_resources = 0; - break; - case RPCRDMA_MTHCAFMR: - case RPCRDMA_REGISTER: - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 8); - break; - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: -#if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: -#endif - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 2); - break; - default: - break; - } - if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) + else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; - ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: rpcrdma_ep_disconnect" " returned %i\n", __func__, rc); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; } - ep->rep_func = NULL; - /* padding - could be done in rpcrdma_buffer_destroy... */ if (ep->rep_pad_mr) { rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); ep->rep_pad_mr = NULL; } - if (ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - rpcrdma_clean_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_cq); if (rc) @@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc = 0; int retry_count = 0; - int reconnect = (ep->rep_connected != 0); - if (reconnect) { + if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: rc = rpcrdma_ep_disconnect(ep, ia); @@ -745,6 +836,7 @@ retry: goto out; } /* END TEMP */ + rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; } @@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { } } - /* Theoretically a client initiator_depth > 0 is not needed, - * but many peers fail to complete the connection unless they - * == responder_resources! */ - if (ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources) - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { goto out; } - if (reconnect) - return 0; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); /* @@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { if (ep->rep_connected <= 0) { /* Sometimes, the only way to reliably connect to remote * CMs is to use same nonzero values for ORD and IRD. */ - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - if (ep->rep_remote_cma.initiator_depth == 0) - ++ep->rep_remote_cma.initiator_depth; - if (ep->rep_remote_cma.responder_resources == 0) - ++ep->rep_remote_cma.responder_resources; - if (retry_count++ == 0) + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; goto retry; + } rc = ep->rep_connected; } else { dprintk("RPC: %s: connected\n", __func__); @@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len; int i, rc; + struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, if any + * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ @@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * @@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * and also reduce unbind-to-bind collision. */ INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } + r->r.frmr.fr_pgl = + ib_alloc_fast_reg_page_list(ia->ri_id->device, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; case RPCRDMA_MTHCAFMR: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; - struct ib_fmr_attr fa = { - RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT - }; /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + static struct ib_fmr_attr fa = + { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; r->r.fmr = ib_alloc_fmr(ia->ri_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, &fa); @@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.mw = ib_alloc_mw(ia->ri_pd); @@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; default: break; @@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *r; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) @@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { while (!list_empty(&buf->rb_mws)) { - struct rpcrdma_mw *r; r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; case RPCRDMA_MTHCAFMR: rc = ib_dealloc_fmr(r->r.fmr); if (rc) @@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_req *req; unsigned long flags; + int i; + struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; if (!list_empty(&buffers->rb_mws)) { - int i = RPCRDMA_MAX_SEGS - 1; + i = RPCRDMA_MAX_SEGS - 1; do { - struct rpcrdma_mw *r; r = list_entry(buffers->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); @@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: @@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, va, len, DMA_BIDIRECTIONAL); iov->length = len; - if (ia->ri_bind_mem != NULL) { + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { *mrp = NULL; iov->lkey = ia->ri_bind_mem->lkey; return 0; @@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr frmr_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, seg1->mr_chunk.rl_mw, i); + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.opcode = IB_WR_FAST_REG_MR; + frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; + frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list_len = i; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ); + frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + if (rc) + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct ib_mw_bind param; + int rc; + + *nsegs = 1; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + } + return rc; +} + +static int +rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt, void **r) +{ + struct ib_mw_bind param; + LIST_HEAD(l); + int rc; + + BUG_ON(seg->mr_nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (*r) { + param.wr_id = (u64) (unsigned long) *r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + *r = NULL; /* will upcall on completion */ + return rc; +} + +static int +rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len, i, rc = 0; + + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (len = 0, i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, i, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int rc; + + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + return rc; +} + int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - int i; int rc = 0; switch (ia->ri_memreg_strategy) { @@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; #endif - /* Registration using fast memory registration */ + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Registration using fmr memory registration */ case RPCRDMA_MTHCAFMR: - { - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff = offset_in_page(seg->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, - physaddrs, nsegs, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; /* Registration using memory windows */ case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - rpcrdma_map_one(ia, seg, writing); - param.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.addr = seg->mr_dma; - param.length = seg->mr_len; - param.send_flags = 0; - param.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.addr; - seg->mr_nsegs = 1; - nsegs = 1; - } - } + rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); break; /* Default registration each time */ default: - { - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len = 0; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, nsegs, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); break; } if (rc) @@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt, void *r) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; int nsegs = seg->mr_nsegs, rc; switch (ia->ri_memreg_strategy) { @@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, break; #endif + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + case RPCRDMA_MTHCAFMR: - { - LIST_HEAD(l); - list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - } - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_fmr_external(seg, ia); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - BUG_ON(nsegs != 1); - param.mr = ia->ri_bind_mem; - param.addr = 0ULL; /* unbind */ - param.length = 0; - param.mw_access_flags = 0; - if (r) { - param.wr_id = (u64) (unsigned long) r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - } - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - r = NULL; /* will upcall on completion */ + rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); break; default: - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_default_external(seg, ia); break; } if (r) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2427822f8bd..c7a7eba991b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,6 +51,9 @@ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + /* * Interface Adapter -- one per transport instance */ @@ -58,6 +61,8 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; @@ -156,6 +161,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ union { struct ib_mw *mw; struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + } frmr; } r; struct list_head mw_list; } *rl_mw; @@ -175,6 +184,7 @@ struct rpcrdma_req { size_t rl_size; /* actual length of buffer */ unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -198,7 +208,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -273,6 +283,11 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + /* * Interface Adapter calls - xprtrdma/verbs.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4486c59c3ac..9a288d5eea6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3,8 +3,8 @@ * * Client-side transport implementation for sockets. * - * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com> - * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com> + * TCP callback races fixes (C) 1998 Red Hat + * TCP send fixes (C) 1998 Red Hat * TCP NFS related read + write fixes * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> * |