aboutsummaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 16:26:12 +0100
committerIngo Molnar <mingo@elte.hu>2008-10-28 16:26:12 +0100
commit7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
treee730a4565e0318140d2fbd2f0415d18a339d7336 /net/sunrpc
parent41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_gss/Makefile4
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c10
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c26
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c16
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c72
-rw-r--r--net/sunrpc/clnt.c6
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/rpcb_clnt.c121
-rw-r--r--net/sunrpc/svc.c361
-rw-r--r--net/sunrpc/svc_xprt.c39
-rw-r--r--net/sunrpc/svcsock.c17
-rw-r--r--net/sunrpc/sysctl.c18
-rw-r--r--net/sunrpc/xprt.c12
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c33
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c35
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c255
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c370
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c530
-rw-r--r--net/sunrpc/xprtrdma/transport.c41
-rw-r--r--net/sunrpc/xprtrdma/verbs.c741
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h17
-rw-r--r--net/sunrpc/xprtsock.c4
24 files changed, 1897 insertions, 837 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 6bfea9ed686..436bf1b4b76 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -83,10 +83,8 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
if (flavor >= RPC_AUTH_MAXFLAVOR)
goto out;
-#ifdef CONFIG_KMOD
if ((ops = auth_flavors[flavor]) == NULL)
request_module("rpc-auth-%u", flavor);
-#endif
spin_lock(&rpc_authflavor_lock);
ops = auth_flavors[flavor];
if (ops == NULL || !try_module_get(ops->owner)) {
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index f3431a7e33d..4de8bcf26fa 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -5,12 +5,12 @@
obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
auth_rpcgss-objs := auth_gss.o gss_generic_token.o \
- gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o
+ gss_mech_switch.o svcauth_gss.o
obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
- gss_krb5_seqnum.o gss_krb5_wrap.o
+ gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o
obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 834a83199bd..853a4142cea 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -33,8 +33,6 @@
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $Id$
*/
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 1d52308ca32..c93fca20455 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -83,8 +83,6 @@ out:
return ret;
}
-EXPORT_SYMBOL(krb5_encrypt);
-
u32
krb5_decrypt(
struct crypto_blkcipher *tfm,
@@ -118,8 +116,6 @@ out:
return ret;
}
-EXPORT_SYMBOL(krb5_decrypt);
-
static int
checksummer(struct scatterlist *sg, void *data)
{
@@ -161,8 +157,6 @@ out:
return err ? GSS_S_FAILURE : 0;
}
-EXPORT_SYMBOL(make_checksum);
-
struct encryptor_desc {
u8 iv[8]; /* XXX hard-coded blocksize */
struct blkcipher_desc desc;
@@ -262,8 +256,6 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
return ret;
}
-EXPORT_SYMBOL(gss_encrypt_xdr_buf);
-
struct decryptor_desc {
u8 iv[8]; /* XXX hard-coded blocksize */
struct blkcipher_desc desc;
@@ -334,5 +326,3 @@ gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
}
-
-EXPORT_SYMBOL(gss_decrypt_xdr_buf);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 5f1d36dfbcf..b8f42ef7178 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -78,7 +78,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
char cksumdata[16];
struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata};
- unsigned char *ptr, *krb5_hdr, *msg_start;
+ unsigned char *ptr, *msg_start;
s32 now;
u32 seq_send;
@@ -87,36 +87,36 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
now = get_seconds();
- token->len = g_token_size(&ctx->mech_used, 24);
+ token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8);
ptr = token->data;
- g_make_token_header(&ctx->mech_used, 24, &ptr);
+ g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr);
- *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
- *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+ /* ptr now at header described in rfc 1964, section 1.2.1: */
+ ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff);
+ ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff);
- /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
- krb5_hdr = ptr - 2;
- msg_start = krb5_hdr + 24;
+ msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8;
- *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5);
- memset(krb5_hdr + 4, 0xff, 4);
+ *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5);
+ memset(ptr + 4, 0xff, 4);
- if (make_checksum("md5", krb5_hdr, 8, text, 0, &md5cksum))
+ if (make_checksum("md5", ptr, 8, text, 0, &md5cksum))
return GSS_S_FAILURE;
if (krb5_encrypt(ctx->seq, NULL, md5cksum.data,
md5cksum.data, md5cksum.len))
return GSS_S_FAILURE;
- memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8);
+ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8);
spin_lock(&krb5_seq_lock);
seq_send = ctx->seq_send++;
spin_unlock(&krb5_seq_lock);
if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
- seq_send, krb5_hdr + 16, krb5_hdr + 8))
+ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ptr + 8))
return GSS_S_FAILURE;
return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index d91a5d00480..066ec73c84d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -92,30 +92,30 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
read_token->len))
return GSS_S_DEFECTIVE_TOKEN;
- if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
- (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) )
+ if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) ||
+ (ptr[1] != (KG_TOK_MIC_MSG & 0xff)))
return GSS_S_DEFECTIVE_TOKEN;
/* XXX sanity-check bodysize?? */
- signalg = ptr[0] + (ptr[1] << 8);
+ signalg = ptr[2] + (ptr[3] << 8);
if (signalg != SGN_ALG_DES_MAC_MD5)
return GSS_S_DEFECTIVE_TOKEN;
- sealalg = ptr[2] + (ptr[3] << 8);
+ sealalg = ptr[4] + (ptr[5] << 8);
if (sealalg != SEAL_ALG_NONE)
return GSS_S_DEFECTIVE_TOKEN;
- if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
return GSS_S_DEFECTIVE_TOKEN;
- if (make_checksum("md5", ptr - 2, 8, message_buffer, 0, &md5cksum))
+ if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum))
return GSS_S_FAILURE;
if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16))
return GSS_S_FAILURE;
- if (memcmp(md5cksum.data + 8, ptr + 14, 8))
+ if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8))
return GSS_S_BAD_SIG;
/* it got through unscathed. Make sure the context is unexpired */
@@ -127,7 +127,7 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
/* do sequencing checks */
- if (krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, &seqnum))
+ if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum))
return GSS_S_FAILURE;
if ((ctx->initiate && direction != 0xff) ||
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index b00b1b42630..ae8e69b59c4 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -87,8 +87,8 @@ out:
return 0;
}
-static inline void
-make_confounder(char *p, int blocksize)
+static void
+make_confounder(char *p, u32 conflen)
{
static u64 i = 0;
u64 *q = (u64 *)p;
@@ -102,8 +102,22 @@ make_confounder(char *p, int blocksize)
* uniqueness would mean worrying about atomicity and rollover, and I
* don't care enough. */
- BUG_ON(blocksize != 8);
- *q = i++;
+ /* initialize to random value */
+ if (i == 0) {
+ i = random32();
+ i = (i << 32) | random32();
+ }
+
+ switch (conflen) {
+ case 16:
+ *q++ = i++;
+ /* fall through */
+ case 8:
+ *q++ = i++;
+ break;
+ default:
+ BUG();
+ }
}
/* Assumptions: the head and tail of inbuf are ours to play with.
@@ -122,7 +136,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
char cksumdata[16];
struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata};
int blocksize = 0, plainlen;
- unsigned char *ptr, *krb5_hdr, *msg_start;
+ unsigned char *ptr, *msg_start;
s32 now;
int headlen;
struct page **tmp_pages;
@@ -149,26 +163,26 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
buf->len += headlen;
BUG_ON((buf->len - offset - headlen) % blocksize);
- g_make_token_header(&kctx->mech_used, 24 + plainlen, &ptr);
+ g_make_token_header(&kctx->mech_used,
+ GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr);
- *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
- *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
+ /* ptr now at header described in rfc 1964, section 1.2.1: */
+ ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
+ ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
- /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
- krb5_hdr = ptr - 2;
- msg_start = krb5_hdr + 24;
+ msg_start = ptr + 24;
- *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5);
- memset(krb5_hdr + 4, 0xff, 4);
- *(__be16 *)(krb5_hdr + 4) = htons(SEAL_ALG_DES);
+ *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5);
+ memset(ptr + 4, 0xff, 4);
+ *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES);
make_confounder(msg_start, blocksize);
/* XXXJBF: UGH!: */
tmp_pages = buf->pages;
buf->pages = pages;
- if (make_checksum("md5", krb5_hdr, 8, buf,
+ if (make_checksum("md5", ptr, 8, buf,
offset + headlen - blocksize, &md5cksum))
return GSS_S_FAILURE;
buf->pages = tmp_pages;
@@ -176,7 +190,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
md5cksum.data, md5cksum.len))
return GSS_S_FAILURE;
- memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8);
+ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8);
spin_lock(&krb5_seq_lock);
seq_send = kctx->seq_send++;
@@ -185,7 +199,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
/* XXX would probably be more efficient to compute checksum
* and encrypt at the same time: */
if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
- seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
return GSS_S_FAILURE;
if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
@@ -219,38 +233,38 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
buf->len - offset))
return GSS_S_DEFECTIVE_TOKEN;
- if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
- (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) )
+ if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
+ (ptr[1] != (KG_TOK_WRAP_MSG & 0xff)))
return GSS_S_DEFECTIVE_TOKEN;
/* XXX sanity-check bodysize?? */
/* get the sign and seal algorithms */
- signalg = ptr[0] + (ptr[1] << 8);
+ signalg = ptr[2] + (ptr[3] << 8);
if (signalg != SGN_ALG_DES_MAC_MD5)
return GSS_S_DEFECTIVE_TOKEN;
- sealalg = ptr[2] + (ptr[3] << 8);
+ sealalg = ptr[4] + (ptr[5] << 8);
if (sealalg != SEAL_ALG_DES)
return GSS_S_DEFECTIVE_TOKEN;
- if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
return GSS_S_DEFECTIVE_TOKEN;
if (gss_decrypt_xdr_buf(kctx->enc, buf,
- ptr + 22 - (unsigned char *)buf->head[0].iov_base))
+ ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base))
return GSS_S_DEFECTIVE_TOKEN;
- if (make_checksum("md5", ptr - 2, 8, buf,
- ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum))
+ if (make_checksum("md5", ptr, 8, buf,
+ ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum))
return GSS_S_FAILURE;
if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
md5cksum.data, md5cksum.len))
return GSS_S_FAILURE;
- if (memcmp(md5cksum.data + 8, ptr + 14, 8))
+ if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8))
return GSS_S_BAD_SIG;
/* it got through unscathed. Make sure the context is unexpired */
@@ -262,8 +276,8 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
/* do sequencing checks */
- if (krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
- &seqnum))
+ if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8,
+ &direction, &seqnum))
return GSS_S_BAD_SIG;
if ((kctx->initiate && direction != 0xff) ||
@@ -274,7 +288,7 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
* better to copy and encrypt at the same time. */
blocksize = crypto_blkcipher_blocksize(kctx->enc);
- data_start = ptr + 22 + blocksize;
+ data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize;
orig_start = buf->head[0].iov_base + offset;
data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
memmove(orig_start, data_start, data_len);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 76739e928d0..4895c341e46 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
clnt->cl_procinfo = version->procs;
clnt->cl_maxproc = version->nrprocs;
clnt->cl_protname = program->name;
- clnt->cl_prog = program->number;
+ clnt->cl_prog = args->prognumber ? : program->number;
clnt->cl_vers = version->number;
clnt->cl_stats = program->stats;
clnt->cl_metrics = rpc_alloc_iostats(clnt);
@@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
}
/* save the nodename */
- clnt->cl_nodelen = strlen(utsname()->nodename);
+ clnt->cl_nodelen = strlen(init_utsname()->nodename);
if (clnt->cl_nodelen > UNX_MAXNODENAME)
clnt->cl_nodelen = UNX_MAXNODENAME;
- memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen);
+ memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen);
rpc_register_client(clnt);
return clnt;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5a9b0e7828c..23a2b8f6dc4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -897,7 +897,7 @@ static struct file_system_type rpc_pipe_fs_type = {
};
static void
-init_once(struct kmem_cache * cachep, void *foo)
+init_once(void *foo)
{
struct rpc_inode *rpci = (struct rpc_inode *) foo;
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 24db2b4d12d..41013dd66ac 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -20,6 +20,7 @@
#include <linux/in6.h>
#include <linux/kernel.h>
#include <linux/errno.h>
+#include <net/ipv6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
@@ -176,13 +177,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
}
static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
- u32 version, struct rpc_message *msg,
- int *result)
+ u32 version, struct rpc_message *msg)
{
struct rpc_clnt *rpcb_clnt;
- int error = 0;
+ int result, error = 0;
- *result = 0;
+ msg->rpc_resp = &result;
rpcb_clnt = rpcb_create_local(addr, addrlen, version);
if (!IS_ERR(rpcb_clnt)) {
@@ -191,12 +191,15 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
} else
error = PTR_ERR(rpcb_clnt);
- if (error < 0)
+ if (error < 0) {
printk(KERN_WARNING "RPC: failed to contact local rpcbind "
"server (errno %d).\n", -error);
- dprintk("RPC: registration status %d/%d\n", error, *result);
+ return error;
+ }
- return error;
+ if (!result)
+ return -EACCES;
+ return 0;
}
/**
@@ -205,7 +208,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* @vers: RPC version number to bind
* @prot: transport protocol to register
* @port: port value to register
- * @okay: OUT: result code
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success. Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
*
* RPC services invoke this function to advertise their contact
* information via the system's rpcbind daemon. RPC services
@@ -217,15 +224,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* all registered transports for [program, version] from the local
* rpcbind database.
*
- * Returns zero if the registration request was dispatched
- * successfully and a reply was received. The rpcbind daemon's
- * boolean result code is stored in *okay.
- *
- * Returns an errno value and sets *result to zero if there was
- * some problem that prevented the rpcbind request from being
- * dispatched, or if the rpcbind daemon did not respond within
- * the timeout.
- *
* This function uses rpcbind protocol version 2 to contact the
* local rpcbind daemon.
*
@@ -236,7 +234,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
* addresses).
*/
-int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
+int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
{
struct rpcbind_args map = {
.r_prog = prog,
@@ -246,7 +244,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
};
struct rpc_message msg = {
.rpc_argp = &map,
- .rpc_resp = okay,
};
dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
@@ -259,7 +256,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
sizeof(rpcb_inaddr_loopback),
- RPCBVERS_2, &msg, okay);
+ RPCBVERS_2, &msg);
}
/*
@@ -290,7 +287,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
sizeof(rpcb_inaddr_loopback),
- RPCBVERS_4, msg, msg->rpc_resp);
+ RPCBVERS_4, msg);
}
/*
@@ -304,10 +301,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
char buf[64];
/* Construct AF_INET6 universal address */
- snprintf(buf, sizeof(buf),
- NIP6_FMT".%u.%u",
- NIP6(address_to_register->sin6_addr),
- port >> 8, port & 0xff);
+ if (ipv6_addr_any(&address_to_register->sin6_addr))
+ snprintf(buf, sizeof(buf), "::.%u.%u",
+ port >> 8, port & 0xff);
+ else
+ snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u",
+ NIP6(address_to_register->sin6_addr),
+ port >> 8, port & 0xff);
map->r_addr = buf;
dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
sizeof(rpcb_in6addr_loopback),
- RPCBVERS_4, msg, msg->rpc_resp);
+ RPCBVERS_4, msg);
}
/**
@@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* @version: RPC version number of service to (un)register
* @address: address family, IP address, and port to (un)register
* @netid: netid of transport protocol to (un)register
- * @result: result code from rpcbind RPC call
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success. Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
*
* RPC services invoke this function to advertise their contact
* information via the system's rpcbind daemon. RPC services
@@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* to zero. Callers pass a netid of "" to unregister all
* transport netids associated with [program, version, address].
*
- * Returns zero if the registration request was dispatched
- * successfully and a reply was received. The rpcbind daemon's
- * result code is stored in *result.
- *
- * Returns an errno value and sets *result to zero if there was
- * some problem that prevented the rpcbind request from being
- * dispatched, or if the rpcbind daemon did not respond within
- * the timeout.
- *
* This function uses rpcbind protocol version 4 to contact the
* local rpcbind daemon. The local rpcbind daemon must support
* version 4 of the rpcbind protocol in order for these functions
@@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* advertises the service on all IPv4 and IPv6 addresses.
*/
int rpcb_v4_register(const u32 program, const u32 version,
- const struct sockaddr *address, const char *netid,
- int *result)
+ const struct sockaddr *address, const char *netid)
{
struct rpcbind_args map = {
.r_prog = program,
@@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version,
};
struct rpc_message msg = {
.rpc_argp = &map,
- .rpc_resp = result,
};
- *result = 0;
-
switch (address->sa_family) {
case AF_INET:
return rpcb_register_netid4((struct sockaddr_in *)address,
@@ -469,6 +460,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
return rpc_run_task(&task_setup_data);
}
+/*
+ * In the case where rpc clients have been cloned, we want to make
+ * sure that we use the program number/version etc of the actual
+ * owner of the xprt. To do so, we walk back up the tree of parents
+ * to find whoever created the transport and/or whoever has the
+ * autobind flag set.
+ */
+static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
+{
+ struct rpc_clnt *parent = clnt->cl_parent;
+
+ while (parent != clnt) {
+ if (parent->cl_xprt != clnt->cl_xprt)
+ break;
+ if (clnt->cl_autobind)
+ break;
+ clnt = parent;
+ parent = parent->cl_parent;
+ }
+ return clnt;
+}
+
/**
* rpcb_getport_async - obtain the port for a given RPC service on a given host
* @task: task that is waiting for portmapper request
@@ -478,10 +491,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
*/
void rpcb_getport_async(struct rpc_task *task)
{
- struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_clnt *clnt;
struct rpc_procinfo *proc;
u32 bind_version;
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt;
struct rpc_clnt *rpcb_clnt;
static struct rpcbind_args *map;
struct rpc_task *child;
@@ -490,13 +503,13 @@ void rpcb_getport_async(struct rpc_task *task)
size_t salen;
int status;
+ clnt = rpcb_find_transport_owner(task->tk_client);
+ xprt = clnt->cl_xprt;
+
dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
task->tk_pid, __func__,
clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot);
- /* Autobind on cloned rpc clients is discouraged */
- BUG_ON(clnt->cl_parent != clnt);
-
/* Put self on the wait queue to ensure we get notified if
* some other task is already attempting to bind the port */
rpc_sleep_on(&xprt->binding, task, NULL);
@@ -558,7 +571,7 @@ void rpcb_getport_async(struct rpc_task *task)
status = -ENOMEM;
dprintk("RPC: %5u %s: no memory available\n",
task->tk_pid, __func__);
- goto bailout_nofree;
+ goto bailout_release_client;
}
map->r_prog = clnt->cl_prog;
map->r_vers = clnt->cl_vers;
@@ -578,11 +591,13 @@ void rpcb_getport_async(struct rpc_task *task)
task->tk_pid, __func__);
return;
}
- rpc_put_task(child);
- task->tk_xprt->stat.bind_count++;
+ xprt->stat.bind_count++;
+ rpc_put_task(child);
return;
+bailout_release_client:
+ rpc_release_client(rpcb_clnt);
bailout_nofree:
rpcb_wake_rpcbind_waiters(xprt, status);
task->tk_status = status;
@@ -633,7 +648,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p,
struct rpcbind_args *rpcb)
{
- dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n",
+ dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n",
rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
*p++ = htonl(rpcb->r_prog);
*p++ = htonl(rpcb->r_vers);
@@ -648,7 +663,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p,
unsigned short *portp)
{
*portp = (unsigned short) ntohl(*p++);
- dprintk("RPC: rpcb_decode_getport result %u\n",
+ dprintk("RPC: rpcb getport result: %u\n",
*portp);
return 0;
}
@@ -657,7 +672,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
unsigned int *boolp)
{
*boolp = (unsigned int) ntohl(*p++);
- dprintk("RPC: rpcb_decode_set: call %s\n",
+ dprintk("RPC: rpcb set/unset call %s\n",
(*boolp ? "succeeded" : "failed"));
return 0;
}
@@ -665,7 +680,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p,
struct rpcbind_args *rpcb)
{
- dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n",
+ dprintk("RPC: encoding rpcb request (%u, %u, %s)\n",
rpcb->r_prog, rpcb->r_vers, rpcb->r_addr);
*p++ = htonl(rpcb->r_prog);
*p++ = htonl(rpcb->r_vers);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 01c7e311b90..54c98d87684 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/module.h>
+#include <linux/kthread.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/xdr.h>
@@ -27,6 +28,8 @@
#define RPCDBG_FACILITY RPCDBG_SVCDSP
+static void svc_unregister(const struct svc_serv *serv);
+
#define svc_serv_is_pooled(serv) ((serv)->sv_function)
/*
@@ -291,15 +294,14 @@ svc_pool_map_put(void)
/*
- * Set the current thread's cpus_allowed mask so that it
+ * Set the given thread's cpus_allowed mask so that it
* will only run on cpus in the given pool.
- *
- * Returns 1 and fills in oldmask iff a cpumask was applied.
*/
-static inline int
-svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+static inline void
+svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
{
struct svc_pool_map *m = &svc_pool_map;
+ unsigned int node = m->pool_to[pidx];
/*
* The caller checks for sv_nrpools > 1, which
@@ -307,26 +309,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
*/
BUG_ON(m->count == 0);
- switch (m->mode)
- {
- default:
- return 0;
+ switch (m->mode) {
case SVC_POOL_PERCPU:
{
- unsigned int cpu = m->pool_to[pidx];
-
- *oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- return 1;
+ set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+ break;
}
case SVC_POOL_PERNODE:
{
- unsigned int node = m->pool_to[pidx];
node_to_cpumask_ptr(nodecpumask, node);
-
- *oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, nodecpumask);
- return 1;
+ set_cpus_allowed_ptr(task, nodecpumask);
+ break;
}
}
}
@@ -366,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- void (*shutdown)(struct svc_serv *serv))
+ sa_family_t family, void (*shutdown)(struct svc_serv *serv))
{
struct svc_serv *serv;
unsigned int vers;
@@ -375,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
return NULL;
+ serv->sv_family = family;
serv->sv_name = prog->pg_name;
serv->sv_program = prog;
serv->sv_nrthreads = 1;
@@ -425,34 +419,32 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
spin_lock_init(&pool->sp_lock);
}
-
/* Remove any stale portmap registrations */
- svc_register(serv, 0, 0);
+ svc_unregister(serv);
return serv;
}
struct svc_serv *
svc_create(struct svc_program *prog, unsigned int bufsize,
- void (*shutdown)(struct svc_serv *serv))
+ sa_family_t family, void (*shutdown)(struct svc_serv *serv))
{
- return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+ return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
}
EXPORT_SYMBOL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
- void (*shutdown)(struct svc_serv *serv),
- svc_thread_fn func, int sig, struct module *mod)
+ sa_family_t family, void (*shutdown)(struct svc_serv *serv),
+ svc_thread_fn func, struct module *mod)
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
- serv = __svc_create(prog, bufsize, npools, shutdown);
+ serv = __svc_create(prog, bufsize, npools, family, shutdown);
if (serv != NULL) {
serv->sv_function = func;
- serv->sv_kill_signal = sig;
serv->sv_module = mod;
}
@@ -461,7 +453,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
EXPORT_SYMBOL(svc_create_pooled);
/*
- * Destroy an RPC service. Should be called with the BKL held
+ * Destroy an RPC service. Should be called with appropriate locking to
+ * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
*/
void
svc_destroy(struct svc_serv *serv)
@@ -495,8 +488,7 @@ svc_destroy(struct svc_serv *serv)
if (svc_serv_is_pooled(serv))
svc_pool_map_put();
- /* Unregister service with the portmapper */
- svc_register(serv, 0, 0);
+ svc_unregister(serv);
kfree(serv->sv_pools);
kfree(serv);
}
@@ -578,46 +570,6 @@ out_enomem:
EXPORT_SYMBOL(svc_prepare_thread);
/*
- * Create a thread in the given pool. Caller must hold BKL.
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
- */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
- struct svc_pool *pool)
-{
- struct svc_rqst *rqstp;
- int error = -ENOMEM;
- int have_oldmask = 0;
- cpumask_t uninitialized_var(oldmask);
-
- rqstp = svc_prepare_thread(serv, pool);
- if (IS_ERR(rqstp)) {
- error = PTR_ERR(rqstp);
- goto out;
- }
-
- if (serv->sv_nrpools > 1)
- have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
-
- error = kernel_thread((int (*)(void *)) func, rqstp, 0);
-
- if (have_oldmask)
- set_cpus_allowed(current, oldmask);
-
- if (error < 0)
- goto out_thread;
- svc_sock_update_bufs(serv);
- error = 0;
-out:
- return error;
-
-out_thread:
- svc_exit_thread(rqstp);
- goto out;
-}
-
-/*
* Choose a pool in which to create a new thread, for svc_set_num_threads
*/
static inline struct svc_pool *
@@ -674,7 +626,7 @@ found_pool:
* of threads the given number. If `pool' is non-NULL, applies
* only to threads in that pool, otherwise round-robins between
* all pools. Must be called with a svc_get() reference and
- * the BKL held.
+ * the BKL or another lock to protect access to svc_serv fields.
*
* Destroying threads relies on the service threads filling in
* rqstp->rq_task, which only the nfs ones do. Assumes the serv
@@ -686,7 +638,9 @@ found_pool:
int
svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
- struct task_struct *victim;
+ struct svc_rqst *rqstp;
+ struct task_struct *task;
+ struct svc_pool *chosen_pool;
int error = 0;
unsigned int state = serv->sv_nrthreads-1;
@@ -702,18 +656,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
/* create new threads */
while (nrservs > 0) {
nrservs--;
+ chosen_pool = choose_pool(serv, pool, &state);
+
+ rqstp = svc_prepare_thread(serv, chosen_pool);
+ if (IS_ERR(rqstp)) {
+ error = PTR_ERR(rqstp);
+ break;
+ }
+
__module_get(serv->sv_module);
- error = __svc_create_thread(serv->sv_function, serv,
- choose_pool(serv, pool, &state));
- if (error < 0) {
+ task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
module_put(serv->sv_module);
+ svc_exit_thread(rqstp);
break;
}
+
+ rqstp->rq_task = task;
+ if (serv->sv_nrpools > 1)
+ svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+
+ svc_sock_update_bufs(serv);
+ wake_up_process(task);
}
/* destroy old threads */
while (nrservs < 0 &&
- (victim = choose_victim(serv, pool, &state)) != NULL) {
- send_sig(serv->sv_kill_signal, victim, 1);
+ (task = choose_victim(serv, pool, &state)) != NULL) {
+ send_sig(SIGINT, task, 1);
nrservs++;
}
@@ -722,7 +692,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
EXPORT_SYMBOL(svc_set_num_threads);
/*
- * Called from a server thread as it's exiting. Caller must hold BKL.
+ * Called from a server thread as it's exiting. Caller must hold the BKL or
+ * the "service mutex", whichever is appropriate for the service.
*/
void
svc_exit_thread(struct svc_rqst *rqstp)
@@ -748,55 +719,245 @@ svc_exit_thread(struct svc_rqst *rqstp)
}
EXPORT_SYMBOL(svc_exit_thread);
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
/*
- * Register an RPC service with the local portmapper.
- * To unregister a service, call this routine with
- * proto and port == 0.
+ * Register an "inet" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
*/
-int
-svc_register(struct svc_serv *serv, int proto, unsigned short port)
+static int __svc_rpcb_register4(const u32 program, const u32 version,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ .sin_port = htons(port),
+ };
+ char *netid;
+
+ switch (protocol) {
+ case IPPROTO_UDP:
+ netid = RPCBIND_NETID_UDP;
+ break;
+ case IPPROTO_TCP:
+ netid = RPCBIND_NETID_TCP;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ return rpcb_v4_register(program, version,
+ (struct sockaddr *)&sin, netid);
+}
+
+/*
+ * Register an "inet6" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register6(const u32 program, const u32 version,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+ char *netid;
+
+ switch (protocol) {
+ case IPPROTO_UDP:
+ netid = RPCBIND_NETID_UDP6;
+ break;
+ case IPPROTO_TCP:
+ netid = RPCBIND_NETID_TCP6;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ return rpcb_v4_register(program, version,
+ (struct sockaddr *)&sin6, netid);
+}
+
+/*
+ * Register a kernel RPC service via rpcbind version 4.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const u32 program, const u32 version,
+ const sa_family_t family,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ int error;
+
+ switch (family) {
+ case AF_INET:
+ return __svc_rpcb_register4(program, version,
+ protocol, port);
+ case AF_INET6:
+ error = __svc_rpcb_register6(program, version,
+ protocol, port);
+ if (error < 0)
+ return error;
+
+ /*
+ * Work around bug in some versions of Linux rpcbind
+ * which don't allow registration of both inet and
+ * inet6 netids.
+ *
+ * Error return ignored for now.
+ */
+ __svc_rpcb_register4(program, version,
+ protocol, port);
+ return 0;
+ }
+
+ return -EAFNOSUPPORT;
+}
+
+#else /* CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
+ * Register a kernel RPC service via rpcbind version 2.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const u32 program, const u32 version,
+ sa_family_t family,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ if (family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ return rpcb_register(program, version, protocol, port);
+}
+
+#endif /* CONFIG_SUNRPC_REGISTER_V4 */
+
+/**
+ * svc_register - register an RPC service with the local portmapper
+ * @serv: svc_serv struct for the service to register
+ * @proto: transport protocol number to advertise
+ * @port: port to advertise
+ *
+ * Service is registered for any address in serv's address family
+ */
+int svc_register(const struct svc_serv *serv, const unsigned short proto,
+ const unsigned short port)
{
struct svc_program *progp;
- unsigned long flags;
unsigned int i;
- int error = 0, dummy;
+ int error = 0;
- if (!port)
- clear_thread_flag(TIF_SIGPENDING);
+ BUG_ON(proto == 0 && port == 0);
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
- dprintk("svc: svc_register(%s, %s, %d, %d)%s\n",
+ dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
progp->pg_name,
+ i,
proto == IPPROTO_UDP? "udp" : "tcp",
port,
- i,
+ serv->sv_family,
progp->pg_vers[i]->vs_hidden?
" (but not telling portmap)" : "");
if (progp->pg_vers[i]->vs_hidden)
continue;
- error = rpcb_register(progp->pg_prog, i, proto, port, &dummy);
+ error = __svc_register(progp->pg_prog, i,
+ serv->sv_family, proto, port);
if (error < 0)
break;
- if (port && !dummy) {
- error = -EACCES;
- break;
- }
}
}
- if (!port) {
- spin_lock_irqsave(&current->sighand->siglock, flags);
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ return error;
+}
+
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
+static void __svc_unregister(const u32 program, const u32 version,
+ const char *progname)
+{
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = 0,
+ };
+ int error;
+
+ error = rpcb_v4_register(program, version,
+ (struct sockaddr *)&sin6, "");
+ dprintk("svc: %s(%sv%u), error %d\n",
+ __func__, progname, version, error);
+}
+
+#else /* CONFIG_SUNRPC_REGISTER_V4 */
+
+static void __svc_unregister(const u32 program, const u32 version,
+ const char *progname)
+{
+ int error;
+
+ error = rpcb_register(program, version, 0, 0);
+ dprintk("svc: %s(%sv%u), error %d\n",
+ __func__, progname, version, error);
+}
+
+#endif /* CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
+ * All netids, bind addresses and ports registered for [program, version]
+ * are removed from the local rpcbind database (if the service is not
+ * hidden) to make way for a new instance of the service.
+ *
+ * The result of unregistration is reported via dprintk for those who want
+ * verification of the result, but is otherwise not important.
+ */
+static void svc_unregister(const struct svc_serv *serv)
+{
+ struct svc_program *progp;
+ unsigned long flags;
+ unsigned int i;
+
+ clear_thread_flag(TIF_SIGPENDING);
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (i = 0; i < progp->pg_nvers; i++) {
+ if (progp->pg_vers[i] == NULL)
+ continue;
+ if (progp->pg_vers[i]->vs_hidden)
+ continue;
+
+ __svc_unregister(progp->pg_prog, i, progp->pg_name);
+ }
}
- return error;
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e46c825f495..bf5b5cdafeb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
- int flags)
+static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
+ struct svc_serv *serv,
+ unsigned short port, int flags)
{
- struct svc_xprt_class *xcl;
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+ struct sockaddr *sap;
+ size_t len;
+
+ switch (serv->sv_family) {
+ case AF_INET:
+ sap = (struct sockaddr *)&sin;
+ len = sizeof(sin);
+ break;
+ case AF_INET6:
+ sap = (struct sockaddr *)&sin6;
+ len = sizeof(sin6);
+ break;
+ default:
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
+}
+
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+ int flags)
+{
+ struct svc_xprt_class *xcl;
+
dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
spin_lock(&svc_xprt_class_lock);
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
@@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
goto err;
spin_unlock(&svc_xprt_class_lock);
- newxprt = xcl->xcl_ops->
- xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
- flags);
+ newxprt = __svc_xpo_create(xcl, serv, port, flags);
if (IS_ERR(newxprt)) {
module_put(xcl->xcl_owner);
return PTR_ERR(newxprt);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 3e65719f1ef..95293f549e9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+ int val;
dprintk("svc: svc_setup_socket %p\n", sock);
if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
else
svc_tcp_init(svsk, serv);
+ /*
+ * We start one listener per sv_serv. We want AF_INET
+ * requests to be automatically shunted to our AF_INET6
+ * listener using a mapped IPv4 address. Make sure
+ * no-one starts an equivalent IPv4 listener, which
+ * would steal our incoming connections.
+ */
+ val = 0;
+ if (serv->sv_family == AF_INET6)
+ kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+ (char *)&val, sizeof(val));
+
dprintk("svc: svc_setup_socket created %p (inet %p)\n",
svsk, svsk->sk_sk);
@@ -1154,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
int svc_addsock(struct svc_serv *serv,
int fd,
- char *name_return,
- int *proto)
+ char *name_return)
{
int err = 0;
struct socket *so = sockfd_lookup(fd, &err);
@@ -1190,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv,
sockfd_put(so);
return err;
}
- if (proto) *proto = so->sk->sk_protocol;
return one_sock_name(name_return, svsk);
}
EXPORT_SYMBOL_GPL(svc_addsock);
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 0f8c439b848..5231f7aaac0 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -60,24 +60,14 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
- int len;
+ size_t len;
+
if ((*ppos && !write) || !*lenp) {
*lenp = 0;
return 0;
}
- if (write)
- return -EINVAL;
- else {
- len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
- if (!access_ok(VERIFY_WRITE, buffer, len))
- return -EFAULT;
-
- if (__copy_to_user(buffer, tmpbuf, len))
- return -EFAULT;
- }
- *lenp -= len;
- *ppos += len;
- return 0;
+ len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+ return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
}
static int
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 99a52aabe33..29e401bb612 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport)
goto out;
}
- result = -EINVAL;
- if (try_module_get(THIS_MODULE)) {
- list_add_tail(&transport->list, &xprt_list);
- printk(KERN_INFO "RPC: Registered %s transport module.\n",
- transport->name);
- result = 0;
- }
+ list_add_tail(&transport->list, &xprt_list);
+ printk(KERN_INFO "RPC: Registered %s transport module.\n",
+ transport->name);
+ result = 0;
out:
spin_unlock(&xprt_list_lock);
@@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport)
"RPC: Unregistered %s transport module.\n",
transport->name);
list_del_init(&transport->list);
- module_put(THIS_MODULE);
goto out;
}
}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e55427f73df..14106d26bb9 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
}
if (xdrbuf->tail[0].iov_len) {
+ /* the rpcrdma protocol allows us to omit any trailing
+ * xdr pad bytes, saving the server an RDMA operation. */
+ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+ return n;
if (n == nsegs)
return 0;
seg[n].mr_page = NULL;
@@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (hdrlen == 0)
return -1;
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
- " headerp 0x%p base 0x%p lkey 0x%x\n",
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ " headerp 0x%p base 0x%p lkey 0x%x\n",
__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
headerp, base, req->rl_iov.lkey);
@@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
* Scatter inline received data back into provided iov's.
*/
static void
-rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
{
int i, npages, curlen, olen;
char *destp;
@@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
} else
rqst->rq_rcv_buf.tail[0].iov_len = 0;
+ if (pad) {
+ /* implicit padding on terminal chunk */
+ unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+ while (pad--)
+ p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+ }
+
if (copy_len)
dprintk("RPC: %s: %d bytes in"
" %d extra segments (%d lost)\n",
@@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
struct rpc_xprt *xprt = ep->rep_xprt;
spin_lock_bh(&xprt->transport_lock);
+ if (++xprt->connect_cookie == 0) /* maintain a reserved value */
+ ++xprt->connect_cookie;
if (ep->rep_connected > 0) {
if (!xprt_test_and_set_connected(xprt))
xprt_wake_pending_tasks(xprt, 0);
} else {
if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, ep->rep_connected);
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
}
spin_unlock_bh(&xprt->transport_lock);
}
@@ -769,7 +782,7 @@ repost:
/* check for expected message types */
/* The order of some of these tests is important. */
switch (headerp->rm_type) {
- case __constant_htonl(RDMA_MSG):
+ case htonl(RDMA_MSG):
/* never expect read chunks */
/* never expect reply chunks (two ways to check) */
/* never expect write chunks without having offered RDMA */
@@ -792,17 +805,23 @@ repost:
((unsigned char *)iptr - (unsigned char *)headerp);
status = rep->rr_len + rdmalen;
r_xprt->rx_stats.total_rdma_reply += rdmalen;
+ /* special case - last chunk may omit padding */
+ if (rdmalen &= 3) {
+ rdmalen = 4 - rdmalen;
+ status += rdmalen;
+ }
} else {
/* else ordinary inline */
+ rdmalen = 0;
iptr = (__be32 *)((unsigned char *)headerp + 28);
rep->rr_len -= 28; /*sizeof *headerp;*/
status = rep->rr_len;
}
/* Fix up the rpc results for upper layer */
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+ rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
break;
- case __constant_htonl(RDMA_NOMSG):
+ case htonl(RDMA_NOMSG):
/* never expect read or write chunks, always reply chunks */
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
headerp->rm_body.rm_chunks[1] != xdr_zero ||
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 88c0ca20bb1..87101177825 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod;
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
/*
* This function implements reading and resetting an atomic_t stat
* variable through read/write to a proc file. Any write to the file
@@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = {
void svc_rdma_cleanup(void)
{
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+ flush_scheduled_work();
if (svcrdma_table_header) {
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
}
svc_unreg_xprt_class(&svc_rdma_class);
+ kmem_cache_destroy(svc_rdma_map_cachep);
+ kmem_cache_destroy(svc_rdma_ctxt_cachep);
}
int svc_rdma_init(void)
@@ -255,9 +262,37 @@ int svc_rdma_init(void)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
+ /* Create the temporary map cache */
+ svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+ sizeof(struct svc_rdma_req_map),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!svc_rdma_map_cachep) {
+ printk(KERN_INFO "Could not allocate map cache.\n");
+ goto err0;
+ }
+
+ /* Create the temporary context cache */
+ svc_rdma_ctxt_cachep =
+ kmem_cache_create("svc_rdma_ctxt_cache",
+ sizeof(struct svc_rdma_op_ctxt),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!svc_rdma_ctxt_cachep) {
+ printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+ goto err1;
+ }
+
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
return 0;
+ err1:
+ kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+ unregister_sysctl_table(svcrdma_table_header);
+ return -ENOMEM;
}
MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
MODULE_DESCRIPTION("SVC RDMA Transport");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 06ab4841537..a4756576d68 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -112,16 +112,11 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0].iov_len = 0;
}
-struct chunk_sge {
- int start; /* sge no for this chunk */
- int count; /* sge count for this chunk */
-};
-
/* Encode a read-chunk-list as an array of IB SGE
*
* Assumptions:
* - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contigous and consists of
+ * - pages[] is not physically or virtually contiguous and consists of
* PAGE_SIZE elements.
*
* Output:
@@ -130,12 +125,12 @@ struct chunk_sge {
* chunk in the read list
*
*/
-static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
+static int map_read_chunks(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head,
struct rpcrdma_msg *rmsgp,
- struct ib_sge *sge,
- struct chunk_sge *ch_sge_ary,
+ struct svc_rdma_req_map *rpl_map,
+ struct svc_rdma_req_map *chl_map,
int ch_count,
int byte_count)
{
@@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
head->arg.head[0] = rqstp->rq_arg.head[0];
head->arg.tail[0] = rqstp->rq_arg.tail[0];
head->arg.pages = &head->pages[head->count];
- head->sge[0].length = head->count; /* save count of hdr pages */
+ head->hdr_count = head->count; /* save count of hdr pages */
head->arg.page_base = 0;
head->arg.page_len = ch_bytes;
head->arg.len = rqstp->rq_arg.len + ch_bytes;
head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
head->count++;
- ch_sge_ary[0].start = 0;
+ chl_map->ch[0].start = 0;
while (byte_count) {
+ rpl_map->sge[sge_no].iov_base =
+ page_address(rqstp->rq_arg.pages[page_no]) + page_off;
sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
- sge[sge_no].addr =
- ib_dma_map_page(xprt->sc_cm_id->device,
- rqstp->rq_arg.pages[page_no],
- page_off, sge_bytes,
- DMA_FROM_DEVICE);
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ rpl_map->sge[sge_no].iov_len = sge_bytes;
/*
* Don't bump head->count here because the same page
* may be used by multiple SGE.
@@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
* SGE, move to the next SGE
*/
if (ch_bytes == 0) {
- ch_sge_ary[ch_no].count =
- sge_no - ch_sge_ary[ch_no].start;
+ chl_map->ch[ch_no].count =
+ sge_no - chl_map->ch[ch_no].start;
ch_no++;
ch++;
- ch_sge_ary[ch_no].start = sge_no;
+ chl_map->ch[ch_no].start = sge_no;
ch_bytes = ch->rc_target.rs_length;
/* If bytes remaining account for next chunk */
if (byte_count) {
@@ -220,19 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
return sge_no;
}
-static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
- struct ib_sge *sge,
- u64 *sgl_offset,
- int count)
+/* Map a read-chunk-list to an XDR and fast register the page-list.
+ *
+ * Assumptions:
+ * - chunk[0] position points to pages[0] at an offset of 0
+ * - pages[] will be made physically contiguous by creating a one-off memory
+ * region using the fastreg verb.
+ * - byte_count is # of bytes in read-chunk-list
+ * - ch_count is # of chunks in read-chunk-list
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ * chunk in the read list
+ */
+static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ struct rpcrdma_msg *rmsgp,
+ struct svc_rdma_req_map *rpl_map,
+ struct svc_rdma_req_map *chl_map,
+ int ch_count,
+ int byte_count)
+{
+ int page_no;
+ int ch_no;
+ u32 offset;
+ struct rpcrdma_read_chunk *ch;
+ struct svc_rdma_fastreg_mr *frmr;
+ int ret = 0;
+
+ frmr = svc_rdma_get_frmr(xprt);
+ if (IS_ERR(frmr))
+ return -ENOMEM;
+
+ head->frmr = frmr;
+ head->arg.head[0] = rqstp->rq_arg.head[0];
+ head->arg.tail[0] = rqstp->rq_arg.tail[0];
+ head->arg.pages = &head->pages[head->count];
+ head->hdr_count = head->count; /* save count of hdr pages */
+ head->arg.page_base = 0;
+ head->arg.page_len = byte_count;
+ head->arg.len = rqstp->rq_arg.len + byte_count;
+ head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+
+ /* Fast register the page list */
+ frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+ frmr->direction = DMA_FROM_DEVICE;
+ frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+ frmr->map_len = byte_count;
+ frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+ for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ page_address(rqstp->rq_arg.pages[page_no]),
+ PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+ atomic_inc(&xprt->sc_dma_used);
+ head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+ }
+ head->count += page_no;
+
+ /* rq_respages points one past arg pages */
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+
+ /* Create the reply and chunk maps */
+ offset = 0;
+ ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ for (ch_no = 0; ch_no < ch_count; ch_no++) {
+ rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+ rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
+ chl_map->ch[ch_no].count = 1;
+ chl_map->ch[ch_no].start = ch_no;
+ offset += ch->rc_target.rs_length;
+ ch++;
+ }
+
+ ret = svc_rdma_fastreg(xprt, frmr);
+ if (ret)
+ goto fatal_err;
+
+ return ch_no;
+
+ fatal_err:
+ printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+ svc_rdma_put_frmr(xprt, frmr);
+ return -EIO;
+}
+
+static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct svc_rdma_fastreg_mr *frmr,
+ struct kvec *vec,
+ u64 *sgl_offset,
+ int count)
{
int i;
ctxt->count = count;
+ ctxt->direction = DMA_FROM_DEVICE;
for (i = 0; i < count; i++) {
- ctxt->sge[i].addr = sge[i].addr;
- ctxt->sge[i].length = sge[i].length;
- *sgl_offset = *sgl_offset + sge[i].length;
+ ctxt->sge[i].length = 0; /* in case map fails */
+ if (!frmr) {
+ ctxt->sge[i].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ vec[i].iov_base,
+ vec[i].iov_len,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ ctxt->sge[i].addr))
+ return -EINVAL;
+ ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+ atomic_inc(&xprt->sc_dma_used);
+ } else {
+ ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+ ctxt->sge[i].lkey = frmr->mr->lkey;
+ }
+ ctxt->sge[i].length = vec[i].iov_len;
+ *sgl_offset = *sgl_offset + vec[i].iov_len;
}
+ return 0;
}
static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
@@ -280,37 +380,44 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *hdr_ctxt)
{
struct ib_send_wr read_wr;
+ struct ib_send_wr inv_wr;
int err = 0;
int ch_no;
- struct ib_sge *sge;
int ch_count;
int byte_count;
int sge_count;
u64 sgl_offset;
struct rpcrdma_read_chunk *ch;
struct svc_rdma_op_ctxt *ctxt = NULL;
- struct svc_rdma_op_ctxt *tmp_sge_ctxt;
- struct svc_rdma_op_ctxt *tmp_ch_ctxt;
- struct chunk_sge *ch_sge_ary;
+ struct svc_rdma_req_map *rpl_map;
+ struct svc_rdma_req_map *chl_map;
/* If no read list is present, return 0 */
ch = svc_rdma_get_read_chunk(rmsgp);
if (!ch)
return 0;
- /* Allocate temporary contexts to keep SGE */
- BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
- tmp_sge_ctxt = svc_rdma_get_context(xprt);
- sge = tmp_sge_ctxt->sge;
- tmp_ch_ctxt = svc_rdma_get_context(xprt);
- ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
+ /* Allocate temporary reply and chunk maps */
+ rpl_map = svc_rdma_get_req_map();
+ chl_map = svc_rdma_get_req_map();
svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
if (ch_count > RPCSVC_MAXPAGES)
return -EINVAL;
- sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
- sge, ch_sge_ary,
- ch_count, byte_count);
+
+ if (!xprt->sc_frmr_pg_list_len)
+ sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+ rpl_map, chl_map, ch_count,
+ byte_count);
+ else
+ sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+ rpl_map, chl_map, ch_count,
+ byte_count);
+ if (sge_count < 0) {
+ err = -EIO;
+ goto out;
+ }
+
sgl_offset = 0;
ch_no = 0;
@@ -319,32 +426,64 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
next_sge:
ctxt = svc_rdma_get_context(xprt);
ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->frmr = hdr_ctxt->frmr;
+ ctxt->read_hdr = NULL;
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare READ WR */
memset(&read_wr, 0, sizeof read_wr);
- ctxt->wr_op = IB_WR_RDMA_READ;
read_wr.wr_id = (unsigned long)ctxt;
read_wr.opcode = IB_WR_RDMA_READ;
+ ctxt->wr_op = read_wr.opcode;
read_wr.send_flags = IB_SEND_SIGNALED;
read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
read_wr.wr.rdma.remote_addr =
get_unaligned(&(ch->rc_target.rs_offset)) +
sgl_offset;
- read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
+ read_wr.sg_list = ctxt->sge;
read_wr.num_sge =
- rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
- rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
- &sgl_offset,
- read_wr.num_sge);
+ rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+ err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+ &rpl_map->sge[chl_map->ch[ch_no].start],
+ &sgl_offset,
+ read_wr.num_sge);
+ if (err) {
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ goto out;
+ }
if (((ch+1)->rc_discrim == 0) &&
- (read_wr.num_sge == ch_sge_ary[ch_no].count)) {
+ (read_wr.num_sge == chl_map->ch[ch_no].count)) {
/*
* Mark the last RDMA_READ with a bit to
* indicate all RPC data has been fetched from
* the client and the RPC needs to be enqueued.
*/
set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ if (hdr_ctxt->frmr) {
+ set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+ /*
+ * Invalidate the local MR used to map the data
+ * sink.
+ */
+ if (xprt->sc_dev_caps &
+ SVCRDMA_DEVCAP_READ_W_INV) {
+ read_wr.opcode =
+ IB_WR_RDMA_READ_WITH_INV;
+ ctxt->wr_op = read_wr.opcode;
+ read_wr.ex.invalidate_rkey =
+ ctxt->frmr->mr->lkey;
+ } else {
+ /* Prepare INVALIDATE WR */
+ memset(&inv_wr, 0, sizeof inv_wr);
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+ inv_wr.send_flags = IB_SEND_SIGNALED;
+ inv_wr.ex.invalidate_rkey =
+ hdr_ctxt->frmr->mr->lkey;
+ read_wr.next = &inv_wr;
+ }
+ }
ctxt->read_hdr = hdr_ctxt;
}
/* Post the read */
@@ -358,9 +497,9 @@ next_sge:
}
atomic_inc(&rdma_stat_read);
- if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
- ch_sge_ary[ch_no].count -= read_wr.num_sge;
- ch_sge_ary[ch_no].start += read_wr.num_sge;
+ if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+ chl_map->ch[ch_no].count -= read_wr.num_sge;
+ chl_map->ch[ch_no].start += read_wr.num_sge;
goto next_sge;
}
sgl_offset = 0;
@@ -368,8 +507,8 @@ next_sge:
}
out:
- svc_rdma_put_context(tmp_sge_ctxt, 0);
- svc_rdma_put_context(tmp_ch_ctxt, 0);
+ svc_rdma_put_req_map(rpl_map);
+ svc_rdma_put_req_map(chl_map);
/* Detach arg pages. svc_recv will replenish them */
for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
@@ -399,7 +538,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_pages[page_no] = head->pages[page_no];
}
/* Point rq_arg.pages past header */
- rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
+ rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
rqstp->rq_arg.page_len = head->arg.page_len;
rqstp->rq_arg.page_base = head->arg.page_base;
@@ -449,18 +588,18 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dprintk("svcrdma: rqstp=%p\n", rqstp);
- spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
+ spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
struct svc_rdma_op_ctxt,
dto_q);
list_del_init(&ctxt->dto_q);
}
- spin_unlock_bh(&rdma_xprt->sc_read_complete_lock);
- if (ctxt)
+ if (ctxt) {
+ spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
return rdma_read_complete(rqstp, ctxt);
+ }
- spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
struct svc_rdma_op_ctxt,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index fb82b1b683f..9a7a8e7ae03 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -63,52 +63,165 @@
* SGE[2..sge_count-2] data from xdr->pages[]
* SGE[sge_count-1] data from xdr->tail.
*
+ * The max SGE we need is the length of the XDR / pagesize + one for
+ * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+ * reserves a page for both the request and the reply header, and this
+ * array is only concerned with the reply we are assured that we have
+ * on extra page for the RPCRMDA header.
*/
-static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct ib_sge *sge,
- int *sge_count)
+int fast_reg_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
+{
+ int sge_no;
+ u32 sge_bytes;
+ u32 page_bytes;
+ u32 page_off;
+ int page_no = 0;
+ u8 *frva;
+ struct svc_rdma_fastreg_mr *frmr;
+
+ frmr = svc_rdma_get_frmr(xprt);
+ if (IS_ERR(frmr))
+ return -ENOMEM;
+ vec->frmr = frmr;
+
+ /* Skip the RPCRDMA header */
+ sge_no = 1;
+
+ /* Map the head. */
+ frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
+ vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+ vec->count = 2;
+ sge_no++;
+
+ /* Build the FRMR */
+ frmr->kva = frva;
+ frmr->direction = DMA_TO_DEVICE;
+ frmr->access_flags = 0;
+ frmr->map_len = PAGE_SIZE;
+ frmr->page_list_len = 1;
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ (void *)xdr->head[0].iov_base,
+ PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+ atomic_inc(&xprt->sc_dma_used);
+
+ page_off = xdr->page_base;
+ page_bytes = xdr->page_len + page_off;
+ if (!page_bytes)
+ goto encode_tail;
+
+ /* Map the pages */
+ vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+ vec->sge[sge_no].iov_len = page_bytes;
+ sge_no++;
+ while (page_bytes) {
+ struct page *page;
+
+ page = xdr->pages[page_no++];
+ sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+ page_bytes -= sge_bytes;
+
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
+ PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+
+ atomic_inc(&xprt->sc_dma_used);
+ page_off = 0; /* reset for next time through loop */
+ frmr->map_len += PAGE_SIZE;
+ frmr->page_list_len++;
+ }
+ vec->count++;
+
+ encode_tail:
+ /* Map tail */
+ if (0 == xdr->tail[0].iov_len)
+ goto done;
+
+ vec->count++;
+ vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+
+ if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
+ ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
+ /*
+ * If head and tail use the same page, we don't need
+ * to map it again.
+ */
+ vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+ } else {
+ void *va;
+
+ /* Map another page for the tail */
+ page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+ va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
+ vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+ atomic_inc(&xprt->sc_dma_used);
+ frmr->map_len += PAGE_SIZE;
+ frmr->page_list_len++;
+ }
+
+ done:
+ if (svc_rdma_fastreg(xprt, frmr))
+ goto fatal_err;
+
+ return 0;
+
+ fatal_err:
+ printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
+ svc_rdma_put_frmr(xprt, frmr);
+ return -EIO;
+}
+
+static int map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
{
- /* Max we need is the length of the XDR / pagesize + one for
- * head + one for tail + one for RPCRDMA header
- */
int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
int sge_no;
- u32 byte_count = xdr->len;
u32 sge_bytes;
u32 page_bytes;
- int page_off;
+ u32 page_off;
int page_no;
+ BUG_ON(xdr->len !=
+ (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
+ if (xprt->sc_frmr_pg_list_len)
+ return fast_reg_xdr(xprt, xdr, vec);
+
/* Skip the first sge, this is for the RPCRDMA header */
sge_no = 1;
/* Head SGE */
- sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
- xdr->head[0].iov_base,
- xdr->head[0].iov_len,
- DMA_TO_DEVICE);
- sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
- byte_count -= sge_bytes;
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
sge_no++;
/* pages SGE */
page_no = 0;
page_bytes = xdr->page_len;
page_off = xdr->page_base;
- while (byte_count && page_bytes) {
- sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
- sge[sge_no].addr =
- ib_dma_map_page(xprt->sc_cm_id->device,
- xdr->pages[page_no], page_off,
- sge_bytes, DMA_TO_DEVICE);
- sge_bytes = min(sge_bytes, page_bytes);
- byte_count -= sge_bytes;
+ while (page_bytes) {
+ vec->sge[sge_no].iov_base =
+ page_address(xdr->pages[page_no]) + page_off;
+ sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
page_bytes -= sge_bytes;
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ vec->sge[sge_no].iov_len = sge_bytes;
sge_no++;
page_no++;
@@ -116,36 +229,27 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
}
/* Tail SGE */
- if (byte_count && xdr->tail[0].iov_len) {
- sge[sge_no].addr =
- ib_dma_map_single(xprt->sc_cm_id->device,
- xdr->tail[0].iov_base,
- xdr->tail[0].iov_len,
- DMA_TO_DEVICE);
- sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
- byte_count -= sge_bytes;
- sge[sge_no].length = sge_bytes;
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ if (xdr->tail[0].iov_len) {
+ vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
sge_no++;
}
BUG_ON(sge_no > sge_max);
- BUG_ON(byte_count != 0);
-
- *sge_count = sge_no;
- return sge;
+ vec->count = sge_no;
+ return 0;
}
-
/* Assumptions:
+ * - We are using FRMR
+ * - or -
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
u32 rmr, u64 to,
u32 xdr_off, int write_len,
- struct ib_sge *xdr_sge, int sge_count)
+ struct svc_rdma_req_map *vec)
{
- struct svc_rdma_op_ctxt *tmp_sge_ctxt;
struct ib_send_wr write_wr;
struct ib_sge *sge;
int xdr_sge_no;
@@ -154,25 +258,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
int sge_off;
int bc;
struct svc_rdma_op_ctxt *ctxt;
- int ret = 0;
- BUG_ON(sge_count > RPCSVC_MAXPAGES);
+ BUG_ON(vec->count > RPCSVC_MAXPAGES);
dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
- "write_len=%d, xdr_sge=%p, sge_count=%d\n",
+ "write_len=%d, vec->sge=%p, vec->count=%lu\n",
rmr, (unsigned long long)to, xdr_off,
- write_len, xdr_sge, sge_count);
+ write_len, vec->sge, vec->count);
ctxt = svc_rdma_get_context(xprt);
- ctxt->count = 0;
- tmp_sge_ctxt = svc_rdma_get_context(xprt);
- sge = tmp_sge_ctxt->sge;
+ ctxt->direction = DMA_TO_DEVICE;
+ sge = ctxt->sge;
/* Find the SGE associated with xdr_off */
- for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
+ for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
xdr_sge_no++) {
- if (xdr_sge[xdr_sge_no].length > bc)
+ if (vec->sge[xdr_sge_no].iov_len > bc)
break;
- bc -= xdr_sge[xdr_sge_no].length;
+ bc -= vec->sge[xdr_sge_no].iov_len;
}
sge_off = bc;
@@ -180,22 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge_no = 0;
/* Copy the remaining SGE */
- while (bc != 0 && xdr_sge_no < sge_count) {
- sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
- sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
- sge_bytes = min((size_t)bc,
- (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
+ while (bc != 0) {
+ sge_bytes = min_t(size_t,
+ bc, vec->sge[xdr_sge_no].iov_len-sge_off);
sge[sge_no].length = sge_bytes;
-
+ if (!vec->frmr) {
+ sge[sge_no].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ (void *)
+ vec->sge[xdr_sge_no].iov_base + sge_off,
+ sge_bytes, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ sge[sge_no].addr))
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+ sge[sge_no].lkey = xprt->sc_dma_lkey;
+ } else {
+ sge[sge_no].addr = (unsigned long)
+ vec->sge[xdr_sge_no].iov_base + sge_off;
+ sge[sge_no].lkey = vec->frmr->mr->lkey;
+ }
+ ctxt->count++;
+ ctxt->frmr = vec->frmr;
sge_off = 0;
sge_no++;
xdr_sge_no++;
+ BUG_ON(xdr_sge_no > vec->count);
bc -= sge_bytes;
}
- BUG_ON(bc != 0);
- BUG_ON(xdr_sge_no > sge_count);
-
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
ctxt->wr_op = IB_WR_RDMA_WRITE;
@@ -209,21 +324,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
/* Post It */
atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr)) {
- svc_rdma_put_context(ctxt, 1);
- /* Fatal error, close transport */
- ret = -EIO;
- }
- svc_rdma_put_context(tmp_sge_ctxt, 0);
- return ret;
+ if (svc_rdma_send(xprt, &write_wr))
+ goto err;
+ return 0;
+ err:
+ svc_rdma_put_context(ctxt, 0);
+ /* Fatal error, close transport */
+ return -EIO;
}
static int send_write_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rdma_argp,
struct rpcrdma_msg *rdma_resp,
struct svc_rqst *rqstp,
- struct ib_sge *sge,
- int sge_count)
+ struct svc_rdma_req_map *vec)
{
u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
int write_len;
@@ -241,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[1];
- max_write = xprt->sc_max_sge * PAGE_SIZE;
+ if (vec->frmr)
+ max_write = vec->frmr->map_len;
+ else
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
/* Write chunks start at the pagelist */
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -269,8 +386,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
rs_offset + chunk_off,
xdr_off,
this_write,
- sge,
- sge_count);
+ vec);
if (ret) {
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
ret);
@@ -292,8 +408,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rdma_argp,
struct rpcrdma_msg *rdma_resp,
struct svc_rqst *rqstp,
- struct ib_sge *sge,
- int sge_count)
+ struct svc_rdma_req_map *vec)
{
u32 xfer_len = rqstp->rq_res.len;
int write_len;
@@ -314,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[2];
- max_write = xprt->sc_max_sge * PAGE_SIZE;
+ if (vec->frmr)
+ max_write = vec->frmr->map_len;
+ else
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
/* xdr offset starts at RPC message */
for (xdr_off = 0, chunk_no = 0;
@@ -324,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
ch = &arg_ary->wc_array[chunk_no].wc_target;
write_len = min(xfer_len, ch->rs_length);
-
/* Prepare the reply chunk given the length actually
* written */
rs_offset = get_unaligned(&(ch->rs_offset));
@@ -341,8 +458,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
rs_offset + chunk_off,
xdr_off,
this_write,
- sge,
- sge_count);
+ vec);
if (ret) {
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
ret);
@@ -380,10 +496,11 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct page *page,
struct rpcrdma_msg *rdma_resp,
struct svc_rdma_op_ctxt *ctxt,
- int sge_count,
+ struct svc_rdma_req_map *vec,
int byte_count)
{
struct ib_send_wr send_wr;
+ struct ib_send_wr inv_wr;
int sge_no;
int sge_bytes;
int page_no;
@@ -403,20 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma,
/* Prepare the context */
ctxt->pages[0] = page;
ctxt->count = 1;
+ ctxt->frmr = vec->frmr;
+ if (vec->frmr)
+ set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+ else
+ clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare the SGE for the RPCRDMA Header */
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device,
page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+
ctxt->direction = DMA_TO_DEVICE;
+
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
- ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
+ ctxt->sge[0].lkey = rdma->sc_dma_lkey;
/* Determine how many of our SGE are to be transmitted */
- for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
- sge_bytes = min((size_t)ctxt->sge[sge_no].length,
- (size_t)byte_count);
+ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+ sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
+ if (!vec->frmr) {
+ ctxt->sge[sge_no].addr =
+ ib_dma_map_single(rdma->sc_cm_id->device,
+ vec->sge[sge_no].iov_base,
+ sge_bytes, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+ ctxt->sge[sge_no].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+ ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ } else {
+ ctxt->sge[sge_no].addr = (unsigned long)
+ vec->sge[sge_no].iov_base;
+ ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
+ }
+ ctxt->sge[sge_no].length = sge_bytes;
}
BUG_ON(byte_count != 0);
@@ -428,9 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
+ /*
+ * If there are more pages than SGE, terminate SGE
+ * list so that svc_rdma_unmap_dma doesn't attempt to
+ * unmap garbage.
+ */
+ if (page_no+1 >= sge_no)
+ ctxt->sge[page_no+1].length = 0;
}
-
BUG_ON(sge_no > rdma->sc_max_sge);
+ BUG_ON(sge_no > ctxt->count);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
send_wr.wr_id = (unsigned long)ctxt;
@@ -438,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma,
send_wr.num_sge = sge_no;
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
+ if (vec->frmr) {
+ /* Prepare INVALIDATE WR */
+ memset(&inv_wr, 0, sizeof inv_wr);
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+ inv_wr.send_flags = IB_SEND_SIGNALED;
+ inv_wr.ex.invalidate_rkey =
+ vec->frmr->mr->lkey;
+ send_wr.next = &inv_wr;
+ }
ret = svc_rdma_send(rdma, &send_wr);
if (ret)
- svc_rdma_put_context(ctxt, 1);
+ goto err;
- return ret;
+ return 0;
+
+ err:
+ svc_rdma_put_frmr(rdma, vec->frmr);
+ svc_rdma_put_context(ctxt, 1);
+ return -EIO;
}
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -473,21 +636,22 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
enum rpcrdma_proc reply_type;
int ret;
int inline_bytes;
- struct ib_sge *sge;
- int sge_count = 0;
struct page *res_page;
struct svc_rdma_op_ctxt *ctxt;
+ struct svc_rdma_req_map *vec;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
/* Get the RDMA request header. */
rdma_argp = xdr_start(&rqstp->rq_arg);
- /* Build an SGE for the XDR */
+ /* Build an req vec for the XDR */
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
- sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
-
+ vec = svc_rdma_get_req_map();
+ ret = map_xdr(rdma, &rqstp->rq_res, vec);
+ if (ret)
+ goto err0;
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
@@ -503,30 +667,34 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
/* Send any write-chunk data and build resp write-list */
ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
- rqstp, sge, sge_count);
+ rqstp, vec);
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
ret);
- goto error;
+ goto err1;
}
inline_bytes -= ret;
/* Send any reply-list data and update resp reply-list */
ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
- rqstp, sge, sge_count);
+ rqstp, vec);
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
ret);
- goto error;
+ goto err1;
}
inline_bytes -= ret;
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
inline_bytes);
+ svc_rdma_put_req_map(vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
- error:
- svc_rdma_put_context(ctxt, 0);
+
+ err1:
put_page(res_page);
+ err0:
+ svc_rdma_put_req_map(vec);
+ svc_rdma_put_context(ctxt, 0);
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e132509d1db..6fb493cbd29 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -84,70 +84,46 @@ struct svc_xprt_class svc_rdma_class = {
.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
};
-static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
+/* WR context cache. Created in svc_rdma.c */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
- int target;
- int at_least_one = 0;
struct svc_rdma_op_ctxt *ctxt;
- target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
- xprt->sc_ctxt_max);
-
- spin_lock_bh(&xprt->sc_ctxt_lock);
- while (xprt->sc_ctxt_cnt < target) {
- xprt->sc_ctxt_cnt++;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
-
- ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
-
- spin_lock_bh(&xprt->sc_ctxt_lock);
- if (ctxt) {
- at_least_one = 1;
- INIT_LIST_HEAD(&ctxt->free_list);
- list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
- } else {
- /* kmalloc failed...give up for now */
- xprt->sc_ctxt_cnt--;
+ while (1) {
+ ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+ if (ctxt)
break;
- }
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
}
- spin_unlock_bh(&xprt->sc_ctxt_lock);
- dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
- xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
- return at_least_one;
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt->count = 0;
+ ctxt->frmr = NULL;
+ atomic_inc(&xprt->sc_ctxt_used);
+ return ctxt;
}
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
- struct svc_rdma_op_ctxt *ctxt;
-
- while (1) {
- spin_lock_bh(&xprt->sc_ctxt_lock);
- if (unlikely(list_empty(&xprt->sc_ctxt_free))) {
- /* Try to bump my cache. */
- spin_unlock_bh(&xprt->sc_ctxt_lock);
-
- if (rdma_bump_context_cache(xprt))
- continue;
-
- printk(KERN_INFO "svcrdma: sleeping waiting for "
- "context memory on xprt=%p\n",
- xprt);
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- continue;
+ struct svcxprt_rdma *xprt = ctxt->xprt;
+ int i;
+ for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+ /*
+ * Unmap the DMA addr in the SGE if the lkey matches
+ * the sc_dma_lkey, otherwise, ignore it since it is
+ * an FRMR lkey and will be unmapped later when the
+ * last WR that uses it completes.
+ */
+ if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_single(xprt->sc_cm_id->device,
+ ctxt->sge[i].addr,
+ ctxt->sge[i].length,
+ ctxt->direction);
}
- ctxt = list_entry(xprt->sc_ctxt_free.next,
- struct svc_rdma_op_ctxt,
- free_list);
- list_del_init(&ctxt->free_list);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
- ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->dto_q);
- ctxt->count = 0;
- atomic_inc(&xprt->sc_ctxt_used);
- break;
}
- return ctxt;
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -161,18 +137,37 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- for (i = 0; i < ctxt->count; i++)
- ib_dma_unmap_single(xprt->sc_cm_id->device,
- ctxt->sge[i].addr,
- ctxt->sge[i].length,
- ctxt->direction);
-
- spin_lock_bh(&xprt->sc_ctxt_lock);
- list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
atomic_dec(&xprt->sc_ctxt_used);
}
+/* Temporary NFS request map cache. Created in svc_rdma.c */
+extern struct kmem_cache *svc_rdma_map_cachep;
+
+/*
+ * Temporary NFS req mappings are shared across all transport
+ * instances. These are short lived and should be bounded by the number
+ * of concurrent server threads * depth of the SQ.
+ */
+struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+{
+ struct svc_rdma_req_map *map;
+ while (1) {
+ map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
+ if (map)
+ break;
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+ }
+ map->count = 0;
+ map->frmr = NULL;
+ return map;
+}
+
+void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+{
+ kmem_cache_free(svc_rdma_map_cachep, map);
+}
+
/* ib_cq event handler */
static void cq_event_handler(struct ib_event *event, void *context)
{
@@ -302,6 +297,7 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
ctxt->wc_status = wc.status;
ctxt->byte_len = wc.byte_len;
+ svc_rdma_unmap_dma(ctxt);
if (wc.status != IB_WC_SUCCESS) {
/* Close the transport */
dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
@@ -330,6 +326,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
}
/*
+ * Processs a completion context
+ */
+static void process_context(struct svcxprt_rdma *xprt,
+ struct svc_rdma_op_ctxt *ctxt)
+{
+ svc_rdma_unmap_dma(ctxt);
+
+ switch (ctxt->wr_op) {
+ case IB_WR_SEND:
+ if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
+ svc_rdma_put_context(ctxt, 1);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
+ if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+ BUG_ON(!read_hdr);
+ if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ }
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ default:
+ printk(KERN_ERR "svcrdma: unexpected completion type, "
+ "opcode=%d\n",
+ ctxt->wr_op);
+ break;
+ }
+}
+
+/*
* Send Queue Completion Handler - potentially called on interrupt context.
*
* Note that caller must hold a transport reference.
@@ -341,16 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
struct ib_cq *cq = xprt->sc_sq_cq;
int ret;
-
if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
return;
ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
atomic_inc(&rdma_stat_sq_poll);
while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
- ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
- xprt = ctxt->xprt;
-
if (wc.status != IB_WC_SUCCESS)
/* Close the transport */
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -359,32 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
atomic_dec(&xprt->sc_sq_count);
wake_up(&xprt->sc_send_wait);
- switch (ctxt->wr_op) {
- case IB_WR_SEND:
- case IB_WR_RDMA_WRITE:
- svc_rdma_put_context(ctxt, 1);
- break;
-
- case IB_WR_RDMA_READ:
- if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
- struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
- BUG_ON(!read_hdr);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- spin_lock_bh(&xprt->sc_read_complete_lock);
- list_add_tail(&read_hdr->dto_q,
- &xprt->sc_read_complete_q);
- spin_unlock_bh(&xprt->sc_read_complete_lock);
- svc_xprt_enqueue(&xprt->sc_xprt);
- }
- svc_rdma_put_context(ctxt, 0);
- break;
+ ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+ if (ctxt)
+ process_context(xprt, ctxt);
- default:
- printk(KERN_ERR "svcrdma: unexpected completion type, "
- "opcode=%d, status=%d\n",
- wc.opcode, wc.status);
- break;
- }
svc_xprt_put(&xprt->sc_xprt);
}
@@ -423,40 +437,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
tasklet_schedule(&dto_tasklet);
}
-static void create_context_cache(struct svcxprt_rdma *xprt,
- int ctxt_count, int ctxt_bump, int ctxt_max)
-{
- struct svc_rdma_op_ctxt *ctxt;
- int i;
-
- xprt->sc_ctxt_max = ctxt_max;
- xprt->sc_ctxt_bump = ctxt_bump;
- xprt->sc_ctxt_cnt = 0;
- atomic_set(&xprt->sc_ctxt_used, 0);
-
- INIT_LIST_HEAD(&xprt->sc_ctxt_free);
- for (i = 0; i < ctxt_count; i++) {
- ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
- if (ctxt) {
- INIT_LIST_HEAD(&ctxt->free_list);
- list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
- xprt->sc_ctxt_cnt++;
- }
- }
-}
-
-static void destroy_context_cache(struct svcxprt_rdma *xprt)
-{
- while (!list_empty(&xprt->sc_ctxt_free)) {
- struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(xprt->sc_ctxt_free.next,
- struct svc_rdma_op_ctxt,
- free_list);
- list_del_init(&ctxt->free_list);
- kfree(ctxt);
- }
-}
-
static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
int listener)
{
@@ -469,12 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
- spin_lock_init(&cma_xprt->sc_read_complete_lock);
- spin_lock_init(&cma_xprt->sc_ctxt_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+ spin_lock_init(&cma_xprt->sc_frmr_q_lock);
cma_xprt->sc_ord = svcrdma_ord;
@@ -482,21 +462,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
cma_xprt->sc_max_requests = svcrdma_max_requests;
cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
atomic_set(&cma_xprt->sc_sq_count, 0);
+ atomic_set(&cma_xprt->sc_ctxt_used, 0);
- if (!listener) {
- int reqs = cma_xprt->sc_max_requests;
- create_context_cache(cma_xprt,
- reqs << 1, /* starting size */
- reqs, /* bump amount */
- reqs +
- cma_xprt->sc_sq_depth +
- RPCRDMA_MAX_THREADS + 1); /* max */
- if (list_empty(&cma_xprt->sc_ctxt_free)) {
- kfree(cma_xprt);
- return NULL;
- }
- clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
- } else
+ if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
return cma_xprt;
@@ -520,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct svc_rdma_op_ctxt *ctxt;
struct page *page;
- unsigned long pa;
+ dma_addr_t pa;
int sge_no;
int buflen;
int ret;
@@ -535,9 +503,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+ goto err_put_ctxt;
+ atomic_inc(&xprt->sc_dma_used);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
- ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
buflen += PAGE_SIZE;
}
ctxt->count = sge_no;
@@ -553,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
svc_rdma_put_context(ctxt, 1);
}
return ret;
+
+ err_put_ctxt:
+ svc_rdma_put_context(ctxt, 1);
+ return -ENOMEM;
}
/*
@@ -566,7 +541,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
* will call the recvfrom method on the listen xprt which will accept the new
* connection.
*/
-static void handle_connect_req(struct rdma_cm_id *new_cma_id)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
{
struct svcxprt_rdma *listen_xprt = new_cma_id->context;
struct svcxprt_rdma *newxprt;
@@ -583,6 +558,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
newxprt, newxprt->sc_cm_id, listen_xprt);
+ /* Save client advertised inbound read limit for use later in accept. */
+ newxprt->sc_ord = client_ird;
+
/* Set the local and remote addresses in the transport */
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
@@ -619,7 +597,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
case RDMA_CM_EVENT_CONNECT_REQUEST:
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
"event=%d\n", cma_id, cma_id->context, event->event);
- handle_connect_req(cma_id);
+ handle_connect_req(cma_id,
+ event->param.conn.initiator_depth);
break;
case RDMA_CM_EVENT_ESTABLISHED:
@@ -739,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
return ERR_PTR(ret);
}
+static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
+{
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *pl;
+ struct svc_rdma_fastreg_mr *frmr;
+
+ frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
+ if (!frmr)
+ goto err;
+
+ mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+ if (!mr)
+ goto err_free_frmr;
+
+ pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
+ RPCSVC_MAXPAGES);
+ if (!pl)
+ goto err_free_mr;
+
+ frmr->mr = mr;
+ frmr->page_list = pl;
+ INIT_LIST_HEAD(&frmr->frmr_list);
+ return frmr;
+
+ err_free_mr:
+ ib_dereg_mr(mr);
+ err_free_frmr:
+ kfree(frmr);
+ err:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_fastreg_mr *frmr;
+
+ while (!list_empty(&xprt->sc_frmr_q)) {
+ frmr = list_entry(xprt->sc_frmr_q.next,
+ struct svc_rdma_fastreg_mr, frmr_list);
+ list_del_init(&frmr->frmr_list);
+ ib_dereg_mr(frmr->mr);
+ ib_free_fast_reg_page_list(frmr->page_list);
+ kfree(frmr);
+ }
+}
+
+struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_fastreg_mr *frmr = NULL;
+
+ spin_lock_bh(&rdma->sc_frmr_q_lock);
+ if (!list_empty(&rdma->sc_frmr_q)) {
+ frmr = list_entry(rdma->sc_frmr_q.next,
+ struct svc_rdma_fastreg_mr, frmr_list);
+ list_del_init(&frmr->frmr_list);
+ frmr->map_len = 0;
+ frmr->page_list_len = 0;
+ }
+ spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ if (frmr)
+ return frmr;
+
+ return rdma_alloc_frmr(rdma);
+}
+
+static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ int page_no;
+ for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+ dma_addr_t addr = frmr->page_list->page_list[page_no];
+ if (ib_dma_mapping_error(frmr->mr->device, addr))
+ continue;
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE,
+ frmr->direction);
+ }
+}
+
+void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ if (frmr) {
+ frmr_unmap_dma(rdma, frmr);
+ spin_lock_bh(&rdma->sc_frmr_q_lock);
+ BUG_ON(!list_empty(&frmr->frmr_list));
+ list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
+ spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ }
+}
+
/*
* This is the xpo_recvfrom function for listening endpoints. Its
* purpose is to accept incoming connections. The CMA callback handler
@@ -757,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct ib_device_attr devattr;
+ int dma_mr_acc;
+ int need_dma_mr;
int ret;
int i;
@@ -793,8 +865,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
(size_t)svcrdma_max_requests);
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
- newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom,
- (size_t)svcrdma_ord);
+ /*
+ * Limit ORD based on client limit, local device limit, and
+ * configured svcrdma limit.
+ */
+ newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+ newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
if (IS_ERR(newxprt->sc_pd)) {
@@ -868,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
- /* Register all of physical memory */
- newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
- IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE);
- if (IS_ERR(newxprt->sc_phys_mr)) {
- dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
+ /*
+ * Use the most secure set of MR resources based on the
+ * transport type and available memory management features in
+ * the device. Here's the table implemented below:
+ *
+ * Fast Global DMA Remote WR
+ * Reg LKEY MR Access
+ * Sup'd Sup'd Needed Needed
+ *
+ * IWARP N N Y Y
+ * N Y Y Y
+ * Y N Y N
+ * Y Y N -
+ *
+ * IB N N Y N
+ * N Y N -
+ * Y N Y N
+ * Y Y N -
+ *
+ * NB: iWARP requires remote write access for the data sink
+ * of an RDMA_READ. IB does not.
+ */
+ if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ newxprt->sc_frmr_pg_list_len =
+ devattr.max_fast_reg_page_list_len;
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+ }
+
+ /*
+ * Determine if a DMA MR is required and if so, what privs are required
+ */
+ switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
+ case RDMA_TRANSPORT_IWARP:
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+ need_dma_mr = 1;
+ dma_mr_acc =
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+ need_dma_mr = 0;
+ break;
+ case RDMA_TRANSPORT_IB:
+ if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+ need_dma_mr = 0;
+ break;
+ default:
goto errout;
}
+ /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
+ if (need_dma_mr) {
+ /* Register all of physical memory */
+ newxprt->sc_phys_mr =
+ ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
+ if (IS_ERR(newxprt->sc_phys_mr)) {
+ dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
+ ret);
+ goto errout;
+ }
+ newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
+ } else
+ newxprt->sc_dma_lkey =
+ newxprt->sc_cm_id->device->local_dma_lkey;
+
/* Post receive buffers */
for (i = 0; i < newxprt->sc_max_requests; i++) {
ret = svc_rdma_post_recv(newxprt);
@@ -987,7 +1125,6 @@ static void __svc_rdma_free(struct work_struct *work)
* cm_id because the device ptr is needed to unmap the dma in
* svc_rdma_put_context.
*/
- spin_lock_bh(&rdma->sc_read_complete_lock);
while (!list_empty(&rdma->sc_read_complete_q)) {
struct svc_rdma_op_ctxt *ctxt;
ctxt = list_entry(rdma->sc_read_complete_q.next,
@@ -996,10 +1133,8 @@ static void __svc_rdma_free(struct work_struct *work)
list_del_init(&ctxt->dto_q);
svc_rdma_put_context(ctxt, 1);
}
- spin_unlock_bh(&rdma->sc_read_complete_lock);
/* Destroy queued, but not processed recv completions */
- spin_lock_bh(&rdma->sc_rq_dto_lock);
while (!list_empty(&rdma->sc_rq_dto_q)) {
struct svc_rdma_op_ctxt *ctxt;
ctxt = list_entry(rdma->sc_rq_dto_q.next,
@@ -1008,10 +1143,13 @@ static void __svc_rdma_free(struct work_struct *work)
list_del_init(&ctxt->dto_q);
svc_rdma_put_context(ctxt, 1);
}
- spin_unlock_bh(&rdma->sc_rq_dto_lock);
/* Warn if we leaked a resource or under-referenced */
WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
+ WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+
+ /* De-allocate fastreg mr */
+ rdma_dealloc_frmr_q(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1032,7 +1170,6 @@ static void __svc_rdma_free(struct work_struct *work)
/* Destroy the CM ID */
rdma_destroy_id(rdma->sc_cm_id);
- destroy_context_cache(rdma);
kfree(rdma);
}
@@ -1067,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
return 1;
}
+/*
+ * Attempt to register the kvec representing the RPC memory with the
+ * device.
+ *
+ * Returns:
+ * NULL : The device does not support fastreg or there were no more
+ * fastreg mr.
+ * frmr : The kvec register request was successfully posted.
+ * <0 : An error was encountered attempting to register the kvec.
+ */
+int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ struct ib_send_wr fastreg_wr;
+ u8 key;
+
+ /* Bump the key */
+ key = (u8)(frmr->mr->lkey & 0x000000FF);
+ ib_update_fast_reg_key(frmr->mr, ++key);
+
+ /* Prepare FASTREG WR */
+ memset(&fastreg_wr, 0, sizeof fastreg_wr);
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.send_flags = IB_SEND_SIGNALED;
+ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+ fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = frmr->map_len;
+ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+ return svc_rdma_send(xprt, &fastreg_wr);
+}
+
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
- struct ib_send_wr *bad_wr;
+ struct ib_send_wr *bad_wr, *n_wr;
+ int wr_count;
+ int i;
int ret;
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
- BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
- wr->opcode);
+ wr_count = 1;
+ for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+ wr_count++;
+
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
spin_lock_bh(&xprt->sc_lock);
- if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
+ if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
spin_unlock_bh(&xprt->sc_lock);
atomic_inc(&rdma_stat_sq_starve);
@@ -1096,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
return 0;
continue;
}
- /* Bumped used SQ WR count and post */
- svc_xprt_get(&xprt->sc_xprt);
+ /* Take a transport ref for each WR posted */
+ for (i = 0; i < wr_count; i++)
+ svc_xprt_get(&xprt->sc_xprt);
+
+ /* Bump used SQ WR count and post */
+ atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
- if (!ret)
- atomic_inc(&xprt->sc_sq_count);
- else {
- svc_xprt_put(&xprt->sc_xprt);
+ if (ret) {
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ atomic_sub(wr_count, &xprt->sc_sq_count);
+ for (i = 0; i < wr_count; i ++)
+ svc_xprt_put(&xprt->sc_xprt);
dprintk("svcrdma: failed to post SQ WR rc=%d, "
"sc_sq_count=%d, sc_sq_depth=%d\n",
ret, atomic_read(&xprt->sc_sq_count),
xprt->sc_sq_depth);
}
spin_unlock_bh(&xprt->sc_lock);
+ if (ret)
+ wake_up(&xprt->sc_send_wait);
break;
}
return ret;
@@ -1134,7 +1316,12 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
/* Prepare SGE for local address */
sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
- sge.lkey = xprt->sc_phys_mr->lkey;
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
+ put_page(p);
+ return;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+ sge.lkey = xprt->sc_dma_lkey;
sge.length = length;
ctxt = svc_rdma_get_context(xprt);
@@ -1155,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
if (ret) {
dprintk("svcrdma: Error %d posting send for protocol error\n",
ret);
+ ib_dma_unmap_page(xprt->sc_cm_id->device,
+ sge.addr, PAGE_SIZE,
+ DMA_FROM_DEVICE);
svc_rdma_put_context(ctxt, 1);
}
}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index a564c1a39ec..9839c3d9414 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
-#if !RPCRDMA_PERSISTENT_REGISTRATION
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
-#else
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
-#endif
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+ int xprt_rdma_pad_optimize = 0;
#ifdef RPC_DEBUG
@@ -140,6 +137,14 @@ static ctl_table xr_tunables_table[] = {
.extra2 = &max_memreg,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "rdma_pad_optimize",
+ .data = &xprt_rdma_pad_optimize,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = 0,
},
};
@@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
dprintk("RPC: %s: closing\n", __func__);
+ if (r_xprt->rx_ep.rep_connected > 0)
+ xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
(void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
}
@@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task)
/* Reconnect */
schedule_delayed_work(&r_xprt->rdma_connect,
xprt->reestablish_timeout);
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > (30 * HZ))
+ xprt->reestablish_timeout = (30 * HZ);
+ else if (xprt->reestablish_timeout < (5 * HZ))
+ xprt->reestablish_timeout = (5 * HZ);
} else {
schedule_delayed_work(&r_xprt->rdma_connect, 0);
if (!RPC_IS_ASYNC(task))
@@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
}
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
out:
+ req->rl_connect_cookie = 0; /* our reserved value */
return req->rl_xdr_buf;
outfail:
@@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task)
req->rl_reply->rr_xprt = xprt;
}
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
- xprt_disconnect_done(xprt);
- return -ENOTCONN; /* implies disconnect */
- }
+ /* Must suppress retransmit to maintain credits */
+ if (req->rl_connect_cookie == xprt->connect_cookie)
+ goto drop_connection;
+ req->rl_connect_cookie = xprt->connect_cookie;
+
+ if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ goto drop_connection;
+ task->tk_bytes_sent += rqst->rq_snd_buf.len;
rqst->rq_bytes_sent = 0;
return 0;
+
+drop_connection:
+ xprt_disconnect_done(xprt);
+ return -ENOTCONN; /* implies disconnect */
}
static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
@@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void)
{
int rc;
- dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+ dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
#ifdef RPC_DEBUG
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8ea283ecc52..a5fef5e6c32 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ ia->ri_async_rc = 0;
complete(&ia->ri_done);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
@@ -338,13 +339,32 @@ connected:
wake_up_all(&ep->rep_connect_wait);
break;
default:
- ia->ri_async_rc = -EINVAL;
- dprintk("RPC: %s: unexpected CM event %X\n",
+ dprintk("RPC: %s: unexpected CM event %d\n",
__func__, event->event);
- complete(&ia->ri_done);
break;
}
+#ifdef RPC_DEBUG
+ if (connstate == 1) {
+ int ird = attr.max_dest_rd_atomic;
+ int tird = ep->rep_remote_cma.responder_resources;
+ printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
+ "on %s, memreg %d slots %d ird %d%s\n",
+ NIPQUAD(addr->sin_addr.s_addr),
+ ntohs(addr->sin_port),
+ ia->ri_id->device->name,
+ ia->ri_memreg_strategy,
+ xprt->rx_buf.rb_max_requests,
+ ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+ } else if (connstate < 0) {
+ printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
+ "closed (%d)\n",
+ NIPQUAD(addr->sin_addr.s_addr),
+ ntohs(addr->sin_port),
+ connstate);
+ }
+#endif
+
return 0;
}
@@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rdma_cm_id *id;
int rc;
+ init_completion(&ia->ri_done);
+
id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
@@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
return id;
}
- ia->ri_async_rc = 0;
+ ia->ri_async_rc = -ETIMEDOUT;
rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
if (rc) {
dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
__func__, rc);
goto out;
}
- wait_for_completion(&ia->ri_done);
+ wait_for_completion_interruptible_timeout(&ia->ri_done,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = 0;
+ ia->ri_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
goto out;
}
- wait_for_completion(&ia->ri_done);
+ wait_for_completion_interruptible_timeout(&ia->ri_done,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq)
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
- int rc;
+ int rc, mem_priv;
+ struct ib_device_attr devattr;
struct rpcrdma_ia *ia = &xprt->rx_ia;
- init_completion(&ia->ri_done);
-
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
@@ -443,6 +466,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
/*
+ * Query the device to determine if the requested memory
+ * registration strategy is supported. If it isn't, set the
+ * strategy to a globally supported model.
+ */
+ rc = ib_query_device(ia->ri_id->device, &devattr);
+ if (rc) {
+ dprintk("RPC: %s: ib_query_device failed %d\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+ ia->ri_have_dma_lkey = 1;
+ ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+ }
+
+ switch (memreg) {
+ case RPCRDMA_MEMWINDOWS:
+ case RPCRDMA_MEMWINDOWS_ASYNC:
+ if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+ dprintk("RPC: %s: MEMWINDOWS registration "
+ "specified but not supported by adapter, "
+ "using slower RPCRDMA_REGISTER\n",
+ __func__);
+ memreg = RPCRDMA_REGISTER;
+ }
+ break;
+ case RPCRDMA_MTHCAFMR:
+ if (!ia->ri_id->device->alloc_fmr) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "specified but not supported by adapter, "
+ "using riskier RPCRDMA_ALLPHYSICAL\n",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+#else
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "specified but not supported by adapter, "
+ "using slower RPCRDMA_REGISTER\n",
+ __func__);
+ memreg = RPCRDMA_REGISTER;
+#endif
+ }
+ break;
+ case RPCRDMA_FRMR:
+ /* Requires both frmr reg and local dma lkey */
+ if ((devattr.device_cap_flags &
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+ dprintk("RPC: %s: FRMR registration "
+ "specified but not supported by adapter, "
+ "using riskier RPCRDMA_ALLPHYSICAL\n",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+#else
+ dprintk("RPC: %s: FRMR registration "
+ "specified but not supported by adapter, "
+ "using slower RPCRDMA_REGISTER\n",
+ __func__);
+ memreg = RPCRDMA_REGISTER;
+#endif
+ }
+ break;
+ }
+
+ /*
* Optionally obtain an underlying physical identity mapping in
* order to do a memory window-based bind. This base registration
* is protected from remote access - that is enabled only by binding
@@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
* revoked after the corresponding completion similar to a storage
* adapter.
*/
- if (memreg > RPCRDMA_REGISTER) {
- int mem_priv = IB_ACCESS_LOCAL_WRITE;
- switch (memreg) {
+ switch (memreg) {
+ case RPCRDMA_BOUNCEBUFFERS:
+ case RPCRDMA_REGISTER:
+ case RPCRDMA_FRMR:
+ break;
#if RPCRDMA_PERSISTENT_REGISTRATION
- case RPCRDMA_ALLPHYSICAL:
- mem_priv |= IB_ACCESS_REMOTE_WRITE;
- mem_priv |= IB_ACCESS_REMOTE_READ;
- break;
+ case RPCRDMA_ALLPHYSICAL:
+ mem_priv = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ;
+ goto register_setup;
#endif
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- mem_priv |= IB_ACCESS_MW_BIND;
- break;
- default:
+ case RPCRDMA_MEMWINDOWS_ASYNC:
+ case RPCRDMA_MEMWINDOWS:
+ mem_priv = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_MW_BIND;
+ goto register_setup;
+ case RPCRDMA_MTHCAFMR:
+ if (ia->ri_have_dma_lkey)
break;
- }
+ mem_priv = IB_ACCESS_LOCAL_WRITE;
+ register_setup:
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
memreg = RPCRDMA_REGISTER;
ia->ri_bind_mem = NULL;
}
+ break;
+ default:
+ printk(KERN_ERR "%s: invalid memory registration mode %d\n",
+ __func__, memreg);
+ rc = -EINVAL;
+ goto out2;
}
+ dprintk("RPC: %s: memory registration strategy is %d\n",
+ __func__, memreg);
/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
@@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
return 0;
out2:
rdma_destroy_id(ia->ri_id);
+ ia->ri_id = NULL;
out1:
return rc;
}
@@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
dprintk("RPC: %s: ib_dereg_mr returned %i\n",
__func__, rc);
}
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
+ if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+ if (ia->ri_id->qp)
+ rdma_destroy_qp(ia->ri_id);
+ rdma_destroy_id(ia->ri_id);
+ ia->ri_id = NULL;
+ }
if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
rc = ib_dealloc_pd(ia->ri_pd);
dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
__func__, rc);
}
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
- rdma_destroy_id(ia->ri_id);
}
/*
@@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ /* Add room for frmr register and invalidate WRs */
+ ep->rep_attr.cap.max_send_wr *= 3;
+ if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+ return -EINVAL;
+ break;
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
/* Add room for mw_binds+unbinds - overkill! */
@@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_remote_cma.private_data_len = 0;
/* Client offers RDMA Read but does not initiate */
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_BOUNCEBUFFERS:
+ ep->rep_remote_cma.initiator_depth = 0;
+ if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
ep->rep_remote_cma.responder_resources = 0;
- break;
- case RPCRDMA_MTHCAFMR:
- case RPCRDMA_REGISTER:
- ep->rep_remote_cma.responder_resources = cdata->max_requests *
- (RPCRDMA_MAX_DATA_SEGS / 8);
- break;
- case RPCRDMA_MEMWINDOWS:
- case RPCRDMA_MEMWINDOWS_ASYNC:
-#if RPCRDMA_PERSISTENT_REGISTRATION
- case RPCRDMA_ALLPHYSICAL:
-#endif
- ep->rep_remote_cma.responder_resources = cdata->max_requests *
- (RPCRDMA_MAX_DATA_SEGS / 2);
- break;
- default:
- break;
- }
- if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
+ else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ ep->rep_remote_cma.responder_resources = 32;
+ else
ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
- ep->rep_remote_cma.initiator_depth = 0;
ep->rep_remote_cma.retry_count = 7;
ep->rep_remote_cma.flow_control = 0;
@@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
if (rc)
dprintk("RPC: %s: rpcrdma_ep_disconnect"
" returned %i\n", __func__, rc);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
}
- ep->rep_func = NULL;
-
/* padding - could be done in rpcrdma_buffer_destroy... */
if (ep->rep_pad_mr) {
rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
ep->rep_pad_mr = NULL;
}
- if (ia->ri_id->qp) {
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
rpcrdma_clean_cq(ep->rep_cq);
rc = ib_destroy_cq(ep->rep_cq);
if (rc)
@@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
struct rdma_cm_id *id;
int rc = 0;
int retry_count = 0;
- int reconnect = (ep->rep_connected != 0);
- if (reconnect) {
+ if (ep->rep_connected != 0) {
struct rpcrdma_xprt *xprt;
retry:
rc = rpcrdma_ep_disconnect(ep, ia);
@@ -745,6 +836,7 @@ retry:
goto out;
}
/* END TEMP */
+ rdma_destroy_qp(ia->ri_id);
rdma_destroy_id(ia->ri_id);
ia->ri_id = id;
}
@@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
}
}
- /* Theoretically a client initiator_depth > 0 is not needed,
- * but many peers fail to complete the connection unless they
- * == responder_resources! */
- if (ep->rep_remote_cma.initiator_depth !=
- ep->rep_remote_cma.responder_resources)
- ep->rep_remote_cma.initiator_depth =
- ep->rep_remote_cma.responder_resources;
-
ep->rep_connected = 0;
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
goto out;
}
- if (reconnect)
- return 0;
-
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
/*
@@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
if (ep->rep_connected <= 0) {
/* Sometimes, the only way to reliably connect to remote
* CMs is to use same nonzero values for ORD and IRD. */
- ep->rep_remote_cma.initiator_depth =
- ep->rep_remote_cma.responder_resources;
- if (ep->rep_remote_cma.initiator_depth == 0)
- ++ep->rep_remote_cma.initiator_depth;
- if (ep->rep_remote_cma.responder_resources == 0)
- ++ep->rep_remote_cma.responder_resources;
- if (retry_count++ == 0)
+ if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+ (ep->rep_remote_cma.responder_resources == 0 ||
+ ep->rep_remote_cma.initiator_depth !=
+ ep->rep_remote_cma.responder_resources)) {
+ if (ep->rep_remote_cma.responder_resources == 0)
+ ep->rep_remote_cma.responder_resources = 1;
+ ep->rep_remote_cma.initiator_depth =
+ ep->rep_remote_cma.responder_resources;
goto retry;
+ }
rc = ep->rep_connected;
} else {
dprintk("RPC: %s: connected\n", __func__);
@@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
char *p;
size_t len;
int i, rc;
+ struct rpcrdma_mw *r;
buf->rb_max_requests = cdata->max_requests;
spin_lock_init(&buf->rb_lock);
@@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* 2. arrays of struct rpcrdma_req to fill in pointers
* 3. array of struct rpcrdma_rep for replies
* 4. padding, if any
- * 5. mw's, if any
+ * 5. mw's, fmr's or frmr's, if any
* Send/recv buffers in req/rep need to be registered
*/
@@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
len += cdata->padding;
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
+ sizeof(struct rpcrdma_mw);
+ break;
case RPCRDMA_MTHCAFMR:
/* TBD we are perhaps overallocating here */
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* and also reduce unbind-to-bind collision.
*/
INIT_LIST_HEAD(&buf->rb_mws);
+ r = (struct rpcrdma_mw *)p;
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+ r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ RPCRDMA_MAX_SEGS);
+ if (IS_ERR(r->r.frmr.fr_mr)) {
+ rc = PTR_ERR(r->r.frmr.fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr"
+ " failed %i\n", __func__, rc);
+ goto out;
+ }
+ r->r.frmr.fr_pgl =
+ ib_alloc_fast_reg_page_list(ia->ri_id->device,
+ RPCRDMA_MAX_SEGS);
+ if (IS_ERR(r->r.frmr.fr_pgl)) {
+ rc = PTR_ERR(r->r.frmr.fr_pgl);
+ dprintk("RPC: %s: "
+ "ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+ goto out;
+ }
+ list_add(&r->mw_list, &buf->rb_mws);
+ ++r;
+ }
+ break;
case RPCRDMA_MTHCAFMR:
- {
- struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
- struct ib_fmr_attr fa = {
- RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
- };
/* TBD we are perhaps overallocating here */
for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+ static struct ib_fmr_attr fa =
+ { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
r->r.fmr = ib_alloc_fmr(ia->ri_pd,
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
&fa);
@@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
list_add(&r->mw_list, &buf->rb_mws);
++r;
}
- }
break;
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
- {
- struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
/* Allocate one extra request's worth, for full cycling */
for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
r->r.mw = ib_alloc_mw(ia->ri_pd);
@@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
list_add(&r->mw_list, &buf->rb_mws);
++r;
}
- }
break;
default:
break;
@@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
int rc, i;
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct rpcrdma_mw *r;
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
@@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
while (!list_empty(&buf->rb_mws)) {
- struct rpcrdma_mw *r;
r = list_entry(buf->rb_mws.next,
struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dereg_mr"
+ " failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ break;
case RPCRDMA_MTHCAFMR:
rc = ib_dealloc_fmr(r->r.fmr);
if (rc)
@@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
struct rpcrdma_req *req;
unsigned long flags;
+ int i;
+ struct rpcrdma_mw *r;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
if (!list_empty(&buffers->rb_mws)) {
- int i = RPCRDMA_MAX_SEGS - 1;
+ i = RPCRDMA_MAX_SEGS - 1;
do {
- struct rpcrdma_mw *r;
r = list_entry(buffers->rb_mws.next,
struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
@@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_reply = NULL;
}
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
@@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
va, len, DMA_BIDIRECTIONAL);
iov->length = len;
- if (ia->ri_bind_mem != NULL) {
+ if (ia->ri_have_dma_lkey) {
+ *mrp = NULL;
+ iov->lkey = ia->ri_dma_lkey;
+ return 0;
+ } else if (ia->ri_bind_mem != NULL) {
*mrp = NULL;
iov->lkey = ia->ri_bind_mem->lkey;
return 0;
@@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
}
+static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia,
+ struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_send_wr frmr_wr, *bad_wr;
+ u8 key;
+ int len, pageoff;
+ int i, rc;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+ *nsegs = RPCRDMA_MAX_DATA_SEGS;
+ for (i = 0; i < *nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+ dprintk("RPC: %s: Using frmr %p to map %d segments\n",
+ __func__, seg1->mr_chunk.rl_mw, i);
+
+ /* Bump the key */
+ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
+ /* Prepare FRMR WR */
+ memset(&frmr_wr, 0, sizeof frmr_wr);
+ frmr_wr.opcode = IB_WR_FAST_REG_MR;
+ frmr_wr.send_flags = 0; /* unsignaled */
+ frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
+ frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+ frmr_wr.wr.fast_reg.page_list_len = i;
+ frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
+ frmr_wr.wr.fast_reg.access_flags = (writing ?
+ IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
+ frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
+
+ if (rc) {
+ dprintk("RPC: %s: failed ib_post_send for register,"
+ " status %i\n", __func__, rc);
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
+ } else {
+ seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ }
+ *nsegs = i;
+ return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+ struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc;
+
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+
+ memset(&invalidate_wr, 0, sizeof invalidate_wr);
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.send_flags = 0; /* unsignaled */
+ invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ if (rc)
+ dprintk("RPC: %s: failed ib_post_send for invalidate,"
+ " status %i\n", __func__, rc);
+ return rc;
+}
+
+static int
+rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+ int len, pageoff, i, rc;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+ *nsegs = RPCRDMA_MAX_DATA_SEGS;
+ for (i = 0; i < *nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ physaddrs[i] = seg->mr_dma;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+ rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+ physaddrs, i, seg1->mr_dma);
+ if (rc) {
+ dprintk("RPC: %s: failed ib_map_phys_fmr "
+ "%u@0x%llx+%i (%d)... status %i\n", __func__,
+ len, (unsigned long long)seg1->mr_dma,
+ pageoff, i, rc);
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
+ } else {
+ seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ }
+ *nsegs = i;
+ return rc;
+}
+
+static int
+rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+ struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ LIST_HEAD(l);
+ int rc;
+
+ list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+ if (rc)
+ dprintk("RPC: %s: failed ib_unmap_fmr,"
+ " status %i\n", __func__, rc);
+ return rc;
+}
+
+static int
+rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia,
+ struct rpcrdma_xprt *r_xprt)
+{
+ int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+ IB_ACCESS_REMOTE_READ);
+ struct ib_mw_bind param;
+ int rc;
+
+ *nsegs = 1;
+ rpcrdma_map_one(ia, seg, writing);
+ param.mr = ia->ri_bind_mem;
+ param.wr_id = 0ULL; /* no send cookie */
+ param.addr = seg->mr_dma;
+ param.length = seg->mr_len;
+ param.send_flags = 0;
+ param.mw_access_flags = mem_priv;
+
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+ rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+ if (rc) {
+ dprintk("RPC: %s: failed ib_bind_mw "
+ "%u@0x%llx status %i\n",
+ __func__, seg->mr_len,
+ (unsigned long long)seg->mr_dma, rc);
+ rpcrdma_unmap_one(ia, seg);
+ } else {
+ seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+ seg->mr_base = param.addr;
+ seg->mr_nsegs = 1;
+ }
+ return rc;
+}
+
+static int
+rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
+ struct rpcrdma_ia *ia,
+ struct rpcrdma_xprt *r_xprt, void **r)
+{
+ struct ib_mw_bind param;
+ LIST_HEAD(l);
+ int rc;
+
+ BUG_ON(seg->mr_nsegs != 1);
+ param.mr = ia->ri_bind_mem;
+ param.addr = 0ULL; /* unbind */
+ param.length = 0;
+ param.mw_access_flags = 0;
+ if (*r) {
+ param.wr_id = (u64) (unsigned long) *r;
+ param.send_flags = IB_SEND_SIGNALED;
+ INIT_CQCOUNT(&r_xprt->rx_ep);
+ } else {
+ param.wr_id = 0ULL;
+ param.send_flags = 0;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+ }
+ rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+ rpcrdma_unmap_one(ia, seg);
+ if (rc)
+ dprintk("RPC: %s: failed ib_(un)bind_mw,"
+ " status %i\n", __func__, rc);
+ else
+ *r = NULL; /* will upcall on completion */
+ return rc;
+}
+
+static int
+rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+ int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+ IB_ACCESS_REMOTE_READ);
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+ int len, i, rc = 0;
+
+ if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+ *nsegs = RPCRDMA_MAX_DATA_SEGS;
+ for (len = 0, i = 0; i < *nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ ipb[i].addr = seg->mr_dma;
+ ipb[i].size = seg->mr_len;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+ break;
+ }
+ seg1->mr_base = seg1->mr_dma;
+ seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+ ipb, i, mem_priv, &seg1->mr_base);
+ if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+ rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+ dprintk("RPC: %s: failed ib_reg_phys_mr "
+ "%u@0x%llx (%d)... status %i\n",
+ __func__, len,
+ (unsigned long long)seg1->mr_dma, i, rc);
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
+ } else {
+ seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ }
+ *nsegs = i;
+ return rc;
+}
+
+static int
+rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
+ struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ int rc;
+
+ rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+ seg1->mr_chunk.rl_mr = NULL;
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+ if (rc)
+ dprintk("RPC: %s: failed ib_dereg_mr,"
+ " status %i\n", __func__, rc);
+ return rc;
+}
+
int
rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
- IB_ACCESS_REMOTE_READ);
- struct rpcrdma_mr_seg *seg1 = seg;
- int i;
int rc = 0;
switch (ia->ri_memreg_strategy) {
@@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
break;
#endif
- /* Registration using fast memory registration */
+ /* Registration using frmr registration */
+ case RPCRDMA_FRMR:
+ rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
+ break;
+
+ /* Registration using fmr memory registration */
case RPCRDMA_MTHCAFMR:
- {
- u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
- int len, pageoff = offset_in_page(seg->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
- if (nsegs > RPCRDMA_MAX_DATA_SEGS)
- nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (i = 0; i < nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- physaddrs[i] = seg->mr_dma;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
- break;
- }
- nsegs = i;
- rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
- physaddrs, nsegs, seg1->mr_dma);
- if (rc) {
- dprintk("RPC: %s: failed ib_map_phys_fmr "
- "%u@0x%llx+%i (%d)... status %i\n", __func__,
- len, (unsigned long long)seg1->mr_dma,
- pageoff, nsegs, rc);
- while (nsegs--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = nsegs;
- seg1->mr_len = len;
- }
- }
+ rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
break;
/* Registration using memory windows */
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
- {
- struct ib_mw_bind param;
- rpcrdma_map_one(ia, seg, writing);
- param.mr = ia->ri_bind_mem;
- param.wr_id = 0ULL; /* no send cookie */
- param.addr = seg->mr_dma;
- param.length = seg->mr_len;
- param.send_flags = 0;
- param.mw_access_flags = mem_priv;
-
- DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_bind_mw(ia->ri_id->qp,
- seg->mr_chunk.rl_mw->r.mw, &param);
- if (rc) {
- dprintk("RPC: %s: failed ib_bind_mw "
- "%u@0x%llx status %i\n",
- __func__, seg->mr_len,
- (unsigned long long)seg->mr_dma, rc);
- rpcrdma_unmap_one(ia, seg);
- } else {
- seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
- seg->mr_base = param.addr;
- seg->mr_nsegs = 1;
- nsegs = 1;
- }
- }
+ rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
break;
/* Default registration each time */
default:
- {
- struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
- int len = 0;
- if (nsegs > RPCRDMA_MAX_DATA_SEGS)
- nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (i = 0; i < nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- ipb[i].addr = seg->mr_dma;
- ipb[i].size = seg->mr_len;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
- break;
- }
- nsegs = i;
- seg1->mr_base = seg1->mr_dma;
- seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
- ipb, nsegs, mem_priv, &seg1->mr_base);
- if (IS_ERR(seg1->mr_chunk.rl_mr)) {
- rc = PTR_ERR(seg1->mr_chunk.rl_mr);
- dprintk("RPC: %s: failed ib_reg_phys_mr "
- "%u@0x%llx (%d)... status %i\n",
- __func__, len,
- (unsigned long long)seg1->mr_dma, nsegs, rc);
- while (nsegs--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
- seg1->mr_nsegs = nsegs;
- seg1->mr_len = len;
- }
- }
+ rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
break;
}
if (rc)
@@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
struct rpcrdma_xprt *r_xprt, void *r)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
int nsegs = seg->mr_nsegs, rc;
switch (ia->ri_memreg_strategy) {
@@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
break;
#endif
+ case RPCRDMA_FRMR:
+ rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
+ break;
+
case RPCRDMA_MTHCAFMR:
- {
- LIST_HEAD(l);
- list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
- rc = ib_unmap_fmr(&l);
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- }
- if (rc)
- dprintk("RPC: %s: failed ib_unmap_fmr,"
- " status %i\n", __func__, rc);
+ rc = rpcrdma_deregister_fmr_external(seg, ia);
break;
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
- {
- struct ib_mw_bind param;
- BUG_ON(nsegs != 1);
- param.mr = ia->ri_bind_mem;
- param.addr = 0ULL; /* unbind */
- param.length = 0;
- param.mw_access_flags = 0;
- if (r) {
- param.wr_id = (u64) (unsigned long) r;
- param.send_flags = IB_SEND_SIGNALED;
- INIT_CQCOUNT(&r_xprt->rx_ep);
- } else {
- param.wr_id = 0ULL;
- param.send_flags = 0;
- DECR_CQCOUNT(&r_xprt->rx_ep);
- }
- rc = ib_bind_mw(ia->ri_id->qp,
- seg->mr_chunk.rl_mw->r.mw, &param);
- rpcrdma_unmap_one(ia, seg);
- }
- if (rc)
- dprintk("RPC: %s: failed ib_(un)bind_mw,"
- " status %i\n", __func__, rc);
- else
- r = NULL; /* will upcall on completion */
+ rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
break;
default:
- rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
- seg1->mr_chunk.rl_mr = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- if (rc)
- dprintk("RPC: %s: failed ib_dereg_mr,"
- " status %i\n", __func__, rc);
+ rc = rpcrdma_deregister_default_external(seg, ia);
break;
}
if (r) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2427822f8bd..c7a7eba991b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -51,6 +51,9 @@
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
+#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
+
/*
* Interface Adapter -- one per transport instance
*/
@@ -58,6 +61,8 @@ struct rpcrdma_ia {
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct ib_mr *ri_bind_mem;
+ u32 ri_dma_lkey;
+ int ri_have_dma_lkey;
struct completion ri_done;
int ri_async_rc;
enum rpcrdma_memreg ri_memreg_strategy;
@@ -156,6 +161,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
union {
struct ib_mw *mw;
struct ib_fmr *fmr;
+ struct {
+ struct ib_fast_reg_page_list *fr_pgl;
+ struct ib_mr *fr_mr;
+ } frmr;
} r;
struct list_head mw_list;
} *rl_mw;
@@ -175,6 +184,7 @@ struct rpcrdma_req {
size_t rl_size; /* actual length of buffer */
unsigned int rl_niovs; /* 0, 2 or 4 */
unsigned int rl_nchunks; /* non-zero if chunks */
+ unsigned int rl_connect_cookie; /* retry detection */
struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -198,7 +208,7 @@ struct rpcrdma_buffer {
atomic_t rb_credits; /* most recent server credits */
unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
int rb_max_requests;/* client max requests */
- struct list_head rb_mws; /* optional memory windows/fmrs */
+ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
int rb_send_index;
struct rpcrdma_req **rb_send_bufs;
int rb_recv_index;
@@ -273,6 +283,11 @@ struct rpcrdma_xprt {
#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
/*
* Interface Adapter calls - xprtrdma/verbs.c
*/
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4486c59c3ac..9a288d5eea6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3,8 +3,8 @@
*
* Client-side transport implementation for sockets.
*
- * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
- * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
+ * TCP callback races fixes (C) 1998 Red Hat
+ * TCP send fixes (C) 1998 Red Hat
* TCP NFS related read + write fixes
* (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
*