From f75e6745aa3084124ae1434fd7629853bdaf6798 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Apr 2009 17:18:20 -0400 Subject: SUNRPC: Fix the problem of EADDRNOTAVAIL syslog floods on reconnect See http://bugzilla.kernel.org/show_bug.cgi?id=13034 If the port gets into a TIME_WAIT state, then we cannot reconnect without binding to a new port. Tested-by: Petr Vandrovec Tested-by: Jean Delvare Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- net/sunrpc/xprt.c | 6 ++---- net/sunrpc/xprtsock.c | 26 +++++++++++++++++++++----- 2 files changed, 23 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a0bfe53f162..06ca058572f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -672,10 +672,8 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - if (xprt_connecting(xprt)) - xprt_release_write(xprt, NULL); - else - queue_work(rpciod_workqueue, &xprt->task_cleanup); + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + queue_work(rpciod_workqueue, &xprt->task_cleanup); return; out_abort: spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d40ff50887a..e1859614601 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -807,6 +807,9 @@ static void xs_reset_transport(struct sock_xprt *transport) * * This is used when all requests are complete; ie, no DRC state remains * on the server we want to save. + * + * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with + * xs_reset_transport() zeroing the socket from underneath a writer. */ static void xs_close(struct rpc_xprt *xprt) { @@ -824,6 +827,14 @@ static void xs_close(struct rpc_xprt *xprt) xprt_disconnect_done(xprt); } +static void xs_tcp_close(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state)) + xs_close(xprt); + else + xs_tcp_shutdown(xprt); +} + /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport @@ -1772,6 +1783,15 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt, xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { + default: + printk("%s: connect returned unhandled error %d\n", + __func__, status); + case -EADDRNOTAVAIL: + /* We're probably in TIME_WAIT. Get rid of existing socket, + * and retry + */ + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + xprt_force_disconnect(xprt); case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: @@ -1782,10 +1802,6 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt, xprt_clear_connecting(xprt); return; } - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); out_eagain: status = -EAGAIN; out: @@ -1994,7 +2010,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .buf_free = rpc_free, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, - .close = xs_tcp_shutdown, + .close = xs_tcp_close, .destroy = xs_destroy, .print_stats = xs_tcp_print_stats, }; -- cgit v1.2.3 From 453ed90d1395a5281a8f1a0de5d8aabc66202e34 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Sun, 5 Apr 2009 16:22:16 -0500 Subject: net/9p: set correct stat size when sending Twstat messages The 9P2000 Twstat message requires the size of the stat structure to be specified. Currently the 9p code writes zero instead of the actual size. This behavior confuses some of the file servers that check if the size is correct. This patch adds a new function that calculcates the stat size and puts the value in the appropriate place in the 9P message. Signed-off-by: Latchesar Ionkov Reviewed-by: Eric Van Hensbergen --- net/9p/client.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 1eb580c38fb..93f442aaa11 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1251,12 +1251,42 @@ error: } EXPORT_SYMBOL(p9_client_stat); +static int p9_client_statsize(struct p9_wstat *wst, int optional) +{ + int ret; + + /* size[2] type[2] dev[4] qid[13] */ + /* mode[4] atime[4] mtime[4] length[8]*/ + /* name[s] uid[s] gid[s] muid[s] */ + ret = 2+2+4+13+4+4+4+8+2+2+2+2; + + if (wst->name) + ret += strlen(wst->name); + if (wst->uid) + ret += strlen(wst->uid); + if (wst->gid) + ret += strlen(wst->gid); + if (wst->muid) + ret += strlen(wst->muid); + + if (optional) { + ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ + if (wst->extension) + ret += strlen(wst->extension); + } + + return ret; +} + int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) { int err; struct p9_req_t *req; struct p9_client *clnt; + err = 0; + clnt = fid->clnt; + wst->size = p9_client_statsize(wst, clnt->dotu); P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid); P9_DPRINTK(P9_DEBUG_9P, " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" @@ -1268,10 +1298,8 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) wst->atime, wst->mtime, (unsigned long long)wst->length, wst->name, wst->uid, wst->gid, wst->muid, wst->extension, wst->n_uid, wst->n_gid, wst->n_muid); - err = 0; - clnt = fid->clnt; - req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, 0, wst); + req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size, wst); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; -- cgit v1.2.3 From 742b11a7ec60faa25d76c95c268041ab215c25ad Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Sun, 5 Apr 2009 16:26:41 -0500 Subject: net/9p: return error when p9_client_stat fails p9_client_stat function doesn't return correct value if it fails. p9_client_stat should return ERR_PTR of the error value when it fails. Instead, it always returns a value to the allocated p9_wstat struct even when it is not populated correctly. This patch makes p9_client_stat to handle failure correctly. Signed-off-by: Latchesar Ionkov Reviewed-by: Eric Van Hensbergen --- net/9p/client.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 93f442aaa11..781d89a952e 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1244,10 +1244,14 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) ret->name, ret->uid, ret->gid, ret->muid, ret->extension, ret->n_uid, ret->n_gid, ret->n_muid); + p9_free_req(clnt, req); + return ret; + free_and_error: p9_free_req(clnt, req); error: - return ret; + kfree(ret); + return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_stat); -- cgit v1.2.3 From 1bab88b2310998de18b32529a27ea835d164254a Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Sun, 5 Apr 2009 16:28:59 -0500 Subject: net/9p: handle correctly interrupted 9P requests Currently the 9p code crashes when a operation is interrupted, i.e. for example when the user presses ^C while reading from a file. This patch fixes the code that is responsible for interruption and flushing of 9P operations. Signed-off-by: Latchesar Ionkov --- net/9p/client.c | 74 +++++++++++++-------------------------------------- net/9p/trans_fd.c | 14 ++++++---- net/9p/trans_rdma.c | 1 + net/9p/trans_virtio.c | 1 + 4 files changed, 30 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 781d89a952e..dd43a8289b0 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -203,7 +203,6 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) p9pdu_reset(req->tc); p9pdu_reset(req->rc); - req->flush_tag = 0; req->tc->tag = tag-1; req->status = REQ_STATUS_ALLOC; @@ -324,35 +323,9 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req) { - struct p9_req_t *other_req; - unsigned long flags; - P9_DPRINTK(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); - - if (req->status == REQ_STATUS_ERROR) - wake_up(req->wq); - - if (req->flush_tag) { /* flush receive path */ - P9_DPRINTK(P9_DEBUG_9P, "<<< RFLUSH %d\n", req->tc->tag); - spin_lock_irqsave(&c->lock, flags); - other_req = p9_tag_lookup(c, req->flush_tag); - if (other_req->status != REQ_STATUS_FLSH) /* stale flush */ - spin_unlock_irqrestore(&c->lock, flags); - else { - other_req->status = REQ_STATUS_FLSHD; - spin_unlock_irqrestore(&c->lock, flags); - wake_up(other_req->wq); - } - p9_free_req(c, req); - } else { /* normal receive path */ - P9_DPRINTK(P9_DEBUG_MUX, "normal: tag %d\n", req->tc->tag); - spin_lock_irqsave(&c->lock, flags); - if (req->status != REQ_STATUS_FLSHD) - req->status = REQ_STATUS_RCVD; - spin_unlock_irqrestore(&c->lock, flags); - wake_up(req->wq); - P9_DPRINTK(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); - } + wake_up(req->wq); + P9_DPRINTK(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); } EXPORT_SYMBOL(p9_client_cb); @@ -486,9 +459,15 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) if (IS_ERR(req)) return PTR_ERR(req); - req->flush_tag = oldtag; - /* we don't free anything here because RPC isn't complete */ + /* if we haven't received a response for oldreq, + remove it from the list. */ + spin_lock(&c->lock); + if (oldreq->status == REQ_STATUS_FLSH) + list_del(&oldreq->req_list); + spin_unlock(&c->lock); + + p9_free_req(c, req); return 0; } @@ -509,7 +488,6 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) struct p9_req_t *req; unsigned long flags; int sigpending; - int flushed = 0; P9_DPRINTK(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -546,42 +524,28 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) goto reterr; } - /* if it was a flush we just transmitted, return our tag */ - if (type == P9_TFLUSH) - return req; -again: P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d\n", req->wq, tag); err = wait_event_interruptible(*req->wq, req->status >= REQ_STATUS_RCVD); - P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d returned %d (flushed=%d)\n", - req->wq, tag, err, flushed); + P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d returned %d\n", + req->wq, tag, err); if (req->status == REQ_STATUS_ERROR) { P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; - } else if (err == -ERESTARTSYS && flushed) { - P9_DPRINTK(P9_DEBUG_MUX, "flushed - going again\n"); - goto again; - } else if (req->status == REQ_STATUS_FLSHD) { - P9_DPRINTK(P9_DEBUG_MUX, "flushed - erestartsys\n"); - err = -ERESTARTSYS; } - if ((err == -ERESTARTSYS) && (c->status == Connected) && (!flushed)) { + if ((err == -ERESTARTSYS) && (c->status == Connected)) { P9_DPRINTK(P9_DEBUG_MUX, "flushing\n"); - spin_lock_irqsave(&c->lock, flags); - if (req->status == REQ_STATUS_SENT) - req->status = REQ_STATUS_FLSH; - spin_unlock_irqrestore(&c->lock, flags); sigpending = 1; - flushed = 1; clear_thread_flag(TIF_SIGPENDING); - if (c->trans_mod->cancel(c, req)) { - err = p9_client_flush(c, req); - if (err == 0) - goto again; - } + if (c->trans_mod->cancel(c, req)) + p9_client_flush(c, req); + + /* if we received the response anyway, don't signal error */ + if (req->status == REQ_STATUS_RCVD) + err = 0; } if (sigpending) { diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index c613ed08a5e..a2a1814c7a8 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -213,8 +213,8 @@ static void p9_conn_cancel(struct p9_conn *m, int err) spin_unlock_irqrestore(&m->client->lock, flags); list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { - list_del(&req->req_list); P9_DPRINTK(P9_DEBUG_ERROR, "call back req %p\n", req); + list_del(&req->req_list); p9_client_cb(m->client, req); } } @@ -336,7 +336,8 @@ static void p9_read_work(struct work_struct *work) "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); m->req = p9_tag_lookup(m->client, tag); - if (!m->req) { + if (!m->req || (m->req->status != REQ_STATUS_SENT && + m->req->status != REQ_STATUS_FLSH)) { P9_DPRINTK(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", tag); err = -EIO; @@ -361,10 +362,11 @@ static void p9_read_work(struct work_struct *work) if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */ P9_DPRINTK(P9_DEBUG_TRANS, "got new packet\n"); spin_lock(&m->client->lock); + if (m->req->status != REQ_STATUS_ERROR) + m->req->status = REQ_STATUS_RCVD; list_del(&m->req->req_list); spin_unlock(&m->client->lock); p9_client_cb(m->client, m->req); - m->rbuf = NULL; m->rpos = 0; m->rsize = 0; @@ -454,6 +456,7 @@ static void p9_write_work(struct work_struct *work) req = list_entry(m->unsent_req_list.next, struct p9_req_t, req_list); req->status = REQ_STATUS_SENT; + P9_DPRINTK(P9_DEBUG_TRANS, "move req %p\n", req); list_move_tail(&req->req_list, &m->req_list); m->wbuf = req->tc->sdata; @@ -683,12 +686,13 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) P9_DPRINTK(P9_DEBUG_TRANS, "client %p req %p\n", client, req); spin_lock(&client->lock); - list_del(&req->req_list); if (req->status == REQ_STATUS_UNSENT) { + list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; ret = 0; - } + } else if (req->status == REQ_STATUS_SENT) + req->status = REQ_STATUS_FLSH; spin_unlock(&client->lock); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 7fa0eb20b2f..ac4990041eb 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -295,6 +295,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, goto err_out; req->rc = c->rc; + req->status = REQ_STATUS_RCVD; p9_client_cb(client, req); return; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 2d7781ec663..bb8579a141a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -134,6 +134,7 @@ static void req_done(struct virtqueue *vq) P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); req = p9_tag_lookup(chan->client, rc->tag); + req->status = REQ_STATUS_RCVD; p9_client_cb(chan->client, req); } } -- cgit v1.2.3 From c197facc8ea08062f8f949aade6a33649ee06771 Mon Sep 17 00:00:00 2001 From: "hummerbliss@gmail.com" Date: Mon, 20 Apr 2009 17:12:35 +0200 Subject: netfilter: bridge: allow fragmentation of VLAN packets traversing a bridge br_nf_dev_queue_xmit only checks for ETH_P_IP packets for fragmenting but not VLAN packets. This results in dropping of large VLAN packets. This can be observed when connection tracking is enabled. Connection tracking re-assembles fragmented packets, and these have to re-fragmented when transmitting out. Also, make sure only refragmented packets are defragmented as per suggestion from Patrick McHardy. Signed-off-by: Saikiran Madugula Signed-off-by: Patrick McHardy --- net/bridge/br_netfilter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3953ac4214c..e4a418fcb35 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -788,15 +788,23 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff *skb, return NF_STOLEN; } +#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) static int br_nf_dev_queue_xmit(struct sk_buff *skb) { - if (skb->protocol == htons(ETH_P_IP) && + if (skb->nfct != NULL && + (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) && skb->len > skb->dev->mtu && !skb_is_gso(skb)) return ip_fragment(skb, br_dev_queue_push_xmit); else return br_dev_queue_push_xmit(skb); } +#else +static int br_nf_dev_queue_xmit(struct sk_buff *skb) +{ + return br_dev_queue_push_xmit(skb); +} +#endif /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, -- cgit v1.2.3 From 5ff482940f5aa2cdc3424c4a8ea94b9833b2af5f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 24 Apr 2009 15:37:44 +0200 Subject: netfilter: nf_ct_dccp/udplite: fix protocol registration error Commit d0dba725 (netfilter: ctnetlink: add callbacks to the per-proto nlattrs) changed the protocol registration function to abort if the to-be registered protocol doesn't provide a new callback function. The DCCP and UDP-Lite IPv6 protocols were missed in this conversion, add the required callback pointer. Reported-and-tested-by: Steven Jan Springl Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto_dccp.c | 1 + net/netfilter/nf_conntrack_proto_udplite.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 50dac8dbe7d..5411d63f31a 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -777,6 +777,7 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { .print_conntrack = dccp_print_conntrack, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, .from_nlattr = nlattr_to_dccp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 4614696c1b8..0badedc542d 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -204,6 +204,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .error = udplite_error, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -- cgit v1.2.3 From 4b0706624930dc75c3b0d0df463d89759ef7de29 Mon Sep 17 00:00:00 2001 From: Laszlo Attila Toth Date: Fri, 24 Apr 2009 16:55:25 +0200 Subject: netfilter: Kconfig: TProxy doesn't depend on NF_CONNTRACK Signed-off-by: Laszlo Attila Toth Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 2329c5f5055..881203c4a14 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -275,6 +275,8 @@ config NF_CT_NETLINK help This option enables support for a netlink-based userspace interface +endif # NF_CONNTRACK + # transparent proxy support config NETFILTER_TPROXY tristate "Transparent proxying support (EXPERIMENTAL)" @@ -290,8 +292,6 @@ config NETFILTER_TPROXY To compile it as a module, choose M here. If unsure, say N. -endif # NF_CONNTRACK - config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n -- cgit v1.2.3 From 71951b64a5a87c09eb6fde59ce51aaab2fdaeab2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2009 16:58:41 +0200 Subject: netfilter: nf_ct_dccp: add missing role attributes for DCCP This patch adds missing role attribute to the DCCP type, otherwise the creation of entries is not of any use. The attribute added is CTA_PROTOINFO_DCCP_ROLE which contains the role of the conntrack original tuple. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto_dccp.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 5411d63f31a..8e757dd5339 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -633,6 +633,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); + NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE, + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]); nla_nest_end(skb, nest_parms); read_unlock_bh(&dccp_lock); return 0; @@ -644,6 +646,7 @@ nla_put_failure: static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, }; static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) @@ -661,11 +664,21 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) return err; if (!tb[CTA_PROTOINFO_DCCP_STATE] || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) + !tb[CTA_PROTOINFO_DCCP_ROLE] || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { return -EINVAL; + } write_lock_bh(&dccp_lock); ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); + if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + } else { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; + } write_unlock_bh(&dccp_lock); return 0; } -- cgit v1.2.3 From 37e55cf0ceb8803256bf69a3e45bd668bf90b76f Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Fri, 24 Apr 2009 17:05:21 +0200 Subject: netfilter: xt_recent: fix stack overread in compat code Related-to: commit 325fb5b4d26038cba665dd0d8ee09555321061f0 The compat path suffers from a similar problem. It only uses a __be32 when all of the recent code uses, and expects, an nf_inet_addr everywhere. As a result, addresses stored by xt_recents were filled with whatever other stuff was on the stack following the be32. Signed-off-by: Jan Engelhardt With a minor compile fix from Roman. Reported-and-tested-by: Roman Hoog Antink Signed-off-by: Patrick McHardy --- net/netfilter/xt_recent.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 791e030ea90..eb0ceb84652 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -474,7 +474,7 @@ static ssize_t recent_old_proc_write(struct file *file, struct recent_table *t = pde->data; struct recent_entry *e; char buf[sizeof("+255.255.255.255")], *c = buf; - __be32 addr; + union nf_inet_addr addr = {}; int add; if (size > sizeof(buf)) @@ -506,14 +506,13 @@ static ssize_t recent_old_proc_write(struct file *file, add = 1; break; } - addr = in_aton(c); + addr.ip = in_aton(c); spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0); + e = recent_entry_lookup(t, &addr, NFPROTO_IPV4, 0); if (e == NULL) { if (add) - recent_entry_init(t, (const void *)&addr, - NFPROTO_IPV4, 0); + recent_entry_init(t, &addr, NFPROTO_IPV4, 0); } else { if (add) recent_entry_update(t, e); -- cgit v1.2.3 From adc667e84f086aa110d810f3476c494e48eaabaa Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Sat, 25 Apr 2009 18:03:35 -0700 Subject: vlan: update vlan carrier state for admin up/down Currently, the VLAN event handler does not adjust the VLAN device's carrier state when the real device or the VLAN device is set administratively up or down. The following patch adds a transfer of operating state from the real device to the VLAN device when the real device is administratively set up or down, and sets the carrier state up or down during init, open and close of the VLAN device. This permits observers above the VLAN device that care about the carrier state (bonding's link monitor, for example) to receive updates for administrative changes by more closely mimicing the behavior of real devices. Signed-off-by: Jay Vosburgh Signed-off-by: Patrick McHardy --- net/8021q/vlan.c | 2 ++ net/8021q/vlan_dev.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 2b7390e377b..d1e10546eb8 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -492,6 +492,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, continue; dev_change_flags(vlandev, flgs & ~IFF_UP); + vlan_transfer_operstate(dev, vlandev); } break; @@ -507,6 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, continue; dev_change_flags(vlandev, flgs | IFF_UP); + vlan_transfer_operstate(dev, vlandev); } break; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 6b092136401..b4b9068e55a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -462,6 +462,7 @@ static int vlan_dev_open(struct net_device *dev) if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); + netif_carrier_on(dev); return 0; clear_allmulti: @@ -471,6 +472,7 @@ del_unicast: if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN); out: + netif_carrier_off(dev); return err; } @@ -492,6 +494,7 @@ static int vlan_dev_stop(struct net_device *dev) if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len); + netif_carrier_off(dev); return 0; } @@ -612,6 +615,8 @@ static int vlan_dev_init(struct net_device *dev) struct net_device *real_dev = vlan_dev_info(dev)->real_dev; int subclass = 0; + netif_carrier_off(dev); + /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI); dev->iflink = real_dev->ifindex; -- cgit v1.2.3 From 6a783c9067e3f71aac61a9262fe42c1f68efd4fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 27 Apr 2009 02:58:59 -0700 Subject: xfrm: wrong hash value for temporary SA When kernel inserts a temporary SA for IKE, it uses the wrong hash value for dst list. Two hash values were calcultated before: one with source address and one with a wildcard source address. Bug hinted by Junwei Zhang Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 82271720d97..5f1f86565f1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -794,7 +794,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); - unsigned int h; + unsigned int h, h_wildcard; struct hlist_node *entry; struct xfrm_state *x, *x0, *to_put; int acquire_in_progress = 0; @@ -819,8 +819,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, if (best) goto found; - h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { if (x->props.family == family && x->props.reqid == tmpl->reqid && !(x->props.flags & XFRM_STATE_WILDRECV) && -- cgit v1.2.3 From c9503e0fe052020e0294cd07d0ecd982eb7c9177 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 27 Apr 2009 05:42:24 -0700 Subject: ipv4: Limit size of route cache hash table Right now we have no upper limit on the size of the route cache hash table. On a 128GB POWER6 box it ends up as 32MB: IP route cache hash table entries: 4194304 (order: 9, 33554432 bytes) It would be nice to cap this for memory consumption reasons, but a massive hashtable also causes a significant spike when measuring OS jitter. With a 32MB hashtable and 4 million entries, rt_worker_func is taking 5 ms to complete. On another system with more memory it's taking 14 ms. Even though rt_worker_func does call cond_sched() to limit its impact, in an HPC environment we want to keep all sources of OS jitter to a minimum. With the patch applied we limit the number of entries to 512k which can still be overriden by using the rt_entries boot option: IP route cache hash table entries: 524288 (order: 6, 4194304 bytes) With this patch rt_worker_func now takes 0.460 ms on the same system. Signed-off-by: Anton Blanchard Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c40debe51b3..c4c60e9f068 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3397,7 +3397,7 @@ int __init ip_rt_init(void) 0, &rt_hash_log, &rt_hash_mask, - 0); + rhash_entries ? 0 : 512 * 1024); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); -- cgit v1.2.3 From bf368e4e70cd4e0f880923c44e95a4273d725ab4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2009 02:24:21 -0700 Subject: net: Avoid extra wakeups of threads blocked in wait_for_packet() In 2.6.25 we added UDP mem accounting. This unfortunatly added a penalty when a frame is transmitted, since we have at TX completion time to call sock_wfree() to perform necessary memory accounting. This calls sock_def_write_space() and utimately scheduler if any thread is waiting on the socket. Thread(s) waiting for an incoming frame was scheduled, then had to sleep again as event was meaningless. (All threads waiting on a socket are using same sk_sleep anchor) This adds lot of extra wakeups and increases latencies, as noted by Christoph Lameter, and slows down softirq handler. Reference : http://marc.info/?l=linux-netdev&m=124060437012283&w=2 Fortunatly, Davide Libenzi recently added concept of keyed wakeups into kernel, and particularly for sockets (see commit 37e5540b3c9d838eb20f2ca8ea2eb8072271e403 epoll keyed wakeups: make sockets use keyed wakeups) Davide goal was to optimize epoll, but this new wakeup infrastructure can help non epoll users as well, if they care to setup an appropriate handler. This patch introduces new DEFINE_WAIT_FUNC() helper and uses it in wait_for_packet(), so that only relevant event can wakeup a thread blocked in this function. Trace of function calls from bnx2 TX completion bnx2_poll_work() is : __kfree_skb() skb_release_head_state() sock_wfree() sock_def_write_space() __wake_up_sync_key() __wake_up_common() receiver_wake_function() : Stops here since thread is waiting for an INPUT Reported-by: Christoph Lameter Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index d0de644b378..b01a76abe1d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } +static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + unsigned long bits = (unsigned long)key; + + /* + * Avoid a wakeup if event not interesting for us + */ + if (bits && !(bits & (POLLIN | POLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} /* * Wait for a packet.. */ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); -- cgit v1.2.3 From f3784d834c71689336fa272df420b45345cb6b84 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 23 Apr 2009 14:50:54 +0300 Subject: Bluetooth: Ensure that HCI sysfs add/del is preempt safe Use a different work_struct variables for add_conn() and del_conn() and use single work queue instead of two for adding and deleting connections. It eliminates the following error on a preemptible kernel: [ 204.358032] Unable to handle kernel NULL pointer dereference at virtual address 0000000c [ 204.370697] pgd = c0004000 [ 204.373443] [0000000c] *pgd=00000000 [ 204.378601] Internal error: Oops: 17 [#1] PREEMPT [ 204.383361] Modules linked in: vfat fat rfcomm sco l2cap sd_mod scsi_mod iphb pvr2d drm omaplfb ps [ 204.438537] CPU: 0 Not tainted (2.6.28-maemo2 #1) [ 204.443664] PC is at klist_put+0x2c/0xb4 [ 204.447601] LR is at klist_put+0x18/0xb4 [ 204.451568] pc : [] lr : [] psr: a0000113 [ 204.451568] sp : cf1b3f10 ip : cf1b3f10 fp : cf1b3f2c [ 204.463104] r10: 00000000 r9 : 00000000 r8 : bf08029c [ 204.468353] r7 : c7869200 r6 : cfbe2690 r5 : c78692c8 r4 : 00000001 [ 204.474945] r3 : 00000001 r2 : cf1b2000 r1 : 00000001 r0 : 00000000 [ 204.481506] Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment kernel [ 204.488861] Control: 10c5387d Table: 887fc018 DAC: 00000017 [ 204.494628] Process btdelconn (pid: 515, stack limit = 0xcf1b22e0) Signed-off-by: Roger Quadros Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sysfs.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index ed82796d4a0..b7c51082dde 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -9,8 +9,7 @@ struct class *bt_class = NULL; EXPORT_SYMBOL_GPL(bt_class); -static struct workqueue_struct *btaddconn; -static struct workqueue_struct *btdelconn; +static struct workqueue_struct *bluetooth; static inline char *link_typetostr(int type) { @@ -88,9 +87,10 @@ static struct device_type bt_link = { static void add_conn(struct work_struct *work) { - struct hci_conn *conn = container_of(work, struct hci_conn, work); + struct hci_conn *conn = container_of(work, struct hci_conn, work_add); - flush_workqueue(btdelconn); + /* ensure previous add/del is complete */ + flush_workqueue(bluetooth); if (device_add(&conn->dev) < 0) { BT_ERR("Failed to register connection device"); @@ -114,9 +114,9 @@ void hci_conn_add_sysfs(struct hci_conn *conn) device_initialize(&conn->dev); - INIT_WORK(&conn->work, add_conn); + INIT_WORK(&conn->work_add, add_conn); - queue_work(btaddconn, &conn->work); + queue_work(bluetooth, &conn->work_add); } /* @@ -131,9 +131,12 @@ static int __match_tty(struct device *dev, void *data) static void del_conn(struct work_struct *work) { - struct hci_conn *conn = container_of(work, struct hci_conn, work); + struct hci_conn *conn = container_of(work, struct hci_conn, work_del); struct hci_dev *hdev = conn->hdev; + /* ensure previous add/del is complete */ + flush_workqueue(bluetooth); + while (1) { struct device *dev; @@ -156,9 +159,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn) if (!device_is_registered(&conn->dev)) return; - INIT_WORK(&conn->work, del_conn); + INIT_WORK(&conn->work_del, del_conn); - queue_work(btdelconn, &conn->work); + queue_work(bluetooth, &conn->work_del); } static inline char *host_typetostr(int type) @@ -435,20 +438,13 @@ void hci_unregister_sysfs(struct hci_dev *hdev) int __init bt_sysfs_init(void) { - btaddconn = create_singlethread_workqueue("btaddconn"); - if (!btaddconn) - return -ENOMEM; - - btdelconn = create_singlethread_workqueue("btdelconn"); - if (!btdelconn) { - destroy_workqueue(btaddconn); + bluetooth = create_singlethread_workqueue("bluetooth"); + if (!bluetooth) return -ENOMEM; - } bt_class = class_create(THIS_MODULE, "bluetooth"); if (IS_ERR(bt_class)) { - destroy_workqueue(btdelconn); - destroy_workqueue(btaddconn); + destroy_workqueue(bluetooth); return PTR_ERR(bt_class); } @@ -457,8 +453,7 @@ int __init bt_sysfs_init(void) void bt_sysfs_cleanup(void) { - destroy_workqueue(btaddconn); - destroy_workqueue(btdelconn); + destroy_workqueue(bluetooth); class_destroy(bt_class); } -- cgit v1.2.3 From 052b30b0a8eec8db5b18ad49effdf2a9ba4c1e1a Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 26 Apr 2009 20:01:22 +0200 Subject: Bluetooth: Add different pairing timeout for Legacy Pairing The Bluetooth stack uses a reference counting for all established ACL links and if no user (L2CAP connection) is present, the link will be terminated to save power. The problem part is the dedicated pairing when using Legacy Pairing (Bluetooth 2.0 and before). At that point no user is present and pairing attempts will be disconnected within 10 seconds or less. In previous kernel version this was not a problem since the disconnect timeout wasn't triggered on incoming connections for the first time. However this caused issues with broken host stacks that kept the connections around after dedicated pairing. When the support for Simple Pairing got added, the link establishment procedure needed to be changed and now causes issues when using Legacy Pairing When using Simple Pairing it is possible to do a proper reference counting of ACL link users. With Legacy Pairing this is not possible since the specification is unclear in some areas and too many broken Bluetooth devices have already been deployed. So instead of trying to deal with all the broken devices, a special pairing timeout will be introduced that increases the timeout to 60 seconds when pairing is triggered. If a broken devices now puts the stack into an unforeseen state, the worst that happens is the disconnect timeout triggers after 120 seconds instead of 4 seconds. This allows successful pairings with legacy and broken devices now. Based on a report by Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 1 + net/bluetooth/hci_event.c | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 1181db08d9d..75ebbe2221a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -215,6 +215,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) conn->state = BT_OPEN; conn->power_save = 1; + conn->disc_timeout = HCI_DISCONN_TIMEOUT; switch (type) { case ACL_LINK: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 15f40ea8d54..4e7cb88e5da 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -883,6 +883,7 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; } else conn->state = BT_CONNECTED; @@ -1063,9 +1064,14 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s hci_proto_connect_cfm(conn, ev->status); hci_conn_put(conn); } - } else + } else { hci_auth_cfm(conn, ev->status); + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + hci_conn_put(conn); + } + if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { if (!ev->status) { struct hci_cp_set_conn_encrypt cp; @@ -1479,7 +1485,21 @@ static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct hci_ev_pin_code_req *ev = (void *) skb->data; + struct hci_conn *conn; + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + hci_conn_hold(conn); + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + hci_conn_put(conn); + } + + hci_dev_unlock(hdev); } static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -1489,7 +1509,21 @@ static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct hci_ev_link_key_notify *ev = (void *) skb->data; + struct hci_conn *conn; + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + hci_conn_put(conn); + } + + hci_dev_unlock(hdev); } static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb) -- cgit v1.2.3 From 3fdca1e1370ffe89980927cdef0583bebcd8caaf Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 28 Apr 2009 09:04:55 -0700 Subject: Bluetooth: Fix connection establishment with low security requirement The Bluetooth 2.1 specification introduced four different security modes that can be mapped using Legacy Pairing and Simple Pairing. With the usage of Simple Pairing it is required that all connections (except the ones for SDP) are encrypted. So even the low security requirement mandates an encrypted connection when using Simple Pairing. When using Legacy Pairing (for Bluetooth 2.0 devices and older) this is not required since it causes interoperability issues. To support this properly the low security requirement translates into different host controller transactions depending if Simple Pairing is supported or not. However in case of Simple Pairing the command to switch on encryption after a successful authentication is not triggered for the low security mode. This patch fixes this and actually makes the logic to differentiate between Simple Pairing and Legacy Pairing a lot simpler. Based on a report by Ville Tervo Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 75ebbe2221a..375f4b4f7f7 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -425,12 +425,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (sec_level == BT_SECURITY_SDP) return 1; - if (sec_level == BT_SECURITY_LOW) { - if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) - return hci_conn_auth(conn, sec_level, auth_type); - else - return 1; - } + if (sec_level == BT_SECURITY_LOW && + (!conn->ssp_mode || !conn->hdev->ssp_mode)) + return 1; if (conn->link_mode & HCI_LM_ENCRYPT) return hci_conn_auth(conn, sec_level, auth_type); -- cgit v1.2.3 From 942e4a2bd680c606af0211e64eb216be2e19bf61 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 28 Apr 2009 22:36:33 -0700 Subject: netfilter: revised locking for x_tables The x_tables are organized with a table structure and a per-cpu copies of the counters and rules. On older kernels there was a reader/writer lock per table which was a performance bottleneck. In 2.6.30-rc, this was converted to use RCU and the counters/rules which solved the performance problems for do_table but made replacing rules much slower because of the necessary RCU grace period. This version uses a per-cpu set of spinlocks and counters to allow to table processing to proceed without the cache thrashing of a global reader lock and keeps the same performance for table updates. Signed-off-by: Stephen Hemminger Acked-by: Linus Torvalds Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 125 ++++++++++++--------------------------- net/ipv4/netfilter/ip_tables.c | 126 +++++++++++----------------------------- net/ipv6/netfilter/ip6_tables.c | 123 ++++++++++++--------------------------- net/netfilter/x_tables.c | 53 +++++++++-------- 4 files changed, 136 insertions(+), 291 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 5ba533d234d..831fe1879dc 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + (2 * skb->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); t = arpt_get_target(e); @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); if (hotdrop) return NF_DROP; @@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t, /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. + * + * Bottom half has to be disabled to prevent deadlock + * if new softirq were to run and call ipt_do_table */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; ARPT_ENTRY_ITERATE(t->entries[curcpu], @@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } -} - - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - ARPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } -static inline int -zero_entry_counter(struct arpt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; - - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ + return ERR_PTR(-ENOMEM); - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int copy_entries_to_user(unsigned int total_size, @@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, @@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct arpt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - mutex_lock(&t->lock); + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + xt_info_wrlock(curcpu); ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); unlock_up_free: - mutex_unlock(&t->lock); - + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 810c0b62c7d..2ec8d7290c4 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb, tgpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t, /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters - * with data used by 'current' CPU - * We dont care about preemption here. + * with data used by 'current' CPU. + * + * Bottom half has to be disabled to prevent deadlock + * if new softirq were to run and call ipt_do_table */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; IPT_ENTRY_ITERATE(t->entries[curcpu], @@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } - -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } - -static inline int -zero_entry_counter(struct ipt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; + return ERR_PTR(-ENOMEM); - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, @@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ipt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat goto free; } - mutex_lock(&t->lock); + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + xt_info_wrlock(curcpu); IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); unlock_up_free: - mutex_unlock(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 800ae854247..219e165aea1 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t, /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. + * + * Bottom half has to be disabled to prevent deadlock + * if new softirq were to run and call ipt_do_table */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; IP6T_ENTRY_ITERATE(t->entries[curcpu], @@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IP6T_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } -static inline int -zero_entry_counter(struct ip6t_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; + return ERR_PTR(-ENOMEM); - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, @@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ip6t_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - mutex_lock(&t->lock); + + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + curcpu = smp_processor_id(); + xt_info_wrlock(curcpu); + loc_cpu_entry = private->entries[curcpu]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); + unlock_up_free: - mutex_unlock(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 509a95621f9..150e5cf62f8 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, - struct xt_table_info *newinfo) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) { - void *p = oldinfo->entries[cpu]; - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); - newinfo->entries[cpu] = p; - } - -} -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); - /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) @@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif +DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); + + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error) { - struct xt_table_info *oldinfo, *private; + struct xt_table_info *private; /* Do the substitution. */ - mutex_lock(&table->lock); + local_bh_disable(); private = table->private; + /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - mutex_unlock(&table->lock); + local_bh_enable(); *error = -EAGAIN; return NULL; } - oldinfo = private; - rcu_assign_pointer(table->private, newinfo); - newinfo->initial_entries = oldinfo->initial_entries; - mutex_unlock(&table->lock); - synchronize_net(); - return oldinfo; + table->private = newinfo; + newinfo->initial_entries = private->initial_entries; + + /* + * Even though table entries have now been swapped, other CPU's + * may still be using the old entries. This is okay, because + * resynchronization happens because of the locking done + * during the get_counters() routine. + */ + local_bh_enable(); + + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table, /* Simplifies replace_table code. */ table->private = bootstrap; - mutex_init(&table->lock); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; @@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = { static int __init xt_init(void) { - int i, rv; + unsigned int i; + int rv; + + for_each_possible_cpu(i) { + struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); + spin_lock_init(&lock->lock); + lock->readers = 0; + } xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt) -- cgit v1.2.3 From d4c4a9a1bce1912ed5681251f0037fd4f2364a3e Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Wed, 29 Apr 2009 11:41:24 +0100 Subject: mac80211: fix modprobe deadlock by not calling wep_init under rtnl_lock - ieee80211_wep_init(), which is called with rtnl_lock held, blocks in request_module() [waiting for modprobe to load a crypto module]. - modprobe blocks in a call to flush_workqueue(), when it closes a TTY [presumably when it exits]. - The workqueue item linkwatch_event() blocks on rtnl_lock. There's no reason for wep_init() to be called with rtnl_lock held, so just move it outside the critical section. Signed-off-by: Alan Jenkins Signed-off-by: John W. Linville --- net/mac80211/main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fbcbed6cad0..00968c2d22b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -909,6 +909,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (result < 0) goto fail_sta_info; + result = ieee80211_wep_init(local); + if (result < 0) { + printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", + wiphy_name(local->hw.wiphy), result); + goto fail_wep; + } + rtnl_lock(); result = dev_alloc_name(local->mdev, local->mdev->name); if (result < 0) @@ -930,14 +937,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_rate; } - result = ieee80211_wep_init(local); - - if (result < 0) { - printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", - wiphy_name(local->hw.wiphy), result); - goto fail_wep; - } - /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION)) { result = ieee80211_if_add(local, "wlan%d", NULL, @@ -967,13 +966,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return 0; -fail_wep: - rate_control_deinitialize(local); fail_rate: unregister_netdevice(local->mdev); local->mdev = NULL; fail_dev: rtnl_unlock(); + ieee80211_wep_free(local); +fail_wep: sta_info_stop(local); fail_sta_info: debugfs_hw_del(local); -- cgit v1.2.3 From c428c89201a57a0ce24c37ed79e540d1f4101cf3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 29 Apr 2009 00:28:18 +0200 Subject: mac80211: default to automatic power control In "mac80211: correct wext transmit power handler" I fixed the wext handler, but forgot to make the default of the user_power_level -1 (aka "auto"), so that now the transmit power is always set to 0, causing associations to time out and similar problems since we're transmitting with very little power. Correct this by correcting the default user_power_level to -1. Signed-off-by: Johannes Berg Bisected-by: Niel Lambrechts Signed-off-by: John W. Linville --- net/mac80211/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 00968c2d22b..14134193cd1 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -757,6 +757,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, local->hw.conf.long_frame_max_tx_count = 4; local->hw.conf.short_frame_max_tx_count = 7; local->hw.conf.radio_enabled = true; + local->user_power_level = -1; INIT_LIST_HEAD(&local->interfaces); mutex_init(&local->iflist_mtx); -- cgit v1.2.3 From 7a67e56fd362d3edfde1f19170893508c3940d3a Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 30 Apr 2009 05:41:19 -0700 Subject: net: Fix oops when splicing skbs from a frag_list. Lennert Buytenhek wrote: > Since 4fb669948116d928ae44262ab7743732c574630d ("net: Optimize memory > usage when splicing from sockets.") I'm seeing this oops (e.g. in > 2.6.30-rc3) when splicing from a TCP socket to /dev/null on a driver > (mv643xx_eth) that uses LRO in the skb mode (lro_receive_skb) rather > than the frag mode: My patch incorrectly assumed skb->sk was always valid, but for "frag_listed" skbs we can only use skb->sk of their parent. Reported-by: Lennert Buytenhek Debugged-by: Lennert Buytenhek Tested-by: Lennert Buytenhek Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce6356cd9f7..f091a5a845c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1365,9 +1365,8 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) static inline struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, - struct sk_buff *skb) + struct sk_buff *skb, struct sock *sk) { - struct sock *sk = skb->sk; struct page *p = sk->sk_sndmsg_page; unsigned int off; @@ -1405,13 +1404,14 @@ new_page: */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, unsigned int *len, unsigned int offset, - struct sk_buff *skb, int linear) + struct sk_buff *skb, int linear, + struct sock *sk) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; if (linear) { - page = linear_to_page(page, len, &offset, skb); + page = linear_to_page(page, len, &offset, skb, sk); if (!page) return 1; } else @@ -1442,7 +1442,8 @@ static inline void __segment_seek(struct page **page, unsigned int *poff, static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd, int linear) + struct splice_pipe_desc *spd, int linear, + struct sock *sk) { if (!*len) return 1; @@ -1465,7 +1466,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, &flen, poff, skb, linear)) + if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1481,8 +1482,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff, * pipe is full or if we already spliced the requested length. */ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, - unsigned int *len, - struct splice_pipe_desc *spd) + unsigned int *len, struct splice_pipe_desc *spd, + struct sock *sk) { int seg; @@ -1492,7 +1493,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd, 1)) + offset, len, skb, spd, 1, sk)) return 1; /* @@ -1502,7 +1503,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd, 0)) + offset, len, skb, spd, 0, sk)) return 1; } @@ -1528,12 +1529,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; + struct sock *sk = skb->sk; /* * __skb_splice_bits() only fails if the output has no room left, * so no point in going over the frag_list for the error case. */ - if (__skb_splice_bits(skb, &offset, &tlen, &spd)) + if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) goto done; else if (!tlen) goto done; @@ -1545,14 +1547,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct sk_buff *list = skb_shinfo(skb)->frag_list; for (; list && tlen; list = list->next) { - if (__skb_splice_bits(list, &offset, &tlen, &spd)) + if (__skb_splice_bits(list, &offset, &tlen, &spd, sk)) break; } } done: if (spd.nr_pages) { - struct sock *sk = skb->sk; int ret; /* -- cgit v1.2.3 From ec581f6a42bbbea5271c66da9769a41b46c74e10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 May 2009 09:05:06 -0700 Subject: net: Fix skb_tx_hash() for forwarding workloads. When skb_rx_queue_recorded() is true, we dont want to use jash distribution as the device driver exactly told us which queue was selected at RX time. jhash makes a statistical shuffle, but this wont work with 8 static inputs. Later improvements would be to compute reciprocal value of real_num_tx_queues to avoid a divide here. But this computation should be done once, when real_num_tx_queues is set. This needs a separate patch, and a new field in struct net_device. Reported-by: Andrew Dickinson Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 308a7d0c277..e2e9e4af3ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1735,11 +1735,12 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - } else if (skb->sk && skb->sk->sk_hash) { + if (skb_rx_queue_recorded(skb)) + return skb_get_rx_queue(skb) % dev->real_num_tx_queues; + + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; - } else + else hash = skb->protocol; hash = jhash_1word(hash, skb_tx_hashrnd); -- cgit v1.2.3 From acda074390270ca9e28f2a9729f7b835e2ad6da4 Mon Sep 17 00:00:00 2001 From: Laszlo Attila Toth Date: Fri, 1 May 2009 15:23:10 -0700 Subject: xt_socket: checks for the state of nf_conntrack xt_socket can use connection tracking, and checks whether it is a module. Signed-off-by: Laszlo Attila Toth Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 881203c4a14..cb3ad741ebf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -837,6 +837,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED + depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DEFRAG_IPV4 help This option adds a `socket' match, which can be used to match -- cgit v1.2.3 From d0ab8ff81bf1b01bae7d6b92ca067badbbb02cc9 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Sat, 2 May 2009 13:48:32 -0700 Subject: net: Only store high 16 bits of kernel generated filter priorities The kernel should only be using the high 16 bits of a kernel generated priority. Filter priorities in all other cases only use the upper 16 bits of the u32 'prio' field of 'struct tcf_proto', but when the kernel generates the priority of a filter is saves all 32 bits which can result in incorrect lookup failures when a filter needs to be deleted or modified. Signed-off-by: Robert Love Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 173fcc4b050..0759f32e9dc 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -254,7 +254,7 @@ replay: } tp->ops = tp_ops; tp->protocol = protocol; - tp->prio = nprio ? : tcf_auto_prio(*back); + tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back)); tp->q = q; tp->classify = tp_ops->classify; tp->classid = parent; -- cgit v1.2.3 From 902e5ea15f8471a3213a37b11b98196f3406aeaf Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sat, 2 May 2009 13:49:36 -0700 Subject: Subject: [PATCH] br2684: restore net_dev initialization Commit 0ba25ff4c669e5395110ba6ab4958a97a9f96922 ("br2684: convert to net_device_ops") inadvertently deleted the initialization of the net_dev pointer in the br2684_dev structure, leading to crashes. This patch adds it back. Reported-by: Mikko Vinni Tested-by: Mikko Vinni Signed-off-by: Rabin Vincent Signed-off-by: David S. Miller --- net/atm/br2684.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 334fcd4a4ea..3100a8940af 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -549,6 +549,7 @@ static void br2684_setup(struct net_device *netdev) struct br2684_dev *brdev = BRPRIV(netdev); ether_setup(netdev); + brdev->net_dev = netdev; netdev->netdev_ops = &br2684_netdev_ops; -- cgit v1.2.3 From 0c266898b42fe4e4e2f9edfc9d3474c10f93aa6a Mon Sep 17 00:00:00 2001 From: Satoru SATOH Date: Mon, 4 May 2009 11:11:01 -0700 Subject: tcp: Fix tcp_prequeue() to get correct rto_min value tcp_prequeue() refers to the constant value (TCP_RTO_MIN) regardless of the actual value might be tuned. The following patches fix this and make tcp_prequeue get the actual value returns from tcp_rto_min(). Signed-off-by: Satoru SATOH Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c96a6bb2543..eec3e6f9956 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -597,16 +597,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_grow_window(sk, skb); } -static u32 tcp_rto_min(struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; - - if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); - return rto_min; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge -- cgit v1.2.3 From a67e899cf38ae542d1a028ccd021f9189f76fb74 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 2 May 2009 18:24:06 -0700 Subject: Bluetooth: Fix issue with sysfs handling for connections Due to a semantic changes in flush_workqueue() the current approach of synchronizing the sysfs handling for connections doesn't work anymore. The whole approach is actually fully broken and based on assumptions that are no longer valid. With the introduction of Simple Pairing support, the creation of low-level ACL links got changed. This change invalidates the reason why in the past two independent work queues have been used for adding/removing sysfs devices. The adding of the actual sysfs device is now postponed until the host controller successfully assigns an unique handle to that link. So the real synchronization happens inside the controller and not the host. The only left-over problem is that some internals of the sysfs device handling are not initialized ahead of time. This leaves potential access to invalid data and can cause various NULL pointer dereferences. To fix this a new function makes sure that all sysfs details are initialized when an connection attempt is made. The actual sysfs device is only registered when the connection has been successfully established. To avoid a race condition with the registration, the check if a device is registered has been moved into the removal work. As an extra protection two flush_work() calls are left in place to make sure a previous add/del work has been completed first. Based on a report by Marc Pignat Signed-off-by: Marcel Holtmann Tested-by: Justin P. Mattock Tested-by: Roger Quadros Tested-by: Marc Pignat --- net/bluetooth/hci_conn.c | 2 ++ net/bluetooth/hci_sysfs.c | 74 +++++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 375f4b4f7f7..61309b26f27 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -248,6 +248,8 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + hci_conn_init_sysfs(conn); + tasklet_enable(&hdev->tx_task); return conn; diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index b7c51082dde..582d8877078 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -9,7 +9,7 @@ struct class *bt_class = NULL; EXPORT_SYMBOL_GPL(bt_class); -static struct workqueue_struct *bluetooth; +static struct workqueue_struct *bt_workq; static inline char *link_typetostr(int type) { @@ -89,8 +89,8 @@ static void add_conn(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, work_add); - /* ensure previous add/del is complete */ - flush_workqueue(bluetooth); + /* ensure previous del is complete */ + flush_work(&conn->work_del); if (device_add(&conn->dev) < 0) { BT_ERR("Failed to register connection device"); @@ -98,27 +98,6 @@ static void add_conn(struct work_struct *work) } } -void hci_conn_add_sysfs(struct hci_conn *conn) -{ - struct hci_dev *hdev = conn->hdev; - - BT_DBG("conn %p", conn); - - conn->dev.type = &bt_link; - conn->dev.class = bt_class; - conn->dev.parent = &hdev->dev; - - dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); - - dev_set_drvdata(&conn->dev, conn); - - device_initialize(&conn->dev); - - INIT_WORK(&conn->work_add, add_conn); - - queue_work(bluetooth, &conn->work_add); -} - /* * The rfcomm tty device will possibly retain even when conn * is down, and sysfs doesn't support move zombie device, @@ -134,8 +113,11 @@ static void del_conn(struct work_struct *work) struct hci_conn *conn = container_of(work, struct hci_conn, work_del); struct hci_dev *hdev = conn->hdev; - /* ensure previous add/del is complete */ - flush_workqueue(bluetooth); + /* ensure previous add is complete */ + flush_work(&conn->work_add); + + if (!device_is_registered(&conn->dev)) + return; while (1) { struct device *dev; @@ -152,16 +134,40 @@ static void del_conn(struct work_struct *work) hci_dev_put(hdev); } -void hci_conn_del_sysfs(struct hci_conn *conn) +void hci_conn_init_sysfs(struct hci_conn *conn) { + struct hci_dev *hdev = conn->hdev; + BT_DBG("conn %p", conn); - if (!device_is_registered(&conn->dev)) - return; + conn->dev.type = &bt_link; + conn->dev.class = bt_class; + conn->dev.parent = &hdev->dev; + + dev_set_drvdata(&conn->dev, conn); + device_initialize(&conn->dev); + + INIT_WORK(&conn->work_add, add_conn); INIT_WORK(&conn->work_del, del_conn); +} + +void hci_conn_add_sysfs(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("conn %p", conn); + + dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); + + queue_work(bt_workq, &conn->work_add); +} + +void hci_conn_del_sysfs(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); - queue_work(bluetooth, &conn->work_del); + queue_work(bt_workq, &conn->work_del); } static inline char *host_typetostr(int type) @@ -438,13 +444,13 @@ void hci_unregister_sysfs(struct hci_dev *hdev) int __init bt_sysfs_init(void) { - bluetooth = create_singlethread_workqueue("bluetooth"); - if (!bluetooth) + bt_workq = create_singlethread_workqueue("bluetooth"); + if (!bt_workq) return -ENOMEM; bt_class = class_create(THIS_MODULE, "bluetooth"); if (IS_ERR(bt_class)) { - destroy_workqueue(bluetooth); + destroy_workqueue(bt_workq); return PTR_ERR(bt_class); } @@ -453,7 +459,7 @@ int __init bt_sysfs_init(void) void bt_sysfs_cleanup(void) { - destroy_workqueue(bluetooth); + destroy_workqueue(bt_workq); class_destroy(bt_class); } -- cgit v1.2.3