From 32b21e034be9954eaae0278df20e0251eb958ded Mon Sep 17 00:00:00 2001 From: Denis Cheng Date: Tue, 28 Aug 2007 15:41:11 -0700 Subject: [NETLINK]: use container_of instead This could make future redesign of struct netlink_sock easier. Signed-off-by: Denis Cheng Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5681ce3aebc..a78d962e2c7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -88,7 +88,7 @@ struct netlink_sock { static inline struct netlink_sock *nlk_sk(struct sock *sk) { - return (struct netlink_sock *)sk; + return container_of(sk, struct netlink_sock, sk); } struct nl_pid_hash { -- cgit v1.2.3 From 457c4cbc5a3dde259d2a1f15d5f9785290397267 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Sep 2007 12:01:34 +0200 Subject: [NET]: Make /proc/net per network namespace This patch makes /proc/net per network namespace. It modifies the global variables proc_net and proc_net_stat to be per network namespace. The proc_net file helpers are modified to take a network namespace argument, and all of their callers are fixed to pass &init_net for that argument. This ensures that all of the /proc/net files are only visible and usable in the initial network namespace until the code behind them has been updated to be handle multiple network namespaces. Making /proc/net per namespace is necessary as at least some files in /proc/net depend upon the set of network devices which is per network namespace, and even more files in /proc/net have contents that are relevant to a single network namespace. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a78d962e2c7..3982f13dab1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -57,6 +57,7 @@ #include #include +#include #include #include #include @@ -1927,7 +1928,7 @@ static int __init netlink_proto_init(void) sock_register(&netlink_family_ops); #ifdef CONFIG_PROC_FS - proc_net_fops_create("netlink", 0, &netlink_seq_fops); + proc_net_fops_create(&init_net, "netlink", 0, &netlink_seq_fops); #endif /* The netlink device handler may be needed early. */ rtnetlink_init(); -- cgit v1.2.3 From 1b8d7ae42d02e483ad94035cca851e4f7fbecb40 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Oct 2007 23:24:22 -0700 Subject: [NET]: Make socket creation namespace safe. This patch passes in the namespace a new socket should be created in and has the socket code do the appropriate reference counting. By virtue of this all socket create methods are touched. In addition the socket create methods are modified so that they will fail if you attempt to create a socket in a non-default network namespace. Failing if we attempt to create a socket outside of the default network namespace ensures that as we incrementally make the network stack network namespace aware we will not export functionality that someone has not audited and made certain is network namespace safe. Allowing us to partially enable network namespaces before all of the exotic protocols are supported. Any protocol layers I have missed will fail to compile because I now pass an extra parameter into the socket creation code. [ Integrated AF_IUCV build fixes from Andrew Morton... -DaveM ] Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3982f13dab1..406a493300d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -384,15 +384,15 @@ static struct proto netlink_proto = { .obj_size = sizeof(struct netlink_sock), }; -static int __netlink_create(struct socket *sock, struct mutex *cb_mutex, - int protocol) +static int __netlink_create(struct net *net, struct socket *sock, + struct mutex *cb_mutex, int protocol) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; - sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); if (!sk) return -ENOMEM; @@ -412,13 +412,16 @@ static int __netlink_create(struct socket *sock, struct mutex *cb_mutex, return 0; } -static int netlink_create(struct socket *sock, int protocol) +static int netlink_create(struct net *net, struct socket *sock, int protocol) { struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; int err = 0; + if (net != &init_net) + return -EAFNOSUPPORT; + sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) @@ -441,7 +444,7 @@ static int netlink_create(struct socket *sock, int protocol) cb_mutex = nl_table[protocol].cb_mutex; netlink_unlock_table(); - if ((err = __netlink_create(sock, cb_mutex, protocol)) < 0) + if ((err = __netlink_create(net, sock, cb_mutex, protocol)) < 0) goto out_module; nlk = nlk_sk(sock->sk); @@ -1318,7 +1321,7 @@ netlink_kernel_create(int unit, unsigned int groups, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (__netlink_create(sock, cb_mutex, unit) < 0) + if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) goto out_sock_release; if (groups < 32) -- cgit v1.2.3 From b4b510290b056b86611757ce1175a230f1080f53 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Sep 2007 13:05:38 +0200 Subject: [NET]: Support multiple network namespaces with netlink Each netlink socket will live in exactly one network namespace, this includes the controlling kernel sockets. This patch updates all of the existing netlink protocols to only support the initial network namespace. Request by clients in other namespaces will get -ECONREFUSED. As they would if the kernel did not have the support for that netlink protocol compiled in. As each netlink protocol is updated to be multiple network namespace safe it can register multiple kernel sockets to acquire a presence in the rest of the network namespaces. The implementation in af_netlink is a simple filter implementation at hash table insertion and hash table look up time. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 104 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 26 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 406a493300d..3029f865cd6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -211,7 +211,7 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } -static __inline__ struct sock *netlink_lookup(int protocol, u32 pid) +static __inline__ struct sock *netlink_lookup(struct net *net, int protocol, u32 pid) { struct nl_pid_hash *hash = &nl_table[protocol].hash; struct hlist_head *head; @@ -221,7 +221,7 @@ static __inline__ struct sock *netlink_lookup(int protocol, u32 pid) read_lock(&nl_table_lock); head = nl_pid_hashfn(hash, pid); sk_for_each(sk, node, head) { - if (nlk_sk(sk)->pid == pid) { + if ((sk->sk_net == net) && (nlk_sk(sk)->pid == pid)) { sock_hold(sk); goto found; } @@ -328,7 +328,7 @@ netlink_update_listeners(struct sock *sk) * makes sure updates are visible before bind or setsockopt return. */ } -static int netlink_insert(struct sock *sk, u32 pid) +static int netlink_insert(struct sock *sk, struct net *net, u32 pid) { struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; struct hlist_head *head; @@ -341,7 +341,7 @@ static int netlink_insert(struct sock *sk, u32 pid) head = nl_pid_hashfn(hash, pid); len = 0; sk_for_each(osk, node, head) { - if (nlk_sk(osk)->pid == pid) + if ((osk->sk_net == net) && (nlk_sk(osk)->pid == pid)) break; len++; } @@ -419,9 +419,6 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol) struct netlink_sock *nlk; int err = 0; - if (net != &init_net) - return -EAFNOSUPPORT; - sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) @@ -481,6 +478,7 @@ static int netlink_release(struct socket *sock) if (nlk->pid && !nlk->subscriptions) { struct netlink_notify n = { + .net = sk->sk_net, .protocol = sk->sk_protocol, .pid = nlk->pid, }; @@ -509,6 +507,7 @@ static int netlink_release(struct socket *sock) static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; struct hlist_head *head; struct sock *osk; @@ -522,6 +521,8 @@ retry: netlink_table_grab(); head = nl_pid_hashfn(hash, pid); sk_for_each(osk, node, head) { + if ((osk->sk_net != net)) + continue; if (nlk_sk(osk)->pid == pid) { /* Bind collision, search negative pid values. */ pid = rover--; @@ -533,7 +534,7 @@ retry: } netlink_table_ungrab(); - err = netlink_insert(sk, pid); + err = netlink_insert(sk, net, pid); if (err == -EADDRINUSE) goto retry; @@ -598,6 +599,7 @@ static int netlink_realloc_groups(struct sock *sk) static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; @@ -619,7 +621,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len return -EINVAL; } else { err = nladdr->nl_pid ? - netlink_insert(sk, nladdr->nl_pid) : + netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); if (err) return err; @@ -703,10 +705,12 @@ static void netlink_overrun(struct sock *sk) static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) { int protocol = ssk->sk_protocol; + struct net *net; struct sock *sock; struct netlink_sock *nlk; - sock = netlink_lookup(protocol, pid); + net = ssk->sk_net; + sock = netlink_lookup(net, protocol, pid); if (!sock) return ERR_PTR(-ECONNREFUSED); @@ -887,6 +891,7 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff struct netlink_broadcast_data { struct sock *exclude_sk; + struct net *net; u32 pid; u32 group; int failure; @@ -909,6 +914,9 @@ static inline int do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) goto out; + if ((sk->sk_net != p->net)) + goto out; + if (p->failure) { netlink_overrun(sk); goto out; @@ -947,6 +955,7 @@ out: int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, u32 group, gfp_t allocation) { + struct net *net = ssk->sk_net; struct netlink_broadcast_data info; struct hlist_node *node; struct sock *sk; @@ -954,6 +963,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; + info.net = net; info.pid = pid; info.group = group; info.failure = 0; @@ -1002,6 +1012,9 @@ static inline int do_one_set_err(struct sock *sk, if (sk == p->exclude_sk) goto out; + if (sk->sk_net != p->exclude_sk->sk_net) + goto out; + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; @@ -1304,7 +1317,7 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(int unit, unsigned int groups, +netlink_kernel_create(struct net *net, int unit, unsigned int groups, void (*input)(struct sock *sk, int len), struct mutex *cb_mutex, struct module *module) { @@ -1321,7 +1334,7 @@ netlink_kernel_create(int unit, unsigned int groups, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + if (__netlink_create(net, sock, cb_mutex, unit) < 0) goto out_sock_release; if (groups < 32) @@ -1336,18 +1349,20 @@ netlink_kernel_create(int unit, unsigned int groups, if (input) nlk_sk(sk)->data_ready = input; - if (netlink_insert(sk, 0)) + if (netlink_insert(sk, net, 0)) goto out_sock_release; nlk = nlk_sk(sk); nlk->flags |= NETLINK_KERNEL_SOCKET; netlink_table_grab(); - nl_table[unit].groups = groups; - nl_table[unit].listeners = listeners; - nl_table[unit].cb_mutex = cb_mutex; - nl_table[unit].module = module; - nl_table[unit].registered = 1; + if (!nl_table[unit].registered) { + nl_table[unit].groups = groups; + nl_table[unit].listeners = listeners; + nl_table[unit].cb_mutex = cb_mutex; + nl_table[unit].module = module; + nl_table[unit].registered = 1; + } netlink_table_ungrab(); return sk; @@ -1513,7 +1528,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, atomic_inc(&skb->users); cb->skb = skb; - sk = netlink_lookup(ssk->sk_protocol, NETLINK_CB(skb).pid); + sk = netlink_lookup(ssk->sk_net, ssk->sk_protocol, NETLINK_CB(skb).pid); if (sk == NULL) { netlink_destroy_callback(cb); return -ECONNREFUSED; @@ -1555,7 +1570,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (!skb) { struct sock *sk; - sk = netlink_lookup(in_skb->sk->sk_protocol, + sk = netlink_lookup(in_skb->sk->sk_net, + in_skb->sk->sk_protocol, NETLINK_CB(in_skb).pid); if (sk) { sk->sk_err = ENOBUFS; @@ -1706,6 +1722,7 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, #ifdef CONFIG_PROC_FS struct nl_seq_iter { + struct net *net; int link; int hash_idx; }; @@ -1723,6 +1740,8 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) for (j = 0; j <= hash->mask; j++) { sk_for_each(s, node, &hash->table[j]) { + if (iter->net != s->sk_net) + continue; if (off == pos) { iter->link = i; iter->hash_idx = j; @@ -1752,11 +1771,14 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return netlink_seq_socket_idx(seq, 0); - s = sk_next(v); + iter = seq->private; + s = v; + do { + s = sk_next(s); + } while (s && (iter->net != s->sk_net)); if (s) return s; - iter = seq->private; i = iter->link; j = iter->hash_idx + 1; @@ -1765,6 +1787,8 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) for (; j <= hash->mask; j++) { s = sk_head(&hash->table[j]); + while (s && (iter->net != s->sk_net)) + s = sk_next(s); if (s) { iter->link = i; iter->hash_idx = j; @@ -1835,15 +1859,24 @@ static int netlink_seq_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = iter; + iter->net = get_net(PROC_NET(inode)); return 0; } +static int netlink_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct nl_seq_iter *iter = seq->private; + put_net(iter->net); + return seq_release_private(inode, file); +} + static const struct file_operations netlink_seq_fops = { .owner = THIS_MODULE, .open = netlink_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = netlink_seq_release, }; #endif @@ -1885,6 +1918,27 @@ static struct net_proto_family netlink_family_ops = { .owner = THIS_MODULE, /* for consistency 8) */ }; +static int netlink_net_init(struct net *net) +{ +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) + return -ENOMEM; +#endif + return 0; +} + +static void netlink_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "netlink"); +#endif +} + +static struct pernet_operations netlink_net_ops = { + .init = netlink_net_init, + .exit = netlink_net_exit, +}; + static int __init netlink_proto_init(void) { struct sk_buff *dummy_skb; @@ -1930,9 +1984,7 @@ static int __init netlink_proto_init(void) } sock_register(&netlink_family_ops); -#ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "netlink", 0, &netlink_seq_fops); -#endif + register_pernet_subsys(&netlink_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: -- cgit v1.2.3 From 077130c0cf7d5ba1992f5b51b96136d7b1c8aad5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Sep 2007 09:18:57 +0200 Subject: [NET]: Fix race when opening a proc file while a network namespace is exiting. The problem: proc_net files remember which network namespace the are against but do not remember hold a reference count (as that would pin the network namespace). So we currently have a small window where the reference count on a network namespace may be incremented when opening a /proc file when it has already gone to zero. To fix this introduce maybe_get_net and get_proc_net. maybe_get_net increments the network namespace reference count only if it is greater then zero, ensuring we don't increment a reference count after it has gone to zero. get_proc_net handles all of the magic to go from a proc inode to the network namespace instance and call maybe_get_net on it. PROC_NET the old accessor is removed so that we don't get confused and use the wrong helper function. Then I fix up the callers to use get_proc_net and handle the case case where get_proc_net returns NULL. In that case I return -ENXIO because effectively the network namespace has already gone away so the files we are trying to access don't exist anymore. Signed-off-by: Eric W. Biederman Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3029f865cd6..dc9f8c2ab1d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1859,7 +1859,11 @@ static int netlink_seq_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = iter; - iter->net = get_net(PROC_NET(inode)); + iter->net = get_proc_net(inode); + if (!iter->net) { + seq_release_private(inode, file); + return -ENXIO; + } return 0; } -- cgit v1.2.3 From 0cfad07555312468296ea3bbbcdf99038f58678b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 16 Sep 2007 16:24:44 -0700 Subject: [NETLINK]: Avoid pointer in netlink_run_queue I was looking at Patrick's fix to inet_diag and it occured to me that we're using a pointer argument to return values unnecessarily in netlink_run_queue. Changing it to return the value will allow the compiler to generate better code since the value won't have to be memory-backed. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index dc9f8c2ab1d..c68888b2575 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1629,7 +1629,7 @@ skip: /** * nelink_run_queue - Process netlink receive queue. * @sk: Netlink socket containing the queue - * @qlen: Place to store queue length upon entry + * @qlen: Initial queue length * @cb: Callback function invoked for each netlink message found * * Processes as much as there was in the queue upon entry and invokes @@ -1639,35 +1639,37 @@ skip: * returns with a qlen != 0. * * qlen must be initialized to 0 before the initial entry, afterwards - * the function may be called repeatedly until qlen reaches 0. + * the function may be called repeatedly until the returned qlen is 0. * * The callback function may return -EINTR to signal that processing * of netlink messages shall be interrupted. In this case the message * currently being processed will NOT be requeued onto the receive * queue. */ -void netlink_run_queue(struct sock *sk, unsigned int *qlen, - int (*cb)(struct sk_buff *, struct nlmsghdr *)) +unsigned int netlink_run_queue(struct sock *sk, unsigned int qlen, + int (*cb)(struct sk_buff *, struct nlmsghdr *)) { struct sk_buff *skb; - if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue)) - *qlen = skb_queue_len(&sk->sk_receive_queue); + if (!qlen || qlen > skb_queue_len(&sk->sk_receive_queue)) + qlen = skb_queue_len(&sk->sk_receive_queue); - for (; *qlen; (*qlen)--) { + for (; qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); if (netlink_rcv_skb(skb, cb)) { if (skb->len) skb_queue_head(&sk->sk_receive_queue, skb); else { kfree_skb(skb); - (*qlen)--; + qlen--; } break; } kfree_skb(skb); } + + return qlen; } /** -- cgit v1.2.3 From 99406c885ab27c369fa4a1b15c4a5a5ad0d61fcd Mon Sep 17 00:00:00 2001 From: Denis Cheng Date: Sun, 16 Sep 2007 16:34:39 -0700 Subject: [NETLINK]: use the macro min(x,y) provided by instead Signed-off-by: Denis Cheng Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c68888b2575..bca93e19bf4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1969,7 +1969,7 @@ static int __init netlink_proto_init(void) order = get_bitmask_order(max) - 1 + PAGE_SHIFT; max = (1UL << order) / sizeof(struct hlist_head); - order = get_bitmask_order(max > UINT_MAX ? UINT_MAX : max) - 1; + order = get_bitmask_order(min(max, (unsigned long)UINT_MAX)) - 1; for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; -- cgit v1.2.3 From 26ff5ddc5ab11e37ab3db469f24324e0ef1d6f63 Mon Sep 17 00:00:00 2001 From: Denis Cheng Date: Sun, 16 Sep 2007 16:36:02 -0700 Subject: [NETLINK]: the temp variable name max is ambiguous with the macro max provided by , so changed its name to a more proper one: limit Signed-off-by: Denis Cheng Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index bca93e19bf4..46eb5ea1fbd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1949,7 +1949,7 @@ static int __init netlink_proto_init(void) { struct sk_buff *dummy_skb; int i; - unsigned long max; + unsigned long limit; unsigned int order; int err = proto_register(&netlink_proto, 0); @@ -1963,13 +1963,13 @@ static int __init netlink_proto_init(void) goto panic; if (num_physpages >= (128 * 1024)) - max = num_physpages >> (21 - PAGE_SHIFT); + limit = num_physpages >> (21 - PAGE_SHIFT); else - max = num_physpages >> (23 - PAGE_SHIFT); + limit = num_physpages >> (23 - PAGE_SHIFT); - order = get_bitmask_order(max) - 1 + PAGE_SHIFT; - max = (1UL << order) / sizeof(struct hlist_head); - order = get_bitmask_order(min(max, (unsigned long)UINT_MAX)) - 1; + order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; + limit = (1UL << order) / sizeof(struct hlist_head); + order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; -- cgit v1.2.3 From 4665079cbb2a3e17de82f2ab2940b9f97f37d65e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 8 Oct 2007 20:38:39 -0700 Subject: [NETNS]: Move some code into __init section when CONFIG_NET_NS=n With the net namespaces many code leaved the __init section, thus making the kernel occupy more memory than it did before. Since we have a config option that prohibits the namespace creation, the functions that initialize/finalize some netns stuff are simply not needed and can be freed after the boot. Currently, this is almost not noticeable, since few calls are no longer in __init, but when the namespaces will be merged it will be possible to free more code. I propose to use the __net_init, __net_exit and __net_initdata "attributes" for functions/variables that are not used if the CONFIG_NET_NS is not set to save more space in memory. The exiting functions cannot just reside in the __exit section, as noticed by David, since the init section will have references on it and the compilation will fail due to modpost checks. These references can exist, since the init namespace never dies and the exit callbacks are never called. So I introduce the __exit_refok attribute just like it is already done with the __init_refok. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 46eb5ea1fbd..3ef32825da7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1924,7 +1924,7 @@ static struct net_proto_family netlink_family_ops = { .owner = THIS_MODULE, /* for consistency 8) */ }; -static int netlink_net_init(struct net *net) +static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) @@ -1933,14 +1933,14 @@ static int netlink_net_init(struct net *net) return 0; } -static void netlink_net_exit(struct net *net) +static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS proc_net_remove(net, "netlink"); #endif } -static struct pernet_operations netlink_net_ops = { +static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; -- cgit v1.2.3 From cf7732e4cc14b56d593ff53352673e1fd5e3ba52 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 10 Oct 2007 02:29:29 -0700 Subject: [NET]: Make core networking code use seq_open_private This concerns the ipv4 and ipv6 code mostly, but also the netlink and unix sockets. The netlink code is an example of how to use the __seq_open_private() call - it saves the net namespace on this private. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3ef32825da7..f934f54fbfd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1845,27 +1845,18 @@ static const struct seq_operations netlink_seq_ops = { static int netlink_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nl_seq_iter *iter; - int err; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &netlink_seq_ops, sizeof(*iter)); if (!iter) return -ENOMEM; - err = seq_open(file, &netlink_seq_ops); - if (err) { - kfree(iter); - return err; - } - - seq = file->private_data; - seq->private = iter; iter->net = get_proc_net(inode); if (!iter->net) { seq_release_private(inode, file); return -ENXIO; } + return 0; } -- cgit v1.2.3 From 7ee015e0fa3c856416e9477aac4b850ec6f09017 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 10 Oct 2007 21:14:03 -0700 Subject: [NET]: cleanup 3rd argument in netlink_sendskb netlink_sendskb does not use third argument. Clean it and save a couple of bytes. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f934f54fbfd..a5bd63ca86b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -791,7 +791,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, return 0; } -int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol) +int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; @@ -853,7 +853,7 @@ retry: if (err) return err; - return netlink_sendskb(sk, skb, ssk->sk_protocol); + return netlink_sendskb(sk, skb); } int netlink_has_listeners(struct sock *sk, unsigned int group) -- cgit v1.2.3 From aed815601f3f95281ab3a01f7e2cbe1bd54285a0 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 10 Oct 2007 21:14:32 -0700 Subject: [NET]: unify netlink kernel socket recognition There are currently two ways to determine whether the netlink socket is a kernel one or a user one. This patch creates a single inline call for this purpose and unifies all the calls in the af_netlink.c No similar calls are found outside af_netlink.c. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a5bd63ca86b..4ce7dcbcb6e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -92,6 +92,11 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +static inline int netlink_is_kernel(struct sock *sk) +{ + return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; +} + struct nl_pid_hash { struct hlist_head *table; unsigned long rehash_time; @@ -489,7 +494,7 @@ static int netlink_release(struct socket *sock) module_put(nlk->module); netlink_table_grab(); - if (nlk->flags & NETLINK_KERNEL_SOCKET) { + if (netlink_is_kernel(sk)) { kfree(nl_table[sk->sk_protocol].listeners); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].registered = 0; @@ -716,7 +721,7 @@ static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); - if ((nlk->pid == 0 && !nlk->data_ready) || + if ((netlink_is_kernel(sock) && !nlk->data_ready) || (sock->sk_state == NETLINK_CONNECTED && nlk->dst_pid != nlk_sk(ssk)->pid)) { sock_put(sock); @@ -762,7 +767,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); if (!timeo) { - if (!ssk || nlk_sk(ssk)->pid == 0) + if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); @@ -861,7 +866,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) int res = 0; unsigned long *listeners; - BUG_ON(!(nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET)); + BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); -- cgit v1.2.3 From cd40b7d3983c708aabe3d3008ec64ffce56d33b0 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 10 Oct 2007 21:15:29 -0700 Subject: [NET]: make netlink user -> kernel interface synchronious This patch make processing netlink user -> kernel messages synchronious. This change was inspired by the talk with Alexey Kuznetsov about current netlink messages processing. He says that he was badly wrong when introduced asynchronious user -> kernel communication. The call netlink_unicast is the only path to send message to the kernel netlink socket. But, unfortunately, it is also used to send data to the user. Before this change the user message has been attached to the socket queue and sk->sk_data_ready was called. The process has been blocked until all pending messages were processed. The bad thing is that this processing may occur in the arbitrary process context. This patch changes nlk->data_ready callback to get 1 skb and force packet processing right in the netlink_unicast. Kernel -> user path in netlink_unicast remains untouched. EINTR processing for in netlink_run_queue was changed. It forces rtnl_lock drop, but the process remains in the cycle until the message will be fully processed. So, there is no need to use this kludges now. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 152 +++++++++++++++-------------------------------- 1 file changed, 47 insertions(+), 105 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4ce7dcbcb6e..c776bcd9f82 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -80,7 +80,7 @@ struct netlink_sock { struct netlink_callback *cb; struct mutex *cb_mutex; struct mutex cb_def_mutex; - void (*data_ready)(struct sock *sk, int bytes); + void (*netlink_rcv)(struct sk_buff *skb); struct module *module; }; @@ -127,7 +127,6 @@ static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); static void netlink_destroy_callback(struct netlink_callback *cb); -static void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb); static DEFINE_RWLOCK(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); @@ -709,21 +708,17 @@ static void netlink_overrun(struct sock *sk) static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) { - int protocol = ssk->sk_protocol; - struct net *net; struct sock *sock; struct netlink_sock *nlk; - net = ssk->sk_net; - sock = netlink_lookup(net, protocol, pid); + sock = netlink_lookup(ssk->sk_net, ssk->sk_protocol, pid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); - if ((netlink_is_kernel(sock) && !nlk->data_ready) || - (sock->sk_state == NETLINK_CONNECTED && - nlk->dst_pid != nlk_sk(ssk)->pid)) { + if (sock->sk_state == NETLINK_CONNECTED && + nlk->dst_pid != nlk_sk(ssk)->pid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } @@ -837,7 +832,34 @@ static inline struct sk_buff *netlink_trim(struct sk_buff *skb, return skb; } -int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) +static inline void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(0, &nlk->state); + if (!test_bit(0, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb) +{ + int ret; + struct netlink_sock *nlk = nlk_sk(sk); + + ret = -ECONNREFUSED; + if (nlk->netlink_rcv != NULL) { + ret = skb->len; + skb_set_owner_r(skb, sk); + nlk->netlink_rcv(skb); + } + kfree_skb(skb); + sock_put(sk); + return ret; +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, + u32 pid, int nonblock) { struct sock *sk; int err; @@ -852,6 +874,9 @@ retry: kfree_skb(skb); return PTR_ERR(sk); } + if (netlink_is_kernel(sk)) + return netlink_unicast_kernel(sk, skb); + err = netlink_attachskb(sk, skb, nonblock, timeo, ssk); if (err == 1) goto retry; @@ -1151,16 +1176,6 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } -static inline void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len) { @@ -1308,11 +1323,7 @@ out: static void netlink_data_ready(struct sock *sk, int len) { - struct netlink_sock *nlk = nlk_sk(sk); - - if (nlk->data_ready) - nlk->data_ready(sk, len); - netlink_rcv_wake(sk); + BUG(); } /* @@ -1323,7 +1334,7 @@ static void netlink_data_ready(struct sock *sk, int len) struct sock * netlink_kernel_create(struct net *net, int unit, unsigned int groups, - void (*input)(struct sock *sk, int len), + void (*input)(struct sk_buff *skb), struct mutex *cb_mutex, struct module *module) { struct socket *sock; @@ -1352,7 +1363,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, sk = sock->sk; sk->sk_data_ready = netlink_data_ready; if (input) - nlk_sk(sk)->data_ready = input; + nlk_sk(sk)->netlink_rcv = input; if (netlink_insert(sk, net, 0)) goto out_sock_release; @@ -1552,12 +1563,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, netlink_dump(sk); sock_put(sk); - - /* We successfully started a dump, by returning -EINTR we - * signal the queue mangement to interrupt processing of - * any netlink messages so userspace gets a chance to read - * the results. */ - return -EINTR; + return 0; } void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) @@ -1594,13 +1600,15 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); } -static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, +int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *)) { struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { + int msglen; + nlh = nlmsg_hdr(skb); err = 0; @@ -1616,85 +1624,19 @@ static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, goto skip; err = cb(skb, nlh); - if (err == -EINTR) { - /* Not an error, but we interrupt processing */ - netlink_queue_skip(nlh, skb); - return err; - } skip: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err); - netlink_queue_skip(nlh, skb); + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); } return 0; } -/** - * nelink_run_queue - Process netlink receive queue. - * @sk: Netlink socket containing the queue - * @qlen: Initial queue length - * @cb: Callback function invoked for each netlink message found - * - * Processes as much as there was in the queue upon entry and invokes - * a callback function for each netlink message found. The callback - * function may refuse a message by returning a negative error code - * but setting the error pointer to 0 in which case this function - * returns with a qlen != 0. - * - * qlen must be initialized to 0 before the initial entry, afterwards - * the function may be called repeatedly until the returned qlen is 0. - * - * The callback function may return -EINTR to signal that processing - * of netlink messages shall be interrupted. In this case the message - * currently being processed will NOT be requeued onto the receive - * queue. - */ -unsigned int netlink_run_queue(struct sock *sk, unsigned int qlen, - int (*cb)(struct sk_buff *, struct nlmsghdr *)) -{ - struct sk_buff *skb; - - if (!qlen || qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (netlink_rcv_skb(skb, cb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - - kfree_skb(skb); - } - - return qlen; -} - -/** - * netlink_queue_skip - Skip netlink message while processing queue. - * @nlh: Netlink message to be skipped - * @skb: Socket buffer containing the netlink messages. - * - * Pulls the given netlink message off the socket buffer so the next - * call to netlink_queue_run() will not reconsider the message. - */ -static void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) -{ - int msglen = NLMSG_ALIGN(nlh->nlmsg_len); - - if (msglen > skb->len) - msglen = skb->len; - - skb_pull(skb, msglen); -} - /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use @@ -1998,7 +1940,7 @@ panic: core_initcall(netlink_proto_init); EXPORT_SYMBOL(netlink_ack); -EXPORT_SYMBOL(netlink_run_queue); +EXPORT_SYMBOL(netlink_rcv_skb); EXPORT_SYMBOL(netlink_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); -- cgit v1.2.3