diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-19 19:04:47 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-19 19:04:47 +0200 |
commit | 3e10e879a8c334a5927d800a3663a24d562cfa31 (patch) | |
tree | 5d18bc7e38c986a044e99aa0d0a4aff4931ec7d0 /net | |
parent | 98d9c66ab07471006fd7910cb16453581c41a3e7 (diff) | |
parent | 0cfd81031a26717fe14380d18275f8e217571615 (diff) |
Merge branch 'linus' into tracing-v28-for-linus-v3
Conflicts:
init/main.c
kernel/module.c
scripts/bootgraph.pl
Diffstat (limited to 'net')
96 files changed, 4225 insertions, 1611 deletions
diff --git a/net/802/fc.c b/net/802/fc.c index cb3475ea6fd..34cf1ee014b 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -82,13 +82,13 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev, static int fc_rebuild_header(struct sk_buff *skb) { +#ifdef CONFIG_INET struct fch_hdr *fch=(struct fch_hdr *)skb->data; struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); if(fcllc->ethertype != htons(ETH_P_IP)) { printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype)); return 0; } -#ifdef CONFIG_INET return arp_find(fch->daddr, skb); #else return 0; diff --git a/net/802/psnap.c b/net/802/psnap.c index b3cfe5a14fc..70980baeb68 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -1,7 +1,7 @@ /* * SNAP data link layer. Derived from 802.2 * - * Alan Cox <Alan.Cox@linux.org>, + * Alan Cox <alan@lxorguk.ukuu.org.uk>, * from the 802.2 layer by Greg Page. * Merged in additions from Greg Page's psnap.c. * diff --git a/net/9p/client.c b/net/9p/client.c index 10e320307ec..e053e06028a 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -52,7 +52,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_msize, "msize=%u"}, {Opt_legacy, "noextend"}, {Opt_trans, "trans=%s"}, diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index d652baf5ff9..6dabbdb6665 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -86,7 +86,7 @@ enum { Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_port, "port=%u"}, {Opt_rfdno, "rfdno=%u"}, {Opt_wfdno, "wfdno=%u"}, diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 0c850427a85..d3134e7e6ee 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -2,7 +2,7 @@ * DDP: An implementation of the AppleTalk DDP protocol for * Ethernet 'ELAP'. * - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <alan@lxorguk.ukuu.org.uk> * * With more than a little assistance from * @@ -1934,6 +1934,6 @@ static void __exit atalk_exit(void) module_exit(atalk_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Alan Cox <Alan.Cox@linux.org>"); +MODULE_AUTHOR("Alan Cox <alan@lxorguk.ukuu.org.uk>"); MODULE_DESCRIPTION("AppleTalk 0.20\n"); MODULE_ALIAS_NETPROTO(PF_APPLETALK); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index f6348e078aa..8f9431a12c6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -37,10 +37,7 @@ #include <linux/poll.h> #include <net/sock.h> #include <asm/ioctls.h> - -#if defined(CONFIG_KMOD) #include <linux/kmod.h> -#endif #include <net/bluetooth/bluetooth.h> @@ -145,11 +142,8 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto) if (proto < 0 || proto >= BT_MAX_PROTO) return -EINVAL; -#if defined(CONFIG_KMOD) - if (!bt_proto[proto]) { + if (!bt_proto[proto]) request_module("bt-proto-%d", proto); - } -#endif err = -EPROTONOSUPPORT; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 96434d774c8..acdeab3d980 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -578,7 +578,7 @@ static int hidp_session(void *arg) if (session->hid) { if (session->hid->claimed & HID_CLAIMED_INPUT) hidinput_disconnect(session->hid); - hid_free_device(session->hid); + hid_destroy_device(session->hid); } /* Wakeup user-space polling for socket errors */ @@ -623,9 +623,15 @@ static struct device *hidp_get_device(struct hidp_session *session) static int hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req) { - struct input_dev *input = session->input; + struct input_dev *input; int i; + input = input_allocate_device(); + if (!input) + return -ENOMEM; + + session->input = input; + input_set_drvdata(input, session); input->name = "Bluetooth HID Boot Protocol Device"; @@ -677,67 +683,114 @@ static void hidp_close(struct hid_device *hid) { } -static const struct { - __u16 idVendor; - __u16 idProduct; - unsigned quirks; -} hidp_blacklist[] = { - /* Apple wireless Mighty Mouse */ - { 0x05ac, 0x030c, HID_QUIRK_MIGHTYMOUSE | HID_QUIRK_INVERT_HWHEEL }, +static int hidp_parse(struct hid_device *hid) +{ + struct hidp_session *session = hid->driver_data; + struct hidp_connadd_req *req = session->req; + unsigned char *buf; + int ret; - { } /* Terminating entry */ -}; + buf = kmalloc(req->rd_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, req->rd_data, req->rd_size)) { + kfree(buf); + return -EFAULT; + } + + ret = hid_parse_report(session->hid, buf, req->rd_size); + + kfree(buf); + + if (ret) + return ret; + + session->req = NULL; + + return 0; +} + +static int hidp_start(struct hid_device *hid) +{ + struct hidp_session *session = hid->driver_data; + struct hid_report *report; -static void hidp_setup_quirks(struct hid_device *hid) + list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT]. + report_list, list) + hidp_send_report(session, report); + + list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT]. + report_list, list) + hidp_send_report(session, report); + + return 0; +} + +static void hidp_stop(struct hid_device *hid) { - unsigned int n; + struct hidp_session *session = hid->driver_data; + + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); - for (n = 0; hidp_blacklist[n].idVendor; n++) - if (hidp_blacklist[n].idVendor == le16_to_cpu(hid->vendor) && - hidp_blacklist[n].idProduct == le16_to_cpu(hid->product)) - hid->quirks = hidp_blacklist[n].quirks; + if (hid->claimed & HID_CLAIMED_INPUT) + hidinput_disconnect(hid); + hid->claimed = 0; } -static void hidp_setup_hid(struct hidp_session *session, +static struct hid_ll_driver hidp_hid_driver = { + .parse = hidp_parse, + .start = hidp_start, + .stop = hidp_stop, + .open = hidp_open, + .close = hidp_close, + .hidinput_input_event = hidp_hidinput_event, +}; + +static int hidp_setup_hid(struct hidp_session *session, struct hidp_connadd_req *req) { - struct hid_device *hid = session->hid; - struct hid_report *report; + struct hid_device *hid; bdaddr_t src, dst; + int ret; - baswap(&src, &bt_sk(session->ctrl_sock->sk)->src); - baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst); + hid = hid_allocate_device(); + if (IS_ERR(hid)) { + ret = PTR_ERR(session->hid); + goto err; + } + session->hid = hid; + session->req = req; hid->driver_data = session; - hid->country = req->country; + baswap(&src, &bt_sk(session->ctrl_sock->sk)->src); + baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst); hid->bus = BUS_BLUETOOTH; hid->vendor = req->vendor; hid->product = req->product; hid->version = req->version; + hid->country = req->country; strncpy(hid->name, req->name, 128); strncpy(hid->phys, batostr(&src), 64); strncpy(hid->uniq, batostr(&dst), 64); - hid->dev = hidp_get_device(session); - - hid->hid_open = hidp_open; - hid->hid_close = hidp_close; - - hid->hidinput_input_event = hidp_hidinput_event; + hid->dev.parent = hidp_get_device(session); + hid->ll_driver = &hidp_hid_driver; - hidp_setup_quirks(hid); + ret = hid_add_device(hid); + if (ret) + goto err_hid; - list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT].report_list, list) - hidp_send_report(session, report); - - list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT].report_list, list) - hidp_send_report(session, report); - - if (hidinput_connect(hid) == 0) - hid->claimed |= HID_CLAIMED_INPUT; + return 0; +err_hid: + hid_destroy_device(hid); + session->hid = NULL; +err: + return ret; } int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) @@ -757,38 +810,6 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, BT_DBG("rd_data %p rd_size %d", req->rd_data, req->rd_size); - if (req->rd_size > 0) { - unsigned char *buf = kmalloc(req->rd_size, GFP_KERNEL); - - if (!buf) { - kfree(session); - return -ENOMEM; - } - - if (copy_from_user(buf, req->rd_data, req->rd_size)) { - kfree(buf); - kfree(session); - return -EFAULT; - } - - session->hid = hid_parse_report(buf, req->rd_size); - - kfree(buf); - - if (!session->hid) { - kfree(session); - return -EINVAL; - } - } - - if (!session->hid) { - session->input = input_allocate_device(); - if (!session->input) { - kfree(session); - return -ENOMEM; - } - } - down_write(&hidp_session_sem); s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst); @@ -816,15 +837,18 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID); session->idle_to = req->idle_to; - if (session->input) { + if (req->rd_size > 0) { + err = hidp_setup_hid(session, req); + if (err && err != -ENODEV) + goto err_skb; + } + + if (!session->hid) { err = hidp_setup_input(session, req); if (err < 0) - goto failed; + goto err_skb; } - if (session->hid) - hidp_setup_hid(session, req); - __hidp_link_session(session); hidp_set_timer(session); @@ -850,17 +874,16 @@ unlink: __hidp_unlink_session(session); - if (session->input) { + if (session->input) input_unregister_device(session->input); - session->input = NULL; /* don't try to free it here */ - } - + if (session->hid) + hid_destroy_device(session->hid); +err_skb: + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); failed: up_write(&hidp_session_sem); - if (session->hid) - hid_free_device(session->hid); - input_free_device(session->input); kfree(session); return err; @@ -950,18 +973,43 @@ int hidp_get_conninfo(struct hidp_conninfo *ci) return err; } +static const struct hid_device_id hidp_table[] = { + { HID_BLUETOOTH_DEVICE(HID_ANY_ID, HID_ANY_ID) }, + { } +}; + +static struct hid_driver hidp_driver = { + .name = "generic-bluetooth", + .id_table = hidp_table, +}; + static int __init hidp_init(void) { + int ret; + l2cap_load(); BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION); - return hidp_init_sockets(); + ret = hid_register_driver(&hidp_driver); + if (ret) + goto err; + + ret = hidp_init_sockets(); + if (ret) + goto err_drv; + + return 0; +err_drv: + hid_unregister_driver(&hidp_driver); +err: + return ret; } static void __exit hidp_exit(void) { hidp_cleanup_sockets(); + hid_unregister_driver(&hidp_driver); } module_init(hidp_init); diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index 343fb0566b3..e503c89057a 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -151,6 +151,8 @@ struct hidp_session { struct sk_buff_head ctrl_transmit; struct sk_buff_head intr_transmit; + + struct hidp_connadd_req *req; }; static inline void hidp_schedule(struct hidp_session *session) diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 366d3e9d51f..ba6f73eb06c 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -4,6 +4,7 @@ menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" + depends on BRIDGE && BRIDGE_NETFILTER select NETFILTER_XTABLES help ebtables is a general, extensible frame/packet identification diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5bb88eb0aad..0fa208e8640 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -305,23 +305,14 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error, return NULL; } -#ifndef CONFIG_KMOD -#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) -#else static void * find_inlist_lock(struct list_head *head, const char *name, const char *prefix, int *error, struct mutex *mutex) { - void *ret; - - ret = find_inlist_lock_noload(head, name, error, mutex); - if (!ret) { - request_module("%s%s", prefix, name); - ret = find_inlist_lock_noload(head, name, error, mutex); - } - return ret; + return try_then_request_module( + find_inlist_lock_noload(head, name, error, mutex), + "%s%s", prefix, name); } -#endif static inline struct ebt_table * find_table_lock(const char *name, int *error, struct mutex *mutex) diff --git a/net/can/af_can.c b/net/can/af_can.c index 8035fbf526a..7d4d2b3c137 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -128,8 +128,8 @@ static int can_create(struct net *net, struct socket *sock, int protocol) if (net != &init_net) return -EAFNOSUPPORT; -#ifdef CONFIG_KMOD - /* try to load protocol module, when CONFIG_KMOD is defined */ +#ifdef CONFIG_MODULES + /* try to load protocol module kernel is modular */ if (!proto_tab[protocol]) { err = request_module("can-proto-%d", protocol); diff --git a/net/core/datagram.c b/net/core/datagram.c index 52f577a0f54..ee631843c2f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -9,7 +9,7 @@ * identical recvmsg() code. So we share it here. The poll was * shared before but buried in udp.c so I moved it. * - * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old * udp.c code) * * Fixes: diff --git a/net/core/dev.c b/net/core/dev.c index 1408a083fe4..868ec0ba8b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4956,8 +4956,6 @@ EXPORT_SYMBOL(br_fdb_get_hook); EXPORT_SYMBOL(br_fdb_put_hook); #endif -#ifdef CONFIG_KMOD EXPORT_SYMBOL(dev_load); -#endif EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 5402b3b38e0..9e2fa39f22a 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -6,7 +6,7 @@ * Richard Underwood <richard@wuzz.demon.co.uk> * * Stir fried together from the IP multicast and CAP patches above - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Update the device on a real delete diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0dc818a91d..f1d07b5c1e1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -96,7 +96,7 @@ static void net_free(struct net *net) return; } #endif - + kfree(net->gen); kmem_cache_free(net_cachep, net); } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a756847e381..99f656d35b4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2474,7 +2474,7 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev, if (ret < 0) { printk(KERN_ERR "Error expanding " "ipsec packet %d\n",ret); - return 0; + goto err; } } @@ -2484,8 +2484,7 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev, if (ret) { printk(KERN_ERR "Error creating ipsec " "packet %d\n",ret); - kfree_skb(skb); - return 0; + goto err; } /* restore ll */ eth = (__u8 *) skb_push(skb, ETH_HLEN); @@ -2494,6 +2493,9 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev, } } return 1; +err: + kfree_skb(skb); + return 0; } #endif diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3630131fa1f..31f29d2989f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1040,7 +1040,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct nlattr *linkinfo[IFLA_INFO_MAX+1]; int err; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES replay: #endif err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); @@ -1129,7 +1129,7 @@ replay: return -EOPNOTSUPP; if (!ops) { -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (kind[0]) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c index 1f49afcd8e8..86234923a3b 100644 --- a/net/core/skb_dma_map.c +++ b/net/core/skb_dma_map.c @@ -35,7 +35,7 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb, return 0; unwind: - while (i-- >= 0) { + while (--i >= 0) { skb_frag_t *fp = &sp->frags[i]; dma_unmap_page(dev, sp->dma_maps[i + 1], diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7f7bb1a636d..4e22e3a3535 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1,7 +1,7 @@ /* * Routines having to do with the 'struct sk_buff' memory handlers. * - * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * * Fixes: diff --git a/net/core/stream.c b/net/core/stream.c index a6b3437ff08..8727cead64a 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -9,7 +9,7 @@ * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * (from old tcp.c code) - * Alan Cox <alan@redhat.com> (Borrowed comments 8-)) + * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-)) */ #include <linux/module.h> diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 4809753d12a..8fe931a3d7a 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -154,7 +154,7 @@ struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) struct ccid *ccid = NULL; ccids_read_lock(); -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (ccids[id] == NULL) { /* We only try to load if in process context */ ccids_read_unlock(); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 2f0ac3c3eb7..28e26bd08e2 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -152,7 +152,7 @@ static struct dn_dev_parms dn_dev_list[] = { #define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list) -#define DN_DEV_PARMS_OFFSET(x) ((int) ((char *) &((struct dn_dev_parms *)0)->x)) +#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x) #ifdef CONFIG_SYSCTL @@ -166,7 +166,7 @@ static int max_priority[] = { 127 }; /* From DECnet spec */ static int dn_forwarding_proc(ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, +static int dn_forwarding_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); @@ -318,7 +318,7 @@ static int dn_forwarding_proc(ctl_table *table, int write, #endif } -static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, +static int dn_forwarding_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -490,9 +490,7 @@ int dn_dev_ioctl(unsigned int cmd, void __user *arg) return -EFAULT; ifr->ifr_name[IFNAMSIZ-1] = 0; -#ifdef CONFIG_KMOD dev_load(&init_net, ifr->ifr_name); -#endif switch(cmd) { case SIOCGIFADDR: diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 228067c571b..36400b26689 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -132,7 +132,7 @@ static int parse_addr(__le16 *addr, char *str) } -static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen, +static int dn_node_address_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -217,7 +217,7 @@ static int dn_node_address_handler(ctl_table *table, int write, } -static int dn_def_dev_strategy(ctl_table *table, int __user *name, int nlen, +static int dn_def_dev_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index cdce4c6c672..49211b35725 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,7 +1,7 @@ menuconfig NET_DSA bool "Distributed Switch Architecture support" default n - depends on EXPERIMENTAL + depends on EXPERIMENTAL && !S390 select PHYLIB ---help--- This allows you to use hardware switch chips that use diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2c0e4572cc9..490e035c6d9 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -13,7 +13,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -47,17 +47,7 @@ #include <asm/bug.h> #include <asm/unaligned.h> -struct cipso_v4_domhsh_entry { - char *domain; - u32 valid; - struct list_head list; - struct rcu_head rcu; -}; - /* List of available DOI definitions */ -/* XXX - Updates should be minimal so having a single lock for the - * cipso_v4_doi_list and the cipso_v4_doi_list->dom_list should be - * okay. */ /* XXX - This currently assumes a minimal number of different DOIs in use, * if in practice there are a lot of different DOIs this list should * probably be turned into a hash table or something similar so we @@ -119,6 +109,19 @@ int cipso_v4_rbm_strictvalid = 1; * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 +/* Base length of the local tag (non-standard tag). + * Tag definition (may change between kernel versions) + * + * 0 8 16 24 32 + * +----------+----------+----------+----------+ + * | 10000000 | 00000110 | 32-bit secid value | + * +----------+----------+----------+----------+ + * | in (host byte order)| + * +----------+----------+ + * + */ +#define CIPSO_V4_TAG_LOC_BLEN 6 + /* * Helper Functions */ @@ -194,25 +197,6 @@ static void cipso_v4_bitmap_setbit(unsigned char *bitmap, } /** - * cipso_v4_doi_domhsh_free - Frees a domain list entry - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that the memory allocated to a domain list entry can be released - * safely. - * - */ -static void cipso_v4_doi_domhsh_free(struct rcu_head *entry) -{ - struct cipso_v4_domhsh_entry *ptr; - - ptr = container_of(entry, struct cipso_v4_domhsh_entry, rcu); - kfree(ptr->domain); - kfree(ptr); -} - -/** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * @@ -457,7 +441,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) struct cipso_v4_doi *iter; list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) - if (iter->doi == doi && iter->valid) + if (iter->doi == doi && atomic_read(&iter->refcount)) return iter; return NULL; } @@ -496,14 +480,17 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def) if (doi_def->type != CIPSO_V4_MAP_PASS) return -EINVAL; break; + case CIPSO_V4_TAG_LOCAL: + if (doi_def->type != CIPSO_V4_MAP_LOCAL) + return -EINVAL; + break; default: return -EINVAL; } } - doi_def->valid = 1; + atomic_set(&doi_def->refcount, 1); INIT_RCU_HEAD(&doi_def->rcu); - INIT_LIST_HEAD(&doi_def->dom_list); spin_lock(&cipso_v4_doi_list_lock); if (cipso_v4_doi_search(doi_def->doi) != NULL) @@ -519,59 +506,129 @@ doi_add_failure: } /** + * cipso_v4_doi_free - Frees a DOI definition + * @entry: the entry's RCU field + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) +{ + if (doi_def == NULL) + return; + + switch (doi_def->type) { + case CIPSO_V4_MAP_TRANS: + kfree(doi_def->map.std->lvl.cipso); + kfree(doi_def->map.std->lvl.local); + kfree(doi_def->map.std->cat.cipso); + kfree(doi_def->map.std->cat.local); + break; + } + kfree(doi_def); +} + +/** + * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to the DOI definition can be released + * safely. + * + */ +static void cipso_v4_doi_free_rcu(struct rcu_head *entry) +{ + struct cipso_v4_doi *doi_def; + + doi_def = container_of(entry, struct cipso_v4_doi, rcu); + cipso_v4_doi_free(doi_def); +} + +/** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value * @audit_secid: the LSM secid to use in the audit message - * @callback: the DOI cleanup/free callback * * Description: - * Removes a DOI definition from the CIPSO engine, @callback is called to - * free any memory. The NetLabel routines will be called to release their own - * LSM domain mappings as well as our own domain list. Returns zero on - * success and negative values on failure. + * Removes a DOI definition from the CIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. * */ -int cipso_v4_doi_remove(u32 doi, - struct netlbl_audit *audit_info, - void (*callback) (struct rcu_head * head)) +int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) { struct cipso_v4_doi *doi_def; - struct cipso_v4_domhsh_entry *dom_iter; spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); - if (doi_def != NULL) { - doi_def->valid = 0; - list_del_rcu(&doi_def->list); + if (doi_def == NULL) { spin_unlock(&cipso_v4_doi_list_lock); - rcu_read_lock(); - list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list) - if (dom_iter->valid) - netlbl_cfg_map_del(dom_iter->domain, - audit_info); - rcu_read_unlock(); - cipso_v4_cache_invalidate(); - call_rcu(&doi_def->rcu, callback); - return 0; + return -ENOENT; + } + if (!atomic_dec_and_test(&doi_def->refcount)) { + spin_unlock(&cipso_v4_doi_list_lock); + return -EBUSY; } + list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); - return -ENOENT; + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); + + return 0; } /** - * cipso_v4_doi_getdef - Returns a pointer to a valid DOI definition + * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that - * rcu_read_lock() is held while accessing the returned definition. + * rcu_read_lock() is held while accessing the returned definition and the DOI + * definition reference count is decremented when the caller is done. * */ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) { - return cipso_v4_doi_search(doi); + struct cipso_v4_doi *doi_def; + + rcu_read_lock(); + doi_def = cipso_v4_doi_search(doi); + if (doi_def == NULL) + goto doi_getdef_return; + if (!atomic_inc_not_zero(&doi_def->refcount)) + doi_def = NULL; + +doi_getdef_return: + rcu_read_unlock(); + return doi_def; +} + +/** + * cipso_v4_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). + * + */ +void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) +{ + if (doi_def == NULL) + return; + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; + spin_lock(&cipso_v4_doi_list_lock); + list_del_rcu(&doi_def->list); + spin_unlock(&cipso_v4_doi_list_lock); + + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); } /** @@ -597,7 +654,7 @@ int cipso_v4_doi_walk(u32 *skip_cnt, rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) - if (iter_doi->valid) { + if (atomic_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); @@ -613,85 +670,6 @@ doi_walk_return: return ret_val; } -/** - * cipso_v4_doi_domhsh_add - Adds a domain entry to a DOI definition - * @doi_def: the DOI definition - * @domain: the domain to add - * - * Description: - * Adds the @domain to the DOI specified by @doi_def, this function - * should only be called by external functions (i.e. NetLabel). This function - * does allocate memory. Returns zero on success, negative values on failure. - * - */ -int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain) -{ - struct cipso_v4_domhsh_entry *iter; - struct cipso_v4_domhsh_entry *new_dom; - - new_dom = kzalloc(sizeof(*new_dom), GFP_KERNEL); - if (new_dom == NULL) - return -ENOMEM; - if (domain) { - new_dom->domain = kstrdup(domain, GFP_KERNEL); - if (new_dom->domain == NULL) { - kfree(new_dom); - return -ENOMEM; - } - } - new_dom->valid = 1; - INIT_RCU_HEAD(&new_dom->rcu); - - spin_lock(&cipso_v4_doi_list_lock); - list_for_each_entry(iter, &doi_def->dom_list, list) - if (iter->valid && - ((domain != NULL && iter->domain != NULL && - strcmp(iter->domain, domain) == 0) || - (domain == NULL && iter->domain == NULL))) { - spin_unlock(&cipso_v4_doi_list_lock); - kfree(new_dom->domain); - kfree(new_dom); - return -EEXIST; - } - list_add_tail_rcu(&new_dom->list, &doi_def->dom_list); - spin_unlock(&cipso_v4_doi_list_lock); - - return 0; -} - -/** - * cipso_v4_doi_domhsh_remove - Removes a domain entry from a DOI definition - * @doi_def: the DOI definition - * @domain: the domain to remove - * - * Description: - * Removes the @domain from the DOI specified by @doi_def, this function - * should only be called by external functions (i.e. NetLabel). Returns zero - * on success and negative values on error. - * - */ -int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def, - const char *domain) -{ - struct cipso_v4_domhsh_entry *iter; - - spin_lock(&cipso_v4_doi_list_lock); - list_for_each_entry(iter, &doi_def->dom_list, list) - if (iter->valid && - ((domain != NULL && iter->domain != NULL && - strcmp(iter->domain, domain) == 0) || - (domain == NULL && iter->domain == NULL))) { - iter->valid = 0; - list_del_rcu(&iter->list); - spin_unlock(&cipso_v4_doi_list_lock); - call_rcu(&iter->rcu, cipso_v4_doi_domhsh_free); - return 0; - } - spin_unlock(&cipso_v4_doi_list_lock); - - return -ENOENT; -} - /* * Label Mapping Functions */ @@ -712,7 +690,7 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL) return 0; break; @@ -741,7 +719,7 @@ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: *net_lvl = host_lvl; return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (host_lvl < doi_def->map.std->lvl.local_size && doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { *net_lvl = doi_def->map.std->lvl.local[host_lvl]; @@ -775,7 +753,7 @@ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: *host_lvl = net_lvl; return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: map_tbl = doi_def->map.std; if (net_lvl < map_tbl->lvl.cipso_size && map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { @@ -812,7 +790,7 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { @@ -860,7 +838,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, u32 host_cat_size = 0; u32 *host_cat_array = NULL; - if (doi_def->type == CIPSO_V4_MAP_STD) { + if (doi_def->type == CIPSO_V4_MAP_TRANS) { host_cat_size = doi_def->map.std->cat.local_size; host_cat_array = doi_def->map.std->cat.local; } @@ -875,7 +853,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: net_spot = host_spot; break; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (host_spot >= host_cat_size) return -EPERM; net_spot = host_cat_array[host_spot]; @@ -921,7 +899,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, u32 net_cat_size = 0; u32 *net_cat_array = NULL; - if (doi_def->type == CIPSO_V4_MAP_STD) { + if (doi_def->type == CIPSO_V4_MAP_TRANS) { net_cat_size = doi_def->map.std->cat.cipso_size; net_cat_array = doi_def->map.std->cat.cipso; } @@ -941,7 +919,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: host_spot = net_spot; break; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (net_spot >= net_cat_size) return -EPERM; host_spot = net_cat_array[net_spot]; @@ -1277,7 +1255,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, } else tag_len = 4; - buffer[0] = 0x01; + buffer[0] = CIPSO_V4_TAG_RBITMAP; buffer[1] = tag_len; buffer[3] = level; @@ -1373,7 +1351,7 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, } else tag_len = 4; - buffer[0] = 0x02; + buffer[0] = CIPSO_V4_TAG_ENUM; buffer[1] = tag_len; buffer[3] = level; @@ -1469,7 +1447,7 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, } else tag_len = 4; - buffer[0] = 0x05; + buffer[0] = CIPSO_V4_TAG_RANGE; buffer[1] = tag_len; buffer[3] = level; @@ -1523,6 +1501,54 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, } /** + * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) + * @doi_def: the DOI definition + * @secattr: the security attributes + * @buffer: the option buffer + * @buffer_len: length of buffer in bytes + * + * Description: + * Generate a CIPSO option using the local tag. Returns the size of the tag + * on success, negative values on failure. + * + */ +static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *buffer, + u32 buffer_len) +{ + if (!(secattr->flags & NETLBL_SECATTR_SECID)) + return -EPERM; + + buffer[0] = CIPSO_V4_TAG_LOCAL; + buffer[1] = CIPSO_V4_TAG_LOC_BLEN; + *(u32 *)&buffer[2] = secattr->attr.secid; + + return CIPSO_V4_TAG_LOC_BLEN; +} + +/** + * cipso_v4_parsetag_loc - Parse a CIPSO local tag + * @doi_def: the DOI definition + * @tag: the CIPSO tag + * @secattr: the security attributes + * + * Description: + * Parse a CIPSO local tag and return the security attributes in @secattr. + * Return zero on success, negatives values on failure. + * + */ +static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, + const unsigned char *tag, + struct netlbl_lsm_secattr *secattr) +{ + secattr->attr.secid = *(u32 *)&tag[2]; + secattr->flags |= NETLBL_SECATTR_SECID; + + return 0; +} + +/** * cipso_v4_validate - Validate a CIPSO option * @option: the start of the option, on error it is set to point to the error * @@ -1541,7 +1567,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, * that is unrecognized." * */ -int cipso_v4_validate(unsigned char **option) +int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { unsigned char *opt = *option; unsigned char *tag; @@ -1566,7 +1592,7 @@ int cipso_v4_validate(unsigned char **option) goto validate_return_locked; } - opt_iter = 6; + opt_iter = CIPSO_V4_HDR_LEN; tag = opt + opt_iter; while (opt_iter < opt_len) { for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) @@ -1584,7 +1610,7 @@ int cipso_v4_validate(unsigned char **option) switch (tag[0]) { case CIPSO_V4_TAG_RBITMAP: - if (tag_len < 4) { + if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } @@ -1602,7 +1628,7 @@ int cipso_v4_validate(unsigned char **option) err_offset = opt_iter + 3; goto validate_return_locked; } - if (tag_len > 4 && + if (tag_len > CIPSO_V4_TAG_RBM_BLEN && cipso_v4_map_cat_rbm_valid(doi_def, &tag[4], tag_len - 4) < 0) { @@ -1612,7 +1638,7 @@ int cipso_v4_validate(unsigned char **option) } break; case CIPSO_V4_TAG_ENUM: - if (tag_len < 4) { + if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } @@ -1622,7 +1648,7 @@ int cipso_v4_validate(unsigned char **option) err_offset = opt_iter + 3; goto validate_return_locked; } - if (tag_len > 4 && + if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && cipso_v4_map_cat_enum_valid(doi_def, &tag[4], tag_len - 4) < 0) { @@ -1631,7 +1657,7 @@ int cipso_v4_validate(unsigned char **option) } break; case CIPSO_V4_TAG_RANGE: - if (tag_len < 4) { + if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } @@ -1641,7 +1667,7 @@ int cipso_v4_validate(unsigned char **option) err_offset = opt_iter + 3; goto validate_return_locked; } - if (tag_len > 4 && + if (tag_len > CIPSO_V4_TAG_RNG_BLEN && cipso_v4_map_cat_rng_valid(doi_def, &tag[4], tag_len - 4) < 0) { @@ -1649,6 +1675,19 @@ int cipso_v4_validate(unsigned char **option) goto validate_return_locked; } break; + case CIPSO_V4_TAG_LOCAL: + /* This is a non-standard tag that we only allow for + * local connections, so if the incoming interface is + * not the loopback device drop the packet. */ + if (!(skb->dev->flags & IFF_LOOPBACK)) { + err_offset = opt_iter; + goto validate_return_locked; + } + if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + break; default: err_offset = opt_iter; goto validate_return_locked; @@ -1704,48 +1743,27 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) } /** - * cipso_v4_sock_setattr - Add a CIPSO option to a socket - * @sk: the socket + * cipso_v4_genopt - Generate a CIPSO option + * @buf: the option buffer + * @buf_len: the size of opt_buf * @doi_def: the CIPSO DOI to use - * @secattr: the specific security attributes of the socket + * @secattr: the security attributes * * Description: - * Set the CIPSO option on the given socket using the DOI definition and - * security attributes passed to the function. This function requires - * exclusive access to @sk, which means it either needs to be in the - * process of being created or locked. Returns zero on success and negative - * values on failure. + * Generate a CIPSO option using the DOI definition and security attributes + * passed to the function. Returns the length of the option on success and + * negative values on failure. * */ -int cipso_v4_sock_setattr(struct sock *sk, - const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) +static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) { - int ret_val = -EPERM; + int ret_val; u32 iter; - unsigned char *buf; - u32 buf_len = 0; - u32 opt_len; - struct ip_options *opt = NULL; - struct inet_sock *sk_inet; - struct inet_connection_sock *sk_conn; - /* In the case of sock_create_lite(), the sock->sk field is not - * defined yet but it is not a problem as the only users of these - * "lite" PF_INET sockets are functions which do an accept() call - * afterwards so we will label the socket as part of the accept(). */ - if (sk == NULL) - return 0; - - /* We allocate the maximum CIPSO option size here so we are probably - * being a little wasteful, but it makes our life _much_ easier later - * on and after all we are only talking about 40 bytes. */ - buf_len = CIPSO_V4_OPT_LEN_MAX; - buf = kmalloc(buf_len, GFP_ATOMIC); - if (buf == NULL) { - ret_val = -ENOMEM; - goto socket_setattr_failure; - } + if (buf_len <= CIPSO_V4_HDR_LEN) + return -ENOSPC; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC @@ -1772,9 +1790,14 @@ int cipso_v4_sock_setattr(struct sock *sk, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; + case CIPSO_V4_TAG_LOCAL: + ret_val = cipso_v4_gentag_loc(doi_def, + secattr, + &buf[CIPSO_V4_HDR_LEN], + buf_len - CIPSO_V4_HDR_LEN); + break; default: - ret_val = -EPERM; - goto socket_setattr_failure; + return -EPERM; } iter++; @@ -1782,9 +1805,58 @@ int cipso_v4_sock_setattr(struct sock *sk, iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); if (ret_val < 0) - goto socket_setattr_failure; + return ret_val; cipso_v4_gentag_hdr(doi_def, buf, ret_val); - buf_len = CIPSO_V4_HDR_LEN + ret_val; + return CIPSO_V4_HDR_LEN + ret_val; +} + +/** + * cipso_v4_sock_setattr - Add a CIPSO option to a socket + * @sk: the socket + * @doi_def: the CIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +int cipso_v4_sock_setattr(struct sock *sk, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + unsigned char *buf = NULL; + u32 buf_len; + u32 opt_len; + struct ip_options *opt = NULL; + struct inet_sock *sk_inet; + struct inet_connection_sock *sk_conn; + + /* In the case of sock_create_lite(), the sock->sk field is not + * defined yet but it is not a problem as the only users of these + * "lite" PF_INET sockets are functions which do an accept() call + * afterwards so we will label the socket as part of the accept(). */ + if (sk == NULL) + return 0; + + /* We allocate the maximum CIPSO option size here so we are probably + * being a little wasteful, but it makes our life _much_ easier later + * on and after all we are only talking about 40 bytes. */ + buf_len = CIPSO_V4_OPT_LEN_MAX; + buf = kmalloc(buf_len, GFP_ATOMIC); + if (buf == NULL) { + ret_val = -ENOMEM; + goto socket_setattr_failure; + } + + ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (ret_val < 0) + goto socket_setattr_failure; + buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and @@ -1822,6 +1894,80 @@ socket_setattr_failure: } /** + * cipso_v4_sock_delattr - Delete the CIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CIPSO option from a socket, if present. + * + */ +void cipso_v4_sock_delattr(struct sock *sk) +{ + u8 hdr_delta; + struct ip_options *opt; + struct inet_sock *sk_inet; + + sk_inet = inet_sk(sk); + opt = sk_inet->opt; + if (opt == NULL || opt->cipso == 0) + return; + + if (opt->srr || opt->rr || opt->ts || opt->router_alert) { + u8 cipso_len; + u8 cipso_off; + unsigned char *cipso_ptr; + int iter; + int optlen_new; + + cipso_off = opt->cipso - sizeof(struct iphdr); + cipso_ptr = &opt->__data[cipso_off]; + cipso_len = cipso_ptr[1]; + + if (opt->srr > opt->cipso) + opt->srr -= cipso_len; + if (opt->rr > opt->cipso) + opt->rr -= cipso_len; + if (opt->ts > opt->cipso) + opt->ts -= cipso_len; + if (opt->router_alert > opt->cipso) + opt->router_alert -= cipso_len; + opt->cipso = 0; + + memmove(cipso_ptr, cipso_ptr + cipso_len, + opt->optlen - cipso_off - cipso_len); + + /* determining the new total option length is tricky because of + * the padding necessary, the only thing i can think to do at + * this point is walk the options one-by-one, skipping the + * padding at the end to determine the actual option size and + * from there we can determine the new total option length */ + iter = 0; + optlen_new = 0; + while (iter < opt->optlen) + if (opt->__data[iter] != IPOPT_NOP) { + iter += opt->__data[iter + 1]; + optlen_new = iter; + } else + iter++; + hdr_delta = opt->optlen; + opt->optlen = (optlen_new + 3) & ~3; + hdr_delta -= opt->optlen; + } else { + /* only the cipso option was present on the socket so we can + * remove the entire option struct */ + sk_inet->opt = NULL; + hdr_delta = opt->optlen; + kfree(opt); + } + + if (sk_inet->is_icsk && hdr_delta > 0) { + struct inet_connection_sock *sk_conn = inet_csk(sk); + sk_conn->icsk_ext_hdr_len -= hdr_delta; + sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); + } +} + +/** * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions * @cipso: the CIPSO v4 option * @secattr: the security attributes @@ -1859,6 +2005,9 @@ static int cipso_v4_getattr(const unsigned char *cipso, case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; + case CIPSO_V4_TAG_LOCAL: + ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); + break; } if (ret_val == 0) secattr->type = NETLBL_NLTYPE_CIPSOV4; @@ -1893,6 +2042,123 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) } /** + * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet + * @skb: the packet + * @secattr: the security attributes + * + * Description: + * Set the CIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +int cipso_v4_skbuff_setattr(struct sk_buff *skb, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct iphdr *iph; + struct ip_options *opt = &IPCB(skb)->opt; + unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; + u32 buf_len = CIPSO_V4_OPT_LEN_MAX; + u32 opt_len; + int len_delta; + + buf_len = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (buf_len < 0) + return buf_len; + opt_len = (buf_len + 3) & ~3; + + /* we overwrite any existing options to ensure that we have enough + * room for the CIPSO option, the reason is that we _need_ to guarantee + * that the security label is applied to the packet - we do the same + * thing when using the socket options and it hasn't caused a problem, + * if we need to we can always revisit this choice later */ + + len_delta = opt_len - opt->optlen; + /* if we don't ensure enough headroom we could panic on the skb_push() + * call below so make sure we have enough, we are also "mangling" the + * packet so we should probably do a copy-on-write call anyway */ + ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + if (ret_val < 0) + return ret_val; + + if (len_delta > 0) { + /* we assume that the header + opt->optlen have already been + * "pushed" in ip_options_build() or similar */ + iph = ip_hdr(skb); + skb_push(skb, len_delta); + memmove((char *)iph - len_delta, iph, iph->ihl << 2); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + } else if (len_delta < 0) { + iph = ip_hdr(skb); + memset(iph + 1, IPOPT_NOP, opt->optlen); + } else + iph = ip_hdr(skb); + + if (opt->optlen > 0) + memset(opt, 0, sizeof(*opt)); + opt->optlen = opt_len; + opt->cipso = sizeof(struct iphdr); + opt->is_changed = 1; + + /* we have to do the following because we are being called from a + * netfilter hook which means the packet already has had the header + * fields populated and the checksum calculated - yes this means we + * are doing more work than needed but we do it to keep the core + * stack clean and tidy */ + memcpy(iph + 1, buf, buf_len); + if (opt_len > buf_len) + memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); + if (len_delta != 0) { + iph->ihl = 5 + (opt_len >> 2); + iph->tot_len = htons(skb->len); + } + ip_send_check(iph); + + return 0; +} + +/** + * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +int cipso_v4_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val; + struct iphdr *iph; + struct ip_options *opt = &IPCB(skb)->opt; + unsigned char *cipso_ptr; + + if (opt->cipso == 0) + return 0; + + /* since we are changing the packet we should make a copy */ + ret_val = skb_cow(skb, skb_headroom(skb)); + if (ret_val < 0) + return ret_val; + + /* the easiest thing to do is just replace the cipso option with noop + * options since we don't change the size of the packet, although we + * still need to recalculate the checksum */ + + iph = ip_hdr(skb); + cipso_ptr = (unsigned char *)iph + opt->cipso; + memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); + opt->cipso = 0; + opt->is_changed = 1; + + ip_send_check(iph); + + return 0; +} + +/** * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option * @skb: the packet * @secattr: the security attributes diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b12dae2b0b2..56fce3ab6c5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -613,9 +613,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (colon) *colon = 0; -#ifdef CONFIG_KMOD dev_load(net, ifr.ifr_name); -#endif switch (cmd) { case SIOCGIFADDR: /* Get interface address */ @@ -1283,7 +1281,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, return ret; } -static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, +static int devinet_conf_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1379,12 +1377,11 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, return ret; } -int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, +int ipv4_doint_and_flush_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, - newval, newlen); + int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen); struct net *net = table->extra2; if (ret == 1) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 55c355e6323..72b2de76f1c 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,7 +1,7 @@ /* * NET3: Implementation of the ICMP protocol layer. * - * Alan Cox, <alan@redhat.com> + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7f9e337e390..a0d86455c53 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -9,7 +9,7 @@ * seems to fall out with gcc 2.6.2. * * Authors: - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <alan@lxorguk.ukuu.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 89cb047ab31..564230dabcb 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -53,11 +53,9 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int type) { -#ifdef CONFIG_KMOD if (!inet_diag_table[type]) request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_INET_DIAG, type); -#endif mutex_lock(&inet_diag_table_mutex); if (!inet_diag_table[type]) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2152d222b95..e4f81f54bef 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -6,7 +6,7 @@ * The IP fragmentation functionality. * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e0bed56c51f..861978a4f1a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -8,7 +8,7 @@ * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> - * Alan Cox, <Alan.Cox@linux.org> + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index be3f18a7a40..2c88da6e786 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -438,7 +438,7 @@ int ip_options_compile(struct net *net, goto error; } opt->cipso = optptr - iph; - if (cipso_v4_validate(&optptr)) { + if (cipso_v4_validate(skb, &optptr)) { pp_ptr = optptr; goto error; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4c6d2caf920..29609d29df7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -41,7 +41,7 @@ Made the tunnels use dev->name not tunnel: when error reporting. Added tx_dropped stat - -Alan Cox (Alan.Cox@linux.org) 21 March 95 + -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 Reworked: Changed to tunnel to destination gateway in addition to the diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c519b8d30ee..b42e082cc17 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1,7 +1,7 @@ /* * IP multicast routing support for mrouted 3.6/3.8 * - * (c) 1995 Alan Cox, <alan@redhat.com> + * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> * Linux Consultancy and Custom Driver Development * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index aa2c50a180f..fa2d6b6fc3e 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -41,12 +41,13 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, int (*okfn)(struct sk_buff *)) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb->nfct) return NF_ACCEPT; #endif - +#endif /* Gather fragments. */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (nf_ct_ipv4_gather_frags(skb, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 2ac9eaf1a8c..a65cf692359 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -584,6 +584,98 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .flags = NF_CT_EXT_F_PREALLOC, }; +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { + [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, + [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int nfnetlink_parse_nat_proto(struct nlattr *attr, + const struct nf_conn *ct, + struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_PROTONAT_MAX+1]; + const struct nf_nat_protocol *npt; + int err; + + err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); + if (err < 0) + return err; + + npt = nf_nat_proto_find_get(nf_ct_protonum(ct)); + if (npt->nlattr_to_range) + err = npt->nlattr_to_range(tb, range); + nf_nat_proto_put(npt); + return err; +} + +static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { + [CTA_NAT_MINIP] = { .type = NLA_U32 }, + [CTA_NAT_MAXIP] = { .type = NLA_U32 }, +}; + +static int +nfnetlink_parse_nat(struct nlattr *nat, + const struct nf_conn *ct, struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_NAT_MAX+1]; + int err; + + memset(range, 0, sizeof(*range)); + + err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); + if (err < 0) + return err; + + if (tb[CTA_NAT_MINIP]) + range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); + + if (!tb[CTA_NAT_MAXIP]) + range->max_ip = range->min_ip; + else + range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO]) + return 0; + + err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); + if (err < 0) + return err; + + return 0; +} + +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + struct nlattr *attr) +{ + struct nf_nat_range range; + + if (nfnetlink_parse_nat(attr, ct, &range) < 0) + return -EINVAL; + if (nf_nat_initialized(ct, manip)) + return -EEXIST; + + return nf_nat_setup_info(ct, &range, manip); +} +#else +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + struct nlattr *attr) +{ + return -EOPNOTSUPP; +} +#endif + static int __net_init nf_nat_net_init(struct net *net) { net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, @@ -654,6 +746,9 @@ static int __init nf_nat_init(void) BUG_ON(nf_nat_seq_adjust_hook != NULL); rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); + BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); + rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, + nfnetlink_parse_nat_setup); return 0; cleanup_extend: @@ -667,10 +762,12 @@ static void __exit nf_nat_cleanup(void) nf_ct_l3proto_put(l3proto); nf_ct_extend_unregister(&nat_extend); rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); synchronize_net(); } MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-ipv4"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a6d7c584f53..2ea6dcc3e2c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1109,7 +1109,12 @@ restart: printk("\n"); } #endif - rt_hash_table[hash].chain = rt; + /* + * Since lookup is lockfree, we must make sure + * previous writes to rt are comitted to memory + * before making rt visible to other CPUS. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; @@ -2908,8 +2913,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, } static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, @@ -2972,16 +2975,13 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, } static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int old = ip_rt_secret_interval; - int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval, - newlen); + int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); rt_secret_reschedule(old); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 276d047fb85..1bb10df8ce7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -64,8 +64,8 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, } /* Validate changes from sysctl interface. */ -static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int ipv4_sysctl_local_port_range(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -80,7 +80,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, }; inet_get_local_port_range(range, range + 1); - ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen); if (ret == 0 && newval && newlen) { if (range[1] < range[0]) ret = -EINVAL; @@ -109,8 +109,8 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * return ret; } -static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int sysctl_tcp_congestion_control(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -122,7 +122,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int ret; tcp_get_default_congestion_control(val); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); if (ret == 1 && newval && newlen) ret = tcp_set_default_congestion_control(val); return ret; @@ -165,8 +165,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } -static int strategy_allowed_congestion_control(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int strategy_allowed_congestion_control(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) @@ -179,7 +179,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam return -ENOMEM; tcp_get_available_congestion_control(tbl.data, tbl.maxlen); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); if (ret == 1 && newval && newlen) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6a250828b76..4ec5b4e97c4 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -115,7 +115,7 @@ int tcp_set_default_congestion_control(const char *name) spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (!ca && capable(CAP_SYS_MODULE)) { spin_unlock(&tcp_cong_list_lock); @@ -244,7 +244,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (ca == icsk->icsk_ca_ops) goto out; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES /* not found attempt to autoload module */ if (!ca && capable(CAP_SYS_MODULE)) { rcu_read_unlock(); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eacf4cfef14..2095abc3cab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -8,7 +8,7 @@ * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * Alan Cox, <Alan.Cox@linux.org> + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7b6a584b62d..eea9542728c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3982,7 +3982,6 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, } static int addrconf_sysctl_forward_strategy(ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 050e14b7f70..01edac88851 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -834,7 +834,7 @@ static void __net_exit ipv6_cleanup_mibs(struct net *net) snmp_mib_free((void **)net->mib.icmpv6msg_statistics); } -static int inet6_net_init(struct net *net) +static int __net_init inet6_net_init(struct net *net) { int err = 0; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 840b15780a3..172438320ee 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1199,7 +1199,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } neigh->flags |= NTF_ROUTER; } else if (rt) { - rt->rt6i_flags |= (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); } if (rt) @@ -1730,9 +1730,8 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f return ret; } -int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, - int nlen, void __user *oldval, - size_t __user *oldlenp, +int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { struct net_device *dev = ctl->extra1; @@ -1745,13 +1744,11 @@ int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, switch (ctl->ctl_name) { case NET_NEIGH_REACHABLE_TIME: - ret = sysctl_jiffies(ctl, name, nlen, - oldval, oldlenp, newval, newlen); + ret = sysctl_jiffies(ctl, oldval, oldlenp, newval, newlen); break; case NET_NEIGH_RETRANS_TIME_MS: case NET_NEIGH_REACHABLE_TIME_MS: - ret = sysctl_ms_jiffies(ctl, name, nlen, - oldval, oldlenp, newval, newlen); + ret = sysctl_ms_jiffies(ctl, oldval, oldlenp, newval, newlen); break; default: ret = 0; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 6b29b03925f..fd5b3a4e332 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -12,6 +12,7 @@ int ip6_route_me_harder(struct sk_buff *skb) { + struct net *net = dev_net(skb->dst->dev); struct ipv6hdr *iph = ipv6_hdr(skb); struct dst_entry *dst; struct flowi fl = { @@ -23,7 +24,7 @@ int ip6_route_me_harder(struct sk_buff *skb) .saddr = iph->saddr, } }, }; - dst = ip6_route_output(dev_net(skb->dst->dev), skb->sk, &fl); + dst = ip6_route_output(net, skb->sk, &fl); #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -33,8 +34,7 @@ int ip6_route_me_harder(struct sk_buff *skb) #endif if (dst->error) { - IP6_INC_STATS(&init_net, ip6_dst_idev(dst), - IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); return -EINVAL; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 2a451562377..2ad504fc341 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -545,8 +545,12 @@ static int netdev_notify(struct notifier_block *nb, sdata = IEEE80211_DEV_TO_SUB_IF(dev); - sprintf(buf, "netdev:%s", dev->name); dir = sdata->debugfsdir; + + if (!dir) + return 0; + + sprintf(buf, "netdev:%s", dev->name); if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf)) printk(KERN_ERR "mac80211: debugfs: failed to rename debugfs " "dir to %s\n", buf); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index b9902e425f0..189d0bafa91 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -249,11 +249,22 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DECLARE_MAC_BUF(mbuf); u8 *mac; + sta->debugfs.add_has_run = true; + if (!stations_dir) return; mac = print_mac(mbuf, sta->sta.addr); + /* + * This might fail due to a race condition: + * When mac80211 unlinks a station, the debugfs entries + * remain, but it is already possible to link a new + * station with the same address which triggers adding + * it to debugfs; therefore, if the old station isn't + * destroyed quickly enough the old station's debugfs + * dir might still be around. + */ sta->debugfs.dir = debugfs_create_dir(mac, stations_dir); if (!sta->debugfs.dir) return; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8025b294588..156e42a003a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -816,8 +816,8 @@ struct ieee802_11_elems { u8 *ext_supp_rates; u8 *wmm_info; u8 *wmm_param; - u8 *ht_cap_elem; - u8 *ht_info_elem; + struct ieee80211_ht_cap *ht_cap_elem; + struct ieee80211_ht_addt_info *ht_info_elem; u8 *mesh_config; u8 *mesh_id; u8 *peer_link; @@ -844,8 +844,6 @@ struct ieee802_11_elems { u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; - u8 ht_cap_elem_len; - u8 ht_info_elem_len; u8 mesh_config_len; u8 mesh_id_len; u8 peer_link_len; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 49f86fa56bf..87665d7bb4f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1348,10 +1348,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) { struct ieee80211_ht_bss_info bss_info; ieee80211_ht_cap_ie_to_ht_info( - (struct ieee80211_ht_cap *) elems.ht_cap_elem, &sta->sta.ht_info); ieee80211_ht_addt_info_ie_to_ht_bss_info( - (struct ieee80211_ht_addt_info *) elems.ht_info_elem, &bss_info); ieee80211_handle_ht(local, 1, &sta->sta.ht_info, &bss_info); } @@ -1709,7 +1707,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_ht_bss_info bss_info; ieee80211_ht_addt_info_ie_to_ht_bss_info( - (struct ieee80211_ht_addt_info *) elems.ht_info_elem, &bss_info); changed |= ieee80211_handle_ht(local, 1, &conf->ht_conf, &bss_info); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 77e7b014872..cf6b121e1bb 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1379,6 +1379,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_QUEUED; } +#ifdef CONFIG_MAC80211_MESH static ieee80211_rx_result ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) { @@ -1453,7 +1454,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) else return RX_DROP_MONITOR; } - +#endif static ieee80211_rx_result debug_noinline ieee80211_rx_h_data(struct ieee80211_rx_data *rx) @@ -1780,8 +1781,10 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, /* must be after MMIC verify so header is counted in MPDU mic */ CALL_RXH(ieee80211_rx_h_remove_qos_control) CALL_RXH(ieee80211_rx_h_amsdu) +#ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) CALL_RXH(ieee80211_rx_h_mesh_fwding); +#endif CALL_RXH(ieee80211_rx_h_data) CALL_RXH(ieee80211_rx_h_ctrl) CALL_RXH(ieee80211_rx_h_action) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 8e6685e7ae8..416bb41099f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -388,7 +388,8 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bss = ieee80211_bss_info_update(sdata->local, rx_status, mgmt, skb->len, &elems, freq, beacon); - ieee80211_rx_bss_put(sdata->local, bss); + if (bss) + ieee80211_rx_bss_put(sdata->local, bss); dev_kfree_skb(skb); return RX_QUEUED; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 9b72d15bc8d..7fef8ea1f5e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -635,7 +635,12 @@ static void sta_info_debugfs_add_work(struct work_struct *work) spin_lock_irqsave(&local->sta_lock, flags); list_for_each_entry(tmp, &local->sta_list, list) { - if (!tmp->debugfs.dir) { + /* + * debugfs.add_has_run will be set by + * ieee80211_sta_debugfs_add regardless + * of what else it does. + */ + if (!tmp->debugfs.add_has_run) { sta = tmp; __sta_info_pin(sta); break; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index a6b51862a89..168a39a298b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -300,6 +300,7 @@ struct sta_info { struct dentry *inactive_ms; struct dentry *last_seq_ctrl; struct dentry *agg_status; + bool add_has_run; } debugfs; #endif diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f32561ec224..cee4884b9d0 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -529,12 +529,12 @@ void ieee802_11_parse_elems(u8 *start, size_t len, elems->ext_supp_rates_len = elen; break; case WLAN_EID_HT_CAPABILITY: - elems->ht_cap_elem = pos; - elems->ht_cap_elem_len = elen; + if (elen >= sizeof(struct ieee80211_ht_cap)) + elems->ht_cap_elem = (void *)pos; break; case WLAN_EID_HT_EXTRA_INFO: - elems->ht_info_elem = pos; - elems->ht_info_elem_len = elen; + if (elen >= sizeof(struct ieee80211_ht_addt_info)) + elems->ht_info_elem = (void *)pos; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c index 7e0d53abde2..742f811ca41 100644 --- a/net/mac80211/wext.c +++ b/net/mac80211/wext.c @@ -775,7 +775,7 @@ static int ieee80211_ioctl_siwfrag(struct net_device *dev, * configure it here */ if (local->ops->set_frag_threshold) - local->ops->set_frag_threshold( + return local->ops->set_frag_threshold( local_to_hw(local), local->fragmentation_threshold); diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 03591d37b9c..b92df5c1dfc 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -115,7 +115,7 @@ int nf_conntrack_acct_init(struct net *net) if (net_eq(net, &init_net)) { #ifdef CONFIG_NF_CT_ACCT - printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n"); + printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Please use\n"); printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n"); printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n"); #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 27de3c7b006..622d7c671cb 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -38,9 +38,16 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_nat.h> #define NF_CONNTRACK_VERSION "0.5.0" +unsigned int +(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, + enum nf_nat_manip_type manip, + struct nlattr *attr) __read_mostly; +EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); + DEFINE_SPINLOCK(nf_conntrack_lock); EXPORT_SYMBOL_GPL(nf_conntrack_lock); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index cadfd15b44f..2e4ad9671e1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -689,71 +689,6 @@ ctnetlink_parse_tuple(struct nlattr *cda[], struct nf_conntrack_tuple *tuple, return 0; } -#ifdef CONFIG_NF_NAT_NEEDED -static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { - [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, - [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, -}; - -static int nfnetlink_parse_nat_proto(struct nlattr *attr, - const struct nf_conn *ct, - struct nf_nat_range *range) -{ - struct nlattr *tb[CTA_PROTONAT_MAX+1]; - const struct nf_nat_protocol *npt; - int err; - - err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); - if (err < 0) - return err; - - npt = nf_nat_proto_find_get(nf_ct_protonum(ct)); - if (npt->nlattr_to_range) - err = npt->nlattr_to_range(tb, range); - nf_nat_proto_put(npt); - return err; -} - -static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { - [CTA_NAT_MINIP] = { .type = NLA_U32 }, - [CTA_NAT_MAXIP] = { .type = NLA_U32 }, -}; - -static inline int -nfnetlink_parse_nat(struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range) -{ - struct nlattr *tb[CTA_NAT_MAX+1]; - int err; - - memset(range, 0, sizeof(*range)); - - err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); - if (err < 0) - return err; - - if (tb[CTA_NAT_MINIP]) - range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); - - if (!tb[CTA_NAT_MAXIP]) - range->max_ip = range->min_ip; - else - range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); - - if (range->min_ip) - range->flags |= IP_NAT_RANGE_MAP_IPS; - - if (!tb[CTA_NAT_PROTO]) - return 0; - - err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); - if (err < 0) - return err; - - return 0; -} -#endif - static inline int ctnetlink_parse_help(struct nlattr *attr, char **helper_name) { @@ -879,6 +814,34 @@ out: } static int +ctnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + struct nlattr *attr) +{ + typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; + + parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); + if (!parse_nat_setup) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + nfnl_unlock(); + if (request_module("nf-nat-ipv4") < 0) { + nfnl_lock(); + rcu_read_lock(); + return -EOPNOTSUPP; + } + nfnl_lock(); + rcu_read_lock(); + if (nfnetlink_parse_nat_setup_hook) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + + return parse_nat_setup(ct, manip, attr); +} + +static int ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[]) { unsigned long d; @@ -897,31 +860,6 @@ ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[]) /* ASSURED bit can only be set */ return -EBUSY; - if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { -#ifndef CONFIG_NF_NAT_NEEDED - return -EOPNOTSUPP; -#else - struct nf_nat_range range; - - if (cda[CTA_NAT_DST]) { - if (nfnetlink_parse_nat(cda[CTA_NAT_DST], ct, - &range) < 0) - return -EINVAL; - if (nf_nat_initialized(ct, IP_NAT_MANIP_DST)) - return -EEXIST; - nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); - } - if (cda[CTA_NAT_SRC]) { - if (nfnetlink_parse_nat(cda[CTA_NAT_SRC], ct, - &range) < 0) - return -EINVAL; - if (nf_nat_initialized(ct, IP_NAT_MANIP_SRC)) - return -EEXIST; - nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); - } -#endif - } - /* Be careful here, modifying NAT bits can screw up things, * so don't let users modify them directly if they don't pass * nf_nat_range. */ @@ -929,6 +867,31 @@ ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[]) return 0; } +static int +ctnetlink_change_nat(struct nf_conn *ct, struct nlattr *cda[]) +{ +#ifdef CONFIG_NF_NAT_NEEDED + int ret; + + if (cda[CTA_NAT_DST]) { + ret = ctnetlink_parse_nat_setup(ct, + IP_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + } + if (cda[CTA_NAT_SRC]) { + ret = ctnetlink_parse_nat_setup(ct, + IP_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + if (ret < 0) + return ret; + } + return 0; +#else + return -EOPNOTSUPP; +#endif +} static inline int ctnetlink_change_helper(struct nf_conn *ct, struct nlattr *cda[]) @@ -1157,6 +1120,14 @@ ctnetlink_create_conntrack(struct nlattr *cda[], } } + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { + err = ctnetlink_change_nat(ct, cda); + if (err < 0) { + rcu_read_unlock(); + goto err; + } + } + if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 373e51e91ce..1bc3001d182 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -65,7 +65,7 @@ void struct nf_conntrack_expect *exp) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); -#ifdef DEBUG +#if defined(DEBUG) || defined(CONFIG_DYNAMIC_PRINTK_DEBUG) /* PptpControlMessageType names */ const char *const pptp_msg_name[] = { "UNKNOWN_MESSAGE", diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index fe34f4bf74c..cdc97f3105a 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -10,7 +10,6 @@ * */ -#include <linux/version.h> #include <linux/module.h> #include <linux/net.h> diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index b75c9c4a995..9c0ba17a1dd 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -44,15 +44,17 @@ static struct sock *nfnl = NULL; static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; static DEFINE_MUTEX(nfnl_mutex); -static inline void nfnl_lock(void) +void nfnl_lock(void) { mutex_lock(&nfnl_mutex); } +EXPORT_SYMBOL_GPL(nfnl_lock); -static inline void nfnl_unlock(void) +void nfnl_unlock(void) { mutex_unlock(&nfnl_mutex); } +EXPORT_SYMBOL_GPL(nfnl_unlock); int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { @@ -132,9 +134,10 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; type = nlh->nlmsg_type; +replay: ss = nfnetlink_get_subsys(type); if (!ss) { -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES nfnl_unlock(); request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); nfnl_lock(); @@ -165,7 +168,10 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } else return -EINVAL; - return nc->call(nfnl, skb, nlh, cda); + err = nc->call(nfnl, skb, nlh, cda); + if (err == -EAGAIN) + goto replay; + return err; } } diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile index 8af18c0a47d..ea750e9df65 100644 --- a/net/netlabel/Makefile +++ b/net/netlabel/Makefile @@ -5,7 +5,8 @@ # # base objects -obj-y := netlabel_user.o netlabel_kapi.o netlabel_domainhash.o +obj-y := netlabel_user.o netlabel_kapi.o +obj-y += netlabel_domainhash.o netlabel_addrlist.o # management objects obj-y += netlabel_mgmt.o diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c new file mode 100644 index 00000000000..b0925a30335 --- /dev/null +++ b/net/netlabel/netlabel_addrlist.c @@ -0,0 +1,388 @@ +/* + * NetLabel Network Address Lists + * + * This file contains network address list functions used to manage ordered + * lists of network addresses for use by the NetLabel subsystem. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore <paul.moore@hp.com> + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <linux/audit.h> + +#include "netlabel_addrlist.h" + +/* + * Address List Functions + */ + +/** + * netlbl_af4list_search - Search for a matching IPv4 address entry + * @addr: IPv4 address + * @head: the list head + * + * Description: + * Searches the IPv4 address list given by @head. If a matching address entry + * is found it is returned, otherwise NULL is returned. The caller is + * responsible for calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af4list *netlbl_af4list_search(__be32 addr, + struct list_head *head) +{ + struct netlbl_af4list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && (addr & iter->mask) == iter->addr) + return iter; + + return NULL; +} + +/** + * netlbl_af4list_search_exact - Search for an exact IPv4 address entry + * @addr: IPv4 address + * @mask: IPv4 address mask + * @head: the list head + * + * Description: + * Searches the IPv4 address list given by @head. If an exact match if found + * it is returned, otherwise NULL is returned. The caller is responsible for + * calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, + __be32 mask, + struct list_head *head) +{ + struct netlbl_af4list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && iter->addr == addr && iter->mask == mask) + return iter; + + return NULL; +} + + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_search - Search for a matching IPv6 address entry + * @addr: IPv6 address + * @head: the list head + * + * Description: + * Searches the IPv6 address list given by @head. If a matching address entry + * is found it is returned, otherwise NULL is returned. The caller is + * responsible for calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, + struct list_head *head) +{ + struct netlbl_af6list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0) + return iter; + + return NULL; +} + +/** + * netlbl_af6list_search_exact - Search for an exact IPv6 address entry + * @addr: IPv6 address + * @mask: IPv6 address mask + * @head: the list head + * + * Description: + * Searches the IPv6 address list given by @head. If an exact match if found + * it is returned, otherwise NULL is returned. The caller is responsible for + * calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head) +{ + struct netlbl_af6list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ipv6_addr_equal(&iter->addr, addr) && + ipv6_addr_equal(&iter->mask, mask)) + return iter; + + return NULL; +} +#endif /* IPv6 */ + +/** + * netlbl_af4list_add - Add a new IPv4 address entry to a list + * @entry: address entry + * @head: the list head + * + * Description: + * Add a new address entry to the list pointed to by @head. On success zero is + * returned, otherwise a negative value is returned. The caller is responsible + * for calling the necessary locking functions. + * + */ +int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head) +{ + struct netlbl_af4list *iter; + + iter = netlbl_af4list_search(entry->addr, head); + if (iter != NULL && + iter->addr == entry->addr && iter->mask == entry->mask) + return -EEXIST; + + /* in order to speed up address searches through the list (the common + * case) we need to keep the list in order based on the size of the + * address mask such that the entry with the widest mask (smallest + * numerical value) appears first in the list */ + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ntohl(entry->mask) > ntohl(iter->mask)) { + __list_add_rcu(&entry->list, + iter->list.prev, + &iter->list); + return 0; + } + list_add_tail_rcu(&entry->list, head); + return 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_add - Add a new IPv6 address entry to a list + * @entry: address entry + * @head: the list head + * + * Description: + * Add a new address entry to the list pointed to by @head. On success zero is + * returned, otherwise a negative value is returned. The caller is responsible + * for calling the necessary locking functions. + * + */ +int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head) +{ + struct netlbl_af6list *iter; + + iter = netlbl_af6list_search(&entry->addr, head); + if (iter != NULL && + ipv6_addr_equal(&iter->addr, &entry->addr) && + ipv6_addr_equal(&iter->mask, &entry->mask)) + return -EEXIST; + + /* in order to speed up address searches through the list (the common + * case) we need to keep the list in order based on the size of the + * address mask such that the entry with the widest mask (smallest + * numerical value) appears first in the list */ + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) { + __list_add_rcu(&entry->list, + iter->list.prev, + &iter->list); + return 0; + } + list_add_tail_rcu(&entry->list, head); + return 0; +} +#endif /* IPv6 */ + +/** + * netlbl_af4list_remove_entry - Remove an IPv4 address entry + * @entry: address entry + * + * Description: + * Remove the specified IP address entry. The caller is responsible for + * calling the necessary locking functions. + * + */ +void netlbl_af4list_remove_entry(struct netlbl_af4list *entry) +{ + entry->valid = 0; + list_del_rcu(&entry->list); +} + +/** + * netlbl_af4list_remove - Remove an IPv4 address entry + * @addr: IP address + * @mask: IP address mask + * @head: the list head + * + * Description: + * Remove an IP address entry from the list pointed to by @head. Returns the + * entry on success, NULL on failure. The caller is responsible for calling + * the necessary locking functions. + * + */ +struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, + struct list_head *head) +{ + struct netlbl_af4list *entry; + + entry = netlbl_af4list_search(addr, head); + if (entry != NULL && entry->addr == addr && entry->mask == mask) { + netlbl_af4list_remove_entry(entry); + return entry; + } + + return NULL; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_remove_entry - Remove an IPv6 address entry + * @entry: address entry + * + * Description: + * Remove the specified IP address entry. The caller is responsible for + * calling the necessary locking functions. + * + */ +void netlbl_af6list_remove_entry(struct netlbl_af6list *entry) +{ + entry->valid = 0; + list_del_rcu(&entry->list); +} + +/** + * netlbl_af6list_remove - Remove an IPv6 address entry + * @addr: IP address + * @mask: IP address mask + * @head: the list head + * + * Description: + * Remove an IP address entry from the list pointed to by @head. Returns the + * entry on success, NULL on failure. The caller is responsible for calling + * the necessary locking functions. + * + */ +struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head) +{ + struct netlbl_af6list *entry; + + entry = netlbl_af6list_search(addr, head); + if (entry != NULL && + ipv6_addr_equal(&entry->addr, addr) && + ipv6_addr_equal(&entry->mask, mask)) { + netlbl_af6list_remove_entry(entry); + return entry; + } + + return NULL; +} +#endif /* IPv6 */ + +/* + * Audit Helper Functions + */ + +/** + * netlbl_af4list_audit_addr - Audit an IPv4 address + * @audit_buf: audit buffer + * @src: true if source address, false if destination + * @dev: network interface + * @addr: IP address + * @mask: IP address mask + * + * Description: + * Write the IPv4 address and address mask, if necessary, to @audit_buf. + * + */ +void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, + int src, const char *dev, + __be32 addr, __be32 mask) +{ + u32 mask_val = ntohl(mask); + char *dir = (src ? "src" : "dst"); + + if (dev != NULL) + audit_log_format(audit_buf, " netif=%s", dev); + audit_log_format(audit_buf, " %s=" NIPQUAD_FMT, dir, NIPQUAD(addr)); + if (mask_val != 0xffffffff) { + u32 mask_len = 0; + while (mask_val > 0) { + mask_val <<= 1; + mask_len++; + } + audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len); + } +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_audit_addr - Audit an IPv6 address + * @audit_buf: audit buffer + * @src: true if source address, false if destination + * @dev: network interface + * @addr: IP address + * @mask: IP address mask + * + * Description: + * Write the IPv6 address and address mask, if necessary, to @audit_buf. + * + */ +void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, + int src, + const char *dev, + const struct in6_addr *addr, + const struct in6_addr *mask) +{ + char *dir = (src ? "src" : "dst"); + + if (dev != NULL) + audit_log_format(audit_buf, " netif=%s", dev); + audit_log_format(audit_buf, " %s=" NIP6_FMT, dir, NIP6(*addr)); + if (ntohl(mask->s6_addr32[3]) != 0xffffffff) { + u32 mask_len = 0; + u32 mask_val; + int iter = -1; + while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff) + mask_len += 32; + mask_val = ntohl(mask->s6_addr32[iter]); + while (mask_val > 0) { + mask_val <<= 1; + mask_len++; + } + audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len); + } +} +#endif /* IPv6 */ diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h new file mode 100644 index 00000000000..0242bead405 --- /dev/null +++ b/net/netlabel/netlabel_addrlist.h @@ -0,0 +1,189 @@ +/* + * NetLabel Network Address Lists + * + * This file contains network address list functions used to manage ordered + * lists of network addresses for use by the NetLabel subsystem. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore <paul.moore@hp.com> + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_ADDRLIST_H +#define _NETLABEL_ADDRLIST_H + +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <linux/in6.h> +#include <linux/audit.h> + +/** + * struct netlbl_af4list - NetLabel IPv4 address list + * @addr: IPv4 address + * @mask: IPv4 address mask + * @valid: valid flag + * @list: list structure, used internally + */ +struct netlbl_af4list { + __be32 addr; + __be32 mask; + + u32 valid; + struct list_head list; +}; + +/** + * struct netlbl_af6list - NetLabel IPv6 address list + * @addr: IPv6 address + * @mask: IPv6 address mask + * @valid: valid flag + * @list: list structure, used internally + */ +struct netlbl_af6list { + struct in6_addr addr; + struct in6_addr mask; + + u32 valid; + struct list_head list; +}; + +#define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list) + +static inline struct netlbl_af4list *__af4list_valid(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af4list *n = __af4list_entry(s); + while (i != h && !n->valid) { + i = i->next; + n = __af4list_entry(i); + } + return n; +} + +static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af4list *n = __af4list_entry(s); + while (i != h && !n->valid) { + i = rcu_dereference(i->next); + n = __af4list_entry(i); + } + return n; +} + +#define netlbl_af4list_foreach(iter, head) \ + for (iter = __af4list_valid((head)->next, head); \ + prefetch(iter->list.next), &iter->list != (head); \ + iter = __af4list_valid(iter->list.next, head)) + +#define netlbl_af4list_foreach_rcu(iter, head) \ + for (iter = __af4list_valid_rcu((head)->next, head); \ + prefetch(iter->list.next), &iter->list != (head); \ + iter = __af4list_valid_rcu(iter->list.next, head)) + +#define netlbl_af4list_foreach_safe(iter, tmp, head) \ + for (iter = __af4list_valid((head)->next, head), \ + tmp = __af4list_valid(iter->list.next, head); \ + &iter->list != (head); \ + iter = tmp, tmp = __af4list_valid(iter->list.next, head)) + +int netlbl_af4list_add(struct netlbl_af4list *entry, + struct list_head *head); +struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, + struct list_head *head); +void netlbl_af4list_remove_entry(struct netlbl_af4list *entry); +struct netlbl_af4list *netlbl_af4list_search(__be32 addr, + struct list_head *head); +struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, + __be32 mask, + struct list_head *head); +void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, + int src, const char *dev, + __be32 addr, __be32 mask); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +#define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list) + +static inline struct netlbl_af6list *__af6list_valid(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af6list *n = __af6list_entry(s); + while (i != h && !n->valid) { + i = i->next; + n = __af6list_entry(i); + } + return n; +} + +static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af6list *n = __af6list_entry(s); + while (i != h && !n->valid) { + i = rcu_dereference(i->next); + n = __af6list_entry(i); + } + return n; +} + +#define netlbl_af6list_foreach(iter, head) \ + for (iter = __af6list_valid((head)->next, head); \ + prefetch(iter->list.next), &iter->list != (head); \ + iter = __af6list_valid(iter->list.next, head)) + +#define netlbl_af6list_foreach_rcu(iter, head) \ + for (iter = __af6list_valid_rcu((head)->next, head); \ + prefetch(iter->list.next), &iter->list != (head); \ + iter = __af6list_valid_rcu(iter->list.next, head)) + +#define netlbl_af6list_foreach_safe(iter, tmp, head) \ + for (iter = __af6list_valid((head)->next, head), \ + tmp = __af6list_valid(iter->list.next, head); \ + &iter->list != (head); \ + iter = tmp, tmp = __af6list_valid(iter->list.next, head)) + +int netlbl_af6list_add(struct netlbl_af6list *entry, + struct list_head *head); +struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head); +void netlbl_af6list_remove_entry(struct netlbl_af6list *entry); +struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, + struct list_head *head); +struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head); +void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, + int src, + const char *dev, + const struct in6_addr *addr, + const struct in6_addr *mask); +#endif /* IPV6 */ + +#endif diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 0aec318bf0e..fff32b70efa 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -43,6 +43,7 @@ #include "netlabel_user.h" #include "netlabel_cipso_v4.h" #include "netlabel_mgmt.h" +#include "netlabel_domainhash.h" /* Argument struct for cipso_v4_doi_walk() */ struct netlbl_cipsov4_doiwalk_arg { @@ -51,6 +52,12 @@ struct netlbl_cipsov4_doiwalk_arg { u32 seq; }; +/* Argument struct for netlbl_domhsh_walk() */ +struct netlbl_domhsh_walk_arg { + struct netlbl_audit *audit_info; + u32 doi; +}; + /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_cipsov4_gnl_family = { .id = GENL_ID_GENERATE, @@ -81,32 +88,6 @@ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1 */ /** - * netlbl_cipsov4_doi_free - Frees a CIPSO V4 DOI definition - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that the memory allocated to the DOI definition can be released - * safely. - * - */ -void netlbl_cipsov4_doi_free(struct rcu_head *entry) -{ - struct cipso_v4_doi *ptr; - - ptr = container_of(entry, struct cipso_v4_doi, rcu); - switch (ptr->type) { - case CIPSO_V4_MAP_STD: - kfree(ptr->map.std->lvl.cipso); - kfree(ptr->map.std->lvl.local); - kfree(ptr->map.std->cat.cipso); - kfree(ptr->map.std->cat.local); - break; - } - kfree(ptr); -} - -/** * netlbl_cipsov4_add_common - Parse the common sections of a ADD message * @info: the Generic NETLINK info block * @doi_def: the CIPSO V4 DOI definition @@ -151,9 +132,9 @@ static int netlbl_cipsov4_add_common(struct genl_info *info, * @info: the Generic NETLINK info block * * Description: - * Create a new CIPSO_V4_MAP_STD DOI definition based on the given ADD message - * and add it to the CIPSO V4 engine. Return zero on success and non-zero on - * error. + * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD + * message and add it to the CIPSO V4 engine. Return zero on success and + * non-zero on error. * */ static int netlbl_cipsov4_add_std(struct genl_info *info) @@ -183,7 +164,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info) ret_val = -ENOMEM; goto add_std_failure; } - doi_def->type = CIPSO_V4_MAP_STD; + doi_def->type = CIPSO_V4_MAP_TRANS; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) @@ -342,7 +323,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info) add_std_failure: if (doi_def) - netlbl_cipsov4_doi_free(&doi_def->rcu); + cipso_v4_doi_free(doi_def); return ret_val; } @@ -379,7 +360,44 @@ static int netlbl_cipsov4_add_pass(struct genl_info *info) return 0; add_pass_failure: - netlbl_cipsov4_doi_free(&doi_def->rcu); + cipso_v4_doi_free(doi_def); + return ret_val; +} + +/** + * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition + * @info: the Generic NETLINK info block + * + * Description: + * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD + * message and add it to the CIPSO V4 engine. Return zero on success and + * non-zero on error. + * + */ +static int netlbl_cipsov4_add_local(struct genl_info *info) +{ + int ret_val; + struct cipso_v4_doi *doi_def = NULL; + + if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) + return -EINVAL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (doi_def == NULL) + return -ENOMEM; + doi_def->type = CIPSO_V4_MAP_LOCAL; + + ret_val = netlbl_cipsov4_add_common(info, doi_def); + if (ret_val != 0) + goto add_local_failure; + + ret_val = cipso_v4_doi_add(doi_def); + if (ret_val != 0) + goto add_local_failure; + return 0; + +add_local_failure: + cipso_v4_doi_free(doi_def); return ret_val; } @@ -412,14 +430,18 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) type = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE]); switch (type) { - case CIPSO_V4_MAP_STD: - type_str = "std"; + case CIPSO_V4_MAP_TRANS: + type_str = "trans"; ret_val = netlbl_cipsov4_add_std(info); break; case CIPSO_V4_MAP_PASS: type_str = "pass"; ret_val = netlbl_cipsov4_add_pass(info); break; + case CIPSO_V4_MAP_LOCAL: + type_str = "local"; + ret_val = netlbl_cipsov4_add_local(info); + break; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); @@ -491,7 +513,7 @@ list_start: doi_def = cipso_v4_doi_getdef(doi); if (doi_def == NULL) { ret_val = -EINVAL; - goto list_failure; + goto list_failure_lock; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type); @@ -516,7 +538,7 @@ list_start: nla_nest_end(ans_skb, nla_a); switch (doi_def->type) { - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST); if (nla_a == NULL) { ret_val = -ENOMEM; @@ -655,7 +677,7 @@ static int netlbl_cipsov4_listall(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_cipsov4_doiwalk_arg cb_arg; - int doi_skip = cb->args[0]; + u32 doi_skip = cb->args[0]; cb_arg.nl_cb = cb; cb_arg.skb = skb; @@ -668,6 +690,29 @@ static int netlbl_cipsov4_listall(struct sk_buff *skb, } /** + * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE + * @entry: LSM domain mapping entry + * @arg: the netlbl_domhsh_walk_arg structure + * + * Description: + * This function is intended for use by netlbl_cipsov4_remove() as the callback + * for the netlbl_domhsh_walk() function; it removes LSM domain map entries + * which are associated with the CIPSO DOI specified in @arg. Returns zero on + * success, negative values on failure. + * + */ +static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg) +{ + struct netlbl_domhsh_walk_arg *cb_arg = arg; + + if (entry->type == NETLBL_NLTYPE_CIPSOV4 && + entry->type_def.cipsov4->doi == cb_arg->doi) + return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); + + return 0; +} + +/** * netlbl_cipsov4_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block @@ -681,8 +726,11 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; u32 doi = 0; + struct netlbl_domhsh_walk_arg cb_arg; struct audit_buffer *audit_buf; struct netlbl_audit audit_info; + u32 skip_bkt = 0; + u32 skip_chain = 0; if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; @@ -690,11 +738,15 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); netlbl_netlink_auditinfo(skb, &audit_info); - ret_val = cipso_v4_doi_remove(doi, - &audit_info, - netlbl_cipsov4_doi_free); - if (ret_val == 0) - atomic_dec(&netlabel_mgmt_protocount); + cb_arg.doi = doi; + cb_arg.audit_info = &audit_info; + ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, + netlbl_cipsov4_remove_cb, &cb_arg); + if (ret_val == 0 || ret_val == -ENOENT) { + ret_val = cipso_v4_doi_remove(doi, &audit_info); + if (ret_val == 0) + atomic_dec(&netlabel_mgmt_protocount); + } audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, &audit_info); diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h index 220cb9d06b4..c8a4079261f 100644 --- a/net/netlabel/netlabel_cipso_v4.h +++ b/net/netlabel/netlabel_cipso_v4.h @@ -45,12 +45,13 @@ * NLBL_CIPSOV4_A_MTYPE * NLBL_CIPSOV4_A_TAGLST * - * If using CIPSO_V4_MAP_STD the following attributes are required: + * If using CIPSO_V4_MAP_TRANS the following attributes are required: * * NLBL_CIPSOV4_A_MLSLVLLST * NLBL_CIPSOV4_A_MLSCATLST * - * If using CIPSO_V4_MAP_PASS no additional attributes are required. + * If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes + * are required. * * o REMOVE: * Sent by an application to remove a specific DOI mapping table from the @@ -76,12 +77,13 @@ * NLBL_CIPSOV4_A_MTYPE * NLBL_CIPSOV4_A_TAGLST * - * If using CIPSO_V4_MAP_STD the following attributes are required: + * If using CIPSO_V4_MAP_TRANS the following attributes are required: * * NLBL_CIPSOV4_A_MLSLVLLST * NLBL_CIPSOV4_A_MLSCATLST * - * If using CIPSO_V4_MAP_PASS no additional attributes are required. + * If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes + * are required. * * o LISTALL: * This message is sent by an application to list the valid DOIs on the diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index 643c032a3a5..5fadf10e5dd 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -11,7 +11,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,6 +40,7 @@ #include <asm/bug.h> #include "netlabel_mgmt.h" +#include "netlabel_addrlist.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" @@ -72,8 +73,28 @@ static struct netlbl_dom_map *netlbl_domhsh_def = NULL; static void netlbl_domhsh_free_entry(struct rcu_head *entry) { struct netlbl_dom_map *ptr; + struct netlbl_af4list *iter4; + struct netlbl_af4list *tmp4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; + struct netlbl_af6list *tmp6; +#endif /* IPv6 */ ptr = container_of(entry, struct netlbl_dom_map, rcu); + if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) { + netlbl_af4list_foreach_safe(iter4, tmp4, + &ptr->type_def.addrsel->list4) { + netlbl_af4list_remove_entry(iter4); + kfree(netlbl_domhsh_addr4_entry(iter4)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_safe(iter6, tmp6, + &ptr->type_def.addrsel->list6) { + netlbl_af6list_remove_entry(iter6); + kfree(netlbl_domhsh_addr6_entry(iter6)); + } +#endif /* IPv6 */ + } kfree(ptr->domain); kfree(ptr); } @@ -115,13 +136,13 @@ static u32 netlbl_domhsh_hash(const char *key) static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) { u32 bkt; + struct list_head *bkt_list; struct netlbl_dom_map *iter; if (domain != NULL) { bkt = netlbl_domhsh_hash(domain); - list_for_each_entry_rcu(iter, - &rcu_dereference(netlbl_domhsh)->tbl[bkt], - list) + bkt_list = &rcu_dereference(netlbl_domhsh)->tbl[bkt]; + list_for_each_entry_rcu(iter, bkt_list, list) if (iter->valid && strcmp(iter->domain, domain) == 0) return iter; } @@ -156,6 +177,69 @@ static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) return entry; } +/** + * netlbl_domhsh_audit_add - Generate an audit entry for an add event + * @entry: the entry being added + * @addr4: the IPv4 address information + * @addr6: the IPv6 address information + * @result: the result code + * @audit_info: NetLabel audit information + * + * Description: + * Generate an audit record for adding a new NetLabel/LSM mapping entry with + * the given information. Caller is responsibile for holding the necessary + * locks. + * + */ +static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, + struct netlbl_af4list *addr4, + struct netlbl_af6list *addr6, + int result, + struct netlbl_audit *audit_info) +{ + struct audit_buffer *audit_buf; + struct cipso_v4_doi *cipsov4 = NULL; + u32 type; + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); + if (audit_buf != NULL) { + audit_log_format(audit_buf, " nlbl_domain=%s", + entry->domain ? entry->domain : "(default)"); + if (addr4 != NULL) { + struct netlbl_domaddr4_map *map4; + map4 = netlbl_domhsh_addr4_entry(addr4); + type = map4->type; + cipsov4 = map4->type_def.cipsov4; + netlbl_af4list_audit_addr(audit_buf, 0, NULL, + addr4->addr, addr4->mask); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } else if (addr6 != NULL) { + struct netlbl_domaddr6_map *map6; + map6 = netlbl_domhsh_addr6_entry(addr6); + type = map6->type; + netlbl_af6list_audit_addr(audit_buf, 0, NULL, + &addr6->addr, &addr6->mask); +#endif /* IPv6 */ + } else { + type = entry->type; + cipsov4 = entry->type_def.cipsov4; + } + switch (type) { + case NETLBL_NLTYPE_UNLABELED: + audit_log_format(audit_buf, " nlbl_protocol=unlbl"); + break; + case NETLBL_NLTYPE_CIPSOV4: + BUG_ON(cipsov4 == NULL); + audit_log_format(audit_buf, + " nlbl_protocol=cipsov4 cipso_doi=%u", + cipsov4->doi); + break; + } + audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); + audit_log_end(audit_buf); + } +} + /* * Domain Hash Table Functions */ @@ -213,74 +297,106 @@ int __init netlbl_domhsh_init(u32 size) int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { - int ret_val; - u32 bkt; - struct audit_buffer *audit_buf; - - switch (entry->type) { - case NETLBL_NLTYPE_UNLABELED: - ret_val = 0; - break; - case NETLBL_NLTYPE_CIPSOV4: - ret_val = cipso_v4_doi_domhsh_add(entry->type_def.cipsov4, - entry->domain); - break; - default: - return -EINVAL; - } - if (ret_val != 0) - return ret_val; - - entry->valid = 1; - INIT_RCU_HEAD(&entry->rcu); + int ret_val = 0; + struct netlbl_dom_map *entry_old; + struct netlbl_af4list *iter4; + struct netlbl_af4list *tmp4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; + struct netlbl_af6list *tmp6; +#endif /* IPv6 */ rcu_read_lock(); + spin_lock(&netlbl_domhsh_lock); - if (entry->domain != NULL) { - bkt = netlbl_domhsh_hash(entry->domain); - if (netlbl_domhsh_search(entry->domain) == NULL) + if (entry->domain != NULL) + entry_old = netlbl_domhsh_search(entry->domain); + else + entry_old = netlbl_domhsh_search_def(entry->domain); + if (entry_old == NULL) { + entry->valid = 1; + INIT_RCU_HEAD(&entry->rcu); + + if (entry->domain != NULL) { + u32 bkt = netlbl_domhsh_hash(entry->domain); list_add_tail_rcu(&entry->list, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); - else - ret_val = -EEXIST; - } else { - INIT_LIST_HEAD(&entry->list); - if (rcu_dereference(netlbl_domhsh_def) == NULL) + } else { + INIT_LIST_HEAD(&entry->list); rcu_assign_pointer(netlbl_domhsh_def, entry); - else - ret_val = -EEXIST; - } - spin_unlock(&netlbl_domhsh_lock); - audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); - if (audit_buf != NULL) { - audit_log_format(audit_buf, - " nlbl_domain=%s", - entry->domain ? entry->domain : "(default)"); - switch (entry->type) { - case NETLBL_NLTYPE_UNLABELED: - audit_log_format(audit_buf, " nlbl_protocol=unlbl"); - break; - case NETLBL_NLTYPE_CIPSOV4: - audit_log_format(audit_buf, - " nlbl_protocol=cipsov4 cipso_doi=%u", - entry->type_def.cipsov4->doi); - break; } - audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); - audit_log_end(audit_buf); - } - rcu_read_unlock(); - if (ret_val != 0) { - switch (entry->type) { - case NETLBL_NLTYPE_CIPSOV4: - if (cipso_v4_doi_domhsh_remove(entry->type_def.cipsov4, - entry->domain) != 0) - BUG(); - break; + if (entry->type == NETLBL_NLTYPE_ADDRSELECT) { + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) + netlbl_domhsh_audit_add(entry, iter4, NULL, + ret_val, audit_info); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, + &entry->type_def.addrsel->list6) + netlbl_domhsh_audit_add(entry, NULL, iter6, + ret_val, audit_info); +#endif /* IPv6 */ + } else + netlbl_domhsh_audit_add(entry, NULL, NULL, + ret_val, audit_info); + } else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT && + entry->type == NETLBL_NLTYPE_ADDRSELECT) { + struct list_head *old_list4; + struct list_head *old_list6; + + old_list4 = &entry_old->type_def.addrsel->list4; + old_list6 = &entry_old->type_def.addrsel->list6; + + /* we only allow the addition of address selectors if all of + * the selectors do not exist in the existing domain map */ + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) + if (netlbl_af4list_search_exact(iter4->addr, + iter4->mask, + old_list4)) { + ret_val = -EEXIST; + goto add_return; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, + &entry->type_def.addrsel->list6) + if (netlbl_af6list_search_exact(&iter6->addr, + &iter6->mask, + old_list6)) { + ret_val = -EEXIST; + goto add_return; + } +#endif /* IPv6 */ + + netlbl_af4list_foreach_safe(iter4, tmp4, + &entry->type_def.addrsel->list4) { + netlbl_af4list_remove_entry(iter4); + iter4->valid = 1; + ret_val = netlbl_af4list_add(iter4, old_list4); + netlbl_domhsh_audit_add(entry_old, iter4, NULL, + ret_val, audit_info); + if (ret_val != 0) + goto add_return; } - } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_safe(iter6, tmp6, + &entry->type_def.addrsel->list6) { + netlbl_af6list_remove_entry(iter6); + iter6->valid = 1; + ret_val = netlbl_af6list_add(iter6, old_list6); + netlbl_domhsh_audit_add(entry_old, NULL, iter6, + ret_val, audit_info); + if (ret_val != 0) + goto add_return; + } +#endif /* IPv6 */ + } else + ret_val = -EINVAL; +add_return: + spin_unlock(&netlbl_domhsh_lock); + rcu_read_unlock(); return ret_val; } @@ -302,35 +418,26 @@ int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, } /** - * netlbl_domhsh_remove - Removes an entry from the domain hash table - * @domain: the domain to remove + * netlbl_domhsh_remove_entry - Removes a given entry from the domain table + * @entry: the entry to remove * @audit_info: NetLabel audit information * * Description: * Removes an entry from the domain hash table and handles any updates to the - * lower level protocol handler (i.e. CIPSO). Returns zero on success, - * negative on failure. + * lower level protocol handler (i.e. CIPSO). Caller is responsible for + * ensuring that the RCU read lock is held. Returns zero on success, negative + * on failure. * */ -int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) +int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info) { - int ret_val = -ENOENT; - struct netlbl_dom_map *entry; + int ret_val = 0; struct audit_buffer *audit_buf; - rcu_read_lock(); - if (domain) - entry = netlbl_domhsh_search(domain); - else - entry = netlbl_domhsh_search_def(domain); if (entry == NULL) - goto remove_return; - switch (entry->type) { - case NETLBL_NLTYPE_CIPSOV4: - cipso_v4_doi_domhsh_remove(entry->type_def.cipsov4, - entry->domain); - break; - } + return -ENOENT; + spin_lock(&netlbl_domhsh_lock); if (entry->valid) { entry->valid = 0; @@ -338,8 +445,8 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) list_del_rcu(&entry->list); else rcu_assign_pointer(netlbl_domhsh_def, NULL); - ret_val = 0; - } + } else + ret_val = -ENOENT; spin_unlock(&netlbl_domhsh_lock); audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); @@ -351,10 +458,54 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) audit_log_end(audit_buf); } -remove_return: - rcu_read_unlock(); - if (ret_val == 0) + if (ret_val == 0) { + struct netlbl_af4list *iter4; + struct netlbl_domaddr4_map *map4; + + switch (entry->type) { + case NETLBL_NLTYPE_ADDRSELECT: + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) { + map4 = netlbl_domhsh_addr4_entry(iter4); + cipso_v4_doi_putdef(map4->type_def.cipsov4); + } + /* no need to check the IPv6 list since we currently + * support only unlabeled protocols for IPv6 */ + break; + case NETLBL_NLTYPE_CIPSOV4: + cipso_v4_doi_putdef(entry->type_def.cipsov4); + break; + } call_rcu(&entry->rcu, netlbl_domhsh_free_entry); + } + + return ret_val; +} + +/** + * netlbl_domhsh_remove - Removes an entry from the domain hash table + * @domain: the domain to remove + * @audit_info: NetLabel audit information + * + * Description: + * Removes an entry from the domain hash table and handles any updates to the + * lower level protocol handler (i.e. CIPSO). Returns zero on success, + * negative on failure. + * + */ +int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) +{ + int ret_val; + struct netlbl_dom_map *entry; + + rcu_read_lock(); + if (domain) + entry = netlbl_domhsh_search(domain); + else + entry = netlbl_domhsh_search_def(domain); + ret_val = netlbl_domhsh_remove_entry(entry, audit_info); + rcu_read_unlock(); + return ret_val; } @@ -389,6 +540,70 @@ struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) } /** + * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table + * @domain: the domain name to search for + * @addr: the IP address to search for + * + * Description: + * Look through the domain hash table searching for an entry to match @domain + * and @addr, return a pointer to a copy of the entry or NULL. The caller is + * responsible for ensuring that rcu_read_[un]lock() is called. + * + */ +struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, + __be32 addr) +{ + struct netlbl_dom_map *dom_iter; + struct netlbl_af4list *addr_iter; + + dom_iter = netlbl_domhsh_search_def(domain); + if (dom_iter == NULL) + return NULL; + if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) + return NULL; + + addr_iter = netlbl_af4list_search(addr, + &dom_iter->type_def.addrsel->list4); + if (addr_iter == NULL) + return NULL; + + return netlbl_domhsh_addr4_entry(addr_iter); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table + * @domain: the domain name to search for + * @addr: the IP address to search for + * + * Description: + * Look through the domain hash table searching for an entry to match @domain + * and @addr, return a pointer to a copy of the entry or NULL. The caller is + * responsible for ensuring that rcu_read_[un]lock() is called. + * + */ +struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, + const struct in6_addr *addr) +{ + struct netlbl_dom_map *dom_iter; + struct netlbl_af6list *addr_iter; + + dom_iter = netlbl_domhsh_search_def(domain); + if (dom_iter == NULL) + return NULL; + if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) + return NULL; + + addr_iter = netlbl_af6list_search(addr, + &dom_iter->type_def.addrsel->list6); + if (addr_iter == NULL) + return NULL; + + return netlbl_domhsh_addr6_entry(addr_iter); +} +#endif /* IPv6 */ + +/** * netlbl_domhsh_walk - Iterate through the domain mapping hash table * @skip_bkt: the number of buckets to skip at the start * @skip_chain: the number of entries to skip in the first iterated bucket @@ -410,6 +625,7 @@ int netlbl_domhsh_walk(u32 *skip_bkt, { int ret_val = -ENOENT; u32 iter_bkt; + struct list_head *iter_list; struct netlbl_dom_map *iter_entry; u32 chain_cnt = 0; @@ -417,9 +633,8 @@ int netlbl_domhsh_walk(u32 *skip_bkt, for (iter_bkt = *skip_bkt; iter_bkt < rcu_dereference(netlbl_domhsh)->size; iter_bkt++, chain_cnt = 0) { - list_for_each_entry_rcu(iter_entry, - &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt], - list) + iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt]; + list_for_each_entry_rcu(iter_entry, iter_list, list) if (iter_entry->valid) { if (chain_cnt++ < *skip_chain) continue; diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h index 8220990ceb9..bfcb6763a1a 100644 --- a/net/netlabel/netlabel_domainhash.h +++ b/net/netlabel/netlabel_domainhash.h @@ -11,7 +11,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,16 +36,43 @@ #include <linux/rcupdate.h> #include <linux/list.h> +#include "netlabel_addrlist.h" + /* Domain hash table size */ /* XXX - currently this number is an uneducated guess */ #define NETLBL_DOMHSH_BITSIZE 7 -/* Domain mapping definition struct */ +/* Domain mapping definition structures */ +#define netlbl_domhsh_addr4_entry(iter) \ + container_of(iter, struct netlbl_domaddr4_map, list) +struct netlbl_domaddr4_map { + u32 type; + union { + struct cipso_v4_doi *cipsov4; + } type_def; + + struct netlbl_af4list list; +}; +#define netlbl_domhsh_addr6_entry(iter) \ + container_of(iter, struct netlbl_domaddr6_map, list) +struct netlbl_domaddr6_map { + u32 type; + + /* NOTE: no 'type_def' union needed at present since we don't currently + * support any IPv6 labeling protocols */ + + struct netlbl_af6list list; +}; +struct netlbl_domaddr_map { + struct list_head list4; + struct list_head list6; +}; struct netlbl_dom_map { char *domain; u32 type; union { struct cipso_v4_doi *cipsov4; + struct netlbl_domaddr_map *addrsel; } type_def; u32 valid; @@ -61,12 +88,21 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info); int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info); int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info); struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain); +struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, + __be32 addr); int netlbl_domhsh_walk(u32 *skip_bkt, u32 *skip_chain, int (*callback) (struct netlbl_dom_map *entry, void *arg), void *cb_arg); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, + const struct in6_addr *addr); +#endif /* IPv6 */ + #endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 39793a1a93a..b32eceb3ab0 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -10,7 +10,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -82,7 +82,7 @@ int netlbl_cfg_unlbl_add_map(const char *domain, entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) - goto cfg_unlbl_add_map_failure; + return -ENOMEM; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) @@ -104,49 +104,6 @@ cfg_unlbl_add_map_failure: } /** - * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition - * @doi_def: the DOI definition - * @audit_info: NetLabel audit information - * - * Description: - * Add a new CIPSOv4 DOI definition to the NetLabel subsystem. Returns zero on - * success, negative values on failure. - * - */ -int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, - struct netlbl_audit *audit_info) -{ - int ret_val; - const char *type_str; - struct audit_buffer *audit_buf; - - ret_val = cipso_v4_doi_add(doi_def); - - audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, - audit_info); - if (audit_buf != NULL) { - switch (doi_def->type) { - case CIPSO_V4_MAP_STD: - type_str = "std"; - break; - case CIPSO_V4_MAP_PASS: - type_str = "pass"; - break; - default: - type_str = "(unknown)"; - } - audit_log_format(audit_buf, - " cipso_doi=%u cipso_type=%s res=%u", - doi_def->doi, - type_str, - ret_val == 0 ? 1 : 0); - audit_log_end(audit_buf); - } - - return ret_val; -} - -/** * netlbl_cfg_cipsov4_add_map - Add a new CIPSOv4 DOI definition and mapping * @doi_def: the DOI definition * @domain: the domain mapping to add @@ -164,58 +121,71 @@ int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -ENOMEM; + u32 doi; + u32 doi_type; struct netlbl_dom_map *entry; + const char *type_str; + struct audit_buffer *audit_buf; + + doi = doi_def->doi; + doi_type = doi_def->type; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) - goto cfg_cipsov4_add_map_failure; + return -ENOMEM; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) goto cfg_cipsov4_add_map_failure; } - entry->type = NETLBL_NLTYPE_CIPSOV4; - entry->type_def.cipsov4 = doi_def; - - /* Grab a RCU read lock here so nothing happens to the doi_def variable - * between adding it to the CIPSOv4 protocol engine and adding a - * domain mapping for it. */ - rcu_read_lock(); - ret_val = netlbl_cfg_cipsov4_add(doi_def, audit_info); + ret_val = cipso_v4_doi_add(doi_def); if (ret_val != 0) - goto cfg_cipsov4_add_map_failure_unlock; + goto cfg_cipsov4_add_map_failure_remove_doi; + entry->type = NETLBL_NLTYPE_CIPSOV4; + entry->type_def.cipsov4 = cipso_v4_doi_getdef(doi); + if (entry->type_def.cipsov4 == NULL) { + ret_val = -ENOENT; + goto cfg_cipsov4_add_map_failure_remove_doi; + } ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) - goto cfg_cipsov4_add_map_failure_remove_doi; - rcu_read_unlock(); + goto cfg_cipsov4_add_map_failure_release_doi; - return 0; +cfg_cipsov4_add_map_return: + audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, + audit_info); + if (audit_buf != NULL) { + switch (doi_type) { + case CIPSO_V4_MAP_TRANS: + type_str = "trans"; + break; + case CIPSO_V4_MAP_PASS: + type_str = "pass"; + break; + case CIPSO_V4_MAP_LOCAL: + type_str = "local"; + break; + default: + type_str = "(unknown)"; + } + audit_log_format(audit_buf, + " cipso_doi=%u cipso_type=%s res=%u", + doi, type_str, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + return ret_val; + +cfg_cipsov4_add_map_failure_release_doi: + cipso_v4_doi_putdef(doi_def); cfg_cipsov4_add_map_failure_remove_doi: - cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free); -cfg_cipsov4_add_map_failure_unlock: - rcu_read_unlock(); + cipso_v4_doi_remove(doi, audit_info); cfg_cipsov4_add_map_failure: if (entry != NULL) kfree(entry->domain); kfree(entry); - return ret_val; -} - -/** - * netlbl_cfg_cipsov4_del - Removean existing CIPSOv4 DOI definition - * @doi: the CIPSO DOI value - * @audit_info: NetLabel audit information - * - * Description: - * Removes an existing CIPSOv4 DOI definition from the NetLabel subsystem. - * Returns zero on success, negative values on failure. - * - */ -int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info) -{ - return cipso_v4_doi_remove(doi, audit_info, netlbl_cipsov4_doi_free); + goto cfg_cipsov4_add_map_return; } /* @@ -452,7 +422,9 @@ int netlbl_enabled(void) * Attach the correct label to the given socket using the security attributes * specified in @secattr. This function requires exclusive access to @sk, * which means it either needs to be in the process of being created or locked. - * Returns zero on success, negative values on failure. + * Returns zero on success, -EDESTADDRREQ if the domain is configured to use + * network address selectors (can't blindly label the socket), and negative + * values on all other failures. * */ int netlbl_sock_setattr(struct sock *sk, @@ -466,6 +438,9 @@ int netlbl_sock_setattr(struct sock *sk, if (dom_entry == NULL) goto socket_setattr_return; switch (dom_entry->type) { + case NETLBL_NLTYPE_ADDRSELECT: + ret_val = -EDESTADDRREQ; + break; case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_sock_setattr(sk, dom_entry->type_def.cipsov4, @@ -484,6 +459,20 @@ socket_setattr_return: } /** + * netlbl_sock_delattr - Delete all the NetLabel labels on a socket + * @sk: the socket + * + * Description: + * Remove all the NetLabel labeling from @sk. The caller is responsible for + * ensuring that @sk is locked. + * + */ +void netlbl_sock_delattr(struct sock *sk) +{ + cipso_v4_sock_delattr(sk); +} + +/** * netlbl_sock_getattr - Determine the security attributes of a sock * @sk: the sock * @secattr: the security attributes @@ -501,6 +490,128 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) } /** + * netlbl_conn_setattr - Label a connected socket using the correct protocol + * @sk: the socket to label + * @addr: the destination address + * @secattr: the security attributes + * + * Description: + * Attach the correct label to the given connected socket using the security + * attributes specified in @secattr. The caller is responsible for ensuring + * that @sk is locked. Returns zero on success, negative values on failure. + * + */ +int netlbl_conn_setattr(struct sock *sk, + struct sockaddr *addr, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct sockaddr_in *addr4; + struct netlbl_domaddr4_map *af4_entry; + + rcu_read_lock(); + switch (addr->sa_family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + af4_entry = netlbl_domhsh_getentry_af4(secattr->domain, + addr4->sin_addr.s_addr); + if (af4_entry == NULL) { + ret_val = -ENOENT; + goto conn_setattr_return; + } + switch (af4_entry->type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = cipso_v4_sock_setattr(sk, + af4_entry->type_def.cipsov4, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + cipso_v4_sock_delattr(sk); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + /* since we don't support any IPv6 labeling protocols right + * now we can optimize everything away until we do */ + ret_val = 0; + break; +#endif /* IPv6 */ + default: + ret_val = 0; + } + +conn_setattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** + * netlbl_skbuff_setattr - Label a packet using the correct protocol + * @skb: the packet + * @family: protocol family + * @secattr: the security attributes + * + * Description: + * Attach the correct label to the given packet using the security attributes + * specified in @secattr. Returns zero on success, negative values on failure. + * + */ +int netlbl_skbuff_setattr(struct sk_buff *skb, + u16 family, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct iphdr *hdr4; + struct netlbl_domaddr4_map *af4_entry; + + rcu_read_lock(); + switch (family) { + case AF_INET: + hdr4 = ip_hdr(skb); + af4_entry = netlbl_domhsh_getentry_af4(secattr->domain, + hdr4->daddr); + if (af4_entry == NULL) { + ret_val = -ENOENT; + goto skbuff_setattr_return; + } + switch (af4_entry->type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = cipso_v4_skbuff_setattr(skb, + af4_entry->type_def.cipsov4, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + ret_val = cipso_v4_skbuff_delattr(skb); + break; + default: + ret_val = -ENOENT; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + /* since we don't support any IPv6 labeling protocols right + * now we can optimize everything away until we do */ + ret_val = 0; + break; +#endif /* IPv6 */ + default: + ret_val = 0; + } + +skbuff_setattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** * netlbl_skbuff_getattr - Determine the security attributes of a packet * @skb: the packet * @family: protocol family @@ -528,6 +639,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, * netlbl_skbuff_err - Handle a LSM error on a sk_buff * @skb: the packet * @error: the error code + * @gateway: true if host is acting as a gateway, false otherwise * * Description: * Deal with a LSM problem when handling the packet in @skb, typically this is @@ -535,10 +647,10 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, * according to the packet's labeling protocol. * */ -void netlbl_skbuff_err(struct sk_buff *skb, int error) +void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) { if (CIPSO_V4_OPTEXIST(skb)) - cipso_v4_error(skb, error, 0); + cipso_v4_error(skb, error, gateway); } /** diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 44be5d5261f..ee769ecaa13 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -10,7 +10,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,9 +32,13 @@ #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/in6.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> +#include <net/ip.h> +#include <net/ipv6.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <asm/atomic.h> @@ -71,86 +75,337 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { }; /* - * NetLabel Command Handlers + * Helper Functions */ /** * netlbl_mgmt_add - Handle an ADD message - * @skb: the NETLINK buffer * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information * * Description: - * Process a user generated ADD message and add the domains from the message - * to the hash table. See netlabel.h for a description of the message format. - * Returns zero on success, negative values on failure. + * Helper function for the ADD and ADDDEF messages to add the domain mappings + * from the message to the hash table. See netlabel.h for a description of the + * message format. Returns zero on success, negative values on failure. * */ -static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) +static int netlbl_mgmt_add_common(struct genl_info *info, + struct netlbl_audit *audit_info) { int ret_val = -EINVAL; struct netlbl_dom_map *entry = NULL; - size_t tmp_size; + struct netlbl_domaddr_map *addrmap = NULL; + struct cipso_v4_doi *cipsov4 = NULL; u32 tmp_val; - struct netlbl_audit audit_info; - - if (!info->attrs[NLBL_MGMT_A_DOMAIN] || - !info->attrs[NLBL_MGMT_A_PROTOCOL]) - goto add_failure; - - netlbl_netlink_auditinfo(skb, &audit_info); entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) { ret_val = -ENOMEM; goto add_failure; } - tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]); - entry->domain = kmalloc(tmp_size, GFP_KERNEL); - if (entry->domain == NULL) { - ret_val = -ENOMEM; - goto add_failure; - } entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]); - nla_strlcpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); + if (info->attrs[NLBL_MGMT_A_DOMAIN]) { + size_t tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]); + entry->domain = kmalloc(tmp_size, GFP_KERNEL); + if (entry->domain == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + nla_strlcpy(entry->domain, + info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); + } + + /* NOTE: internally we allow/use a entry->type value of + * NETLBL_NLTYPE_ADDRSELECT but we don't currently allow users + * to pass that as a protocol value because we need to know the + * "real" protocol */ switch (entry->type) { case NETLBL_NLTYPE_UNLABELED: - ret_val = netlbl_domhsh_add(entry, &audit_info); break; case NETLBL_NLTYPE_CIPSOV4: if (!info->attrs[NLBL_MGMT_A_CV4DOI]) goto add_failure; tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]); - /* We should be holding a rcu_read_lock() here while we hold - * the result but since the entry will always be deleted when - * the CIPSO DOI is deleted we aren't going to keep the - * lock. */ - rcu_read_lock(); - entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val); - if (entry->type_def.cipsov4 == NULL) { - rcu_read_unlock(); + cipsov4 = cipso_v4_doi_getdef(tmp_val); + if (cipsov4 == NULL) goto add_failure; - } - ret_val = netlbl_domhsh_add(entry, &audit_info); - rcu_read_unlock(); + entry->type_def.cipsov4 = cipsov4; break; default: goto add_failure; } + + if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) { + struct in_addr *addr; + struct in_addr *mask; + struct netlbl_domaddr4_map *map; + + addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); + if (addrmap == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + if (nla_len(info->attrs[NLBL_MGMT_A_IPV4ADDR]) != + sizeof(struct in_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + if (nla_len(info->attrs[NLBL_MGMT_A_IPV4MASK]) != + sizeof(struct in_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]); + mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + map->list.addr = addr->s_addr & mask->s_addr; + map->list.mask = mask->s_addr; + map->list.valid = 1; + map->type = entry->type; + if (cipsov4) + map->type_def.cipsov4 = cipsov4; + + ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); + if (ret_val != 0) { + kfree(map); + goto add_failure; + } + + entry->type = NETLBL_NLTYPE_ADDRSELECT; + entry->type_def.addrsel = addrmap; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } else if (info->attrs[NLBL_MGMT_A_IPV6ADDR]) { + struct in6_addr *addr; + struct in6_addr *mask; + struct netlbl_domaddr6_map *map; + + addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); + if (addrmap == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + if (nla_len(info->attrs[NLBL_MGMT_A_IPV6ADDR]) != + sizeof(struct in6_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + if (nla_len(info->attrs[NLBL_MGMT_A_IPV6MASK]) != + sizeof(struct in6_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]); + mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + ipv6_addr_copy(&map->list.addr, addr); + map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + map->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + map->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + ipv6_addr_copy(&map->list.mask, mask); + map->list.valid = 1; + map->type = entry->type; + + ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); + if (ret_val != 0) { + kfree(map); + goto add_failure; + } + + entry->type = NETLBL_NLTYPE_ADDRSELECT; + entry->type_def.addrsel = addrmap; +#endif /* IPv6 */ + } + + ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) goto add_failure; return 0; add_failure: + if (cipsov4) + cipso_v4_doi_putdef(cipsov4); if (entry) kfree(entry->domain); + kfree(addrmap); kfree(entry); return ret_val; } /** + * netlbl_mgmt_listentry - List a NetLabel/LSM domain map entry + * @skb: the NETLINK buffer + * @entry: the map entry + * + * Description: + * This function is a helper function used by the LISTALL and LISTDEF command + * handlers. The caller is responsibile for ensuring that the RCU read lock + * is held. Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_listentry(struct sk_buff *skb, + struct netlbl_dom_map *entry) +{ + int ret_val; + struct nlattr *nla_a; + struct nlattr *nla_b; + struct netlbl_af4list *iter4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; +#endif + + if (entry->domain != NULL) { + ret_val = nla_put_string(skb, + NLBL_MGMT_A_DOMAIN, entry->domain); + if (ret_val != 0) + return ret_val; + } + + switch (entry->type) { + case NETLBL_NLTYPE_ADDRSELECT: + nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST); + if (nla_a == NULL) + return -ENOMEM; + + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) { + struct netlbl_domaddr4_map *map4; + struct in_addr addr_struct; + + nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR); + if (nla_b == NULL) + return -ENOMEM; + + addr_struct.s_addr = iter4->addr; + ret_val = nla_put(skb, NLBL_MGMT_A_IPV4ADDR, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + return ret_val; + addr_struct.s_addr = iter4->mask; + ret_val = nla_put(skb, NLBL_MGMT_A_IPV4MASK, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + return ret_val; + map4 = netlbl_domhsh_addr4_entry(iter4); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + map4->type); + if (ret_val != 0) + return ret_val; + switch (map4->type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, + map4->type_def.cipsov4->doi); + if (ret_val != 0) + return ret_val; + break; + } + + nla_nest_end(skb, nla_b); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, + &entry->type_def.addrsel->list6) { + struct netlbl_domaddr6_map *map6; + + nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR); + if (nla_b == NULL) + return -ENOMEM; + + ret_val = nla_put(skb, NLBL_MGMT_A_IPV6ADDR, + sizeof(struct in6_addr), + &iter6->addr); + if (ret_val != 0) + return ret_val; + ret_val = nla_put(skb, NLBL_MGMT_A_IPV6MASK, + sizeof(struct in6_addr), + &iter6->mask); + if (ret_val != 0) + return ret_val; + map6 = netlbl_domhsh_addr6_entry(iter6); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + map6->type); + if (ret_val != 0) + return ret_val; + + nla_nest_end(skb, nla_b); + } +#endif /* IPv6 */ + + nla_nest_end(skb, nla_a); + break; + case NETLBL_NLTYPE_UNLABELED: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type); + break; + case NETLBL_NLTYPE_CIPSOV4: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type); + if (ret_val != 0) + return ret_val; + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, + entry->type_def.cipsov4->doi); + break; + } + + return ret_val; +} + +/* + * NetLabel Command Handlers + */ + +/** + * netlbl_mgmt_add - Handle an ADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated ADD message and add the domains from the message + * to the hash table. See netlabel.h for a description of the message format. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) +{ + struct netlbl_audit audit_info; + + if ((!info->attrs[NLBL_MGMT_A_DOMAIN]) || + (!info->attrs[NLBL_MGMT_A_PROTOCOL]) || + (info->attrs[NLBL_MGMT_A_IPV4ADDR] && + info->attrs[NLBL_MGMT_A_IPV6ADDR]) || + (info->attrs[NLBL_MGMT_A_IPV4MASK] && + info->attrs[NLBL_MGMT_A_IPV6MASK]) || + ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) || + ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + return netlbl_mgmt_add_common(info, &audit_info); +} + +/** * netlbl_mgmt_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block @@ -198,23 +453,9 @@ static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg) if (data == NULL) goto listall_cb_failure; - ret_val = nla_put_string(cb_arg->skb, - NLBL_MGMT_A_DOMAIN, - entry->domain); + ret_val = netlbl_mgmt_listentry(cb_arg->skb, entry); if (ret_val != 0) goto listall_cb_failure; - ret_val = nla_put_u32(cb_arg->skb, NLBL_MGMT_A_PROTOCOL, entry->type); - if (ret_val != 0) - goto listall_cb_failure; - switch (entry->type) { - case NETLBL_NLTYPE_CIPSOV4: - ret_val = nla_put_u32(cb_arg->skb, - NLBL_MGMT_A_CV4DOI, - entry->type_def.cipsov4->doi); - if (ret_val != 0) - goto listall_cb_failure; - break; - } cb_arg->seq++; return genlmsg_end(cb_arg->skb, data); @@ -268,56 +509,22 @@ static int netlbl_mgmt_listall(struct sk_buff *skb, */ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) { - int ret_val = -EINVAL; - struct netlbl_dom_map *entry = NULL; - u32 tmp_val; struct netlbl_audit audit_info; - if (!info->attrs[NLBL_MGMT_A_PROTOCOL]) - goto adddef_failure; + if ((!info->attrs[NLBL_MGMT_A_PROTOCOL]) || + (info->attrs[NLBL_MGMT_A_IPV4ADDR] && + info->attrs[NLBL_MGMT_A_IPV6ADDR]) || + (info->attrs[NLBL_MGMT_A_IPV4MASK] && + info->attrs[NLBL_MGMT_A_IPV6MASK]) || + ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) || + ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) + return -EINVAL; netlbl_netlink_auditinfo(skb, &audit_info); - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (entry == NULL) { - ret_val = -ENOMEM; - goto adddef_failure; - } - entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]); - - switch (entry->type) { - case NETLBL_NLTYPE_UNLABELED: - ret_val = netlbl_domhsh_add_default(entry, &audit_info); - break; - case NETLBL_NLTYPE_CIPSOV4: - if (!info->attrs[NLBL_MGMT_A_CV4DOI]) - goto adddef_failure; - - tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]); - /* We should be holding a rcu_read_lock() here while we hold - * the result but since the entry will always be deleted when - * the CIPSO DOI is deleted we aren't going to keep the - * lock. */ - rcu_read_lock(); - entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val); - if (entry->type_def.cipsov4 == NULL) { - rcu_read_unlock(); - goto adddef_failure; - } - ret_val = netlbl_domhsh_add_default(entry, &audit_info); - rcu_read_unlock(); - break; - default: - goto adddef_failure; - } - if (ret_val != 0) - goto adddef_failure; - - return 0; - -adddef_failure: - kfree(entry); - return ret_val; + return netlbl_mgmt_add_common(info, &audit_info); } /** @@ -371,19 +578,10 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) ret_val = -ENOENT; goto listdef_failure_lock; } - ret_val = nla_put_u32(ans_skb, NLBL_MGMT_A_PROTOCOL, entry->type); - if (ret_val != 0) - goto listdef_failure_lock; - switch (entry->type) { - case NETLBL_NLTYPE_CIPSOV4: - ret_val = nla_put_u32(ans_skb, - NLBL_MGMT_A_CV4DOI, - entry->type_def.cipsov4->doi); - if (ret_val != 0) - goto listdef_failure_lock; - break; - } + ret_val = netlbl_mgmt_listentry(ans_skb, entry); rcu_read_unlock(); + if (ret_val != 0) + goto listdef_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h index a43bff169d6..05d96431f81 100644 --- a/net/netlabel/netlabel_mgmt.h +++ b/net/netlabel/netlabel_mgmt.h @@ -45,6 +45,16 @@ * NLBL_MGMT_A_DOMAIN * NLBL_MGMT_A_PROTOCOL * + * If IPv4 is specified the following attributes are required: + * + * NLBL_MGMT_A_IPV4ADDR + * NLBL_MGMT_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_MGMT_A_IPV6ADDR + * NLBL_MGMT_A_IPV6MASK + * * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required: * * NLBL_MGMT_A_CV4DOI @@ -68,13 +78,24 @@ * Required attributes: * * NLBL_MGMT_A_DOMAIN + * + * If the IP address selectors are not used the following attribute is + * required: + * * NLBL_MGMT_A_PROTOCOL * - * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required: + * If the IP address selectors are used then the following attritbute is + * required: + * + * NLBL_MGMT_A_SELECTORLIST + * + * If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following + * attributes are required: * * NLBL_MGMT_A_CV4DOI * - * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other + * attributes are required. * * o ADDDEF: * Sent by an application to set the default domain mapping for the NetLabel @@ -100,15 +121,23 @@ * application there is no payload. On success the kernel should send a * response using the following format. * - * Required attributes: + * If the IP address selectors are not used the following attribute is + * required: * * NLBL_MGMT_A_PROTOCOL * - * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required: + * If the IP address selectors are used then the following attritbute is + * required: + * + * NLBL_MGMT_A_SELECTORLIST + * + * If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following + * attributes are required: * * NLBL_MGMT_A_CV4DOI * - * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other + * attributes are required. * * o PROTOCOLS: * Sent by an application to request a list of configured NetLabel protocols @@ -162,6 +191,26 @@ enum { NLBL_MGMT_A_CV4DOI, /* (NLA_U32) * the CIPSOv4 DOI value */ + NLBL_MGMT_A_IPV6ADDR, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address */ + NLBL_MGMT_A_IPV6MASK, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address mask */ + NLBL_MGMT_A_IPV4ADDR, + /* (NLA_BINARY, struct in_addr) + * an IPv4 address */ + NLBL_MGMT_A_IPV4MASK, + /* (NLA_BINARY, struct in_addr) + * and IPv4 address mask */ + NLBL_MGMT_A_ADDRSELECTOR, + /* (NLA_NESTED) + * an IP address selector, must contain an address, mask, and protocol + * attribute plus any protocol specific attributes */ + NLBL_MGMT_A_SELECTORLIST, + /* (NLA_NESTED) + * the selector list, there must be at least one + * NLBL_MGMT_A_ADDRSELECTOR attribute */ __NLBL_MGMT_A_MAX, }; #define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 921c118ead8..e8a5c32b0f1 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -10,7 +10,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -54,6 +54,7 @@ #include <asm/atomic.h> #include "netlabel_user.h" +#include "netlabel_addrlist.h" #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_mgmt.h" @@ -76,22 +77,20 @@ struct netlbl_unlhsh_tbl { struct list_head *tbl; u32 size; }; +#define netlbl_unlhsh_addr4_entry(iter) \ + container_of(iter, struct netlbl_unlhsh_addr4, list) struct netlbl_unlhsh_addr4 { - __be32 addr; - __be32 mask; u32 secid; - u32 valid; - struct list_head list; + struct netlbl_af4list list; struct rcu_head rcu; }; +#define netlbl_unlhsh_addr6_entry(iter) \ + container_of(iter, struct netlbl_unlhsh_addr6, list) struct netlbl_unlhsh_addr6 { - struct in6_addr addr; - struct in6_addr mask; u32 secid; - u32 valid; - struct list_head list; + struct netlbl_af6list list; struct rcu_head rcu; }; struct netlbl_unlhsh_iface { @@ -147,76 +146,6 @@ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1 }; /* - * Audit Helper Functions - */ - -/** - * netlbl_unlabel_audit_addr4 - Audit an IPv4 address - * @audit_buf: audit buffer - * @dev: network interface - * @addr: IP address - * @mask: IP address mask - * - * Description: - * Write the IPv4 address and address mask, if necessary, to @audit_buf. - * - */ -static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf, - const char *dev, - __be32 addr, __be32 mask) -{ - u32 mask_val = ntohl(mask); - - if (dev != NULL) - audit_log_format(audit_buf, " netif=%s", dev); - audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr)); - if (mask_val != 0xffffffff) { - u32 mask_len = 0; - while (mask_val > 0) { - mask_val <<= 1; - mask_len++; - } - audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); - } -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlabel_audit_addr6 - Audit an IPv6 address - * @audit_buf: audit buffer - * @dev: network interface - * @addr: IP address - * @mask: IP address mask - * - * Description: - * Write the IPv6 address and address mask, if necessary, to @audit_buf. - * - */ -static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf, - const char *dev, - const struct in6_addr *addr, - const struct in6_addr *mask) -{ - if (dev != NULL) - audit_log_format(audit_buf, " netif=%s", dev); - audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr)); - if (ntohl(mask->s6_addr32[3]) != 0xffffffff) { - u32 mask_len = 0; - u32 mask_val; - int iter = -1; - while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff) - mask_len += 32; - mask_val = ntohl(mask->s6_addr32[iter]); - while (mask_val > 0) { - mask_val <<= 1; - mask_len++; - } - audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); - } -} -#endif /* IPv6 */ - -/* * Unlabeled Connection Hash Table Functions */ @@ -274,26 +203,28 @@ static void netlbl_unlhsh_free_addr6(struct rcu_head *entry) static void netlbl_unlhsh_free_iface(struct rcu_head *entry) { struct netlbl_unlhsh_iface *iface; - struct netlbl_unlhsh_addr4 *iter4; - struct netlbl_unlhsh_addr4 *tmp4; - struct netlbl_unlhsh_addr6 *iter6; - struct netlbl_unlhsh_addr6 *tmp6; + struct netlbl_af4list *iter4; + struct netlbl_af4list *tmp4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; + struct netlbl_af6list *tmp6; +#endif /* IPv6 */ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); /* no need for locks here since we are the only one with access to this * structure */ - list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list) - if (iter4->valid) { - list_del_rcu(&iter4->list); - kfree(iter4); - } - list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list) - if (iter6->valid) { - list_del_rcu(&iter6->list); - kfree(iter6); - } + netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { + netlbl_af4list_remove_entry(iter4); + kfree(netlbl_unlhsh_addr4_entry(iter4)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { + netlbl_af6list_remove_entry(iter6); + kfree(netlbl_unlhsh_addr6_entry(iter6)); + } +#endif /* IPv6 */ kfree(iface); } @@ -316,59 +247,6 @@ static u32 netlbl_unlhsh_hash(int ifindex) } /** - * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry - * @addr: IPv4 address - * @iface: the network interface entry - * - * Description: - * Searches the IPv4 address list of the network interface specified by @iface. - * If a matching address entry is found it is returned, otherwise NULL is - * returned. The caller is responsible for calling the rcu_read_[un]lock() - * functions. - * - */ -static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4( - __be32 addr, - const struct netlbl_unlhsh_iface *iface) -{ - struct netlbl_unlhsh_addr4 *iter; - - list_for_each_entry_rcu(iter, &iface->addr4_list, list) - if (iter->valid && (addr & iter->mask) == iter->addr) - return iter; - - return NULL; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry - * @addr: IPv6 address - * @iface: the network interface entry - * - * Description: - * Searches the IPv6 address list of the network interface specified by @iface. - * If a matching address entry is found it is returned, otherwise NULL is - * returned. The caller is responsible for calling the rcu_read_[un]lock() - * functions. - * - */ -static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6( - const struct in6_addr *addr, - const struct netlbl_unlhsh_iface *iface) -{ - struct netlbl_unlhsh_addr6 *iter; - - list_for_each_entry_rcu(iter, &iface->addr6_list, list) - if (iter->valid && - ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0) - return iter; - - return NULL; -} -#endif /* IPv6 */ - -/** * netlbl_unlhsh_search_iface - Search for a matching interface entry * @ifindex: the network interface * @@ -381,12 +259,12 @@ static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6( static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) { u32 bkt; + struct list_head *bkt_list; struct netlbl_unlhsh_iface *iter; bkt = netlbl_unlhsh_hash(ifindex); - list_for_each_entry_rcu(iter, - &rcu_dereference(netlbl_unlhsh)->tbl[bkt], - list) + bkt_list = &rcu_dereference(netlbl_unlhsh)->tbl[bkt]; + list_for_each_entry_rcu(iter, bkt_list, list) if (iter->valid && iter->ifindex == ifindex) return iter; @@ -439,43 +317,26 @@ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, const struct in_addr *mask, u32 secid) { + int ret_val; struct netlbl_unlhsh_addr4 *entry; - struct netlbl_unlhsh_addr4 *iter; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; - entry->addr = addr->s_addr & mask->s_addr; - entry->mask = mask->s_addr; - entry->secid = secid; - entry->valid = 1; + entry->list.addr = addr->s_addr & mask->s_addr; + entry->list.mask = mask->s_addr; + entry->list.valid = 1; INIT_RCU_HEAD(&entry->rcu); + entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); - iter = netlbl_unlhsh_search_addr4(entry->addr, iface); - if (iter != NULL && - iter->addr == addr->s_addr && iter->mask == mask->s_addr) { - spin_unlock(&netlbl_unlhsh_lock); - kfree(entry); - return -EEXIST; - } - /* in order to speed up address searches through the list (the common - * case) we need to keep the list in order based on the size of the - * address mask such that the entry with the widest mask (smallest - * numerical value) appears first in the list */ - list_for_each_entry_rcu(iter, &iface->addr4_list, list) - if (iter->valid && - ntohl(entry->mask) > ntohl(iter->mask)) { - __list_add_rcu(&entry->list, - iter->list.prev, - &iter->list); - spin_unlock(&netlbl_unlhsh_lock); - return 0; - } - list_add_tail_rcu(&entry->list, &iface->addr4_list); + ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); - return 0; + + if (ret_val != 0) + kfree(entry); + return ret_val; } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) @@ -498,47 +359,29 @@ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, const struct in6_addr *mask, u32 secid) { + int ret_val; struct netlbl_unlhsh_addr6 *entry; - struct netlbl_unlhsh_addr6 *iter; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; - ipv6_addr_copy(&entry->addr, addr); - entry->addr.s6_addr32[0] &= mask->s6_addr32[0]; - entry->addr.s6_addr32[1] &= mask->s6_addr32[1]; - entry->addr.s6_addr32[2] &= mask->s6_addr32[2]; - entry->addr.s6_addr32[3] &= mask->s6_addr32[3]; - ipv6_addr_copy(&entry->mask, mask); - entry->secid = secid; - entry->valid = 1; + ipv6_addr_copy(&entry->list.addr, addr); + entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + ipv6_addr_copy(&entry->list.mask, mask); + entry->list.valid = 1; INIT_RCU_HEAD(&entry->rcu); + entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); - iter = netlbl_unlhsh_search_addr6(&entry->addr, iface); - if (iter != NULL && - (ipv6_addr_equal(&iter->addr, addr) && - ipv6_addr_equal(&iter->mask, mask))) { - spin_unlock(&netlbl_unlhsh_lock); - kfree(entry); - return -EEXIST; - } - /* in order to speed up address searches through the list (the common - * case) we need to keep the list in order based on the size of the - * address mask such that the entry with the widest mask (smallest - * numerical value) appears first in the list */ - list_for_each_entry_rcu(iter, &iface->addr6_list, list) - if (iter->valid && - ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) { - __list_add_rcu(&entry->list, - iter->list.prev, - &iter->list); - spin_unlock(&netlbl_unlhsh_lock); - return 0; - } - list_add_tail_rcu(&entry->list, &iface->addr6_list); + ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); + + if (ret_val != 0) + kfree(entry); return 0; } #endif /* IPv6 */ @@ -658,10 +501,10 @@ static int netlbl_unlhsh_add(struct net *net, mask4 = (struct in_addr *)mask; ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); if (audit_buf != NULL) - netlbl_unlabel_audit_addr4(audit_buf, - dev_name, - addr4->s_addr, - mask4->s_addr); + netlbl_af4list_audit_addr(audit_buf, 1, + dev_name, + addr4->s_addr, + mask4->s_addr); break; } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) @@ -672,9 +515,9 @@ static int netlbl_unlhsh_add(struct net *net, mask6 = (struct in6_addr *)mask; ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); if (audit_buf != NULL) - netlbl_unlabel_audit_addr6(audit_buf, - dev_name, - addr6, mask6); + netlbl_af6list_audit_addr(audit_buf, 1, + dev_name, + addr6, mask6); break; } #endif /* IPv6 */ @@ -719,35 +562,34 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, const struct in_addr *mask, struct netlbl_audit *audit_info) { - int ret_val = -ENOENT; + int ret_val = 0; + struct netlbl_af4list *list_entry; struct netlbl_unlhsh_addr4 *entry; - struct audit_buffer *audit_buf = NULL; + struct audit_buffer *audit_buf; struct net_device *dev; - char *secctx = NULL; + char *secctx; u32 secctx_len; spin_lock(&netlbl_unlhsh_lock); - entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface); - if (entry != NULL && - entry->addr == addr->s_addr && entry->mask == mask->s_addr) { - entry->valid = 0; - list_del_rcu(&entry->list); - ret_val = 0; - } + list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, + &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); + if (list_entry == NULL) + ret_val = -ENOENT; + entry = netlbl_unlhsh_addr4_entry(list_entry); audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); - netlbl_unlabel_audit_addr4(audit_buf, - (dev != NULL ? dev->name : NULL), - entry->addr, entry->mask); + netlbl_af4list_audit_addr(audit_buf, 1, + (dev != NULL ? dev->name : NULL), + addr->s_addr, mask->s_addr); if (dev != NULL) dev_put(dev); - if (security_secid_to_secctx(entry->secid, - &secctx, - &secctx_len) == 0) { + if (entry && security_secid_to_secctx(entry->secid, + &secctx, + &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } @@ -781,36 +623,33 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, const struct in6_addr *mask, struct netlbl_audit *audit_info) { - int ret_val = -ENOENT; + int ret_val = 0; + struct netlbl_af6list *list_entry; struct netlbl_unlhsh_addr6 *entry; - struct audit_buffer *audit_buf = NULL; + struct audit_buffer *audit_buf; struct net_device *dev; - char *secctx = NULL; + char *secctx; u32 secctx_len; spin_lock(&netlbl_unlhsh_lock); - entry = netlbl_unlhsh_search_addr6(addr, iface); - if (entry != NULL && - (ipv6_addr_equal(&entry->addr, addr) && - ipv6_addr_equal(&entry->mask, mask))) { - entry->valid = 0; - list_del_rcu(&entry->list); - ret_val = 0; - } + list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); + if (list_entry == NULL) + ret_val = -ENOENT; + entry = netlbl_unlhsh_addr6_entry(list_entry); audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); - netlbl_unlabel_audit_addr6(audit_buf, - (dev != NULL ? dev->name : NULL), - addr, mask); + netlbl_af6list_audit_addr(audit_buf, 1, + (dev != NULL ? dev->name : NULL), + addr, mask); if (dev != NULL) dev_put(dev); - if (security_secid_to_secctx(entry->secid, - &secctx, - &secctx_len) == 0) { + if (entry && security_secid_to_secctx(entry->secid, + &secctx, + &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } @@ -836,16 +675,18 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, */ static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) { - struct netlbl_unlhsh_addr4 *iter4; - struct netlbl_unlhsh_addr6 *iter6; + struct netlbl_af4list *iter4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; +#endif /* IPv6 */ spin_lock(&netlbl_unlhsh_lock); - list_for_each_entry_rcu(iter4, &iface->addr4_list, list) - if (iter4->valid) - goto unlhsh_condremove_failure; - list_for_each_entry_rcu(iter6, &iface->addr6_list, list) - if (iter6->valid) - goto unlhsh_condremove_failure; + netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) + goto unlhsh_condremove_failure; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) + goto unlhsh_condremove_failure; +#endif /* IPv6 */ iface->valid = 0; if (iface->ifindex > 0) list_del_rcu(&iface->list); @@ -1349,7 +1190,7 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, if (addr4) { struct in_addr addr_struct; - addr_struct.s_addr = addr4->addr; + addr_struct.s_addr = addr4->list.addr; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_IPV4ADDR, sizeof(struct in_addr), @@ -1357,7 +1198,7 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, if (ret_val != 0) goto list_cb_failure; - addr_struct.s_addr = addr4->mask; + addr_struct.s_addr = addr4->list.mask; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_IPV4MASK, sizeof(struct in_addr), @@ -1370,14 +1211,14 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_IPV6ADDR, sizeof(struct in6_addr), - &addr6->addr); + &addr6->list.addr); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_IPV6MASK, sizeof(struct in6_addr), - &addr6->mask); + &addr6->list.mask); if (ret_val != 0) goto list_cb_failure; @@ -1425,8 +1266,11 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, u32 iter_bkt; u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; - struct netlbl_unlhsh_addr4 *addr4; - struct netlbl_unlhsh_addr6 *addr6; + struct list_head *iter_list; + struct netlbl_af4list *addr4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *addr6; +#endif cb_arg.nl_cb = cb; cb_arg.skb = skb; @@ -1436,44 +1280,43 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { - list_for_each_entry_rcu(iface, - &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt], - list) { + iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; + list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || iter_chain++ < skip_chain) continue; - list_for_each_entry_rcu(addr4, - &iface->addr4_list, - list) { - if (!addr4->valid || iter_addr4++ < skip_addr4) + netlbl_af4list_foreach_rcu(addr4, + &iface->addr4_list) { + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( - NLBL_UNLABEL_C_STATICLIST, - iface, - addr4, - NULL, - &cb_arg) < 0) { + NLBL_UNLABEL_C_STATICLIST, + iface, + netlbl_unlhsh_addr4_entry(addr4), + NULL, + &cb_arg) < 0) { iter_addr4--; iter_chain--; goto unlabel_staticlist_return; } } - list_for_each_entry_rcu(addr6, - &iface->addr6_list, - list) { - if (!addr6->valid || iter_addr6++ < skip_addr6) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(addr6, + &iface->addr6_list) { + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( - NLBL_UNLABEL_C_STATICLIST, - iface, - NULL, - addr6, - &cb_arg) < 0) { + NLBL_UNLABEL_C_STATICLIST, + iface, + NULL, + netlbl_unlhsh_addr6_entry(addr6), + &cb_arg) < 0) { iter_addr6--; iter_chain--; goto unlabel_staticlist_return; } } +#endif /* IPv6 */ } } @@ -1504,9 +1347,12 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, struct netlbl_unlhsh_iface *iface; u32 skip_addr4 = cb->args[0]; u32 skip_addr6 = cb->args[1]; - u32 iter_addr4 = 0, iter_addr6 = 0; - struct netlbl_unlhsh_addr4 *addr4; - struct netlbl_unlhsh_addr6 *addr6; + u32 iter_addr4 = 0; + struct netlbl_af4list *addr4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + u32 iter_addr6 = 0; + struct netlbl_af6list *addr6; +#endif cb_arg.nl_cb = cb; cb_arg.skb = skb; @@ -1517,30 +1363,32 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, if (iface == NULL || !iface->valid) goto unlabel_staticlistdef_return; - list_for_each_entry_rcu(addr4, &iface->addr4_list, list) { - if (!addr4->valid || iter_addr4++ < skip_addr4) + netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, - iface, - addr4, - NULL, - &cb_arg) < 0) { + iface, + netlbl_unlhsh_addr4_entry(addr4), + NULL, + &cb_arg) < 0) { iter_addr4--; goto unlabel_staticlistdef_return; } } - list_for_each_entry_rcu(addr6, &iface->addr6_list, list) { - if (!addr6->valid || iter_addr6++ < skip_addr6) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, - iface, - NULL, - addr6, - &cb_arg) < 0) { + iface, + NULL, + netlbl_unlhsh_addr6_entry(addr6), + &cb_arg) < 0) { iter_addr6--; goto unlabel_staticlistdef_return; } } +#endif /* IPv6 */ unlabel_staticlistdef_return: rcu_read_unlock(); @@ -1718,25 +1566,27 @@ int netlbl_unlabel_getattr(const struct sk_buff *skb, switch (family) { case PF_INET: { struct iphdr *hdr4; - struct netlbl_unlhsh_addr4 *addr4; + struct netlbl_af4list *addr4; hdr4 = ip_hdr(skb); - addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface); + addr4 = netlbl_af4list_search(hdr4->saddr, + &iface->addr4_list); if (addr4 == NULL) goto unlabel_getattr_nolabel; - secattr->attr.secid = addr4->secid; + secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; break; } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: { struct ipv6hdr *hdr6; - struct netlbl_unlhsh_addr6 *addr6; + struct netlbl_af6list *addr6; hdr6 = ipv6_hdr(skb); - addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface); + addr6 = netlbl_af6list_search(&hdr6->saddr, + &iface->addr6_list); if (addr6 == NULL) goto unlabel_getattr_nolabel; - secattr->attr.secid = addr6->secid; + secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; break; } #endif /* IPv6 */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b0eacc0007c..480184a857d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1,7 +1,7 @@ /* * NETLINK Kernel-user communication protocol. * - * Authors: Alan Cox <alan@redhat.com> + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or @@ -435,7 +435,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol) return -EPROTONOSUPPORT; netlink_lock_table(); -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 9e9c6fce11a..b9d97effebe 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -67,11 +67,10 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol) } pnp = phonet_proto_get(protocol); -#ifdef CONFIG_KMOD if (pnp == NULL && request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0) pnp = phonet_proto_get(protocol); -#endif + if (pnp == NULL) return -EPROTONOSUPPORT; if (sock->type != pnp->sock_type) { diff --git a/net/rfkill/rfkill-input.c b/net/rfkill/rfkill-input.c index e5b69556bb5..21124ec0a73 100644 --- a/net/rfkill/rfkill-input.c +++ b/net/rfkill/rfkill-input.c @@ -16,6 +16,7 @@ #include <linux/workqueue.h> #include <linux/init.h> #include <linux/rfkill.h> +#include <linux/sched.h> #include "rfkill-input.h" diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9974b3f04f0..8f457f1e0ac 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -494,7 +494,7 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES rtnl_unlock(); request_module("act_%s", act_name); rtnl_lock(); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8eb79e92e94..16e7ac9774e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -227,7 +227,7 @@ replay: err = -ENOENT; tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); if (tp_ops == NULL) { -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES struct nlattr *kind = tca[TCA_KIND]; char name[IFNAMSIZ]; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 5e6f82e0e6f..e82519e548d 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -224,7 +224,7 @@ static int tcf_em_validate(struct tcf_proto *tp, if (em->ops == NULL) { err = -ENOENT; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES __rtnl_unlock(); request_module("ematch-kind-%u", em_hdr->kind); rtnl_lock(); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1122c952aa9..b16ad2972c6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -764,7 +764,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { diff --git a/net/socket.c b/net/socket.c index 3e8d4e35c08..2b7a4b5c9b7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1142,7 +1142,7 @@ static int __sock_create(struct net *net, int family, int type, int protocol, sock->type = type; -#if defined(CONFIG_KMOD) +#ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 6bfea9ed686..436bf1b4b76 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -83,10 +83,8 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) if (flavor >= RPC_AUTH_MAXFLAVOR) goto out; -#ifdef CONFIG_KMOD if ((ops = auth_flavors[flavor]) == NULL) request_module("rpc-auth-%u", flavor); -#endif spin_lock(&rpc_authflavor_lock); ops = auth_flavors[flavor]; if (ops == NULL || !try_module_get(ops->owner)) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 76739e928d0..4895c341e46 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_protname = program->name; - clnt->cl_prog = program->number; + clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; clnt->cl_stats = program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); @@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru } /* save the nodename */ - clnt->cl_nodelen = strlen(utsname()->nodename); + clnt->cl_nodelen = strlen(init_utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); rpc_register_client(clnt); return clnt; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 24db2b4d12d..41013dd66ac 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -20,6 +20,7 @@ #include <linux/in6.h> #include <linux/kernel.h> #include <linux/errno.h> +#include <net/ipv6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> @@ -176,13 +177,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, } static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, - u32 version, struct rpc_message *msg, - int *result) + u32 version, struct rpc_message *msg) { struct rpc_clnt *rpcb_clnt; - int error = 0; + int result, error = 0; - *result = 0; + msg->rpc_resp = &result; rpcb_clnt = rpcb_create_local(addr, addrlen, version); if (!IS_ERR(rpcb_clnt)) { @@ -191,12 +191,15 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, } else error = PTR_ERR(rpcb_clnt); - if (error < 0) + if (error < 0) { printk(KERN_WARNING "RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); - dprintk("RPC: registration status %d/%d\n", error, *result); + return error; + } - return error; + if (!result) + return -EACCES; + return 0; } /** @@ -205,7 +208,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * @vers: RPC version number to bind * @prot: transport protocol to register * @port: port value to register - * @okay: OUT: result code + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services @@ -217,15 +224,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * all registered transports for [program, version] from the local * rpcbind database. * - * Returns zero if the registration request was dispatched - * successfully and a reply was received. The rpcbind daemon's - * boolean result code is stored in *okay. - * - * Returns an errno value and sets *result to zero if there was - * some problem that prevented the rpcbind request from being - * dispatched, or if the rpcbind daemon did not respond within - * the timeout. - * * This function uses rpcbind protocol version 2 to contact the * local rpcbind daemon. * @@ -236,7 +234,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 * addresses). */ -int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) +int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) { struct rpcbind_args map = { .r_prog = prog, @@ -246,7 +244,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) }; struct rpc_message msg = { .rpc_argp = &map, - .rpc_resp = okay, }; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " @@ -259,7 +256,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_2, &msg, okay); + RPCBVERS_2, &msg); } /* @@ -290,7 +287,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_4, msg, msg->rpc_resp); + RPCBVERS_4, msg); } /* @@ -304,10 +301,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, char buf[64]; /* Construct AF_INET6 universal address */ - snprintf(buf, sizeof(buf), - NIP6_FMT".%u.%u", - NIP6(address_to_register->sin6_addr), - port >> 8, port & 0xff); + if (ipv6_addr_any(&address_to_register->sin6_addr)) + snprintf(buf, sizeof(buf), "::.%u.%u", + port >> 8, port & 0xff); + else + snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u", + NIP6(address_to_register->sin6_addr), + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, sizeof(rpcb_in6addr_loopback), - RPCBVERS_4, msg, msg->rpc_resp); + RPCBVERS_4, msg); } /** @@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * @version: RPC version number of service to (un)register * @address: address family, IP address, and port to (un)register * @netid: netid of transport protocol to (un)register - * @result: result code from rpcbind RPC call + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services @@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * to zero. Callers pass a netid of "" to unregister all * transport netids associated with [program, version, address]. * - * Returns zero if the registration request was dispatched - * successfully and a reply was received. The rpcbind daemon's - * result code is stored in *result. - * - * Returns an errno value and sets *result to zero if there was - * some problem that prevented the rpcbind request from being - * dispatched, or if the rpcbind daemon did not respond within - * the timeout. - * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support * version 4 of the rpcbind protocol in order for these functions @@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * advertises the service on all IPv4 and IPv6 addresses. */ int rpcb_v4_register(const u32 program, const u32 version, - const struct sockaddr *address, const char *netid, - int *result) + const struct sockaddr *address, const char *netid) { struct rpcbind_args map = { .r_prog = program, @@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version, }; struct rpc_message msg = { .rpc_argp = &map, - .rpc_resp = result, }; - *result = 0; - switch (address->sa_family) { case AF_INET: return rpcb_register_netid4((struct sockaddr_in *)address, @@ -469,6 +460,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi return rpc_run_task(&task_setup_data); } +/* + * In the case where rpc clients have been cloned, we want to make + * sure that we use the program number/version etc of the actual + * owner of the xprt. To do so, we walk back up the tree of parents + * to find whoever created the transport and/or whoever has the + * autobind flag set. + */ +static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) +{ + struct rpc_clnt *parent = clnt->cl_parent; + + while (parent != clnt) { + if (parent->cl_xprt != clnt->cl_xprt) + break; + if (clnt->cl_autobind) + break; + clnt = parent; + parent = parent->cl_parent; + } + return clnt; +} + /** * rpcb_getport_async - obtain the port for a given RPC service on a given host * @task: task that is waiting for portmapper request @@ -478,10 +491,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi */ void rpcb_getport_async(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_clnt *clnt; struct rpc_procinfo *proc; u32 bind_version; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; static struct rpcbind_args *map; struct rpc_task *child; @@ -490,13 +503,13 @@ void rpcb_getport_async(struct rpc_task *task) size_t salen; int status; + clnt = rpcb_find_transport_owner(task->tk_client); + xprt = clnt->cl_xprt; + dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __func__, clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); - /* Autobind on cloned rpc clients is discouraged */ - BUG_ON(clnt->cl_parent != clnt); - /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on(&xprt->binding, task, NULL); @@ -558,7 +571,7 @@ void rpcb_getport_async(struct rpc_task *task) status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", task->tk_pid, __func__); - goto bailout_nofree; + goto bailout_release_client; } map->r_prog = clnt->cl_prog; map->r_vers = clnt->cl_vers; @@ -578,11 +591,13 @@ void rpcb_getport_async(struct rpc_task *task) task->tk_pid, __func__); return; } - rpc_put_task(child); - task->tk_xprt->stat.bind_count++; + xprt->stat.bind_count++; + rpc_put_task(child); return; +bailout_release_client: + rpc_release_client(rpcb_clnt); bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); task->tk_status = status; @@ -633,7 +648,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { - dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n", + dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n", rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); *p++ = htonl(rpcb->r_prog); *p++ = htonl(rpcb->r_vers); @@ -648,7 +663,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p, unsigned short *portp) { *portp = (unsigned short) ntohl(*p++); - dprintk("RPC: rpcb_decode_getport result %u\n", + dprintk("RPC: rpcb getport result: %u\n", *portp); return 0; } @@ -657,7 +672,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, unsigned int *boolp) { *boolp = (unsigned int) ntohl(*p++); - dprintk("RPC: rpcb_decode_set: call %s\n", + dprintk("RPC: rpcb set/unset call %s\n", (*boolp ? "succeeded" : "failed")); return 0; } @@ -665,7 +680,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { - dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n", + dprintk("RPC: encoding rpcb request (%u, %u, %s)\n", rpcb->r_prog, rpcb->r_vers, rpcb->r_addr); *p++ = htonl(rpcb->r_prog); *p++ = htonl(rpcb->r_vers); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5a32cb7c4bb..54c98d87684 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -28,6 +28,8 @@ #define RPCDBG_FACILITY RPCDBG_SVCDSP +static void svc_unregister(const struct svc_serv *serv); + #define svc_serv_is_pooled(serv) ((serv)->sv_function) /* @@ -357,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@ -366,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; + serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@ -416,30 +419,29 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, spin_lock_init(&pool->sp_lock); } - /* Remove any stale portmap registrations */ - svc_register(serv, 0, 0); + svc_unregister(serv); return serv; } struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); } EXPORT_SYMBOL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv), + sa_family_t family, void (*shutdown)(struct svc_serv *serv), svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, family, shutdown); if (serv != NULL) { serv->sv_function = func; @@ -486,8 +488,7 @@ svc_destroy(struct svc_serv *serv) if (svc_serv_is_pooled(serv)) svc_pool_map_put(); - /* Unregister service with the portmapper */ - svc_register(serv, 0, 0); + svc_unregister(serv); kfree(serv->sv_pools); kfree(serv); } @@ -718,55 +719,245 @@ svc_exit_thread(struct svc_rqst *rqstp) } EXPORT_SYMBOL(svc_exit_thread); +#ifdef CONFIG_SUNRPC_REGISTER_V4 + /* - * Register an RPC service with the local portmapper. - * To unregister a service, call this routine with - * proto and port == 0. + * Register an "inet" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. */ -int -svc_register(struct svc_serv *serv, int proto, unsigned short port) +static int __svc_rpcb_register4(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; + char *netid; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP; + break; + default: + return -EPROTONOSUPPORT; + } + + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin, netid); +} + +/* + * Register an "inet6" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_rpcb_register6(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + char *netid; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP6; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP6; + break; + default: + return -EPROTONOSUPPORT; + } + + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, netid); +} + +/* + * Register a kernel RPC service via rpcbind version 4. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const u32 program, const u32 version, + const sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + int error; + + switch (family) { + case AF_INET: + return __svc_rpcb_register4(program, version, + protocol, port); + case AF_INET6: + error = __svc_rpcb_register6(program, version, + protocol, port); + if (error < 0) + return error; + + /* + * Work around bug in some versions of Linux rpcbind + * which don't allow registration of both inet and + * inet6 netids. + * + * Error return ignored for now. + */ + __svc_rpcb_register4(program, version, + protocol, port); + return 0; + } + + return -EAFNOSUPPORT; +} + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +/* + * Register a kernel RPC service via rpcbind version 2. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const u32 program, const u32 version, + sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + if (family != AF_INET) + return -EAFNOSUPPORT; + + return rpcb_register(program, version, protocol, port); +} + +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + +/** + * svc_register - register an RPC service with the local portmapper + * @serv: svc_serv struct for the service to register + * @proto: transport protocol number to advertise + * @port: port to advertise + * + * Service is registered for any address in serv's address family + */ +int svc_register(const struct svc_serv *serv, const unsigned short proto, + const unsigned short port) { struct svc_program *progp; - unsigned long flags; unsigned int i; - int error = 0, dummy; + int error = 0; - if (!port) - clear_thread_flag(TIF_SIGPENDING); + BUG_ON(proto == 0 && port == 0); for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; - dprintk("svc: svc_register(%s, %s, %d, %d)%s\n", + dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", progp->pg_name, + i, proto == IPPROTO_UDP? "udp" : "tcp", port, - i, + serv->sv_family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); if (progp->pg_vers[i]->vs_hidden) continue; - error = rpcb_register(progp->pg_prog, i, proto, port, &dummy); + error = __svc_register(progp->pg_prog, i, + serv->sv_family, proto, port); if (error < 0) break; - if (port && !dummy) { - error = -EACCES; - break; - } } } - if (!port) { - spin_lock_irqsave(¤t->sighand->siglock, flags); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return error; +} + +#ifdef CONFIG_SUNRPC_REGISTER_V4 + +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = 0, + }; + int error; + + error = rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, ""); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + int error; + + error = rpcb_register(program, version, 0, 0); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + +/* + * All netids, bind addresses and ports registered for [program, version] + * are removed from the local rpcbind database (if the service is not + * hidden) to make way for a new instance of the service. + * + * The result of unregistration is reported via dprintk for those who want + * verification of the result, but is otherwise not important. + */ +static void svc_unregister(const struct svc_serv *serv) +{ + struct svc_program *progp; + unsigned long flags; + unsigned int i; + + clear_thread_flag(TIF_SIGPENDING); + + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + if (progp->pg_vers[i]->vs_hidden) + continue; + + __svc_unregister(progp->pg_prog, i, progp->pg_name); + } } - return error; + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e46c825f495..bf5b5cdafeb 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, } EXPORT_SYMBOL_GPL(svc_xprt_init); -int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, - int flags) +static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, + struct svc_serv *serv, + unsigned short port, int flags) { - struct svc_xprt_class *xcl; struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + struct sockaddr *sap; + size_t len; + + switch (serv->sv_family) { + case AF_INET: + sap = (struct sockaddr *)&sin; + len = sizeof(sin); + break; + case AF_INET6: + sap = (struct sockaddr *)&sin6; + len = sizeof(sin6); + break; + default: + return ERR_PTR(-EAFNOSUPPORT); + } + + return xcl->xcl_ops->xpo_create(serv, sap, len, flags); +} + +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, + int flags) +{ + struct svc_xprt_class *xcl; + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); spin_lock(&svc_xprt_class_lock); list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { @@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = xcl->xcl_ops-> - xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin), - flags); + newxprt = __svc_xpo_create(xcl, serv, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 3e65719f1ef..95293f549e9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int val; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); + /* + * We start one listener per sv_serv. We want AF_INET + * requests to be automatically shunted to our AF_INET6 + * listener using a mapped IPv4 address. Make sure + * no-one starts an equivalent IPv4 listener, which + * would steal our incoming connections. + */ + val = 0; + if (serv->sv_family == AF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1154,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, int svc_addsock(struct svc_serv *serv, int fd, - char *name_return, - int *proto) + char *name_return) { int err = 0; struct socket *so = sockfd_lookup(fd, &err); @@ -1190,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv, sockfd_put(so); return err; } - if (proto) *proto = so->sk->sk_protocol; return one_sock_name(name_return, svsk); } EXPORT_SYMBOL_GPL(svc_addsock); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 99a52aabe33..29e401bb612 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport) goto out; } - result = -EINVAL; - if (try_module_get(THIS_MODULE)) { - list_add_tail(&transport->list, &xprt_list); - printk(KERN_INFO "RPC: Registered %s transport module.\n", - transport->name); - result = 0; - } + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; out: spin_unlock(&xprt_list_lock); @@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport) "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); - module_put(THIS_MODULE); goto out; } } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 5c1954d28d0..14106d26bb9 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; if (n == nsegs) return 0; seg[n].mr_page = NULL; @@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen == 0) return -1; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" - " headerp 0x%p base 0x%p lkey 0x%x\n", + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); @@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b * Scatter inline received data back into provided iov's. */ static void -rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; @@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) } else rqst->rq_rcv_buf.tail[0].iov_len = 0; + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + if (copy_len) dprintk("RPC: %s: %d bytes in" " %d extra segments (%d lost)\n", @@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, ep->rep_connected); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } spin_unlock_bh(&xprt->transport_lock); } @@ -792,14 +805,20 @@ repost: ((unsigned char *)iptr - (unsigned char *)headerp); status = rep->rr_len + rdmalen; r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } } else { /* else ordinary inline */ + rdmalen = 0; iptr = (__be32 *)((unsigned char *)headerp + 28); rep->rr_len -= 28; /*sizeof *headerp;*/ status = rep->rr_len; } /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); break; case htonl(RDMA_NOMSG): diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 74de31a0661..a4756576d68 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, * * Assumptions: * - chunk[0]->position points to pages[0] at an offset of 0 - * - pages[] is not physically or virtually contigous and consists of + * - pages[] is not physically or virtually contiguous and consists of * PAGE_SIZE elements. * * Output: @@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, * chunk in the read list * */ -static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, +static int map_read_chunks(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, struct rpcrdma_msg *rmsgp, @@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, return sge_no; } -static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, - struct svc_rdma_op_ctxt *ctxt, - struct kvec *vec, - u64 *sgl_offset, - int count) +/* Map a read-chunk-list to an XDR and fast register the page-list. + * + * Assumptions: + * - chunk[0] position points to pages[0] at an offset of 0 + * - pages[] will be made physically contiguous by creating a one-off memory + * region using the fastreg verb. + * - byte_count is # of bytes in read-chunk-list + * - ch_count is # of chunks in read-chunk-list + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + */ +static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, + int ch_count, + int byte_count) +{ + int page_no; + int ch_no; + u32 offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_fastreg_mr *frmr; + int ret = 0; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + + head->frmr = frmr; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = byte_count; + head->arg.len = rqstp->rq_arg.len + byte_count; + head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + + /* Fast register the page list */ + frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->direction = DMA_FROM_DEVICE; + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); + frmr->map_len = byte_count; + frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, + page_address(rqstp->rq_arg.pages[page_no]), + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + } + head->count += page_no; + + /* rq_respages points one past arg pages */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + + /* Create the reply and chunk maps */ + offset = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + for (ch_no = 0; ch_no < ch_count; ch_no++) { + rpl_map->sge[ch_no].iov_base = frmr->kva + offset; + rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length; + chl_map->ch[ch_no].count = 1; + chl_map->ch[ch_no].start = ch_no; + offset += ch->rc_target.rs_length; + ch++; + } + + ret = svc_rdma_fastreg(xprt, frmr); + if (ret) + goto fatal_err; + + return ch_no; + + fatal_err: + printk("svcrdma: error fast registering xdr for xprt %p", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_fastreg_mr *frmr, + struct kvec *vec, + u64 *sgl_offset, + int count) { int i; ctxt->count = count; ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { - atomic_inc(&xprt->sc_dma_used); - ctxt->sge[i].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - vec[i].iov_base, vec[i].iov_len, - DMA_FROM_DEVICE); + ctxt->sge[i].length = 0; /* in case map fails */ + if (!frmr) { + ctxt->sge[i].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + vec[i].iov_base, + vec[i].iov_len, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[i].addr)) + return -EINVAL; + ctxt->sge[i].lkey = xprt->sc_dma_lkey; + atomic_inc(&xprt->sc_dma_used); + } else { + ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; + ctxt->sge[i].lkey = frmr->mr->lkey; + } ctxt->sge[i].length = vec[i].iov_len; - ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey; *sgl_offset = *sgl_offset + vec[i].iov_len; } + return 0; } static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) @@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *hdr_ctxt) { struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; int err = 0; int ch_no; int ch_count; @@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; - sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, - ch_count, byte_count); + + if (!xprt->sc_frmr_pg_list_len) + sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + else + sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + if (sge_count < 0) { + err = -EIO; + goto out; + } + sgl_offset = 0; ch_no = 0; @@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, next_sge: ctxt = svc_rdma_get_context(xprt); ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = hdr_ctxt->frmr; + ctxt->read_hdr = NULL; clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare READ WR */ memset(&read_wr, 0, sizeof read_wr); - ctxt->wr_op = IB_WR_RDMA_READ; read_wr.wr_id = (unsigned long)ctxt; read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; read_wr.send_flags = IB_SEND_SIGNALED; read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; read_wr.wr.rdma.remote_addr = @@ -327,10 +444,15 @@ next_sge: read_wr.sg_list = ctxt->sge; read_wr.num_sge = rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); - rdma_set_ctxt_sge(xprt, ctxt, - &rpl_map->sge[chl_map->ch[ch_no].start], - &sgl_offset, - read_wr.num_sge); + err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, + &rpl_map->sge[chl_map->ch[ch_no].start], + &sgl_offset, + read_wr.num_sge); + if (err) { + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + goto out; + } if (((ch+1)->rc_discrim == 0) && (read_wr.num_sge == chl_map->ch[ch_no].count)) { /* @@ -339,6 +461,29 @@ next_sge: * the client and the RPC needs to be enqueued. */ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + if (hdr_ctxt->frmr) { + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + /* + * Invalidate the local MR used to map the data + * sink. + */ + if (xprt->sc_dev_caps & + SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = + IB_WR_RDMA_READ_WITH_INV; + ctxt->wr_op = read_wr.opcode; + read_wr.ex.invalidate_rkey = + ctxt->frmr->mr->lkey; + } else { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + hdr_ctxt->frmr->mr->lkey; + read_wr.next = &inv_wr; + } + } ctxt->read_hdr = hdr_ctxt; } /* Post the read */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 84d328329d9..9a7a8e7ae03 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -69,9 +69,127 @@ * array is only concerned with the reply we are assured that we have * on extra page for the RPCRMDA header. */ -static void xdr_to_sge(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int fast_reg_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +{ + int sge_no; + u32 sge_bytes; + u32 page_bytes; + u32 page_off; + int page_no = 0; + u8 *frva; + struct svc_rdma_fastreg_mr *frmr; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + vec->frmr = frmr; + + /* Skip the RPCRDMA header */ + sge_no = 1; + + /* Map the head. */ + frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; + vec->count = 2; + sge_no++; + + /* Build the FRMR */ + frmr->kva = frva; + frmr->direction = DMA_TO_DEVICE; + frmr->access_flags = 0; + frmr->map_len = PAGE_SIZE; + frmr->page_list_len = 1; + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *)xdr->head[0].iov_base, + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + + page_off = xdr->page_base; + page_bytes = xdr->page_len + page_off; + if (!page_bytes) + goto encode_tail; + + /* Map the pages */ + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + vec->sge[sge_no].iov_len = page_bytes; + sge_no++; + while (page_bytes) { + struct page *page; + + page = xdr->pages[page_no++]; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); + page_bytes -= sge_bytes; + + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, page, 0, + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + + atomic_inc(&xprt->sc_dma_used); + page_off = 0; /* reset for next time through loop */ + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + vec->count++; + + encode_tail: + /* Map tail */ + if (0 == xdr->tail[0].iov_len) + goto done; + + vec->count++; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; + + if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == + ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { + /* + * If head and tail use the same page, we don't need + * to map it again. + */ + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + } else { + void *va; + + /* Map another page for the tail */ + page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + + done: + if (svc_rdma_fastreg(xprt, frmr)) + goto fatal_err; + + return 0; + + fatal_err: + printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; @@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt, BUG_ON(xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + if (xprt->sc_frmr_pg_list_len) + return fast_reg_xdr(xprt, xdr, vec); + /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; @@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt, BUG_ON(sge_no > sge_max); vec->count = sge_no; + return 0; } /* Assumptions: + * - We are using FRMR + * - or - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_no = 0; /* Copy the remaining SGE */ - while (bc != 0 && xdr_sge_no < vec->count) { - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; - sge_bytes = min((size_t)bc, - (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off)); + while (bc != 0) { + sge_bytes = min_t(size_t, + bc, vec->sge[xdr_sge_no].iov_len-sge_off); sge[sge_no].length = sge_bytes; - atomic_inc(&xprt->sc_dma_used); - sge[sge_no].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - (void *) - vec->sge[xdr_sge_no].iov_base + sge_off, - sge_bytes, DMA_TO_DEVICE); - if (dma_mapping_error(xprt->sc_cm_id->device->dma_device, - sge[sge_no].addr)) - goto err; + if (!vec->frmr) { + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *) + vec->sge[xdr_sge_no].iov_base + sge_off, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; + } else { + sge[sge_no].addr = (unsigned long) + vec->sge[xdr_sge_no].iov_base + sge_off; + sge[sge_no].lkey = vec->frmr->mr->lkey; + } + ctxt->count++; + ctxt->frmr = vec->frmr; sge_off = 0; sge_no++; - ctxt->count++; xdr_sge_no++; + BUG_ON(xdr_sge_no > vec->count); bc -= sge_bytes; } - BUG_ON(bc != 0); - BUG_ON(xdr_sge_no > vec->count); - /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); ctxt->wr_op = IB_WR_RDMA_WRITE; @@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - max_write = xprt->sc_max_sge * PAGE_SIZE; + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; @@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - max_write = xprt->sc_max_sge * PAGE_SIZE; + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; /* xdr offset starts at RPC message */ for (xdr_off = 0, chunk_no = 0; @@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, ch = &arg_ary->wc_array[chunk_no].wc_target; write_len = min(xfer_len, ch->rs_length); - /* Prepare the reply chunk given the length actually * written */ rs_offset = get_unaligned(&(ch->rs_offset)); @@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; + struct ib_send_wr inv_wr; int sge_no; int sge_bytes; int page_no; @@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; + ctxt->frmr = vec->frmr; + if (vec->frmr) + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + else + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ - atomic_inc(&rdma->sc_dma_used); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); - ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; + ctxt->sge[0].lkey = rdma->sc_dma_lkey; /* Determine how many of our SGE are to be transmitted */ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; - atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].addr = - ib_dma_map_single(rdma->sc_cm_id->device, - vec->sge[sge_no].iov_base, - sge_bytes, DMA_TO_DEVICE); + if (!vec->frmr) { + ctxt->sge[sge_no].addr = + ib_dma_map_single(rdma->sc_cm_id->device, + vec->sge[sge_no].iov_base, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + } else { + ctxt->sge[sge_no].addr = (unsigned long) + vec->sge[sge_no].iov_base; + ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; + } ctxt->sge[sge_no].length = sge_bytes; - ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey; } BUG_ON(byte_count != 0); @@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; - /* If there are more pages than SGE, terminate SGE list */ + /* + * If there are more pages than SGE, terminate SGE + * list so that svc_rdma_unmap_dma doesn't attempt to + * unmap garbage. + */ if (page_no+1 >= sge_no) ctxt->sge[page_no+1].length = 0; } BUG_ON(sge_no > rdma->sc_max_sge); + BUG_ON(sge_no > ctxt->count); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; @@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; + if (vec->frmr) { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + vec->frmr->mr->lkey; + send_wr.next = &inv_wr; + } ret = svc_rdma_send(rdma, &send_wr); if (ret) - svc_rdma_put_context(ctxt, 1); + goto err; - return ret; + return 0; + + err: + svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 1); + return -EIO; } void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) @@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; vec = svc_rdma_get_req_map(); - xdr_to_sge(rdma, &rqstp->rq_res, vec); - + ret = map_xdr(rdma, &rqstp->rq_res, vec); + if (ret) + goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ @@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", ret); - goto error; + goto err1; } inline_bytes -= ret; @@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", ret); - goto error; + goto err1; } inline_bytes -= ret; @@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) svc_rdma_put_req_map(vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; - error: + + err1: + put_page(res_page); + err0: svc_rdma_put_req_map(vec); svc_rdma_put_context(ctxt, 0); - put_page(res_page); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 900cb69728c..6fb493cbd29 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -100,20 +100,29 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; + ctxt->frmr = NULL; atomic_inc(&xprt->sc_ctxt_used); return ctxt; } -static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) +void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; int i; for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_single(xprt->sc_cm_id->device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); + /* + * Unmap the DMA addr in the SGE if the lkey matches + * the sc_dma_lkey, otherwise, ignore it since it is + * an FRMR lkey and will be unmapped later when the + * last WR that uses it completes. + */ + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + } } } @@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } map->count = 0; + map->frmr = NULL; return map; } @@ -316,6 +326,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) } /* + * Processs a completion context + */ +static void process_context(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt) +{ + svc_rdma_unmap_dma(ctxt); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + svc_rdma_put_context(ctxt, 0); + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); + break; + } +} + +/* * Send Queue Completion Handler - potentially called on interrupt context. * * Note that caller must hold a transport reference. @@ -327,17 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; - if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - xprt = ctxt->xprt; - - svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) /* Close the transport */ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -346,35 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) atomic_dec(&xprt->sc_sq_count); wake_up(&xprt->sc_send_wait); - switch (ctxt->wr_op) { - case IB_WR_SEND: - svc_rdma_put_context(ctxt, 1); - break; - - case IB_WR_RDMA_WRITE: - svc_rdma_put_context(ctxt, 0); - break; - - case IB_WR_RDMA_READ: - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - BUG_ON(!read_hdr); - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - svc_xprt_enqueue(&xprt->sc_xprt); - } - svc_rdma_put_context(ctxt, 0); - break; + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + if (ctxt) + process_context(xprt, ctxt); - default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d, status=%d\n", - wc.opcode, wc.status); - break; - } svc_xprt_put(&xprt->sc_xprt); } @@ -425,10 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_frmr_q_lock); cma_xprt->sc_ord = svcrdma_ord; @@ -462,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; struct page *page; - unsigned long pa; + dma_addr_t pa; int sge_no; int buflen; int ret; @@ -474,13 +500,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; - atomic_inc(&xprt->sc_dma_used); pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) + goto err_put_ctxt; + atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; buflen += PAGE_SIZE; } ctxt->count = sge_no; @@ -496,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) svc_rdma_put_context(ctxt, 1); } return ret; + + err_put_ctxt: + svc_rdma_put_context(ctxt, 1); + return -ENOMEM; } /* @@ -566,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event=%d\n", cma_id, cma_id->context, event->event); handle_connect_req(cma_id, - event->param.conn.responder_resources); + event->param.conn.initiator_depth); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -686,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(ret); } +static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) +{ + struct ib_mr *mr; + struct ib_fast_reg_page_list *pl; + struct svc_rdma_fastreg_mr *frmr; + + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); + if (!frmr) + goto err; + + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + if (!mr) + goto err_free_frmr; + + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, + RPCSVC_MAXPAGES); + if (!pl) + goto err_free_mr; + + frmr->mr = mr; + frmr->page_list = pl; + INIT_LIST_HEAD(&frmr->frmr_list); + return frmr; + + err_free_mr: + ib_dereg_mr(mr); + err_free_frmr: + kfree(frmr); + err: + return ERR_PTR(-ENOMEM); +} + +static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_fastreg_mr *frmr; + + while (!list_empty(&xprt->sc_frmr_q)) { + frmr = list_entry(xprt->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + ib_dereg_mr(frmr->mr); + ib_free_fast_reg_page_list(frmr->page_list); + kfree(frmr); + } +} + +struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_fastreg_mr *frmr = NULL; + + spin_lock_bh(&rdma->sc_frmr_q_lock); + if (!list_empty(&rdma->sc_frmr_q)) { + frmr = list_entry(rdma->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + frmr->map_len = 0; + frmr->page_list_len = 0; + } + spin_unlock_bh(&rdma->sc_frmr_q_lock); + if (frmr) + return frmr; + + return rdma_alloc_frmr(rdma); +} + +static void frmr_unmap_dma(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + int page_no; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + dma_addr_t addr = frmr->page_list->page_list[page_no]; + if (ib_dma_mapping_error(frmr->mr->device, addr)) + continue; + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); + } +} + +void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, + struct svc_rdma_fastreg_mr *frmr) +{ + if (frmr) { + frmr_unmap_dma(rdma, frmr); + spin_lock_bh(&rdma->sc_frmr_q_lock); + BUG_ON(!list_empty(&frmr->frmr_list)); + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); + spin_unlock_bh(&rdma->sc_frmr_q_lock); + } +} + /* * This is the xpo_recvfrom function for listening endpoints. Its * purpose is to accept incoming connections. The CMA callback handler @@ -704,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; + int dma_mr_acc; + int need_dma_mr; int ret; int i; @@ -819,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* Register all of physical memory */ - newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); + /* + * Use the most secure set of MR resources based on the + * transport type and available memory management features in + * the device. Here's the table implemented below: + * + * Fast Global DMA Remote WR + * Reg LKEY MR Access + * Sup'd Sup'd Needed Needed + * + * IWARP N N Y Y + * N Y Y Y + * Y N Y N + * Y Y N - + * + * IB N N Y N + * N Y N - + * Y N Y N + * Y Y N - + * + * NB: iWARP requires remote write access for the data sink + * of an RDMA_READ. IB does not. + */ + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + newxprt->sc_frmr_pg_list_len = + devattr.max_fast_reg_page_list_len; + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + } + + /* + * Determine if a DMA MR is required and if so, what privs are required + */ + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { + case RDMA_TRANSPORT_IWARP: + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: + if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + default: goto errout; } + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ + if (need_dma_mr) { + /* Register all of physical memory */ + newxprt->sc_phys_mr = + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", + ret); + goto errout; + } + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; + } else + newxprt->sc_dma_lkey = + newxprt->sc_cm_id->device->local_dma_lkey; + /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt); @@ -961,6 +1148,9 @@ static void __svc_rdma_free(struct work_struct *work) WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + /* De-allocate fastreg mr */ + rdma_dealloc_frmr_q(rdma); + /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_destroy_qp(rdma->sc_qp); @@ -1014,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +/* + * Attempt to register the kvec representing the RPC memory with the + * device. + * + * Returns: + * NULL : The device does not support fastreg or there were no more + * fastreg mr. + * frmr : The kvec register request was successfully posted. + * <0 : An error was encountered attempting to register the kvec. + */ +int svc_rdma_fastreg(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + struct ib_send_wr fastreg_wr; + u8 key; + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof fastreg_wr); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + return svc_rdma_send(xprt, &fastreg_wr); +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { - struct ib_send_wr *bad_wr; + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); - BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != - wr->opcode); + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + /* If the SQ is full, wait until an SQ entry is available */ while (1) { spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); @@ -1043,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return 0; continue; } - /* Bumped used SQ WR count and post */ - svc_xprt_get(&xprt->sc_xprt); + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&xprt->sc_xprt); + + /* Bump used SQ WR count and post */ + atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); - if (!ret) - atomic_inc(&xprt->sc_sq_count); - else { - svc_xprt_put(&xprt->sc_xprt); + if (ret) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + atomic_sub(wr_count, &xprt->sc_sq_count); + for (i = 0; i < wr_count; i ++) + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); } spin_unlock_bh(&xprt->sc_lock); + if (ret) + wake_up(&xprt->sc_send_wait); break; } return ret; @@ -1079,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); /* Prepare SGE for local address */ - atomic_inc(&xprt->sc_dma_used); sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, p, 0, PAGE_SIZE, DMA_FROM_DEVICE); - sge.lkey = xprt->sc_phys_mr->lkey; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { + put_page(p); + return; + } + atomic_inc(&xprt->sc_dma_used); + sge.lkey = xprt->sc_dma_lkey; sge.length = length; ctxt = svc_rdma_get_context(xprt); @@ -1103,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, if (ret) { dprintk("svcrdma: Error %d posting send for protocol error\n", ret); + ib_dma_unmap_page(xprt->sc_cm_id->device, + sge.addr, PAGE_SIZE, + DMA_FROM_DEVICE); svc_rdma_put_context(ctxt, 1); } } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a564c1a39ec..9839c3d9414 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -#if !RPCRDMA_PERSISTENT_REGISTRATION -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ -#else -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; -#endif +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; #ifdef RPC_DEBUG @@ -140,6 +137,14 @@ static ctl_table xr_tunables_table[] = { .extra2 = &max_memreg, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0, }, }; @@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } @@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task) /* Reconnect */ schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > (30 * HZ)) + xprt->reestablish_timeout = (30 * HZ); + else if (xprt->reestablish_timeout < (5 * HZ)) + xprt->reestablish_timeout = (5 * HZ); } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) @@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); out: + req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; outfail: @@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task) req->rl_reply->rr_xprt = xprt; } - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ - } + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + task->tk_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; return 0; + +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ } static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) @@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8ea283ecc52..a5fef5e6c32 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: @@ -338,13 +339,32 @@ connected: wake_up_all(&ep->rep_connect_wait); break; default: - ia->ri_async_rc = -EINVAL; - dprintk("RPC: %s: unexpected CM event %X\n", + dprintk("RPC: %s: unexpected CM event %d\n", __func__, event->event); - complete(&ia->ri_done); break; } +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "on %s, memreg %d slots %d ird %d%s\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "closed (%d)\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + connstate); + } +#endif + return 0; } @@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rdma_cm_id *id; int rc; + init_completion(&ia->ri_done); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, return id; } - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; @@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc; + int rc, mem_priv; + struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; - init_completion(&ia->ri_done); - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -443,6 +466,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + switch (memreg) { + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { + dprintk("RPC: %s: MEMWINDOWS registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; + } + break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + case RPCRDMA_FRMR: + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + } + + /* * Optionally obtain an underlying physical identity mapping in * order to do a memory window-based bind. This base registration * is protected from remote access - that is enabled only by binding @@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * revoked after the corresponding completion similar to a storage * adapter. */ - if (memreg > RPCRDMA_REGISTER) { - int mem_priv = IB_ACCESS_LOCAL_WRITE; - switch (memreg) { + switch (memreg) { + case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; #if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: - mem_priv |= IB_ACCESS_REMOTE_WRITE; - mem_priv |= IB_ACCESS_REMOTE_READ; - break; + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv |= IB_ACCESS_MW_BIND; - break; - default: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_MW_BIND; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) break; - } + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) memreg = RPCRDMA_REGISTER; ia->ri_bind_mem = NULL; } + break; + default: + printk(KERN_ERR "%s: invalid memory registration mode %d\n", + __func__, memreg); + rc = -EINVAL; + goto out2; } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) return 0; out2: rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; out1: return rc; } @@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { rc = ib_dealloc_pd(ia->ri_pd); dprintk("RPC: %s: ib_dealloc_pd returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); } /* @@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + /* Add room for frmr register and invalidate WRs */ + ep->rep_attr.cap.max_send_wr *= 3; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ @@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.private_data_len = 0; /* Client offers RDMA Read but does not initiate */ - switch (ia->ri_memreg_strategy) { - case RPCRDMA_BOUNCEBUFFERS: + ep->rep_remote_cma.initiator_depth = 0; + if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) ep->rep_remote_cma.responder_resources = 0; - break; - case RPCRDMA_MTHCAFMR: - case RPCRDMA_REGISTER: - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 8); - break; - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: -#if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: -#endif - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 2); - break; - default: - break; - } - if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) + else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; - ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: rpcrdma_ep_disconnect" " returned %i\n", __func__, rc); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; } - ep->rep_func = NULL; - /* padding - could be done in rpcrdma_buffer_destroy... */ if (ep->rep_pad_mr) { rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); ep->rep_pad_mr = NULL; } - if (ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - rpcrdma_clean_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_cq); if (rc) @@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc = 0; int retry_count = 0; - int reconnect = (ep->rep_connected != 0); - if (reconnect) { + if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: rc = rpcrdma_ep_disconnect(ep, ia); @@ -745,6 +836,7 @@ retry: goto out; } /* END TEMP */ + rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; } @@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { } } - /* Theoretically a client initiator_depth > 0 is not needed, - * but many peers fail to complete the connection unless they - * == responder_resources! */ - if (ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources) - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { goto out; } - if (reconnect) - return 0; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); /* @@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { if (ep->rep_connected <= 0) { /* Sometimes, the only way to reliably connect to remote * CMs is to use same nonzero values for ORD and IRD. */ - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - if (ep->rep_remote_cma.initiator_depth == 0) - ++ep->rep_remote_cma.initiator_depth; - if (ep->rep_remote_cma.responder_resources == 0) - ++ep->rep_remote_cma.responder_resources; - if (retry_count++ == 0) + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; goto retry; + } rc = ep->rep_connected; } else { dprintk("RPC: %s: connected\n", __func__); @@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len; int i, rc; + struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, if any + * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ @@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * @@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * and also reduce unbind-to-bind collision. */ INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } + r->r.frmr.fr_pgl = + ib_alloc_fast_reg_page_list(ia->ri_id->device, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; case RPCRDMA_MTHCAFMR: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; - struct ib_fmr_attr fa = { - RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT - }; /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + static struct ib_fmr_attr fa = + { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; r->r.fmr = ib_alloc_fmr(ia->ri_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, &fa); @@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.mw = ib_alloc_mw(ia->ri_pd); @@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; default: break; @@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *r; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) @@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { while (!list_empty(&buf->rb_mws)) { - struct rpcrdma_mw *r; r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; case RPCRDMA_MTHCAFMR: rc = ib_dealloc_fmr(r->r.fmr); if (rc) @@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_req *req; unsigned long flags; + int i; + struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; if (!list_empty(&buffers->rb_mws)) { - int i = RPCRDMA_MAX_SEGS - 1; + i = RPCRDMA_MAX_SEGS - 1; do { - struct rpcrdma_mw *r; r = list_entry(buffers->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); @@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: @@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, va, len, DMA_BIDIRECTIONAL); iov->length = len; - if (ia->ri_bind_mem != NULL) { + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { *mrp = NULL; iov->lkey = ia->ri_bind_mem->lkey; return 0; @@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr frmr_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, seg1->mr_chunk.rl_mw, i); + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.opcode = IB_WR_FAST_REG_MR; + frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; + frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list_len = i; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ); + frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + if (rc) + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct ib_mw_bind param; + int rc; + + *nsegs = 1; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + } + return rc; +} + +static int +rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt, void **r) +{ + struct ib_mw_bind param; + LIST_HEAD(l); + int rc; + + BUG_ON(seg->mr_nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (*r) { + param.wr_id = (u64) (unsigned long) *r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + *r = NULL; /* will upcall on completion */ + return rc; +} + +static int +rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len, i, rc = 0; + + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (len = 0, i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, i, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int rc; + + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + return rc; +} + int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - int i; int rc = 0; switch (ia->ri_memreg_strategy) { @@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; #endif - /* Registration using fast memory registration */ + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Registration using fmr memory registration */ case RPCRDMA_MTHCAFMR: - { - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff = offset_in_page(seg->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, - physaddrs, nsegs, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; /* Registration using memory windows */ case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - rpcrdma_map_one(ia, seg, writing); - param.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.addr = seg->mr_dma; - param.length = seg->mr_len; - param.send_flags = 0; - param.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.addr; - seg->mr_nsegs = 1; - nsegs = 1; - } - } + rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); break; /* Default registration each time */ default: - { - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len = 0; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, nsegs, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); break; } if (rc) @@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt, void *r) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; int nsegs = seg->mr_nsegs, rc; switch (ia->ri_memreg_strategy) { @@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, break; #endif + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + case RPCRDMA_MTHCAFMR: - { - LIST_HEAD(l); - list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - } - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_fmr_external(seg, ia); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - BUG_ON(nsegs != 1); - param.mr = ia->ri_bind_mem; - param.addr = 0ULL; /* unbind */ - param.length = 0; - param.mw_access_flags = 0; - if (r) { - param.wr_id = (u64) (unsigned long) r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - } - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - r = NULL; /* will upcall on completion */ + rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); break; default: - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_default_external(seg, ia); break; } if (r) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2427822f8bd..c7a7eba991b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,6 +51,9 @@ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + /* * Interface Adapter -- one per transport instance */ @@ -58,6 +61,8 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; @@ -156,6 +161,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ union { struct ib_mw *mw; struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + } frmr; } r; struct list_head mw_list; } *rl_mw; @@ -175,6 +184,7 @@ struct rpcrdma_req { size_t rl_size; /* actual length of buffer */ unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -198,7 +208,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -273,6 +283,11 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + /* * Interface Adapter calls - xprtrdma/verbs.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4486c59c3ac..9a288d5eea6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3,8 +3,8 @@ * * Client-side transport implementation for sockets. * - * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com> - * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com> + * TCP callback races fixes (C) 1998 Red Hat + * TCP send fixes (C) 1998 Red Hat * TCP NFS related read + write fixes * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> * diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 015606b54d9..c647aab8d41 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1,7 +1,7 @@ /* * NET4: Implementation of BSD Unix domain sockets. * - * Authors: Alan Cox, <alan.cox@linux.org> + * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/wireless/core.c b/net/wireless/core.c index 5cadbeb76a1..5031db7b275 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -13,7 +13,6 @@ #include <linux/debugfs.h> #include <linux/notifier.h> #include <linux/device.h> -#include <linux/list.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include <net/wireless.h> @@ -185,7 +184,8 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, if (result) goto out_unlock; - if (!debugfs_rename(rdev->wiphy.debugfsdir->d_parent, + if (rdev->wiphy.debugfsdir && + !debugfs_rename(rdev->wiphy.debugfsdir->d_parent, rdev->wiphy.debugfsdir, rdev->wiphy.debugfsdir->d_parent, newname)) @@ -318,6 +318,8 @@ int wiphy_register(struct wiphy *wiphy) drv->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&drv->wiphy), ieee80211_debugfs_dir); + if (IS_ERR(drv->wiphy.debugfsdir)) + drv->wiphy.debugfsdir = NULL; res = 0; out_unlock: |