From 9a812198ae49967f239789164c55ec3e72b7e0dd Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Thu, 14 Aug 2008 14:08:44 +0200 Subject: IPVS: Add genetlink interface implementation Add the implementation of the new Generic Netlink interface to IPVS and keep the old set/getsockopt interface for userspace backwards compatibility. Signed-off-by: Julius Volz Acked-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 875 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 875 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 6379705a8dc..d1dbd8b311b 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -2320,6 +2321,872 @@ static struct nf_sockopt_ops ip_vs_sockopts = { .owner = THIS_MODULE, }; +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + spin_lock_bh(&stats->lock); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps); + + spin_unlock_bh(&stats->lock); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + spin_unlock_bh(&stats->lock); + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + /* For now, only support IPv4 */ + if (nla_get_u16(nla_af) != AF_INET) + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + struct ip_vs_service *svc; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_get(usvc->fwmark); + else + svc = __ip_vs_service_get(usvc->protocol, usvc->addr, + usvc->port); + if (svc) { + usvc->flags = svc->flags; + ip_vs_service_put(svc); + } else + usvc->flags = 0; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + + strlcpy(usvc->sched_name, nla_data(nla_sched), + sizeof(usvc->sched_name)); + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +{ + struct ip_vs_service_user usvc; + int ret; + + ret = ip_vs_genl_parse_service(&usvc, nla, 0); + if (ret) + return ERR_PTR(ret); + + if (usvc.fwmark) + return __ip_vs_svc_fwm_get(usvc.fwmark); + else + return __ip_vs_service_get(usvc.protocol, usvc.addr, + usvc.port); +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + ip_vs_service_put(svc); + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + mutex_lock(&__ip_vs_mutex); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ip_vs_master_mcast_ifn, + ip_vs_master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ip_vs_backup_mcast_ifn, + ip_vs_backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(&t); +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user usvc; + struct ip_vs_dest_user udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(info->attrs); + goto out; + } else if (cmd == IPVS_CMD_NEW_DAEMON || + cmd == IPVS_CMD_DEL_DAEMON) { + + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(daemon_attrs); + else + ret = ip_vs_genl_del_daemon(daemon_attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(&usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc); + if (ret) + goto out; + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.fwmark); + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(&usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + if (svc) + ip_vs_service_put(svc); + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + IP_VS_ERR("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + ip_vs_service_put(svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + IP_VS_CONN_TAB_SIZE); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_unicast(msg, info->snd_pid); + goto out; + +nla_put_failure: + IP_VS_ERR("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + int ret, i; + + ret = genl_register_family(&ip_vs_genl_family); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { + ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); + if (ret) + goto err_out; + } + return 0; + +err_out: + genl_unregister_family(&ip_vs_genl_family); + return ret; +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + int __init ip_vs_control_init(void) { @@ -2334,6 +3201,13 @@ int __init ip_vs_control_init(void) return ret; } + ret = ip_vs_genl_register(); + if (ret) { + IP_VS_ERR("cannot register Generic Netlink interface.\n"); + nf_unregister_sockopt(&ip_vs_sockopts); + return ret; + } + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); @@ -2368,6 +3242,7 @@ void ip_vs_control_cleanup(void) unregister_sysctl_table(sysctl_header); proc_net_remove(&init_net, "ip_vs_stats"); proc_net_remove(&init_net, "ip_vs"); + ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } -- cgit v1.2.3 From 82dfb6f32219d8e6cf6b979a520cb2b11d977d4e Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 11 Aug 2008 19:36:06 +0000 Subject: ipvs: Only call init_service, update_service and done_service for schedulers if defined There are schedulers that only schedule based on data available in the service or destination structures and they don't need any persistent storage or initialization routine. These schedulers currently provide dummy functions for the init_service, update_service and/or done_service functions. For the init_service and done_service cases we already have code that only calls these functions, if the scheduler provides them. Do the same for the update_service case and remove the dummy functions from all schedulers. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 21 ++++++++++++--------- net/ipv4/ipvs/ip_vs_lblc.c | 7 ------- net/ipv4/ipvs/ip_vs_lblcr.c | 7 ------- net/ipv4/ipvs/ip_vs_lc.c | 21 --------------------- net/ipv4/ipvs/ip_vs_nq.c | 24 ------------------------ net/ipv4/ipvs/ip_vs_rr.c | 7 ------- net/ipv4/ipvs/ip_vs_sed.c | 24 ------------------------ net/ipv4/ipvs/ip_vs_wlc.c | 24 ------------------------ 8 files changed, 12 insertions(+), 123 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index d1dbd8b311b..ede101eeec1 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -869,7 +869,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) svc->num_dests++; /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); write_unlock_bh(&__ip_vs_svc_lock); return 0; @@ -899,7 +900,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) svc->num_dests++; /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); write_unlock_bh(&__ip_vs_svc_lock); @@ -949,7 +951,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); /* call the update_service, because server weight may be changed */ - svc->scheduler->update_service(svc); + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); write_unlock_bh(&__ip_vs_svc_lock); @@ -1012,12 +1015,12 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, */ list_del(&dest->n_list); svc->num_dests--; - if (svcupd) { - /* - * Call the update_service function of its scheduler - */ - svc->scheduler->update_service(svc); - } + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); } diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 7a6a319f544..4a14d069f8a 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -388,12 +388,6 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline struct ip_vs_dest * __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) { @@ -542,7 +536,6 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), .init_service = ip_vs_lblc_init_svc, .done_service = ip_vs_lblc_done_svc, - .update_service = ip_vs_lblc_update_svc, .schedule = ip_vs_lblc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index c234e73968a..46b870385b8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -572,12 +572,6 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline struct ip_vs_dest * __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) { @@ -731,7 +725,6 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), .init_service = ip_vs_lblcr_init_svc, .done_service = ip_vs_lblcr_done_svc, - .update_service = ip_vs_lblcr_update_svc, .schedule = ip_vs_lblcr_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index ebcdbf75ac6..2c3de1b6351 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -20,24 +20,6 @@ #include -static int ip_vs_lc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) { @@ -99,9 +81,6 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), - .init_service = ip_vs_lc_init_svc, - .done_service = ip_vs_lc_done_svc, - .update_service = ip_vs_lc_update_svc, .schedule = ip_vs_lc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index 92f3a677003..5330d5a2de1 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -37,27 +37,6 @@ #include -static int -ip_vs_nq_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { @@ -137,9 +116,6 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), - .init_service = ip_vs_nq_init_svc, - .done_service = ip_vs_nq_done_svc, - .update_service = ip_vs_nq_update_svc, .schedule = ip_vs_nq_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index 358110d17e5..f7492911753 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -32,12 +32,6 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static int ip_vs_rr_update_svc(struct ip_vs_service *svc) { svc->sched_data = &svc->destinations; @@ -96,7 +90,6 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .done_service = ip_vs_rr_done_svc, .update_service = ip_vs_rr_update_svc, .schedule = ip_vs_rr_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index 77663d84cbd..53f73bea66c 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -41,27 +41,6 @@ #include -static int -ip_vs_sed_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { @@ -139,9 +118,6 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), - .init_service = ip_vs_sed_init_svc, - .done_service = ip_vs_sed_done_svc, - .update_service = ip_vs_sed_update_svc, .schedule = ip_vs_sed_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index 9b0ef86bb1f..df7ad8d7476 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -25,27 +25,6 @@ #include -static int -ip_vs_wlc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) { @@ -127,9 +106,6 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), - .init_service = ip_vs_wlc_init_svc, - .done_service = ip_vs_wlc_done_svc, - .update_service = ip_vs_wlc_update_svc, .schedule = ip_vs_wlc_schedule, }; -- cgit v1.2.3 From a919cf4b6b499416b6e2247dbc79196c4325f2e6 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Thu, 14 Aug 2008 00:47:16 +0200 Subject: ipvs: Create init functions for estimator code Commit 8ab19ea36c5c5340ff598e4d15fc084eb65671dc ("ipvs: Fix possible deadlock in estimator code") fixed a deadlock condition, but that condition can only happen during unload of IPVS, because during normal operation there is at least our global stats structure in the estimator list. The mod_timer() and del_timer_sync() calls are actually initialization and cleanup code in disguise. Let's make it explicit and move them to their own init and cleanup function. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 8 ++++++-- net/ipv4/ipvs/ip_vs_est.c | 18 +++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index a7879eafc3b..9fbf0a6d739 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1070,10 +1070,12 @@ static int __init ip_vs_init(void) { int ret; + ip_vs_estimator_init(); + ret = ip_vs_control_init(); if (ret < 0) { IP_VS_ERR("can't setup control.\n"); - goto cleanup_nothing; + goto cleanup_estimator; } ip_vs_protocol_init(); @@ -1106,7 +1108,8 @@ static int __init ip_vs_init(void) cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); - cleanup_nothing: + cleanup_estimator: + ip_vs_estimator_cleanup(); return ret; } @@ -1117,6 +1120,7 @@ static void __exit ip_vs_cleanup(void) ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); + ip_vs_estimator_cleanup(); IP_VS_INFO("ipvs unloaded.\n"); } diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 5a20f93bd7f..4fb620ec208 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -124,8 +124,6 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats) est->outbps = stats->outbps<<5; spin_lock_bh(&est_lock); - if (list_empty(&est_list)) - mod_timer(&est_timer, jiffies + 2 * HZ); list_add(&est->list, &est_list); spin_unlock_bh(&est_lock); } @@ -136,11 +134,6 @@ void ip_vs_kill_estimator(struct ip_vs_stats *stats) spin_lock_bh(&est_lock); list_del(&est->list); - while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) { - spin_unlock_bh(&est_lock); - cpu_relax(); - spin_lock_bh(&est_lock); - } spin_unlock_bh(&est_lock); } @@ -160,3 +153,14 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) est->inbps = 0; est->outbps = 0; } + +int __init ip_vs_estimator_init(void) +{ + mod_timer(&est_timer, jiffies + 2 * HZ); + return 0; +} + +void ip_vs_estimator_cleanup(void) +{ + del_timer_sync(&est_timer); +} -- cgit v1.2.3 From 4a031b0e6acd8a8c23725ceb5db6a0aa5c4e231f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 15 Aug 2008 09:26:15 +1000 Subject: ipvs: rename __ip_vs_wlc_schedule in lblc and lblcr schedulers For the sake of clarity, rename __ip_vs_wlc_schedule() in lblc.c to __ip_vs_lblc_schedule() and the version in lblcr.c to __ip_vs_lblc_schedule(). I guess the original name stuck from a copy and paste. Cc: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_lblc.c | 6 +++--- net/ipv4/ipvs/ip_vs_lblcr.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 4a14d069f8a..b9b334cccf3 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -389,7 +389,7 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) { struct ip_vs_dest *dest, *least; int loh, doh; @@ -488,7 +488,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) tbl = (struct ip_vs_lblc_table *)svc->sched_data; en = ip_vs_lblc_get(tbl, iph->daddr); if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblc_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; @@ -503,7 +503,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblc_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 46b870385b8..f1c84503689 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -573,7 +573,7 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) { struct ip_vs_dest *dest, *least; int loh, doh; @@ -673,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) tbl = (struct ip_vs_lblcr_table *)svc->sched_data; en = ip_vs_lblcr_get(tbl, iph->daddr); if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblcr_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; @@ -687,7 +687,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } else { dest = ip_vs_dest_set_min(&en->set); if (!dest || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblcr_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; -- cgit v1.2.3 From 39ac50d0c79747b186c1268d9a488f8c1d256be7 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 18 Aug 2008 00:52:08 +0200 Subject: ipvs: Fix race conditions in lblc scheduler We can't access the cache entry outside of our critical read-locked region, because someone may free that entry. And we also need to check in the critical region wether the destination is still available, i.e. it's not in the trash. If we drop our reference counter, the destination can be purged from the trash at any time. Our caller only guarantees that no destination is moved to the trash, while we are scheduling. Also there is no need for our own rwlock, there is already one in the service structure for use in the schedulers. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_lblc.c | 204 +++++++++++++++++++++------------------------ 1 file changed, 96 insertions(+), 108 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b9b334cccf3..d2a43aa3fe4 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -96,7 +96,6 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - rwlock_t lock; /* lock for this table */ struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ @@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = { static struct ctl_table_header * sysctl_header; -/* - * new/free a ip_vs_lblc_entry, which is a mapping of a destionation - * IP address to a server. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - return en; -} - - static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { list_del(&en->list); @@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr) * Hash an entry in the ip_vs_lblc_table. * returns bool success. */ -static int +static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } + unsigned hash = ip_vs_lblc_hashkey(en->addr); - /* - * Hash by destination IP address - */ - hash = ip_vs_lblc_hashkey(en->addr); - - write_lock(&tbl->lock); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; } /* - * Get ip_vs_lblc_entry associated with supplied parameters. + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) { - unsigned hash; + unsigned hash = ip_vs_lblc_hashkey(addr); struct ip_vs_lblc_entry *en; - hash = ip_vs_lblc_hashkey(addr); + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; - read_lock(&tbl->lock); + return NULL; +} - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; } - } - read_unlock(&tbl->lock); + en->addr = daddr; + en->lastuse = jiffies; - return NULL; + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; } @@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) */ static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) { - int i; struct ip_vs_lblc_entry *en, *nxt; + int i; for (i=0; ilock); list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); } } -static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; unsigned long now = jiffies; int i, j; - struct ip_vs_lblc_entry *en, *nxt; for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_ip_vs_lblc_expiration)) @@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) */ static void ip_vs_lblc_check_expire(unsigned long data) { - struct ip_vs_lblc_table *tbl; + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblc_entry *en, *nxt; - tbl = (struct ip_vs_lblc_table *)data; - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ - ip_vs_lblc_full_check(tbl); + ip_vs_lblc_full_check(svc); tbl->counter = 1; goto out; } @@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; @@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); if (tbl == NULL) { IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); return -ENOMEM; } svc->sched_data = tbl; IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblc_table)); + "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets @@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) for (i=0; ibucket[i]); } - rwlock_init(&tbl->lock); tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; @@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) * Hook periodic timer for garbage collection */ setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } @@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) ip_vs_lblc_flush(tbl); /* release the table itself */ - kfree(svc->sched_data); + kfree(tbl); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblc_table)); + sizeof(*tbl)); return 0; } @@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct ip_vs_dest *dest; - struct ip_vs_lblc_table *tbl; - struct ip_vs_lblc_entry *en; + struct ip_vs_lblc_table *tbl = svc->sched_data; struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_lblc_table *)svc->sched_data; + /* First look in our cache */ + read_lock(&svc->sched_lock); en = ip_vs_lblc_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_lblc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblc_new(iph->daddr, dest); - if (en == NULL) { - return NULL; - } - ip_vs_lblc_hash(tbl, en); - } else { - dest = en->dest; - if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest, svc)) { - dest = __ip_vs_lblc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; } - en->lastuse = jiffies; + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), + NIPQUAD(iph->daddr), NIPQUAD(dest->addr), ntohs(dest->port)); -- cgit v1.2.3 From f728bafb5698076dd35bca35ee6cfe52ea1b8ab2 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Tue, 19 Aug 2008 08:16:19 +0200 Subject: ipvs: Fix race conditions in lblcr scheduler We can't access the cache entry outside of our critical read-locked region, because someone may free that entry. Also getting an entry under read lock, then locking for write and trying to delete that entry looks fishy, but should be no problem here, because we're only comparing a pointer. Also there is no need for our own rwlock, there is already one in the service structure for use in the schedulers. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_lblcr.c | 229 ++++++++++++++++++++++---------------------- 1 file changed, 114 insertions(+), 115 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index f1c84503689..375a1ffb6b6 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) return NULL; } - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) { IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); return NULL; @@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) e->dest = dest; /* link it to the list */ - write_lock(&set->lock); e->next = set->list; set->list = e; atomic_inc(&set->size); - write_unlock(&set->lock); set->lastmod = jiffies; return e; @@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) { struct ip_vs_dest_list *e, **ep; - write_lock(&set->lock); for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { if (e->dest == dest) { /* HIT */ @@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) } ep = &e->next; } - write_unlock(&set->lock); } static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) @@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) if (set == NULL) return NULL; - read_lock(&set->lock); /* select the first destination server, whose weight > 0 */ for (e=set->list; e!=NULL; e=e->next) { least = e->dest; @@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) goto nextstage; } } - read_unlock(&set->lock); return NULL; /* find the destination with the weighted least load */ @@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) loh = doh; } } - read_unlock(&set->lock); IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", @@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) if (set == NULL) return NULL; - read_lock(&set->lock); /* select the first destination server, whose weight > 0 */ for (e=set->list; e!=NULL; e=e->next) { most = e->dest; @@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) goto nextstage; } } - read_unlock(&set->lock); return NULL; /* find the destination with the weighted most load */ @@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) moh = doh; } } - read_unlock(&set->lock); IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", @@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - rwlock_t lock; /* lock for this table */ struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ @@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = { static struct ctl_table_header * sysctl_header; -/* - * new/free a ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. - */ -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr) -{ - struct ip_vs_lblcr_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - return en; -} - - static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { list_del(&en->list); @@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) * Hash an entry in the ip_vs_lblcr_table. * returns bool success. */ -static int +static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } + unsigned hash = ip_vs_lblcr_hashkey(en->addr); - /* - * Hash by destination IP address - */ - hash = ip_vs_lblcr_hashkey(en->addr); - - write_lock(&tbl->lock); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; } /* - * Get ip_vs_lblcr_entry associated with supplied parameters. + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) { - unsigned hash; + unsigned hash = ip_vs_lblcr_hashkey(addr); struct ip_vs_lblcr_entry *en; - hash = ip_vs_lblcr_hashkey(addr); + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} - read_lock(&tbl->lock); - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; } + + en->addr = daddr; + en->lastuse = jiffies; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); } - read_unlock(&tbl->lock); + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); - return NULL; + return en; } @@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) int i; struct ip_vs_lblcr_entry *en, *nxt; + /* No locking required, only called during cleanup. */ for (i=0; ilock); list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); } } -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en, *nxt; @@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, now)) @@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) */ static void ip_vs_lblcr_check_expire(unsigned long data) { - struct ip_vs_lblcr_table *tbl; + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblcr_entry *en, *nxt; - tbl = (struct ip_vs_lblcr_table *)data; - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ - ip_vs_lblcr_full_check(tbl); + ip_vs_lblcr_full_check(svc); tbl->counter = 1; goto out; } @@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); if (tbl == NULL) { IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); return -ENOMEM; } svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblcr_table)); + "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets @@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) for (i=0; ibucket[i]); } - rwlock_init(&tbl->lock); tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; @@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) * Hook periodic timer for garbage collection */ setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } @@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) ip_vs_lblcr_flush(tbl); /* release the table itself */ - kfree(svc->sched_data); + kfree(tbl); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblcr_table)); + sizeof(*tbl)); return 0; } @@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct ip_vs_dest *dest; - struct ip_vs_lblcr_table *tbl; - struct ip_vs_lblcr_entry *en; + struct ip_vs_lblcr_table *tbl = svc->sched_data; struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + /* First look in our cache */ + read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_lblcr_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblcr_new(iph->daddr); - if (en == NULL) { - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - ip_vs_lblcr_hash(tbl, en); - } else { + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - if (!dest || is_overloaded(dest, svc)) { - dest = __ip_vs_lblcr_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - } + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + time_after(jiffies, en->set.lastmod + + sysctl_ip_vs_lblcr_expiration)) { struct ip_vs_dest *m; + + write_lock(&en->set.lock); m = ip_vs_dest_set_max(&en->set); if (m) ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + read_unlock(&svc->sched_lock); + return NULL; } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; } - en->lastuse = jiffies; + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), + NIPQUAD(iph->daddr), NIPQUAD(dest->addr), ntohs(dest->port)); -- cgit v1.2.3 From 409a19669e4cd8d1bab7dff31d3b6aa493ff60f0 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 22 Aug 2008 14:06:12 +0200 Subject: IPVS: Integrate ESP protocol into ip_vs_proto_ah.c Rename all ah_* functions to ah_esp_* (and adjust comments). Move ESP protocol definition into ip_vs_proto_ah.c and remove all usage of ip_vs_proto_esp.c. Make the compilation of ip_vs_proto_ah.c dependent on a new config variable, IP_VS_PROTO_AH_ESP, which is selected either by IP_VS_PROTO_ESP or IP_VS_PROTO_AH. Only compile the selected protocols' structures within this file. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Kconfig | 6 ++++ net/ipv4/ipvs/Makefile | 3 +- net/ipv4/ipvs/ip_vs_proto_ah.c | 69 ++++++++++++++++++++++++++++-------------- 3 files changed, 54 insertions(+), 24 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 09d0c3f3566..2e48a7e2722 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -71,14 +71,20 @@ config IP_VS_PROTO_UDP This option enables support for load balancing UDP transport protocol. Say Y if unsure. +config IP_VS_PROTO_AH_ESP + bool + depends on UNDEFINED + config IP_VS_PROTO_ESP bool "ESP load balancing support" + select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" + select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile index 30e85de9fff..cda3e0860d6 100644 --- a/net/ipv4/ipvs/Makefile +++ b/net/ipv4/ipvs/Makefile @@ -6,8 +6,7 @@ ip_vs_proto-objs-y := ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 73e0ea87c1f..3f9ebd7639a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -1,5 +1,5 @@ /* - * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS * * Authors: Julian Anastasov , February 2002 * Wensong Zhang @@ -39,11 +39,11 @@ struct isakmp_hdr { static struct ip_vs_conn * -ah_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) +ah_esp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; @@ -79,8 +79,8 @@ ah_conn_in_get(const struct sk_buff *skb, static struct ip_vs_conn * -ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; @@ -112,12 +112,12 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static int -ah_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) +ah_esp_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) { /* - * AH is only related traffic. Pass the packet to IP stack. + * AH/ESP is only related traffic. Pass the packet to IP stack. */ *verdict = NF_ACCEPT; return 0; @@ -125,8 +125,8 @@ ah_conn_schedule(struct sk_buff *skb, static void -ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) { char buf[256]; struct iphdr _iph, *ih; @@ -143,28 +143,29 @@ ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, } -static void ah_init(struct ip_vs_protocol *pp) +static void ah_esp_init(struct ip_vs_protocol *pp) { /* nothing to do now */ } -static void ah_exit(struct ip_vs_protocol *pp) +static void ah_esp_exit(struct ip_vs_protocol *pp) { /* nothing to do now */ } +#ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, - .init = ah_init, - .exit = ah_exit, - .conn_schedule = ah_conn_schedule, - .conn_in_get = ah_conn_in_get, - .conn_out_get = ah_conn_out_get, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .csum_check = NULL, @@ -172,7 +173,31 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_debug_packet, + .debug_packet = ah_esp_debug_packet, .timeout_change = NULL, /* ISAKMP */ .set_state_timeout = NULL, }; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif -- cgit v1.2.3 From e3c2ced8d21410e8bc897480081e2ffc516c0f70 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 22 Aug 2008 14:06:13 +0200 Subject: IPVS: Rename ip_vs_proto_ah.c to ip_vs_proto_ah_esp.c After integrating ESP into ip_vs_proto_ah, rename it (and the references to it) to ip_vs_proto_ah_esp.c and delete the old ip_vs_proto_esp.c. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Makefile | 2 +- net/ipv4/ipvs/ip_vs_proto_ah.c | 203 ------------------------------------- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 203 +++++++++++++++++++++++++++++++++++++ net/ipv4/ipvs/ip_vs_proto_esp.c | 176 -------------------------------- 4 files changed, 204 insertions(+), 380 deletions(-) delete mode 100644 net/ipv4/ipvs/ip_vs_proto_ah.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_ah_esp.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_esp.c (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile index cda3e0860d6..73a46fe1fe4 100644 --- a/net/ipv4/ipvs/Makefile +++ b/net/ipv4/ipvs/Makefile @@ -6,7 +6,7 @@ ip_vs_proto-objs-y := ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c deleted file mode 100644 index 3f9ebd7639a..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -ah_esp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH/ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -#ifdef CONFIG_IP_VS_PROTO_AH -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; -#endif - -#ifdef CONFIG_IP_VS_PROTO_ESP -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000000..3f9ebd7639a --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,203 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_esp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + htons(PORT_ISAKMP), + iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + htons(PORT_ISAKMP), + iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + htons(PORT_ISAKMP), + iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + htons(PORT_ISAKMP), + iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void ah_esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c deleted file mode 100644 index 21d70c8ffa5..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = esp_init, - .exit = esp_exit, - .conn_schedule = esp_conn_schedule, - .conn_in_get = esp_conn_in_get, - .conn_out_get = esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -- cgit v1.2.3 From fab0de02fb0da83b90cec7fce4294747d86d5c6f Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:32 +0200 Subject: IPVS: Add CONFIG_IP_VS_IPV6 option for IPv6 support Add boolean config option CONFIG_IP_VS_IPV6 for enabling experimental IPv6 support in IPVS. Only visible if IPv6 support is set to 'y' or both IPv6 and IPVS are modules. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 2e48a7e2722..794cecb249a 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -24,6 +24,14 @@ menuconfig IP_VS if IP_VS +config IP_VS_IPV6 + bool "IPv6 support for IPVS (DANGEROUS)" + depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + Say N if unsure. + config IP_VS_DEBUG bool "IP virtual server debugging" ---help--- -- cgit v1.2.3 From e7ade46a53055c19a01c8becbe7807f9075d6fee Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:33 +0200 Subject: IPVS: Change IPVS data structures to support IPv6 addresses Introduce new 'af' fields into IPVS data structures for specifying an entry's address family. Convert IP addresses to be of type union nf_inet_addr. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 60 ++++++++++++++++++++--------------------- net/ipv4/ipvs/ip_vs_core.c | 28 +++++++++---------- net/ipv4/ipvs/ip_vs_ctl.c | 37 ++++++++++++------------- net/ipv4/ipvs/ip_vs_dh.c | 2 +- net/ipv4/ipvs/ip_vs_ftp.c | 18 ++++++------- net/ipv4/ipvs/ip_vs_lblc.c | 4 +-- net/ipv4/ipvs/ip_vs_lblcr.c | 8 +++--- net/ipv4/ipvs/ip_vs_lc.c | 2 +- net/ipv4/ipvs/ip_vs_nq.c | 2 +- net/ipv4/ipvs/ip_vs_proto_tcp.c | 16 +++++------ net/ipv4/ipvs/ip_vs_proto_udp.c | 12 ++++----- net/ipv4/ipvs/ip_vs_rr.c | 2 +- net/ipv4/ipvs/ip_vs_sed.c | 2 +- net/ipv4/ipvs/ip_vs_sh.c | 2 +- net/ipv4/ipvs/ip_vs_sync.c | 6 ++--- net/ipv4/ipvs/ip_vs_wlc.c | 2 +- net/ipv4/ipvs/ip_vs_wrr.c | 2 +- net/ipv4/ipvs/ip_vs_xmit.c | 12 ++++----- 18 files changed, 109 insertions(+), 108 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 44a6872dc24..639d4bc7fc1 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -131,7 +131,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) int ret; /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); ct_write_lock(hash); @@ -162,7 +162,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); ct_write_lock(hash); @@ -197,10 +197,10 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && + if (s_addr == cp->caddr.ip && s_port == cp->cport && + d_port == cp->vport && d_addr == cp->vaddr.ip && ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol==cp->protocol) { + protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); ct_read_unlock(hash); @@ -243,10 +243,10 @@ struct ip_vs_conn *ip_vs_ct_in_get ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && + if (s_addr == cp->caddr.ip && s_port == cp->cport && + d_port == cp->vport && d_addr == cp->vaddr.ip && cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol==cp->protocol) { + protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); goto out; @@ -286,8 +286,8 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (d_addr == cp->caddr && d_port == cp->cport && - s_port == cp->dport && s_addr == cp->daddr && + if (d_addr == cp->caddr.ip && d_port == cp->cport && + s_port == cp->dport && s_addr == cp->daddr.ip && protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); @@ -406,9 +406,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -444,8 +444,8 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->daddr, cp->dport, - cp->vaddr, cp->vport, cp->protocol); + dest = ip_vs_find_dest(cp->daddr.ip, cp->dport, + cp->vaddr.ip, cp->vport, cp->protocol); ip_vs_bind_dest(cp, dest); return dest; } else @@ -468,9 +468,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -530,9 +530,9 @@ int ip_vs_check_template(struct ip_vs_conn *ct) "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " "-> d:%u.%u.%u.%u:%d\n", ip_vs_proto_name(ct->protocol), - NIPQUAD(ct->caddr), ntohs(ct->cport), - NIPQUAD(ct->vaddr), ntohs(ct->vport), - NIPQUAD(ct->daddr), ntohs(ct->dport)); + NIPQUAD(ct->caddr.ip), ntohs(ct->cport), + NIPQUAD(ct->vaddr.ip), ntohs(ct->vport), + NIPQUAD(ct->daddr.ip), ntohs(ct->dport)); /* * Invalidate the connection template @@ -641,11 +641,11 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); cp->protocol = proto; - cp->caddr = caddr; + cp->caddr.ip = caddr; cp->cport = cport; - cp->vaddr = vaddr; + cp->vaddr.ip = vaddr; cp->vport = vport; - cp->daddr = daddr; + cp->daddr.ip = daddr; cp->dport = dport; cp->flags = flags; spin_lock_init(&cp->lock); @@ -763,9 +763,9 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ); } @@ -812,9 +812,9 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 9fbf0a6d739..4a54f33b60d 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -232,14 +232,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, snet, 0, iph->daddr, ports[1], - dest->addr, dest->port, + dest->addr.ip, dest->port, IP_VS_CONN_F_TEMPLATE, dest); else ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, 0, - dest->addr, 0, + dest->addr.ip, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -286,14 +286,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, ct = ip_vs_conn_new(IPPROTO_IP, snet, 0, htonl(svc->fwmark), 0, - dest->addr, 0, + dest->addr.ip, 0, IP_VS_CONN_F_TEMPLATE, dest); else ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, 0, - dest->addr, 0, + dest->addr.ip, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -313,7 +313,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, cp = ip_vs_conn_new(iph->protocol, iph->saddr, ports[0], iph->daddr, ports[1], - dest->addr, dport, + dest->addr.ip, dport, 0, dest); if (cp == NULL) { @@ -380,7 +380,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) cp = ip_vs_conn_new(iph->protocol, iph->saddr, pptr[0], iph->daddr, pptr[1], - dest->addr, dest->port?dest->port:pptr[1], + dest->addr.ip, dest->port ? dest->port : pptr[1], 0, dest); if (cp == NULL) @@ -389,9 +389,9 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); @@ -526,14 +526,14 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { - iph->saddr = cp->vaddr; + iph->saddr = cp->vaddr.ip; ip_send_check(iph); - ciph->daddr = cp->vaddr; + ciph->daddr = cp->vaddr.ip; ip_send_check(ciph); } else { - iph->daddr = cp->daddr; + iph->daddr = cp->daddr.ip; ip_send_check(iph); - ciph->saddr = cp->daddr; + ciph->saddr = cp->daddr.ip; ip_send_check(ciph); } @@ -762,7 +762,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* mangle the packet */ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) goto drop; - ip_hdr(skb)->saddr = cp->vaddr; + ip_hdr(skb)->saddr = cp->vaddr.ip; ip_send_check(ip_hdr(skb)); /* For policy routing, packets originating from this diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index ede101eeec1..3f2277b847d 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -317,7 +317,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr.ip, + svc->port); list_add(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* @@ -373,7 +374,7 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) hash = ip_vs_svc_hashkey(protocol, vaddr, vport); list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->addr == vaddr) + if ((svc->addr.ip == vaddr) && (svc->port == vport) && (svc->protocol == protocol)) { /* HIT */ @@ -503,7 +504,7 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->addr, dest->port); + hash = ip_vs_rs_hashkey(dest->addr.ip, dest->port); list_add(&dest->d_list, &ip_vs_rtable[hash]); return 1; @@ -543,7 +544,7 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) read_lock(&__ip_vs_rs_lock); list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->addr == daddr) + if ((dest->addr.ip == daddr) && (dest->port == dport) && ((dest->protocol == protocol) || dest->vfwmark)) { @@ -569,7 +570,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Find the destination for the given service */ list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->addr == daddr) && (dest->port == dport)) { + if ((dest->addr.ip == daddr) && (dest->port == dport)) { /* HIT */ return dest; } @@ -626,14 +627,14 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->refcnt)); - if (dest->addr == daddr && + if (dest->addr.ip == daddr && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || - (dest->vaddr == svc->addr && + (dest->vaddr.ip == svc->addr.ip && dest->vport == svc->port))) { /* HIT */ return dest; @@ -646,7 +647,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " "from trash\n", dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port)); + NIPQUAD(dest->addr.ip), ntohs(dest->port)); list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); @@ -779,10 +780,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, } dest->protocol = svc->protocol; - dest->vaddr = svc->addr; + dest->vaddr.ip = svc->addr.ip; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - dest->addr = udest->addr; + dest->addr.ip = udest->addr; dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -847,7 +848,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) NIPQUAD(daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, - NIPQUAD(dest->vaddr), + NIPQUAD(dest->vaddr.ip), ntohs(dest->vport)); __ip_vs_update_dest(svc, dest, udest); @@ -993,7 +994,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) } else { IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " "dest->refcnt=%d\n", - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); atomic_inc(&dest->refcnt); @@ -1101,7 +1102,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) atomic_set(&svc->refcnt, 0); svc->protocol = u->protocol; - svc->addr = u->addr; + svc->addr.ip = u->addr; svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags; @@ -1751,7 +1752,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) if (iter->table == ip_vs_svc_table) seq_printf(seq, "%s %08X:%04X %s ", ip_vs_proto_name(svc->protocol), - ntohl(svc->addr), + ntohl(svc->addr.ip), ntohs(svc->port), svc->scheduler->name); else @@ -1768,7 +1769,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) list_for_each_entry(dest, &svc->destinations, n_list) { seq_printf(seq, " -> %08X:%04X %-7s %-6d %-10d %-10d\n", - ntohl(dest->addr), ntohs(dest->port), + ntohl(dest->addr.ip), ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), @@ -2040,7 +2041,7 @@ static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { dst->protocol = src->protocol; - dst->addr = src->addr; + dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); @@ -2114,7 +2115,7 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; - entry.addr = dest->addr; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index fa66824d264..9f9d795dbd7 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -218,7 +218,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index c1c758e4f73..bfe5d7050a5 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -172,17 +172,17 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); + NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr.ip), 0); /* * Now update or create an connection entry for it */ n_cp = ip_vs_conn_out_get(iph->protocol, from, port, - cp->caddr, 0); + cp->caddr.ip, 0); if (!n_cp) { n_cp = ip_vs_conn_new(IPPROTO_TCP, - cp->caddr, 0, - cp->vaddr, port, + cp->caddr.ip, 0, + cp->vaddr.ip, port, from, port, IP_VS_CONN_F_NO_CPORT, cp->dest); @@ -196,7 +196,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Replace the old passive address with the new one */ - from = n_cp->vaddr; + from = n_cp->vaddr.ip; port = n_cp->vport; sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), (ntohs(port)>>8)&255, ntohs(port)&255); @@ -306,16 +306,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", ip_vs_proto_name(iph->protocol), - NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); + NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); n_cp = ip_vs_conn_in_get(iph->protocol, to, port, - cp->vaddr, htons(ntohs(cp->vport)-1)); + cp->vaddr.ip, htons(ntohs(cp->vport)-1)); if (!n_cp) { n_cp = ip_vs_conn_new(IPPROTO_TCP, to, port, - cp->vaddr, htons(ntohs(cp->vport)-1), - cp->daddr, htons(ntohs(cp->dport)-1), + cp->vaddr.ip, htons(ntohs(cp->vport)-1), + cp->daddr.ip, htons(ntohs(cp->dport)-1), 0, cp->dest); if (!n_cp) diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index d2a43aa3fe4..69309edc0c4 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -422,7 +422,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); @@ -506,7 +506,7 @@ out: IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 375a1ffb6b6..51c746e2083 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -204,7 +204,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); @@ -250,7 +250,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr), ntohs(most->port), + NIPQUAD(most->addr.ip), ntohs(most->port), atomic_read(&most->activeconns), atomic_read(&most->refcnt), atomic_read(&most->weight), moh); @@ -598,7 +598,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); @@ -706,7 +706,7 @@ out: IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index 2c3de1b6351..551d293347f 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -68,7 +68,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (least) IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->inactconns)); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index 5330d5a2de1..aa0e32ad348 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -101,7 +101,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) out: IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index d0ea467986a..15860e1441b 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -147,7 +147,7 @@ tcp_snat_handler(struct sk_buff *skb, /* Adjust TCP checksums */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, + tcp_fast_csum_update(tcph, cp->daddr.ip, cp->vaddr.ip, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -155,7 +155,7 @@ tcp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", @@ -198,7 +198,7 @@ tcp_dnat_handler(struct sk_buff *skb, */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, + tcp_fast_csum_update(tcph, cp->vaddr.ip, cp->daddr.ip, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -206,7 +206,7 @@ tcp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -427,8 +427,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, th->fin? 'F' : '.', th->ack? 'A' : '.', th->rst? 'R' : '.', - NIPQUAD(cp->daddr), ntohs(cp->dport), - NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), tcp_state_name(cp->state), tcp_state_name(new_state), atomic_read(&cp->refcnt)); @@ -549,8 +549,8 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u to app %s on port %u\n", __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index c6be5d56823..8dfad5db829 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -158,7 +158,7 @@ udp_snat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->daddr, cp->vaddr, + udp_fast_csum_update(udph, cp->daddr.ip, cp->vaddr.ip, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -166,7 +166,7 @@ udp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) @@ -211,7 +211,7 @@ udp_dnat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->vaddr, cp->daddr, + udp_fast_csum_update(udph, cp->vaddr.ip, cp->daddr.ip, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -219,7 +219,7 @@ udp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) @@ -343,8 +343,8 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u to app %s on port %u\n", __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index f7492911753..27f0b624283 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -76,7 +76,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) write_unlock(&svc->sched_lock); IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index 53f73bea66c..38b574b2fdf 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -103,7 +103,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 7b979e22805..c9e54e2ec29 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -215,7 +215,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->saddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index a652da2c320..2cf47b2e166 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -256,9 +256,9 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) s->cport = cp->cport; s->vport = cp->vport; s->dport = cp->dport; - s->caddr = cp->caddr; - s->vaddr = cp->vaddr; - s->daddr = cp->daddr; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); s->state = htons(cp->state); if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index df7ad8d7476..09fd993040f 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -91,7 +91,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c index 0d86a79b87b..19c49b234f3 100644 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -197,7 +197,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 9892d4aca42..88199c9f2d3 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -71,7 +71,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = dest->addr, + .daddr = dest->addr.ip, .saddr = 0, .tos = rtos, } }, }; @@ -80,12 +80,12 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, " "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr)); + NIPQUAD(dest->addr.ip)); return NULL; } __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), atomic_read(&rt->u.dst.__refcnt), rtos); } spin_unlock(&dest->dst_lock); @@ -94,14 +94,14 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = cp->daddr, + .daddr = cp->daddr.ip, .saddr = 0, .tos = rtos, } }, }; if (ip_route_output_key(&init_net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); return NULL; } } @@ -264,7 +264,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; - ip_hdr(skb)->daddr = cp->daddr; + ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); -- cgit v1.2.3 From c860c6b1479992440e4962e9c95d258bfdce4fca Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:36 +0200 Subject: IPVS: Add internal versions of sockopt interface structs Add extended internal versions of struct ip_vs_service_user and struct ip_vs_dest_user (the originals can't be modified as they are part of the old sockopt interface). Adjust ip_vs_ctl.c to work with the new data structures and add some minor AF-awareness. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 138 ++++++++++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 48 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 3f2277b847d..a0c8b7bb553 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -708,7 +708,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) */ static void __ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) + struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) { int conn_flags; @@ -717,7 +717,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ - if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | IP_VS_CONN_F_LOCALNODE; } @@ -761,7 +761,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, * Create a destination for the given service */ static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { struct ip_vs_dest *dest; @@ -769,7 +769,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, EnterFunction(2); - atype = inet_addr_type(&init_net, udest->addr); + atype = inet_addr_type(&init_net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; @@ -779,11 +779,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, return -ENOMEM; } + dest->af = svc->af; dest->protocol = svc->protocol; - dest->vaddr.ip = svc->addr.ip; + dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - dest->addr.ip = udest->addr; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -808,10 +809,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, * Add a destination into an existing service */ static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - __be32 daddr = udest->addr; + union nf_inet_addr daddr; __be16 dport = udest->port; int ret; @@ -828,10 +829,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) return -ERANGE; } + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + /* * Check if the dest already exists in the list */ - dest = ip_vs_lookup_dest(svc, daddr, dport); + dest = ip_vs_lookup_dest(svc, daddr.ip, dport); if (dest != NULL) { IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); return -EEXIST; @@ -841,7 +844,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, daddr, dport); + dest = ip_vs_trash_get_dest(svc, daddr.ip, dport); if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", @@ -916,10 +919,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) * Edit a destination in the given service */ static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - __be32 daddr = udest->addr; + union nf_inet_addr daddr; __be16 dport = udest->port; EnterFunction(2); @@ -935,10 +938,12 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) return -ERANGE; } + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + /* * Lookup the destination list */ - dest = ip_vs_lookup_dest(svc, daddr, dport); + dest = ip_vs_lookup_dest(svc, daddr.ip, dport); if (dest == NULL) { IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); return -ENOENT; @@ -1029,15 +1034,15 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, * Delete a destination server in the given service */ static int -ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - __be32 daddr = udest->addr; __be16 dport = udest->port; EnterFunction(2); - dest = ip_vs_lookup_dest(svc, daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->addr.ip, dport); + if (dest == NULL) { IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); return -ENOENT; @@ -1072,7 +1077,8 @@ ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) * Add a service into the service hash table */ static int -ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) +ip_vs_add_service(struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; @@ -1101,8 +1107,9 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) atomic_set(&svc->usecnt, 1); atomic_set(&svc->refcnt, 0); + svc->af = u->af; svc->protocol = u->protocol; - svc->addr.ip = u->addr; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags; @@ -1161,7 +1168,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) * Edit a service and bind it with a new scheduler */ static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched, *old_sched; int ret = 0; @@ -1905,14 +1912,44 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, }; +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + static int do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; - struct ip_vs_dest_user *udest; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1952,35 +1989,40 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) goto out_unlock; } - usvc = (struct ip_vs_service_user *)arg; - udest = (struct ip_vs_dest_user *)(usvc + 1); + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ - if (!usvc->fwmark && !usvc->addr && !usvc->port) { + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { ret = ip_vs_zero_all(); goto out_unlock; } } /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc->protocol, NIPQUAD(usvc->addr), - ntohs(usvc->port), usvc->sched_name); + usvc.protocol, NIPQUAD(usvc.addr.ip), + ntohs(usvc.port), usvc.sched_name); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by or fwmark */ - if (usvc->fwmark == 0) - svc = __ip_vs_service_get(usvc->protocol, - usvc->addr, usvc->port); + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.protocol, + usvc.addr.ip, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc->fwmark); + svc = __ip_vs_svc_fwm_get(usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc->protocol)) { + && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; goto out_unlock; } @@ -1990,10 +2032,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(usvc, &svc); + ret = ip_vs_add_service(&usvc, &svc); break; case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, usvc); + ret = ip_vs_edit_service(svc, &usvc); break; case IP_VS_SO_SET_DEL: ret = ip_vs_del_service(svc); @@ -2004,13 +2046,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, udest); + ret = ip_vs_add_dest(svc, &udest); break; case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, udest); + ret = ip_vs_edit_dest(svc, &udest); break; case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, udest); + ret = ip_vs_del_dest(svc, &udest); break; default: ret = -EINVAL; @@ -2517,7 +2559,7 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, +static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; @@ -2537,6 +2579,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; + usvc->af = nla_get_u16(nla_af); /* For now, only support IPv4 */ if (nla_get_u16(nla_af) != AF_INET) return -EAFNOSUPPORT; @@ -2572,7 +2615,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, if (usvc->fwmark) svc = __ip_vs_svc_fwm_get(usvc->fwmark); else - svc = __ip_vs_service_get(usvc->protocol, usvc->addr, + svc = __ip_vs_service_get(usvc->protocol, usvc->addr.ip, usvc->port); if (svc) { usvc->flags = svc->flags; @@ -2583,9 +2626,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); - - strlcpy(usvc->sched_name, nla_data(nla_sched), - sizeof(usvc->sched_name)); + usvc->sched_name = nla_data(nla_sched); usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_u32(nla_netmask); } @@ -2595,7 +2636,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) { - struct ip_vs_service_user usvc; + struct ip_vs_service_user_kern usvc; int ret; ret = ip_vs_genl_parse_service(&usvc, nla, 0); @@ -2605,7 +2646,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) if (usvc.fwmark) return __ip_vs_svc_fwm_get(usvc.fwmark); else - return __ip_vs_service_get(usvc.protocol, usvc.addr, + return __ip_vs_service_get(usvc.protocol, usvc.addr.ip, usvc.port); } @@ -2705,7 +2746,7 @@ out_err: return skb->len; } -static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest, +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, struct nlattr *nla, int full_entry) { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; @@ -2861,8 +2902,8 @@ static int ip_vs_genl_set_config(struct nlattr **attrs) static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc = NULL; - struct ip_vs_service_user usvc; - struct ip_vs_dest_user udest; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; @@ -2914,7 +2955,8 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port); + svc = __ip_vs_service_get(usvc.protocol, usvc.addr.ip, + usvc.port); else svc = __ip_vs_svc_fwm_get(usvc.fwmark); -- cgit v1.2.3 From b18610de9ec2728159f723a9b864ca78a5774193 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:37 +0200 Subject: IPVS: Convert __ip_vs_svc_get() and __ip_vs_fwm_get() Add support for getting services based on their address family to __ip_vs_service_get(), __ip_vs_fwm_get() and the helper hash function ip_vs_svc_hashkey(). Adjust the callers. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 79 ++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 32 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index a0c8b7bb553..a2d69b2ce6a 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -282,11 +282,19 @@ static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); * Returns hash value for virtual service */ static __inline__ unsigned -ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port) +ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, + __be16 port) { register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; - return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) & IP_VS_SVC_TAB_MASK; } @@ -317,7 +325,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->protocol, svc->addr.ip, + hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, svc->port); list_add(&svc->s_list, &ip_vs_svc_table[hash]); } else { @@ -364,17 +372,19 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* * Get service by {proto,addr,port} in the service table. */ -static __inline__ struct ip_vs_service * -__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) +static inline struct ip_vs_service * +__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, + __be16 vport) { unsigned hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->addr.ip == vaddr) + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol)) { /* HIT */ @@ -390,7 +400,8 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) /* * Get service by {fwmark} in the service table. */ -static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) +static inline struct ip_vs_service * +__ip_vs_svc_fwm_get(int af, __u32 fwmark) { unsigned hash; struct ip_vs_service *svc; @@ -399,7 +410,7 @@ static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) hash = ip_vs_svc_fwm_hashkey(fwmark); list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark) { + if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ atomic_inc(&svc->usecnt); return svc; @@ -413,20 +424,20 @@ struct ip_vs_service * ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) { struct ip_vs_service *svc; - + union nf_inet_addr _vaddr = { .ip = vaddr }; read_lock(&__ip_vs_svc_lock); /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) + if (fwmark && (svc = __ip_vs_svc_fwm_get(AF_INET, fwmark))) goto out; /* * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_get(protocol, vaddr, vport); + svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -436,7 +447,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); + svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, FTPPORT); } if (svc == NULL @@ -444,7 +455,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_get(protocol, vaddr, 0); + svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, 0); } out: @@ -2016,10 +2027,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.protocol, - usvc.addr.ip, usvc.port); + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc.fwmark); + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2141,13 +2152,15 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; if (get->fwmark) - svc = __ip_vs_svc_fwm_get(get->fwmark); + svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); else - svc = __ip_vs_service_get(get->protocol, - get->addr, get->port); + svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, + get->port); + if (svc) { int count = 0; struct ip_vs_dest *dest; @@ -2282,13 +2295,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_service_entry *entry; struct ip_vs_service *svc; + union nf_inet_addr addr; entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(entry->fwmark); + svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); else - svc = __ip_vs_service_get(entry->protocol, - entry->addr, entry->port); + svc = __ip_vs_service_get(AF_INET, entry->protocol, + &addr, entry->port); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2613,10 +2628,10 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, /* prefill flags from service if it already exists */ if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->fwmark); + svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); else - svc = __ip_vs_service_get(usvc->protocol, usvc->addr.ip, - usvc->port); + svc = __ip_vs_service_get(usvc->af, usvc->protocol, + &usvc->addr, usvc->port); if (svc) { usvc->flags = svc->flags; ip_vs_service_put(svc); @@ -2644,10 +2659,10 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) return ERR_PTR(ret); if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.fwmark); + return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); else - return __ip_vs_service_get(usvc.protocol, usvc.addr.ip, - usvc.port); + return __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) @@ -2955,10 +2970,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.protocol, usvc.addr.ip, - usvc.port); + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc.fwmark); + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { -- cgit v1.2.3 From 3c2e0505d25cdc9425336f167fd4ff5f505aecff Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:38 +0200 Subject: IPVS: Add v6 support to ip_vs_service_get() Add support for selecting services based on their address family to ip_vs_service_get() and adjust the callers. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 24 +++++++++++++----------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 9 ++++++--- net/ipv4/ipvs/ip_vs_proto_udp.c | 11 +++++++---- 3 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index a2d69b2ce6a..1f3fc66e694 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -421,23 +421,24 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark) } struct ip_vs_service * -ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) +ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; - union nf_inet_addr _vaddr = { .ip = vaddr }; + read_lock(&__ip_vs_svc_lock); /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(AF_INET, fwmark))) + if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) goto out; /* * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, vport); + svc = __ip_vs_service_get(af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, FTPPORT); + svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -455,16 +456,16 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, 0); + svc = __ip_vs_service_get(af, protocol, vaddr, 0); } out: read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - NIPQUAD(vaddr), ntohs(vport), - svc?"hit":"not hit"); + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); return svc; } @@ -605,8 +606,9 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, { struct ip_vs_dest *dest; struct ip_vs_service *svc; + union nf_inet_addr _vaddr = { .ip = vaddr }; - svc = ip_vs_service_get(0, protocol, vaddr, vport); + svc = ip_vs_service_get(AF_INET, 0, protocol, &_vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 15860e1441b..fe93c9e6ff6 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -74,16 +74,19 @@ tcp_conn_schedule(struct sk_buff *skb, { struct ip_vs_service *svc; struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } if (th->syn && - (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, th->dest))) { + (svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, + &iph.daddr, th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8dfad5db829..d208ed6eb9f 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -80,16 +80,19 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, { struct ip_vs_service *svc; struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; - uh = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_udph), &_udph); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } - if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, uh->dest))) { + svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. -- cgit v1.2.3 From b14198f6c1bea1687d20723db35d8effecd9d899 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:39 +0200 Subject: IPVS: Add IPv6 support flag to schedulers Add 'supports_ipv6' flag to struct ip_vs_scheduler to indicate whether a scheduler supports IPv6. Set the flag to 1 in schedulers that work with IPv6, 0 otherwise. This flag is checked in a later patch while trying to add a service with a specific scheduler. Adjust debug in v6-supporting schedulers to work with both address families. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_dh.c | 3 +++ net/ipv4/ipvs/ip_vs_lblc.c | 3 +++ net/ipv4/ipvs/ip_vs_lblcr.c | 3 +++ net/ipv4/ipvs/ip_vs_lc.c | 11 +++++++---- net/ipv4/ipvs/ip_vs_nq.c | 15 +++++++++------ net/ipv4/ipvs/ip_vs_rr.c | 13 ++++++++----- net/ipv4/ipvs/ip_vs_sed.c | 15 +++++++++------ net/ipv4/ipvs/ip_vs_sh.c | 3 +++ net/ipv4/ipvs/ip_vs_wlc.c | 15 +++++++++------ net/ipv4/ipvs/ip_vs_wrr.c | 15 +++++++++------ 10 files changed, 63 insertions(+), 33 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index 9f9d795dbd7..a16943fd72f 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -234,6 +234,9 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, .update_service = ip_vs_dh_update_svc, diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 69309edc0c4..6ecef3518ca 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -522,6 +522,9 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_lblc_init_svc, .done_service = ip_vs_lblc_done_svc, .schedule = ip_vs_lblc_schedule, diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 51c746e2083..1f75ea83bcf 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -722,6 +722,9 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_lblcr_init_svc, .done_service = ip_vs_lblcr_done_svc, .schedule = ip_vs_lblcr_schedule, diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index 551d293347f..b69f808ac46 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -67,10 +67,10 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (least) - IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); return least; } @@ -81,6 +81,9 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_lc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index aa0e32ad348..9a2d8033f08 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -99,12 +99,12 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; out: - IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); return least; } @@ -116,6 +116,9 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_nq_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index 27f0b624283..a22195f68ac 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -74,11 +74,11 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) out: svc->sched_data = q; write_unlock(&svc->sched_lock); - IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); return dest; } @@ -89,6 +89,9 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .init_service = ip_vs_rr_init_svc, .update_service = ip_vs_rr_update_svc, .schedule = ip_vs_rr_schedule, diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index 38b574b2fdf..7d2f22f04b8 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -101,12 +101,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } } - IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); return least; } @@ -118,6 +118,9 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_sed_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index c9e54e2ec29..1d96de27fef 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -231,6 +231,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, .update_service = ip_vs_sh_update_svc, diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index 09fd993040f..8c596e71259 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -89,12 +89,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } } - IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); return least; } @@ -106,6 +106,9 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_wlc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c index 19c49b234f3..7ea92fed50b 100644 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -195,12 +195,12 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } } - IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); out: write_unlock(&svc->sched_lock); @@ -213,6 +213,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, .update_service = ip_vs_wrr_update_svc, -- cgit v1.2.3 From 51ef348b14183789e4cb3444d05ce83b1b69d8fb Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:40 +0200 Subject: IPVS: Add 'af' args to protocol handler functions Add 'af' arguments to conn_schedule(), conn_in_get(), conn_out_get() and csum_check() function pointers in struct ip_vs_protocol. Extend the respective functions for TCP, UDP, AH and ESP and adjust the callers. The changes in the callers need to be somewhat extensive, since they now need to pass a filled out struct ip_vs_iphdr * to the modified functions instead of a struct iphdr *. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 64 +++++++++++++++--------------- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 56 +++++++++++++------------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 79 ++++++++++++++++++++++++------------- net/ipv4/ipvs/ip_vs_proto_udp.c | 81 ++++++++++++++++++++++++-------------- 4 files changed, 162 insertions(+), 118 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 4a54f33b60d..34aaa1480d9 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -572,6 +572,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; @@ -627,8 +628,9 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) offset += cih->ihl * 4; + ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(skb, pp, cih, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -686,43 +688,41 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct iphdr *iph; + struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ihl; EnterFunction(11); if (skb->ipvs_property) return NF_ACCEPT; - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_out_icmp(skb, &related); if (related) return verdict; - iph = ip_hdr(skb); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); } - pp = ip_vs_proto_get(iph->protocol); + pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; /* reassemble IP fragments */ - if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) return NF_STOLEN; - iph = ip_hdr(skb); - } - ihl = iph->ihl << 2; + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(skb, pp, iph, ihl, 0); + cp = pp->conn_out_get(AF_INET, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { if (sysctl_ip_vs_nat_icmp_send && @@ -730,18 +730,18 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, pp->protocol == IPPROTO_UDP)) { __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, ihl, + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(iph->protocol, - iph->saddr, pptr[0])) { + if (ip_vs_lookup_real_service(iph.protocol, + iph.saddr.ip, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ - if (iph->protocol != IPPROTO_TCP + if (iph.protocol != IPPROTO_TCP || !is_tcp_reset(skb)) { icmp_send(skb,ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -756,7 +756,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - if (!skb_make_writable(skb, ihl)) + if (!skb_make_writable(skb, iph.len)) goto drop; /* mangle the packet */ @@ -804,6 +804,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; @@ -860,8 +861,9 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) offset += cih->ihl * 4; + ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(skb, pp, cih, offset, 1); + cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -897,11 +899,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct iphdr *iph; + struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; int ret, restart; - int ihl; + + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); /* * Big tappo: only PACKET_HOST (neither loopback nor mcasts) @@ -909,38 +912,35 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, */ if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { - IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", - skb->pkt_type, - ip_hdr(skb)->protocol, - NIPQUAD(ip_hdr(skb)->daddr)); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", + skb->pkt_type, + iph.protocol, + IP_VS_DBG_ADDR(AF_INET, &iph.daddr)); return NF_ACCEPT; } - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { + if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); if (related) return verdict; - iph = ip_hdr(skb); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); } /* Protocol supported? */ - pp = ip_vs_proto_get(iph->protocol); + pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; - ihl = iph->ihl << 2; - /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(skb, pp, iph, ihl, 0); + cp = pp->conn_in_get(AF_INET, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { int v; - if (!pp->conn_schedule(skb, pp, &v, &cp)) + if (!pp->conn_schedule(AF_INET, skb, pp, &v, &cp)) return v; } diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 3f9ebd7639a..2a361a99174 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -39,25 +39,23 @@ struct isakmp_hdr { static struct ip_vs_conn * -ah_esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, +ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; if (likely(!inverse)) { cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP), - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP), - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP)); } @@ -66,12 +64,12 @@ ah_esp_conn_in_get(const struct sk_buff *skb, * We are not sure if the packet is from our * service, so our conn_schedule hook should return NF_ACCEPT */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; @@ -79,32 +77,35 @@ ah_esp_conn_in_get(const struct sk_buff *skb, static struct ip_vs_conn * -ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; if (likely(!inverse)) { cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP), - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP), - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP)); } if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; @@ -112,8 +113,7 @@ ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static int -ah_esp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { /* diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index fe93c9e6ff6..9211afa8f30 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -25,8 +25,9 @@ static struct ip_vs_conn * -tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { __be16 _ports[2], *pptr; @@ -36,18 +37,19 @@ tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { return ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { return ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } } static struct ip_vs_conn * -tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { __be16 _ports[2], *pptr; @@ -57,26 +59,25 @@ tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { return ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { return ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } } static int -tcp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); if (th == NULL) { @@ -85,8 +86,8 @@ tcp_conn_schedule(struct sk_buff *skb, } if (th->syn && - (svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, - &iph.daddr, th->dest))) { + (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, + th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -136,7 +137,7 @@ tcp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* Call application helper if needed */ @@ -182,7 +183,7 @@ tcp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* @@ -219,21 +220,43 @@ tcp_dnat_handler(struct sk_buff *skb, static int -tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { - const unsigned int tcphoff = ip_hdrlen(skb); + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } break; default: /* No need to checksum. */ diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index d208ed6eb9f..d3a1b1f2d10 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -24,8 +24,9 @@ #include static struct ip_vs_conn * -udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; __be16 _ports[2], *pptr; @@ -36,12 +37,12 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } return cp; @@ -49,25 +50,25 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static struct ip_vs_conn * -udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); if (pptr == NULL) return NULL; if (likely(!inverse)) { cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } return cp; @@ -75,14 +76,14 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static int -udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { struct ip_vs_service *svc; struct udphdr _udph, *uh; struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); if (uh == NULL) { @@ -90,7 +91,7 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } - svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, + svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { if (ip_vs_todrop()) { @@ -143,7 +144,7 @@ udp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* @@ -195,7 +196,7 @@ udp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* @@ -234,10 +235,17 @@ udp_dnat_handler(struct sk_buff *skb, static int -udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { struct udphdr _udph, *uh; - const unsigned int udphoff = ip_hdrlen(skb); + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) @@ -249,15 +257,28 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } break; default: /* No need to checksum. */ -- cgit v1.2.3 From 3b047d9d0407e78a52f009835a0e26cb62edb8c7 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:41 +0200 Subject: IPVS: Add protocol debug functions for IPv6 Add protocol (TCP, UDP, AH, ESP) debug functions for IPv6 packet debug output. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto.c | 63 +++++++++++++++++++++++++++++++++++--- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 36 ++++++++++++++++++++-- 2 files changed, 93 insertions(+), 6 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 6099a88fc20..50f6215beda 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -152,10 +152,10 @@ const char * ip_vs_state_name(__u16 proto, int state) void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) { char buf[128]; struct iphdr _iph, *ih; @@ -189,6 +189,61 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } +#ifdef CONFIG_IP_VS_IPV6 +void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, + pp->name, + NIP6(ih->saddr), + NIP6(ih->daddr)); + else + sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", + pp->name, + NIP6(ih->saddr), + ntohs(pptr[0]), + NIP6(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == __constant_htons(ETH_P_IPV6)) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + int __init ip_vs_protocol_init(void) { diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 2a361a99174..4b0b8f268d1 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -125,8 +125,8 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) +ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) { char buf[256]; struct iphdr _iph, *ih; @@ -142,6 +142,38 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } +#ifdef CONFIG_IP_VS_IPV6 +static void +ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == __constant_htons(ETH_P_IPV6)) + ah_esp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ah_esp_debug_packet_v4(pp, skb, offset, msg); +} + static void ah_esp_init(struct ip_vs_protocol *pp) { -- cgit v1.2.3 From 0bbdd42b7efa66685b6d74701bcde3a596a3a59d Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:42 +0200 Subject: IPVS: Extend protocol DNAT/SNAT and state handlers Extend protocol DNAT/SNAT and state handlers to work with IPv6. Also change/introduce new checksumming helper functions for this. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 85 ++++++++++++++++++++++++++++++++--------- net/ipv4/ipvs/ip_vs_proto_udp.c | 82 ++++++++++++++++++++++++++++++--------- 2 files changed, 131 insertions(+), 36 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 9211afa8f30..3daae43ae44 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -114,11 +114,21 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static inline void -tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip, +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif tcph->check = - csum_fold(ip_vs_check_diff4(oldip, newip, + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } @@ -129,7 +139,14 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -137,7 +154,7 @@ tcp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ @@ -145,13 +162,13 @@ tcp_snat_handler(struct sk_buff *skb, return 0; } - tcph = (void *)ip_hdr(skb) + tcphoff; + tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->daddr.ip, cp->vaddr.ip, + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -159,9 +176,20 @@ tcp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, - skb->len - tcphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); @@ -175,7 +203,14 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -183,7 +218,7 @@ tcp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* @@ -194,7 +229,7 @@ tcp_dnat_handler(struct sk_buff *skb, return 0; } - tcph = (void *)ip_hdr(skb) + tcphoff; + tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* @@ -202,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->vaddr.ip, cp->daddr.ip, + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -210,9 +245,19 @@ tcp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, - skb->len - tcphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; @@ -487,7 +532,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return 0; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index d3a1b1f2d10..6cca0ad8e32 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -120,13 +120,23 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static inline void -udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip, +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip, newip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); if (!uhdr->check) uhdr->check = CSUM_MANGLED_0; } @@ -136,7 +146,14 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - const unsigned int udphoff = ip_hdrlen(skb); + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -144,7 +161,7 @@ udp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* @@ -154,7 +171,7 @@ udp_snat_handler(struct sk_buff *skb, return 0; } - udph = (void *)ip_hdr(skb) + udphoff; + udph = (void *)skb_network_header(skb) + udphoff; udph->source = cp->vport; /* @@ -162,7 +179,7 @@ udp_snat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->daddr.ip, cp->vaddr.ip, + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -170,9 +187,19 @@ udp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, - skb->len - udphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", @@ -188,7 +215,14 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - unsigned int udphoff = ip_hdrlen(skb); + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -196,7 +230,7 @@ udp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* @@ -207,7 +241,7 @@ udp_dnat_handler(struct sk_buff *skb, return 0; } - udph = (void *)ip_hdr(skb) + udphoff; + udph = (void *)skb_network_header(skb) + udphoff; udph->dest = cp->dport; /* @@ -215,7 +249,7 @@ udp_dnat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->vaddr.ip, cp->daddr.ip, + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -223,9 +257,19 @@ udp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, - skb->len - udphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; -- cgit v1.2.3 From 28364a59f3dfe7fed3560ec7aff9b7aeb02824fb Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:43 +0200 Subject: IPVS: Extend functions for getting/creating connections Extend functions for getting/creating connections and connection templates for IPv6 support and fix the callers. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 100 ++++++++++++++++++++------------- net/ipv4/ipvs/ip_vs_core.c | 112 ++++++++++++++++++++----------------- net/ipv4/ipvs/ip_vs_ftp.c | 45 +++++++-------- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 24 ++++---- net/ipv4/ipvs/ip_vs_proto_tcp.c | 24 ++++---- net/ipv4/ipvs/ip_vs_proto_udp.c | 24 ++++---- net/ipv4/ipvs/ip_vs_sync.c | 27 +++++---- 7 files changed, 198 insertions(+), 158 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 639d4bc7fc1..15eec282b17 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -114,9 +114,18 @@ static inline void ct_write_unlock_bh(unsigned key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) +static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, + const union nf_inet_addr *addr, + __be16 port) { - return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), + (__force u32)port, proto, ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +#endif + return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) & IP_VS_CONN_TAB_MASK; } @@ -131,7 +140,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) int ret; /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); ct_write_lock(hash); @@ -162,7 +171,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); ct_write_lock(hash); @@ -187,18 +196,21 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) * d_addr, d_port: pkt dest address (load balancer) */ static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr == cp->caddr.ip && s_port == cp->cport && - d_port == cp->vport && d_addr == cp->vaddr.ip && + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && protocol == cp->protocol) { /* HIT */ @@ -214,37 +226,42 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get } struct ip_vs_conn *ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { struct ip_vs_conn *cp; - cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); + cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); + cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, + d_port); - IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); return cp; } /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr == cp->caddr.ip && s_port == cp->cport && - d_port == cp->vport && d_addr == cp->vaddr.ip && + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && cp->flags & IP_VS_CONN_F_TEMPLATE && protocol == cp->protocol) { /* HIT */ @@ -257,11 +274,11 @@ struct ip_vs_conn *ip_vs_ct_in_get out: ct_read_unlock(hash); - IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); return cp; } @@ -273,7 +290,8 @@ struct ip_vs_conn *ip_vs_ct_in_get * d_addr, d_port: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { unsigned hash; struct ip_vs_conn *cp, *ret=NULL; @@ -281,13 +299,15 @@ struct ip_vs_conn *ip_vs_conn_out_get /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); + hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (d_addr == cp->caddr.ip && d_port == cp->cport && - s_port == cp->dport && s_addr == cp->daddr.ip && + if (cp->af == af && + ip_vs_addr_equal(af, d_addr, &cp->caddr) && + ip_vs_addr_equal(af, s_addr, &cp->daddr) && + d_port == cp->cport && s_port == cp->dport && protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); @@ -298,11 +318,11 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); - IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - ret?"hit":"not hit"); + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ret ? "hit" : "not hit"); return ret; } @@ -625,8 +645,9 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, - __be32 daddr, __be16 dport, unsigned flags, +ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, struct ip_vs_dest *dest) { struct ip_vs_conn *cp; @@ -640,12 +661,13 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + cp->af = af; cp->protocol = proto; - cp->caddr.ip = caddr; + ip_vs_addr_copy(af, &cp->caddr, caddr); cp->cport = cport; - cp->vaddr.ip = vaddr; + ip_vs_addr_copy(af, &cp->vaddr, vaddr); cp->vport = vport; - cp->daddr.ip = daddr; + ip_vs_addr_copy(af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; spin_lock_init(&cp->lock); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 34aaa1480d9..2d5a4331709 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -173,19 +173,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc, __be16 ports[2]) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); + struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ - __be32 snet; /* source network of the client, after masking */ + union nf_inet_addr snet; /* source network of the client, + after masking */ + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); /* Mask saddr with the netmask to adjust template granularity */ - snet = iph->saddr & svc->netmask; + snet.ip = iph.saddr.ip & svc->netmask; IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " "mnet %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), ntohs(ports[0]), - NIPQUAD(iph->daddr), ntohs(ports[1]), + NIPQUAD(iph.saddr.ip), ntohs(ports[0]), + NIPQUAD(iph.daddr.ip), ntohs(ports[1]), NIPQUAD(snet)); /* @@ -204,11 +206,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, if (ports[1] == svc->port) { /* Check if a template already exists */ if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, ports[1]); + ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + &iph.daddr, ports[1]); else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); + ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { /* @@ -228,18 +230,18 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * for ftp service. */ if (svc->port != FTPPORT) - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, + ct = ip_vs_conn_new(AF_INET, iph.protocol, + &snet, 0, + &iph.daddr, ports[1], - dest->addr.ip, dest->port, + &dest->addr, dest->port, IP_VS_CONN_F_TEMPLATE, dest); else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr.ip, 0, + ct = ip_vs_conn_new(AF_INET, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -258,12 +260,16 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * fwmark template: * port zero template: */ - if (svc->fwmark) - ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, - htonl(svc->fwmark), 0); - else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_ct_in_get(AF_INET, IPPROTO_IP, &snet, 0, + &fwmark, 0); + } else + ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { /* @@ -282,18 +288,22 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a template according to the service */ - if (svc->fwmark) - ct = ip_vs_conn_new(IPPROTO_IP, - snet, 0, - htonl(svc->fwmark), 0, - dest->addr.ip, 0, + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_conn_new(AF_INET, IPPROTO_IP, + &snet, 0, + &fwmark, 0, + &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); - else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr.ip, 0, + } else + ct = ip_vs_conn_new(AF_INET, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -310,10 +320,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1], - dest->addr.ip, dport, + cp = ip_vs_conn_new(AF_INET, iph.protocol, + &iph.saddr, ports[0], + &iph.daddr, ports[1], + &dest->addr, dport, 0, dest); if (cp == NULL) { @@ -342,12 +352,12 @@ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); + struct ip_vs_iphdr iph; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; @@ -377,10 +387,10 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* * Create a connection entry. */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], - dest->addr.ip, dest->port ? dest->port : pptr[1], + cp = ip_vs_conn_new(AF_INET, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &dest->addr, dest->port ? dest->port : pptr[1], 0, dest); if (cp == NULL) @@ -408,10 +418,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp) { __be16 _ports[2], *pptr; - struct iphdr *iph = ip_hdr(skb); + struct ip_vs_iphdr iph; + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) { ip_vs_service_put(svc); return NF_DROP; @@ -421,7 +431,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, and the destination is RTN_UNICAST (and not local), then create a cache_bypass connection entry */ if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { + && (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST)) { int ret, cs; struct ip_vs_conn *cp; @@ -429,9 +439,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], + cp = ip_vs_conn_new(AF_INET, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], 0, 0, IP_VS_CONN_F_BYPASS, NULL); diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index bfe5d7050a5..0c3fbe0de5f 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -140,7 +140,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct tcphdr *th; char *data, *data_limit; char *start, *end; - __be32 from; + union nf_inet_addr from; __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ @@ -166,24 +166,25 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING, sizeof(SERVER_STRING)-1, ')', - &from, &port, + &from.ip, &port, &start, &end) != 1) return 1; IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr.ip), 0); + NIPQUAD(from.ip), ntohs(port), + NIPQUAD(cp->caddr.ip), 0); /* * Now update or create an connection entry for it */ - n_cp = ip_vs_conn_out_get(iph->protocol, from, port, - cp->caddr.ip, 0); + n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, + &cp->caddr, 0); if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - cp->caddr.ip, 0, - cp->vaddr.ip, port, - from, port, + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &cp->caddr, 0, + &cp->vaddr, port, + &from, port, IP_VS_CONN_F_NO_CPORT, cp->dest); if (!n_cp) @@ -196,9 +197,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Replace the old passive address with the new one */ - from = n_cp->vaddr.ip; + from.ip = n_cp->vaddr.ip; port = n_cp->vport; - sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), + sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), (ntohs(port)>>8)&255, ntohs(port)&255); buf_len = strlen(buf); @@ -243,7 +244,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, struct tcphdr *th; char *data, *data_start, *data_limit; char *start, *end; - __be32 to; + union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; @@ -291,12 +292,12 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to, &port, + '\r', &to.ip, &port, &start, &end) != 1) return 1; IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to), ntohs(port)); + NIPQUAD(to.ip), ntohs(port)); /* Passive mode off */ cp->app_data = NULL; @@ -306,16 +307,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", ip_vs_proto_name(iph->protocol), - NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); + NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); - n_cp = ip_vs_conn_in_get(iph->protocol, - to, port, - cp->vaddr.ip, htons(ntohs(cp->vport)-1)); + n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1)); if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - to, port, - cp->vaddr.ip, htons(ntohs(cp->vport)-1), - cp->daddr.ip, htons(ntohs(cp->dport)-1), + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1), + &cp->daddr, htons(ntohs(cp->dport)-1), 0, cp->dest); if (!n_cp) diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 4b0b8f268d1..2b18a78d039 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -46,16 +46,16 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp; if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr.ip, + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), - iph->daddr.ip, + &iph->daddr, htons(PORT_ISAKMP)); } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr.ip, + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), - iph->saddr.ip, + &iph->saddr, htons(PORT_ISAKMP)); } @@ -86,16 +86,16 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_conn *cp; if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr.ip, + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), - iph->daddr.ip, + &iph->daddr, htons(PORT_ISAKMP)); } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr.ip, + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), - iph->saddr.ip, + &iph->saddr, htons(PORT_ISAKMP)); } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 3daae43ae44..3da2bb05ee7 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -36,13 +36,13 @@ tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - return ip_vs_conn_in_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - return ip_vs_conn_in_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } } @@ -58,13 +58,13 @@ tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - return ip_vs_conn_out_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - return ip_vs_conn_out_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } } diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 6cca0ad8e32..fd8bd934cc0 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -36,13 +36,13 @@ udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } return cp; @@ -62,13 +62,13 @@ udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } return cp; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2cf47b2e166..3ce1093e067 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -366,13 +366,17 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); + cp = ip_vs_conn_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); else - cp = ip_vs_ct_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); + cp = ip_vs_ct_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); if (!cp) { /* * Find the appropriate destination for the connection. @@ -389,10 +393,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) else flags &= ~IP_VS_CONN_F_INACTIVE; } - cp = ip_vs_conn_new(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport, - s->daddr, s->dport, + cp = ip_vs_conn_new(AF_INET, s->protocol, + (union nf_inet_addr *)s->caddr, + s->cport, + (union nf_inet_addr *)s->vaddr, + s->vport, + (union nf_inet_addr *)s->daddr, + s->dport, flags, dest); if (dest) atomic_dec(&dest->refcnt); -- cgit v1.2.3 From 38cdcc9a039b92a9972dca3c954fb3d8b3ef13bf Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:44 +0200 Subject: IPVS: Add IPv6 support to xmit() support functions Add IPv6 support to IP_VS_XMIT() and to the xmit routing cache, introducing a new function __ip_vs_get_out_rt_v6(). Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_xmit.c | 82 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 7 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 88199c9f2d3..fd8342ec7e1 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -20,6 +20,9 @@ #include #include /* for icmp_send */ #include /* for ip_route_output */ +#include +#include +#include #include #include @@ -47,7 +50,8 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) if (!dst) return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && + if ((dst->obsolete + || (dest->af == AF_INET && rtos != dest->dst_rtos)) && dst->ops->check(dst, cookie) == NULL) { dest->dst_cache = NULL; dst_release(dst); @@ -109,6 +113,70 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) return rt; } +#ifdef CONFIG_IP_VS_IPV6 +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + if (!rt) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = dest->addr.in6, + .saddr = { + .s6_addr32 = + { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, + NULL, &fl); + if (!rt) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip6_route_output error, " + "dest: " NIP6_FMT "\n", + NIP6(dest->addr.in6)); + return NULL; + } + __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", + NIP6(dest->addr.in6), + atomic_read(&rt->u.dst.__refcnt)); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = cp->daddr.in6, + .saddr = { + .s6_addr32 = { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip6_route_output error, dest: " + NIP6_FMT "\n", NIP6(cp->daddr.in6)); + return NULL; + } + } + + return rt; +} +#endif + /* * Release dest->dst_cache before a dest is removed @@ -123,11 +191,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) dst_release(old_dst); } -#define IP_VS_XMIT(skb, rt) \ +#define IP_VS_XMIT(pf, skb, rt) \ do { \ (skb)->ipvs_property = 1; \ skb_forward_csum(skb); \ - NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ } while (0) @@ -200,7 +268,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -276,7 +344,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -467,7 +535,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -540,7 +608,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); rc = NF_STOLEN; goto out; -- cgit v1.2.3 From b3cdd2a73867d309dca288b8e820c09e3b7f1da1 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:45 +0200 Subject: IPVS: Add and bind IPv6 xmit functions Add xmit functions for IPv6. Also add the already needed __ip_vs_get_out_rt_v6() to ip_vs_core.c. Bind the new xmit functions to v6 connections. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 34 +++- net/ipv4/ipvs/ip_vs_core.c | 43 ++++++ net/ipv4/ipvs/ip_vs_xmit.c | 377 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 453 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 15eec282b17..f5dddad6d5e 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -389,6 +389,33 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) } } +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) { @@ -694,7 +721,12 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, cp->timeout = 3*HZ; /* Bind its packet transmitter */ - ip_vs_bind_xmit(cp); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); if (unlikely(pp && atomic_read(&pp->appcnt))) ip_vs_bind_app(cp, pp); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 2d5a4331709..d6f5bf9049a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -570,6 +570,49 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, "Forwarding altered incoming ICMP"); } +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = 0; + /* TODO IPv6: is this correct for ICMPv6? */ + ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + /* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections, diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index fd8342ec7e1..02ddc2b3ce2 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -281,6 +281,70 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = iph->daddr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + EnterFunction(10); + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " + "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif /* * NAT transmitter (only for outside-to-inside nat forwarding) @@ -360,6 +424,83 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif + /* * IP Tunneling transmitter @@ -491,6 +632,112 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + sk_buff_data_t old_transport_header = skb->transport_header; + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IPV6)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " + "ETH_P_IPV6: %d, skb protocol: %d\n", + htons(ETH_P_IPV6), skb->protocol); + goto tx_error; + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + /* TODO IPv6: do we need this check in IPv6? */ + if (mtu < 1280) { + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->u.dst); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = old_transport_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len + sizeof(old_iph); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = rt->rt6i_dst.addr; + iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip6_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + /* * Direct Routing transmitter @@ -548,6 +795,60 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + /* * ICMP packet transmitter @@ -625,3 +926,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_rt_put(rt); goto tx_error; } + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif -- cgit v1.2.3 From cd17f9ed099ed27e9b0d298253e5c05e335ac656 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:46 +0200 Subject: IPVS: Extend scheduling functions for IPv6 support Convert ip_vs_schedule() and ip_vs_sched_persist() to support scheduling of IPv6 connections. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 56 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index d6f5bf9049a..8bfd7c2f0eb 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -176,19 +176,25 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ + __be16 dport; /* destination port to forward */ union nf_inet_addr snet; /* source network of the client, after masking */ - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); /* Mask saddr with the netmask to adjust template granularity */ - snet.ip = iph.saddr.ip & svc->netmask; +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; - IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " - "mnet %u.%u.%u.%u\n", - NIPQUAD(iph.saddr.ip), ntohs(ports[0]), - NIPQUAD(iph.daddr.ip), ntohs(ports[1]), - NIPQUAD(snet)); + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &snet)); /* * As far as we know, FTP is a very complicated network protocol, and @@ -206,10 +212,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, if (ports[1] == svc->port) { /* Check if a template already exists */ if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, &iph.daddr, ports[1]); else - ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { @@ -230,7 +236,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * for ftp service. */ if (svc->port != FTPPORT) - ct = ip_vs_conn_new(AF_INET, iph.protocol, + ct = ip_vs_conn_new(svc->af, iph.protocol, &snet, 0, &iph.daddr, ports[1], @@ -238,7 +244,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, IP_VS_CONN_F_TEMPLATE, dest); else - ct = ip_vs_conn_new(AF_INET, iph.protocol, + ct = ip_vs_conn_new(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0, &dest->addr, 0, @@ -265,10 +271,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, .all = { 0, 0, 0, htonl(svc->fwmark) } }; - ct = ip_vs_ct_in_get(AF_INET, IPPROTO_IP, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, &fwmark, 0); } else - ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { @@ -293,14 +299,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, .all = { 0, 0, 0, htonl(svc->fwmark) } }; - ct = ip_vs_conn_new(AF_INET, IPPROTO_IP, + ct = ip_vs_conn_new(svc->af, IPPROTO_IP, &snet, 0, &fwmark, 0, &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); } else - ct = ip_vs_conn_new(AF_INET, iph.protocol, + ct = ip_vs_conn_new(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0, &dest->addr, 0, @@ -320,7 +326,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(AF_INET, iph.protocol, + cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, ports[0], &iph.daddr, ports[1], &dest->addr, dport, @@ -387,7 +393,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* * Create a connection entry. */ - cp = ip_vs_conn_new(AF_INET, iph.protocol, + cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], &dest->addr, dest->port ? dest->port : pptr[1], @@ -396,13 +402,13 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (cp == NULL) return NULL; - IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; -- cgit v1.2.3 From 2a3b791e6e1169f374224d164738e9f7be703d77 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:47 +0200 Subject: IPVS: Add/adjust Netfilter hook functions and helpers for v6 Add Netfilter hook functions or modify existing ones, if possible, to process IPv6 packets. Some support functions are also added/modified for this. ip_vs_nat_icmp_v6() was already added in the patch that added the v6 xmit functions, as it is called from one of them. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 365 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 329 insertions(+), 36 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 8bfd7c2f0eb..035a511e12f 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -39,6 +39,11 @@ #include #include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif + #include @@ -60,6 +65,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) const char *ip_vs_proto_name(unsigned proto) { @@ -74,6 +80,10 @@ const char *ip_vs_proto_name(unsigned proto) return "TCP"; case IPPROTO_ICMP: return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif default: sprintf(buf, "IP_%d", proto); return buf; @@ -425,7 +435,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, { __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) { @@ -433,11 +444,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is RTN_UNICAST (and not local), then create + and the destination is a non-local unicast, then create a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST)) { + if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; @@ -445,7 +462,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(AF_INET, iph.protocol, + cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], 0, 0, @@ -489,7 +506,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * created, the TCP RST packet cannot be sent, instead that * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, + skb->dev); + else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + return NF_DROP; } @@ -528,6 +552,14 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + /* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in @@ -727,11 +759,117 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) return verdict; } -static inline int is_tcp_reset(const struct sk_buff *skb) +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) + && ip_vs_checksum_complete(skb, sizeof(struct ipv6hdr))) { + /* Failed checksum! */ + IP_VS_DBG(1, "Forward ICMPv6: failed checksum from " + NIP6_FMT "!\n", + NIP6(iph->saddr)); + goto out; + } + + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; @@ -750,38 +888,64 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; + int af; EnterFunction(11); + af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; + if (skb->ipvs_property) return NF_ACCEPT; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); - if (related) - return verdict; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - } + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; /* reassemble IP fragments */ - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - } + if (related) + return verdict; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(AF_INET, skb, pp, &iph, iph.len, 0); + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { if (sysctl_ip_vs_nat_icmp_send && @@ -794,16 +958,26 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_lookup_real_service(iph.protocol, - iph.saddr.ip, pptr[0])) { + iph.saddr.ip, + pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb)) { - icmp_send(skb,ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); + || !is_tcp_reset(skb, iph.len)) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0, skb->dev); + else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); return NF_DROP; } } @@ -821,8 +995,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* mangle the packet */ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) goto drop; - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } /* For policy routing, packets originating from this * machine itself may be routed differently to packets @@ -830,8 +1012,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); @@ -949,6 +1137,94 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) return verdict; } +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : + IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); + /* do not touch skb anymore */ + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + /* * Check if it's for virtual services, look it up, * and send it on its way... @@ -961,9 +1237,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ret, restart; + int ret, restart, af; + + af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* * Big tappo: only PACKET_HOST (neither loopback nor mcasts) @@ -974,7 +1252,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, - IP_VS_DBG_ADDR(AF_INET, &iph.daddr)); + IP_VS_DBG_ADDR(af, &iph.daddr)); return NF_ACCEPT; } @@ -983,7 +1261,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (related) return verdict; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } /* Protocol supported? */ @@ -994,12 +1272,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(AF_INET, skb, pp, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { int v; - if (!pp->conn_schedule(AF_INET, skb, pp, &v, &cp)) + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } @@ -1082,6 +1360,21 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, return ip_vs_in_icmp(skb, &r, hooknum); } +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, forward packet through VS/DR, VS/TUN, -- cgit v1.2.3 From 7937df1564783806c285d34a1c6fd63d8da29d7a Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:48 +0200 Subject: IPVS: Convert real server lookup functions Convert functions for looking up destinations (real servers) to support IPv6 services/dests. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 5 +-- net/ipv4/ipvs/ip_vs_core.c | 4 +-- net/ipv4/ipvs/ip_vs_ctl.c | 80 ++++++++++++++++++++++++++++++---------------- net/ipv4/ipvs/ip_vs_sync.c | 7 ++-- 4 files changed, 62 insertions(+), 34 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index f5dddad6d5e..c2a42a62433 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -491,8 +491,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->daddr.ip, cp->dport, - cp->vaddr.ip, cp->vport, cp->protocol); + dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, + &cp->vaddr, cp->vport, + cp->protocol); ip_vs_bind_dest(cp, dest); return dest; } else diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 035a511e12f..27bef1d67aa 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -957,8 +957,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(iph.protocol, - iph.saddr.ip, + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, pptr[0])) { /* * Notify the real server: there is no diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 1f3fc66e694..bb0e1e3c885 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -492,11 +492,20 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) /* * Returns hash value for real service */ -static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port) +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) { register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif - return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; } @@ -516,7 +525,8 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->addr.ip, dest->port); + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); return 1; @@ -543,7 +553,9 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest) * Lookup real service by in the real service table. */ struct ip_vs_dest * -ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) +ip_vs_lookup_real_service(int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) { unsigned hash; struct ip_vs_dest *dest; @@ -552,11 +564,12 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) * Check for "full" addressed entries * Return the first found entry */ - hash = ip_vs_rs_hashkey(daddr, dport); + hash = ip_vs_rs_hashkey(af, daddr, dport); read_lock(&__ip_vs_rs_lock); list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->addr.ip == daddr) + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->port == dport) && ((dest->protocol == protocol) || dest->vfwmark)) { @@ -574,7 +587,8 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) * Lookup destination by {addr,port} in the given service */ static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) { struct ip_vs_dest *dest; @@ -582,7 +596,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Find the destination for the given service */ list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->addr.ip == daddr) && (dest->port == dport)) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { /* HIT */ return dest; } @@ -601,14 +617,15 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * ip_vs_lookup_real_service() looked promissing, but * seems not working as expected. */ -struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, - __be32 vaddr, __be16 vport, __u16 protocol) +struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol) { struct ip_vs_dest *dest; struct ip_vs_service *svc; - union nf_inet_addr _vaddr = { .ip = vaddr }; - svc = ip_vs_service_get(AF_INET, 0, protocol, &_vaddr, vport); + svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); @@ -629,7 +646,8 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, * scheduling. */ static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) { struct ip_vs_dest *dest, *nxt; @@ -637,17 +655,19 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Find the destination in trash */ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->addr.ip == daddr && + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || - (dest->vaddr.ip == svc->addr.ip && + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ return dest; @@ -657,10 +677,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Try to purge the destination from trash if not referenced */ if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " - "from trash\n", - dest->vfwmark, - NIPQUAD(dest->addr.ip), ntohs(dest->port)); + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); @@ -847,7 +868,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Check if the dest already exists in the list */ - dest = ip_vs_lookup_dest(svc, daddr.ip, dport); + dest = ip_vs_lookup_dest(svc, &daddr, dport); + if (dest != NULL) { IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); return -EEXIST; @@ -857,7 +879,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, daddr.ip, dport); + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", @@ -956,7 +979,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Lookup the destination list */ - dest = ip_vs_lookup_dest(svc, daddr.ip, dport); + dest = ip_vs_lookup_dest(svc, &daddr, dport); + if (dest == NULL) { IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); return -ENOENT; @@ -1054,7 +1078,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); - dest = ip_vs_lookup_dest(svc, udest->addr.ip, dport); + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); if (dest == NULL) { IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 3ce1093e067..40647edf102 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -383,8 +383,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) * If it is not found the connection will remain unbound * but still handled. */ - dest = ip_vs_find_dest(s->daddr, s->dport, - s->vaddr, s->vport, + dest = ip_vs_find_dest(AF_INET, + (union nf_inet_addr *)&s->daddr, + s->dport, + (union nf_inet_addr *)&s->vaddr, + s->vport, s->protocol); /* Set the approprite ativity flag */ if (s->protocol == IPPROTO_TCP) { -- cgit v1.2.3 From 667a5f18162e803e30722af46ade1737e3b93198 Mon Sep 17 00:00:00 2001 From: Vince Busam Date: Tue, 2 Sep 2008 15:55:49 +0200 Subject: IPVS: Convert procfs files for IPv6 entry output Correctly output IPv6 connection/service/dest entries in procfs files. Signed-off-by: Vince Busam Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 37 +++++++++++++++++++++++++++---- net/ipv4/ipvs/ip_vs_ctl.c | 54 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 18 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index c2a42a62433..e7603d749c0 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -815,8 +815,22 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) else { const struct ip_vs_conn *cp = v; - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), @@ -864,8 +878,23 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) else { const struct ip_vs_conn *cp = v; - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index bb0e1e3c885..25d9e98e31f 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1793,15 +1793,25 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; - if (iter->table == ip_vs_svc_table) - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr.ip), - ntohs(svc->port), - svc->scheduler->name); - else + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", + ip_vs_proto_name(svc->protocol), + NIP6(svc->addr.in6), + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name); + } else { seq_printf(seq, "FWM %08X %s ", svc->fwmark, svc->scheduler->name); + } if (svc->flags & IP_VS_SVC_F_PERSISTENT) seq_printf(seq, "persistent %d %08X\n", @@ -1811,13 +1821,29 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) seq_putc(seq, '\n'); list_for_each_entry(dest, &svc->destinations, n_list) { - seq_printf(seq, - " -> %08X:%04X %-7s %-6d %-10d %-10d\n", - ntohl(dest->addr.ip), ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [" NIP6_FMT "]:%04X" + " %-7s %-6d %-10d %-10d\n", + NIP6(dest->addr.in6), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + } } return 0; -- cgit v1.2.3 From c6883f587341a3ed113856de8769d0992b4bbd85 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:50 +0200 Subject: IVPS: Disable sync daemon for IPv6 connections Disable the sync daemon for IPv6 connections, works only with IPv4 for now. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 27bef1d67aa..5a7a81778b0 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1321,7 +1321,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, * encorage the standby servers to update the connections timeout */ atomic_inc(&cp->in_pkts); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && + if (af == AF_INET && + (ip_vs_sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] -- cgit v1.2.3 From a0eb662f9ec8962928d937a185ad128db12c4637 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:51 +0200 Subject: IPVS: Turn off FTP application helper for IPv6 Immediately return from FTP application helper and do nothing when dealing with IPv6 packets. IPv6 is not supported by this helper yet. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ftp.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index 0c3fbe0de5f..2e7dbd8b73a 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -147,6 +147,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, unsigned buf_len; int ret; +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + *diff = 0; /* Only useful for established sessions */ @@ -248,6 +256,14 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, __be16 port; struct ip_vs_conn *n_cp; +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + /* no diff required for incoming packets */ *diff = 0; -- cgit v1.2.3 From 09571c7ae30865adfa79dccd12a822a65d2c4b5a Mon Sep 17 00:00:00 2001 From: Vince Busam Date: Tue, 2 Sep 2008 15:55:52 +0200 Subject: IPVS: Add function to determine if IPv6 address is local Add __ip_vs_addr_is_local_v6() to find out if an IPv6 address belongs to a local interface. Use this function to decide whether to set the IP_VS_CONN_F_LOCALNODE flag for IPv6 destinations. Signed-off-by: Vince Busam Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 56 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 25d9e98e31f..640203a153c 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,10 @@ #include #include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif #include #include #include @@ -91,6 +95,26 @@ int ip_vs_get_debug_level(void) } #endif +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *addr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs @@ -751,10 +775,18 @@ __ip_vs_update_dest(struct ip_vs_service *svc, conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + } else +#endif + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { @@ -803,9 +835,19 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); - atype = inet_addr_type(&init_net, udest->addr.ip); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if (!(atype & IPV6_ADDR_UNICAST) && + !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(&init_net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); if (dest == NULL) { -- cgit v1.2.3 From cfc78c5a09241a3a9561466834996a7fb90c4228 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:53 +0200 Subject: IPVS: Adjust various debug outputs to use new macros Adjust various debug outputs to use the new *_BUF macro variants for correct output of v4/v6 addresses. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 57 ++++++++++++++++++++++------------------- net/ipv4/ipvs/ip_vs_ctl.c | 24 +++++++++-------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 45 ++++++++++++++++++-------------- net/ipv4/ipvs/ip_vs_proto_udp.c | 15 ++++++----- 4 files changed, 78 insertions(+), 63 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index e7603d749c0..9a24332fbed 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -449,16 +449,16 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; - IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); /* Update the connection counters */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { @@ -512,16 +512,16 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) if (!dest) return; - IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); /* Update the connection counters */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { @@ -574,13 +574,16 @@ int ip_vs_check_template(struct ip_vs_conn *ct) !(dest->flags & IP_VS_DEST_F_AVAILABLE) || (sysctl_ip_vs_expire_quiescent_template && (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG(9, "check_template: dest not available for " - "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "-> d:%u.%u.%u.%u:%d\n", - ip_vs_proto_name(ct->protocol), - NIPQUAD(ct->caddr.ip), ntohs(ct->cport), - NIPQUAD(ct->vaddr.ip), ntohs(ct->vport), - NIPQUAD(ct->daddr.ip), ntohs(ct->dport)); + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); /* * Invalidate the connection template diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 640203a153c..6dbc527285f 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -924,13 +924,14 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) dest = ip_vs_trash_get_dest(svc, &daddr, dport); if (dest != NULL) { - IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", - NIPQUAD(daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - NIPQUAD(dest->vaddr.ip), - ntohs(dest->vport)); + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + __ip_vs_update_dest(svc, dest, udest); /* @@ -1076,10 +1077,11 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) atomic_dec(&dest->svc->refcnt); kfree(dest); } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " - "dest->refcnt=%d\n", - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->refcnt)); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); atomic_inc(&dest->refcnt); } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 3da2bb05ee7..de8ed73997c 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -490,19 +490,23 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; - IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - (state_off==TCP_DIR_OUTPUT)?"output ":"input ", - th->syn? 'S' : '.', - th->fin? 'F' : '.', - th->ack? 'A' : '.', - th->rst? 'R' : '.', - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && (new_state != IP_VS_TCP_S_ESTABLISHED)) { @@ -623,12 +627,15 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) break; spin_unlock(&tcp_app_lock); - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - inc->name, ntohs(inc->port)); + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index fd8bd934cc0..5f2073e41cf 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -408,12 +408,15 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) break; spin_unlock(&udp_app_lock); - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - inc->name, ntohs(inc->port)); + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); -- cgit v1.2.3 From 473b23d37b697c66ac0bfcfdcc9badf718e25d2a Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:54 +0200 Subject: IPVS: Activate IPv6 Netfilter hooks Register the previously defined or adapted netfilter hook functions for IPv6 as PF_INET6 hooks. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 5a7a81778b0..7d3de9db5ac 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1413,6 +1413,43 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC-1, }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, +#endif }; -- cgit v1.2.3 From f94fd041402e4e70d2b4ed00008b9bb857e6ae87 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:55 +0200 Subject: IPVS: Allow adding IPv6 services from userspace Allow adding IPv6 services through the genetlink interface and add checks to see if the chosen scheduler is supported with IPv6 and whether the supplied prefix length is sane. Make sure the service count exported via the sockopt interface only counts IPv4 services. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 53 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 6dbc527285f..7f89c588e58 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1177,6 +1177,19 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_mod_dec; } +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out_err; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } + } +#endif + svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); if (svc == NULL) { IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); @@ -1214,7 +1227,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, atomic_inc(&ip_vs_nullsvc_counter); ip_vs_new_estimator(&svc->stats); - ip_vs_num_services++; + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services++; /* Hash the service into the service table */ write_lock_bh(&__ip_vs_svc_lock); @@ -1265,6 +1281,19 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } old_sched = sched; +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = EAFNOSUPPORT; + goto out; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = EINVAL; + goto out; + } + } +#endif + write_lock_bh(&__ip_vs_svc_lock); /* @@ -1329,7 +1358,10 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; - ip_vs_num_services--; + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services--; + ip_vs_kill_estimator(&svc->stats); /* Unbind scheduler */ @@ -2212,6 +2244,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); @@ -2227,6 +2263,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); @@ -2584,7 +2624,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, if (!nl_service) return -EMSGSIZE; - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); if (svc->fwmark) { NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); @@ -2691,8 +2731,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, return -EINVAL; usvc->af = nla_get_u16(nla_af); - /* For now, only support IPv4 */ - if (nla_get_u16(nla_af) != AF_INET) +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif return -EAFNOSUPPORT; if (nla_fwmark) { -- cgit v1.2.3 From 4856c84c1358b79852743ac64e50c1e9d5118f05 Mon Sep 17 00:00:00 2001 From: Malcolm Turnbull Date: Fri, 5 Sep 2008 11:17:13 +1000 Subject: ipvs: load balance IPv4 connections from a local process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows IPVS to load balance connections made by a local process. For example a proxy server running locally. External client --> pound:443 -> Local:443 --> IPVS:80 --> RealServer Signed-off-by: Siim Põder Signed-off-by: Malcolm Turnbull Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 224 +++++++++++++++++++++++----------------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 4 +- 2 files changed, 134 insertions(+), 94 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 7d3de9db5ac..26e3d99bbee 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -651,12 +651,53 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } #endif +/* Handle relevant response ICMP messages - forward to the right + * destination host. Used for NAT and local client. + */ +static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph, + struct iphdr *cih, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, + "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + /* * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. + * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. - * (Only used in VS/NAT) */ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) { @@ -666,7 +707,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; + unsigned int offset, ihl; *related = 1; @@ -725,38 +766,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - - out: - __ip_vs_conn_put(cp); - - return verdict; + return handle_response_icmp(skb, iph, cih, cp, pp, offset, ihl); } #ifdef CONFIG_IP_VS_IPV6 @@ -875,10 +885,76 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +/* Handle response packets: rewrite addresses and send away... + * Used for NAT and local client. + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int ihl) +{ + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->ipvs_property = 1; + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + return NF_STOLEN; +} + /* * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn, - * rewrite addresses of the packet and send it on its way... + * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff *skb, @@ -987,55 +1063,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, iph.len)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ipv6_hdr(skb)->saddr = cp->vaddr.in6; - else -#endif - { - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); - } - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - - drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; + return handle_response(af, skb, pp, cp, iph.len); } @@ -1111,8 +1139,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (cp) + return handle_response_icmp(skb, iph, cih, cp, pp, + offset, ihl); return NF_ACCEPT; + } verdict = NF_DROP; @@ -1244,11 +1278,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* - * Big tappo: only PACKET_HOST (neither loopback nor mcasts) - * ... don't know why 1st test DOES NOT include 2nd (?) + * Big tappo: only PACKET_HOST, including loopback for local client + * Don't handle local packets on IPv6 for now */ - if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { + if (unlikely(skb->pkt_type != PACKET_HOST || + (af == AF_INET6 || (skb->dev->flags & IFF_LOOPBACK || + skb->sk)))) { IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, @@ -1277,6 +1312,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { int v; + /* For local client packets, it could be a response */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + if (cp) + return handle_response(af, skb, pp, cp, iph.len); + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index de8ed73997c..808e8be0280 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -166,7 +166,7 @@ tcp_snat_handler(struct sk_buff *skb, tcph->source = cp->vport; /* Adjust TCP checksums */ - if (!cp->app) { + if (!cp->app && (tcph->check != 0)) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); @@ -235,7 +235,7 @@ tcp_dnat_handler(struct sk_buff *skb, /* * Adjust TCP checksums */ - if (!cp->app) { + if (!cp->app && (tcph->check != 0)) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); -- cgit v1.2.3 From f2428ed5e7bc89c7716ead22748cb5d076e204f0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 5 Sep 2008 11:17:14 +1000 Subject: ipvs: load balance ipv6 connections from a local process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows IPVS to load balance IPv6 connections made by a local process. For example a proxy server running locally. External client --> pound:443 -> Local:443 --> IPVS:80 --> RealServer This is an extenstion to the IPv4 work done in this area by Siim Põder and Malcolm Turnbull. Cc: Siim Põder Cc: Malcolm Turnbull Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 91 +++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 50 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 26e3d99bbee..05797a5ce15 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -654,8 +654,9 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, /* Handle relevant response ICMP messages - forward to the right * destination host. Used for NAT and local client. */ -static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph, - struct iphdr *cih, struct ip_vs_conn *cp, +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl) { @@ -669,18 +670,22 @@ static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph, /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ - IP_VS_DBG(1, - "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); goto out; } - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) offset += 2 * sizeof(__u16); if (!skb_make_writable(skb, offset)) goto out; - ip_vs_nat_icmp(skb, pp, cp, 1); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); @@ -708,6 +713,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl; + union nf_inet_addr snet; *related = 1; @@ -766,7 +772,9 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - return handle_response_icmp(skb, iph, cih, cp, pp, offset, ihl); + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); } #ifdef CONFIG_IP_VS_IPV6 @@ -779,7 +787,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset, verdict; + unsigned int offset; + union nf_inet_addr snet; *related = 1; @@ -838,40 +847,9 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) - && ip_vs_checksum_complete(skb, sizeof(struct ipv6hdr))) { - /* Failed checksum! */ - IP_VS_DBG(1, "Forward ICMPv6: failed checksum from " - NIP6_FMT "!\n", - NIP6(iph->saddr)); - goto out; - } - - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - - ip_vs_nat_icmp_v6(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - -out: - __ip_vs_conn_put(cp); - - return verdict; + snet.in6 = iph->saddr; + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); } #endif @@ -1055,7 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; - } + } } } IP_VS_DBG_PKT(12, pp, skb, 0, @@ -1083,6 +1061,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; + union nf_inet_addr snet; *related = 1; @@ -1142,9 +1121,12 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (!cp) { /* The packet could also belong to a local client */ cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (cp) - return handle_response_icmp(skb, iph, cih, cp, pp, + if (cp) { + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, + cih->protocol, cp, pp, offset, ihl); + } return NF_ACCEPT; } @@ -1183,6 +1165,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, verdict; + union nf_inet_addr snet; *related = 1; @@ -1240,8 +1223,18 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (cp) { + snet.in6 = iph->saddr; + return handle_response_icmp(AF_INET6, skb, &snet, + cih->nexthdr, + cp, pp, offset, + sizeof(struct ipv6hdr)); + } return NF_ACCEPT; + } verdict = NF_DROP; @@ -1281,9 +1274,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, * Big tappo: only PACKET_HOST, including loopback for local client * Don't handle local packets on IPv6 for now */ - if (unlikely(skb->pkt_type != PACKET_HOST || - (af == AF_INET6 || (skb->dev->flags & IFF_LOOPBACK || - skb->sk)))) { + if (unlikely(skb->pkt_type != PACKET_HOST)) { IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, -- cgit v1.2.3 From cd9fe6c4f0afe334862c871bf5d32770daa748ec Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 13:46:00 +0200 Subject: ipvs: Use pointer to address from sync message We want a pointer to it, not the value casted to a pointer. Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 40647edf102..28237a5f62e 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -397,11 +397,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) flags &= ~IP_VS_CONN_F_INACTIVE; } cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)s->caddr, + (union nf_inet_addr *)&s->caddr, s->cport, - (union nf_inet_addr *)s->vaddr, + (union nf_inet_addr *)&s->vaddr, s->vport, - (union nf_inet_addr *)s->daddr, + (union nf_inet_addr *)&s->daddr, s->dport, flags, dest); if (dest) -- cgit v1.2.3 From a5ba4bf2732c85d8c95e0432966f79aa2b159478 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 13:47:37 +0200 Subject: ipvs: Return negative error values from ip_vs_edit_service() Like the other code in this function does. Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7f89c588e58..d2dc05a843f 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1284,11 +1284,11 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { if (!sched->supports_ipv6) { - ret = EAFNOSUPPORT; + ret = -EAFNOSUPPORT; goto out; } if ((u->netmask < 1) || (u->netmask > 128)) { - ret = EINVAL; + ret = -EINVAL; goto out; } } -- cgit v1.2.3 From 77eb851630bba8ea9962a1b2f01b23bd5d57c58e Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 14:43:00 +0200 Subject: ipvs: Mark tcp/udp v4 and v6 debug functions static They are only used in this file, so they should be static Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 50f6215beda..b06da1c3445 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -151,7 +151,7 @@ const char * ip_vs_state_name(__u16 proto, int state) } -void +static void ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, @@ -190,7 +190,7 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, } #ifdef CONFIG_IP_VS_IPV6 -void +static void ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, -- cgit v1.2.3 From 3bfb92f4073aa829f8e67e459d54c79306ddbd73 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 16:53:49 +0200 Subject: ipvs: Reject ipv6 link-local addresses for destinations We can't use non-local link-local addresses for destinations, without knowing the interface on which we can reach the address. Reject them for now. Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index d2dc05a843f..e53efe41f01 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -838,7 +838,8 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); - if (!(atype & IPV6_ADDR_UNICAST) && + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(&udest->addr.in6)) return -EINVAL; } else -- cgit v1.2.3 From 5af149cc34143c4e24abcc6355b29b3161eff3b8 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Sep 2008 09:34:45 +1000 Subject: IPVS: fix bogus indentation Sorry, this was my error. Thanks to Julius Volz for pointing it out. Signed-off-by: Simon Horman Acked-by: Julius Volz --- net/ipv4/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 05797a5ce15..1f4f3b94359 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1033,7 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; - } + } } } IP_VS_DBG_PKT(12, pp, skb, 0, -- cgit v1.2.3 From 178f5e494e3c0252d06a9b1473016addff71e01e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Sep 2008 09:34:46 +1000 Subject: IPVS: use ipv6_addr_copy() It is standard to use ipv6_addr_copy() to fill in the in6 element of a union nf_inet_addr snet. Thanks to Julius Volz for pointing this out. Cc: Brian Haley Signed-off-by: Simon Horman Acked-by: Julius Volz --- net/ipv4/ipvs/ip_vs_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1f4f3b94359..f5180ac56be 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -847,7 +847,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - snet.in6 = iph->saddr; + ipv6_addr_copy(&snet.in6, &iph->saddr); return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, pp, offset, sizeof(struct ipv6hdr)); } @@ -1227,7 +1227,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) /* The packet could also belong to a local client */ cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); if (cp) { - snet.in6 = iph->saddr; + ipv6_addr_copy(&snet.in6, &iph->saddr); return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, pp, offset, -- cgit v1.2.3 From 503e81f65adac596a0275ea0230f2ae1fd64c301 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Sep 2008 12:04:21 +1000 Subject: ipvs: handle PARTIAL_CHECKSUM Now that LVS can load balance locally generated traffic, packets may come from the loopback device and thus may have a partial checksum. The existing code allows for the case where there is no checksum at all for TCP, however Herbert Xu has confirmed that this is not legal. Signed-off-by: Simon Horman Acked-by: Julius Volz --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 37 +++++++++++++++++++++++++++++++++++-- net/ipv4/ipvs/ip_vs_proto_udp.c | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 4 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 808e8be0280..537f616776d 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -134,12 +134,34 @@ tcp_fast_csum_update(int af, struct tcphdr *tcph, } +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); +} + + static int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; unsigned int tcphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -147,6 +169,7 @@ tcp_snat_handler(struct sk_buff *skb, else #endif tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -166,7 +189,11 @@ tcp_snat_handler(struct sk_buff *skb, tcph->source = cp->vport; /* Adjust TCP checksums */ - if (!cp->app && (tcph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); @@ -204,6 +231,7 @@ tcp_dnat_handler(struct sk_buff *skb, { struct tcphdr *tcph; unsigned int tcphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -211,6 +239,7 @@ tcp_dnat_handler(struct sk_buff *skb, else #endif tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -235,7 +264,11 @@ tcp_dnat_handler(struct sk_buff *skb, /* * Adjust TCP checksums */ - if (!cp->app && (tcph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 5f2073e41cf..e3ee26bd1de 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -141,12 +141,34 @@ udp_fast_csum_update(int af, struct udphdr *uhdr, uhdr->check = CSUM_MANGLED_0; } +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); +} + + static int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; unsigned int udphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -154,6 +176,7 @@ udp_snat_handler(struct sk_buff *skb, else #endif udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -177,7 +200,11 @@ udp_snat_handler(struct sk_buff *skb, /* * Adjust UDP checksums */ - if (!cp->app && (udph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); @@ -216,6 +243,7 @@ udp_dnat_handler(struct sk_buff *skb, { struct udphdr *udph; unsigned int udphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -223,6 +251,7 @@ udp_dnat_handler(struct sk_buff *skb, else #endif udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -247,7 +276,11 @@ udp_dnat_handler(struct sk_buff *skb, /* * Adjust UDP checksums */ - if (!cp->app && (udph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); -- cgit v1.2.3 From 9d7f2a2b1aa9e55537a053c68bdbd119fc479dd3 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 8 Sep 2008 14:55:42 +0200 Subject: IPVS: Remove incorrect ip_route_me_harder(), fix IPv6 Remove an incorrect ip_route_me_harder() that was probably a result of merging my IPv6 patches with the local client patches. With this, IPv6+NAT are working again. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index f5180ac56be..bdc92d73fbe 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -904,15 +904,6 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, if (ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); -- cgit v1.2.3 From 2206a3f5b75be5dadf11541961bd7c924857eb5d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 8 Sep 2008 13:38:11 +0200 Subject: ipvs: Restrict connection table size via Kconfig Instead of checking the value in include/net/ip_vs.h, we can just restrict the range in our Kconfig file. This will prevent values outside of the range early. Signed-off-by: Sven Wegener Reviewed-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 794cecb249a..de6004de80b 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -41,7 +41,8 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - default "12" + range 8 20 + default 12 ---help--- The IPVS connection hash table uses the chaining scheme to handle hash collisions. Using a big IPVS connection hash table will greatly -- cgit v1.2.3 From e9c0ce232e7a36daae1ca08282609d7f0c57c567 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 8 Sep 2008 13:39:04 +0200 Subject: ipvs: Embed user stats structure into kernel stats structure Instead of duplicating the fields, integrate a user stats structure into the kernel stats structure. This is more robust when the members are changed, because they are now automatically kept in sync. Signed-off-by: Sven Wegener Reviewed-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 30 +++++++++++++------------- net/ipv4/ipvs/ip_vs_ctl.c | 53 ++++++++++++++++++---------------------------- net/ipv4/ipvs/ip_vs_est.c | 40 +++++++++++++++++----------------- 3 files changed, 56 insertions(+), 67 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index bdc92d73fbe..80a4fcf33a5 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -102,18 +102,18 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_dest *dest = cp->dest; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { spin_lock(&dest->stats.lock); - dest->stats.inpkts++; - dest->stats.inbytes += skb->len; + dest->stats.ustats.inpkts++; + dest->stats.ustats.inbytes += skb->len; spin_unlock(&dest->stats.lock); spin_lock(&dest->svc->stats.lock); - dest->svc->stats.inpkts++; - dest->svc->stats.inbytes += skb->len; + dest->svc->stats.ustats.inpkts++; + dest->svc->stats.ustats.inbytes += skb->len; spin_unlock(&dest->svc->stats.lock); spin_lock(&ip_vs_stats.lock); - ip_vs_stats.inpkts++; - ip_vs_stats.inbytes += skb->len; + ip_vs_stats.ustats.inpkts++; + ip_vs_stats.ustats.inbytes += skb->len; spin_unlock(&ip_vs_stats.lock); } } @@ -125,18 +125,18 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_dest *dest = cp->dest; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { spin_lock(&dest->stats.lock); - dest->stats.outpkts++; - dest->stats.outbytes += skb->len; + dest->stats.ustats.outpkts++; + dest->stats.ustats.outbytes += skb->len; spin_unlock(&dest->stats.lock); spin_lock(&dest->svc->stats.lock); - dest->svc->stats.outpkts++; - dest->svc->stats.outbytes += skb->len; + dest->svc->stats.ustats.outpkts++; + dest->svc->stats.ustats.outbytes += skb->len; spin_unlock(&dest->svc->stats.lock); spin_lock(&ip_vs_stats.lock); - ip_vs_stats.outpkts++; - ip_vs_stats.outbytes += skb->len; + ip_vs_stats.ustats.outpkts++; + ip_vs_stats.ustats.outbytes += skb->len; spin_unlock(&ip_vs_stats.lock); } } @@ -146,15 +146,15 @@ static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { spin_lock(&cp->dest->stats.lock); - cp->dest->stats.conns++; + cp->dest->stats.ustats.conns++; spin_unlock(&cp->dest->stats.lock); spin_lock(&svc->stats.lock); - svc->stats.conns++; + svc->stats.ustats.conns++; spin_unlock(&svc->stats.lock); spin_lock(&ip_vs_stats.lock); - ip_vs_stats.conns++; + ip_vs_stats.ustats.conns++; spin_unlock(&ip_vs_stats.lock); } diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index e53efe41f01..993a83fb0d5 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -744,18 +744,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock_bh(&stats->lock); - stats->conns = 0; - stats->inpkts = 0; - stats->outpkts = 0; - stats->inbytes = 0; - stats->outbytes = 0; - - stats->cps = 0; - stats->inpps = 0; - stats->outpps = 0; - stats->inbps = 0; - stats->outbps = 0; - + memset(&stats->ustats, 0, sizeof(stats->ustats)); ip_vs_zero_estimator(stats); spin_unlock_bh(&stats->lock); @@ -1964,20 +1953,20 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) " Conns Packets Packets Bytes Bytes\n"); spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, - ip_vs_stats.inpkts, ip_vs_stats.outpkts, - (unsigned long long) ip_vs_stats.inbytes, - (unsigned long long) ip_vs_stats.outbytes); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, + ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, + (unsigned long long) ip_vs_stats.ustats.inbytes, + (unsigned long long) ip_vs_stats.ustats.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.cps, - ip_vs_stats.inpps, - ip_vs_stats.outpps, - ip_vs_stats.inbps, - ip_vs_stats.outbps); + ip_vs_stats.ustats.cps, + ip_vs_stats.ustats.inpps, + ip_vs_stats.ustats.outpps, + ip_vs_stats.ustats.inbps, + ip_vs_stats.ustats.outbps); spin_unlock_bh(&ip_vs_stats.lock); return 0; @@ -2215,7 +2204,7 @@ static void ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) { spin_lock_bh(&src->lock); - memcpy(dst, src, (char*)&src->lock - (char*)src); + memcpy(dst, &src->ustats, sizeof(*dst)); spin_unlock_bh(&src->lock); } @@ -2591,16 +2580,16 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, spin_lock_bh(&stats->lock); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); spin_unlock_bh(&stats->lock); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 4fb620ec208..2eb2860dabb 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -65,37 +65,37 @@ static void estimation_timer(unsigned long arg) s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); - n_conns = s->conns; - n_inpkts = s->inpkts; - n_outpkts = s->outpkts; - n_inbytes = s->inbytes; - n_outbytes = s->outbytes; + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (n_conns - e->last_conns)<<9; e->last_conns = n_conns; e->cps += ((long)rate - (long)e->cps)>>2; - s->cps = (e->cps+0x1FF)>>10; + s->ustats.cps = (e->cps+0x1FF)>>10; rate = (n_inpkts - e->last_inpkts)<<9; e->last_inpkts = n_inpkts; e->inpps += ((long)rate - (long)e->inpps)>>2; - s->inpps = (e->inpps+0x1FF)>>10; + s->ustats.inpps = (e->inpps+0x1FF)>>10; rate = (n_outpkts - e->last_outpkts)<<9; e->last_outpkts = n_outpkts; e->outpps += ((long)rate - (long)e->outpps)>>2; - s->outpps = (e->outpps+0x1FF)>>10; + s->ustats.outpps = (e->outpps+0x1FF)>>10; rate = (n_inbytes - e->last_inbytes)<<4; e->last_inbytes = n_inbytes; e->inbps += ((long)rate - (long)e->inbps)>>2; - s->inbps = (e->inbps+0xF)>>5; + s->ustats.inbps = (e->inbps+0xF)>>5; rate = (n_outbytes - e->last_outbytes)<<4; e->last_outbytes = n_outbytes; e->outbps += ((long)rate - (long)e->outbps)>>2; - s->outbps = (e->outbps+0xF)>>5; + s->ustats.outbps = (e->outbps+0xF)>>5; spin_unlock(&s->lock); } spin_unlock(&est_lock); @@ -108,20 +108,20 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats) INIT_LIST_HEAD(&est->list); - est->last_conns = stats->conns; - est->cps = stats->cps<<10; + est->last_conns = stats->ustats.conns; + est->cps = stats->ustats.cps<<10; - est->last_inpkts = stats->inpkts; - est->inpps = stats->inpps<<10; + est->last_inpkts = stats->ustats.inpkts; + est->inpps = stats->ustats.inpps<<10; - est->last_outpkts = stats->outpkts; - est->outpps = stats->outpps<<10; + est->last_outpkts = stats->ustats.outpkts; + est->outpps = stats->ustats.outpps<<10; - est->last_inbytes = stats->inbytes; - est->inbps = stats->inbps<<5; + est->last_inbytes = stats->ustats.inbytes; + est->inbps = stats->ustats.inbps<<5; - est->last_outbytes = stats->outbytes; - est->outbps = stats->outbps<<5; + est->last_outbytes = stats->ustats.outbytes; + est->outbps = stats->ustats.outbps<<5; spin_lock_bh(&est_lock); list_add(&est->list, &est_list); -- cgit v1.2.3 From 63f2c0464875b6ef2132cecb19b2a5abbf061227 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 12 Sep 2008 23:23:50 -0700 Subject: net: ip_vs_proto_{tcp,udp} build fix Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 1 + net/ipv4/ipvs/ip_vs_proto_udp.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 537f616776d..dd4566ea2bf 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -18,6 +18,7 @@ #include /* for tcphdr */ #include #include /* for csum_tcpudp_magic */ +#include #include #include diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index e3ee26bd1de..6eb6039d634 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -22,6 +22,7 @@ #include #include +#include static struct ip_vs_conn * udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, -- cgit v1.2.3 From 9e691ed68d94ab3047e028736641445b4cf74d67 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Sep 2008 10:10:41 +1000 Subject: ipvs: only unlock in ip_vs_edit_service() if already locked Jumping to out unlocks __ip_vs_svc_lock, but that lock is not taken until after code that may jump to out. This problem was detected by sparse. make C=1 CHECK net/ipv4/ipvs/ip_vs_ctl.c net/ipv4/ipvs/ip_vs_ctl.c:1332:2: warning: context imbalance in 'ip_vs_edit_service' - unexpected unlock Acked-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 993a83fb0d5..60ca24b9ec0 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1305,7 +1305,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) */ if ((ret = ip_vs_unbind_scheduler(svc))) { old_sched = sched; - goto out; + goto out_unlock; } /* @@ -1324,12 +1324,13 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) */ ip_vs_bind_scheduler(svc, old_sched); old_sched = sched; - goto out; + goto out_unlock; } } - out: + out_unlock: write_unlock_bh(&__ip_vs_svc_lock); + out: if (old_sched) ip_vs_scheduler_put(old_sched); -- cgit v1.2.3 From dff630ddad3884b99fae3ad92f5eccbf26618679 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Sep 2008 10:10:42 +1000 Subject: ipvs: supply a valid 0 address to ip_vs_conn_new() ip_vs_conn_new expects a union nf_inet_addr as the type for its address parameters, not a plain integer. This problem was detected by sparse. make C=1 CHECK net/ipv4/ipvs/ip_vs_core.c net/ipv4/ipvs/ip_vs_core.c:469:9: warning: Using plain integer as NULL pointer Acked-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 80a4fcf33a5..ece748dbd0c 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -457,6 +457,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; ip_vs_service_put(svc); @@ -465,7 +466,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], - 0, 0, + &daddr, 0, IP_VS_CONN_F_BYPASS, NULL); if (cp == NULL) -- cgit v1.2.3 From 563e94f072714657a82a59a3bf81a719a6a25591 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Sep 2008 10:10:42 +1000 Subject: ipvs: add __aquire/__release annotations to ip_vs_info_seq_start/ip_vs_info_seq_stop This teaches sparse that the following are not problems: make C=1 CHECK net/ipv4/ipvs/ip_vs_ctl.c net/ipv4/ipvs/ip_vs_ctl.c:1793:14: warning: context imbalance in 'ip_vs_info_seq_start' - wrong count at exit net/ipv4/ipvs/ip_vs_ctl.c:1842:13: warning: context imbalance in 'ip_vs_info_seq_stop' - unexpected unlock Acked-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 60ca24b9ec0..771551d8fba 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1787,6 +1787,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) { read_lock_bh(&__ip_vs_svc_lock); @@ -1840,6 +1841,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) { read_unlock_bh(&__ip_vs_svc_lock); } -- cgit v1.2.3 From d286600e199aa2f1058a1f883d234e73626304d2 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Tue, 16 Sep 2008 11:11:11 -0400 Subject: ipvs: change some __constant_htons() to htons() Change __contant_htons() to htons() in the IPVS code when not in an initializer. -Brian Signed-off-by: Brian Haley Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto.c | 2 +- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index b06da1c3445..0791f9e08fe 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -237,7 +237,7 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == __constant_htons(ETH_P_IPV6)) + if (skb->protocol == htons(ETH_P_IPV6)) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 2b18a78d039..80ab0c8e5b4 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -167,7 +167,7 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == __constant_htons(ETH_P_IPV6)) + if (skb->protocol == htons(ETH_P_IPV6)) ah_esp_debug_packet_v6(pp, skb, offset, msg); else #endif -- cgit v1.2.3 From 6067804047b64dde89f4f133fc7eba48ee44107d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 20 Sep 2008 22:20:49 -0700 Subject: net: Use hton[sl]() instead of __constant_hton[sl]() where applicable Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index ece748dbd0c..958abf3e5f8 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -938,7 +938,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, EnterFunction(11); - af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; if (skb->ipvs_property) return NF_ACCEPT; @@ -1258,7 +1258,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_conn *cp; int ret, restart, af; - af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); -- cgit v1.2.3 From e6f225ebb7c35fe30fdf8608927c5cf8fce6de7d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 19 Sep 2008 20:41:56 +0200 Subject: ipvs: Restrict sync message to 255 connections The nr_conns variable in the sync message header is only eight bits wide and will overflow on interfaces with a large MTU. As a result the backup won't parse all connections contained in the sync buffer. On regular ethernet with an MTU of 1500 this isn't a problem, because we can't overflow the value, but consider jumbo frames being used on a cross-over connection between both directors. We now restrict the size of the sync buffer, so that we never put more than 255 connections into a single sync buffer. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 28237a5f62e..de5e7e118ee 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -99,6 +100,7 @@ struct ip_vs_sync_thread_data { */ #define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ struct ip_vs_sync_mesg { __u8 nr_conns; @@ -516,8 +518,8 @@ static int set_sync_mesg_maxlen(int sync_state) num = (dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr) - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = - SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; + sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { -- cgit v1.2.3 From 8d5803bf6fbe5264000afc8c34bff08e8ecc023b Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Sat, 20 Sep 2008 11:48:33 +0200 Subject: ipvs: Fix unused label warning Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 771551d8fba..0302cf3e503 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1330,7 +1330,9 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) out_unlock: write_unlock_bh(&__ip_vs_svc_lock); +#ifdef CONFIG_IP_VS_IPV6 out: +#endif if (old_sched) ip_vs_scheduler_put(old_sched); -- cgit v1.2.3 From cb7f6a7b716e801097b564dec3ccb58d330aef56 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 19 Sep 2008 12:32:57 +0200 Subject: IPVS: Move IPVS to net/netfilter/ipvs Since IPVS now has partial IPv6 support, this patch moves IPVS from net/ipv4/ipvs to net/netfilter/ipvs. It's a result of: $ git mv net/ipv4/ipvs net/netfilter and adapting the relevant Kconfigs/Makefiles to the new path. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Kconfig | 239 --- net/ipv4/ipvs/Makefile | 33 - net/ipv4/ipvs/ip_vs_app.c | 622 ------- net/ipv4/ipvs/ip_vs_conn.c | 1110 ------------ net/ipv4/ipvs/ip_vs_core.c | 1542 ---------------- net/ipv4/ipvs/ip_vs_ctl.c | 3443 ------------------------------------ net/ipv4/ipvs/ip_vs_est.c | 166 -- net/ipv4/ipvs/ip_vs_ftp.c | 410 ----- net/ipv4/ipvs/ip_vs_lblc.c | 555 ------ net/ipv4/ipvs/ip_vs_lblcr.c | 755 -------- net/ipv4/ipvs/ip_vs_lc.c | 103 -- net/ipv4/ipvs/ip_vs_nq.c | 138 -- net/ipv4/ipvs/ip_vs_proto.c | 288 --- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 235 --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 732 -------- net/ipv4/ipvs/ip_vs_proto_udp.c | 533 ------ net/ipv4/ipvs/ip_vs_rr.c | 112 -- net/ipv4/ipvs/ip_vs_sched.c | 251 --- net/ipv4/ipvs/ip_vs_sed.c | 140 -- net/ipv4/ipvs/ip_vs_sh.c | 258 --- net/ipv4/ipvs/ip_vs_sync.c | 942 ---------- net/ipv4/ipvs/ip_vs_wlc.c | 128 -- net/ipv4/ipvs/ip_vs_wrr.c | 237 --- net/ipv4/ipvs/ip_vs_xmit.c | 1004 ----------- 24 files changed, 13976 deletions(-) delete mode 100644 net/ipv4/ipvs/Kconfig delete mode 100644 net/ipv4/ipvs/Makefile delete mode 100644 net/ipv4/ipvs/ip_vs_app.c delete mode 100644 net/ipv4/ipvs/ip_vs_conn.c delete mode 100644 net/ipv4/ipvs/ip_vs_core.c delete mode 100644 net/ipv4/ipvs/ip_vs_ctl.c delete mode 100644 net/ipv4/ipvs/ip_vs_est.c delete mode 100644 net/ipv4/ipvs/ip_vs_ftp.c delete mode 100644 net/ipv4/ipvs/ip_vs_lblc.c delete mode 100644 net/ipv4/ipvs/ip_vs_lblcr.c delete mode 100644 net/ipv4/ipvs/ip_vs_lc.c delete mode 100644 net/ipv4/ipvs/ip_vs_nq.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_ah_esp.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_tcp.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_udp.c delete mode 100644 net/ipv4/ipvs/ip_vs_rr.c delete mode 100644 net/ipv4/ipvs/ip_vs_sched.c delete mode 100644 net/ipv4/ipvs/ip_vs_sed.c delete mode 100644 net/ipv4/ipvs/ip_vs_sh.c delete mode 100644 net/ipv4/ipvs/ip_vs_sync.c delete mode 100644 net/ipv4/ipvs/ip_vs_wlc.c delete mode 100644 net/ipv4/ipvs/ip_vs_wrr.c delete mode 100644 net/ipv4/ipvs/ip_vs_xmit.c (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index de6004de80b..00000000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null @@ -1,239 +0,0 @@ -# -# IP Virtual Server configuration -# -menuconfig IP_VS - tristate "IP virtual server support (EXPERIMENTAL)" - depends on NETFILTER - ---help--- - IP Virtual Server support will let you build a high-performance - virtual server based on cluster of two or more real servers. This - option must be enabled for at least one of the clustered computers - that will take care of intercepting incoming connections to a - single IP address and scheduling them to real servers. - - Three request dispatching techniques are implemented, they are - virtual server via NAT, virtual server via tunneling and virtual - server via direct routing. The several scheduling algorithms can - be used to choose which server the connection is directed to, - thus load balancing can be achieved among the servers. For more - information and its administration program, please visit the - following URL: . - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -if IP_VS - -config IP_VS_IPV6 - bool "IPv6 support for IPVS (DANGEROUS)" - depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) - ---help--- - Add IPv6 support to IPVS. This is incomplete and might be dangerous. - - Say N if unsure. - -config IP_VS_DEBUG - bool "IP virtual server debugging" - ---help--- - Say Y here if you want to get additional messages useful in - debugging the IP virtual server code. You can change the debug - level in /proc/sys/net/ipv4/vs/debug_level - -config IP_VS_TAB_BITS - int "IPVS connection table size (the Nth power of 2)" - range 8 20 - default 12 - ---help--- - The IPVS connection hash table uses the chaining scheme to handle - hash collisions. Using a big IPVS connection hash table will greatly - reduce conflicts when there are hundreds of thousands of connections - in the hash table. - - Note the table size must be power of 2. The table size will be the - value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). - - Another note that each connection occupies 128 bytes effectively and - each hash entry uses 8 bytes, so you can estimate how much memory is - needed for your box. - -comment "IPVS transport protocol load balancing support" - -config IP_VS_PROTO_TCP - bool "TCP load balancing support" - ---help--- - This option enables support for load balancing TCP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_UDP - bool "UDP load balancing support" - ---help--- - This option enables support for load balancing UDP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_AH_ESP - bool - depends on UNDEFINED - -config IP_VS_PROTO_ESP - bool "ESP load balancing support" - select IP_VS_PROTO_AH_ESP - ---help--- - This option enables support for load balancing ESP (Encapsulation - Security Payload) transport protocol. Say Y if unsure. - -config IP_VS_PROTO_AH - bool "AH load balancing support" - select IP_VS_PROTO_AH_ESP - ---help--- - This option enables support for load balancing AH (Authentication - Header) transport protocol. Say Y if unsure. - -comment "IPVS scheduler" - -config IP_VS_RR - tristate "round-robin scheduling" - ---help--- - The robin-robin scheduling algorithm simply directs network - connections to different real servers in a round-robin manner. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WRR - tristate "weighted round-robin scheduling" - ---help--- - The weighted robin-robin scheduling algorithm directs network - connections to different real servers based on server weights - in a round-robin manner. Servers with higher weights receive - new connections first than those with less weights, and servers - with higher weights get more connections than those with less - weights and servers with equal weights get equal connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LC - tristate "least-connection scheduling" - ---help--- - The least-connection scheduling algorithm directs network - connections to the server with the least number of active - connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WLC - tristate "weighted least-connection scheduling" - ---help--- - The weighted least-connection scheduling algorithm directs network - connections to the server with the least active connections - normalized by the server weight. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLC - tristate "locality-based least-connection scheduling" - ---help--- - The locality-based least-connection scheduling algorithm is for - destination IP load balancing. It is usually used in cache cluster. - This algorithm usually directs packet destined for an IP address to - its server if the server is alive and under load. If the server is - overloaded (its active connection numbers is larger than its weight) - and there is a server in its half load, then allocate the weighted - least-connection server to this IP address. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLCR - tristate "locality-based least-connection with replication scheduling" - ---help--- - The locality-based least-connection with replication scheduling - algorithm is also for destination IP load balancing. It is - usually used in cache cluster. It differs from the LBLC scheduling - as follows: the load balancer maintains mappings from a target - to a set of server nodes that can serve the target. Requests for - a target are assigned to the least-connection node in the target's - server set. If all the node in the server set are over loaded, - it picks up a least-connection node in the cluster and adds it - in the sever set for the target. If the server set has not been - modified for the specified time, the most loaded node is removed - from the server set, in order to avoid high degree of replication. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_DH - tristate "destination hashing scheduling" - ---help--- - The destination hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their destination IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SH - tristate "source hashing scheduling" - ---help--- - The source hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their source IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SED - tristate "shortest expected delay scheduling" - ---help--- - The shortest expected delay scheduling algorithm assigns network - connections to the server with the shortest expected delay. The - expected delay that the job will experience is (Ci + 1) / Ui if - sent to the ith server, in which Ci is the number of connections - on the ith server and Ui is the fixed service rate (weight) - of the ith server. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_NQ - tristate "never queue scheduling" - ---help--- - The never queue scheduling algorithm adopts a two-speed model. - When there is an idle server available, the job will be sent to - the idle server, instead of waiting for a fast one. When there - is no idle server available, the job will be sent to the server - that minimize its expected delay (The Shortest Expected Delay - scheduling algorithm). - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -comment 'IPVS application helper' - -config IP_VS_FTP - tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP - ---help--- - FTP is a protocol that transfers IP address and/or port number in - the payload. In the virtual server via Network Address Translation, - the IP address and port number of real servers cannot be sent to - clients in ftp connections directly, so FTP protocol helper is - required for tracking the connection and mangling it back to that of - virtual service. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -endif # IP_VS diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 73a46fe1fe4..00000000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# -# Makefile for the IPVS modules on top of IPv4. -# - -# IPVS transport protocol load balancing support -ip_vs_proto-objs-y := -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o - -ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ - ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) - - -# IPVS core -obj-$(CONFIG_IP_VS) += ip_vs.o - -# IPVS schedulers -obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o -obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o -obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o -obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o -obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o -obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o -obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o -obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o -obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o -obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o - -# IPVS application helpers -obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 201b8ea3020..00000000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null @@ -1,622 +0,0 @@ -/* - * ip_vs_app.c: Application module support for IPVS - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference - * is that ip_vs_app module handles the reverse direction (incoming requests - * and outgoing responses). - * - * IP_MASQ_APP application masquerading module - * - * Author: Juan Jose Ciarlante, - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -EXPORT_SYMBOL(register_ip_vs_app); -EXPORT_SYMBOL(unregister_ip_vs_app); -EXPORT_SYMBOL(register_ip_vs_app_inc); - -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - -/* - * Get an ip_vs_app object - */ -static inline int ip_vs_app_get(struct ip_vs_app *app) -{ - return try_module_get(app->module); -} - - -static inline void ip_vs_app_put(struct ip_vs_app *app) -{ - module_put(app->module); -} - - -/* - * Allocate/initialize app incarnation and register it in proto apps. - */ -static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - struct ip_vs_protocol *pp; - struct ip_vs_app *inc; - int ret; - - if (!(pp = ip_vs_proto_get(proto))) - return -EPROTONOSUPPORT; - - if (!pp->unregister_app) - return -EOPNOTSUPP; - - inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); - if (!inc) - return -ENOMEM; - INIT_LIST_HEAD(&inc->p_list); - INIT_LIST_HEAD(&inc->incs_list); - inc->app = app; - inc->port = htons(port); - atomic_set(&inc->usecnt, 0); - - if (app->timeouts) { - inc->timeout_table = - ip_vs_create_timeout_table(app->timeouts, - app->timeouts_size); - if (!inc->timeout_table) { - ret = -ENOMEM; - goto out; - } - } - - ret = pp->register_app(inc); - if (ret) - goto out; - - list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); - - return 0; - - out: - kfree(inc->timeout_table); - kfree(inc); - return ret; -} - - -/* - * Release app incarnation - */ -static void -ip_vs_app_inc_release(struct ip_vs_app *inc) -{ - struct ip_vs_protocol *pp; - - if (!(pp = ip_vs_proto_get(inc->protocol))) - return; - - if (pp->unregister_app) - pp->unregister_app(inc); - - IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); - - list_del(&inc->a_list); - - kfree(inc->timeout_table); - kfree(inc); -} - - -/* - * Get reference to app inc (only called from softirq) - * - */ -int ip_vs_app_inc_get(struct ip_vs_app *inc) -{ - int result; - - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); - return result; -} - - -/* - * Put the app inc (only called from timer or net softirq) - */ -void ip_vs_app_inc_put(struct ip_vs_app *inc) -{ - ip_vs_app_put(inc->app); - atomic_dec(&inc->usecnt); -} - - -/* - * Register an application incarnation in protocol applications - */ -int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - int result; - - mutex_lock(&__ip_vs_app_mutex); - - result = ip_vs_app_inc_new(app, proto, port); - - mutex_unlock(&__ip_vs_app_mutex); - - return result; -} - - -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) -{ - /* increase the module use count */ - ip_vs_use_count_inc(); - - mutex_lock(&__ip_vs_app_mutex); - - list_add(&app->a_list, &ip_vs_app_list); - - mutex_unlock(&__ip_vs_app_mutex); - - return 0; -} - - -/* - * ip_vs_app unregistration routine - * We are sure there are no app incarnations attached to services - */ -void unregister_ip_vs_app(struct ip_vs_app *app) -{ - struct ip_vs_app *inc, *nxt; - - mutex_lock(&__ip_vs_app_mutex); - - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); - } - - list_del(&app->a_list); - - mutex_unlock(&__ip_vs_app_mutex); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - - -/* - * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) - */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) -{ - return pp->app_conn_bind(cp); -} - - -/* - * Unbind cp from application incarnation (called by cp destructor) - */ -void ip_vs_unbind_app(struct ip_vs_conn *cp) -{ - struct ip_vs_app *inc = cp->app; - - if (!inc) - return; - - if (inc->unbind_conn) - inc->unbind_conn(inc, cp); - if (inc->done_conn) - inc->done_conn(inc, cp); - ip_vs_app_inc_put(inc); - cp->app = NULL; -} - - -/* - * Fixes th->seq based on ip_vs_seq info. - */ -static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 seq = ntohl(th->seq); - - /* - * Adjust seq with delta-offset for all packets after - * the most recent resized pkt seq and with previous_delta offset - * for all packets before most recent resized pkt seq. - */ - if (vseq->delta || vseq->previous_delta) { - if(after(seq, vseq->init_seq)) { - th->seq = htonl(seq + vseq->delta); - IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", - vseq->delta); - } else { - th->seq = htonl(seq + vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " - "(%d) to seq\n", vseq->previous_delta); - } - } -} - - -/* - * Fixes th->ack_seq based on ip_vs_seq info. - */ -static inline void -vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 ack_seq = ntohl(th->ack_seq); - - /* - * Adjust ack_seq with delta-offset for - * the packets AFTER most recent resized pkt has caused a shift - * for packets before most recent resized pkt, use previous_delta - */ - if (vseq->delta || vseq->previous_delta) { - /* since ack_seq is the number of octet that is expected - to receive next, so compare it with init_seq+delta */ - if(after(ack_seq, vseq->init_seq+vseq->delta)) { - th->ack_seq = htonl(ack_seq - vseq->delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " - "(%d) from ack_seq\n", vseq->delta); - - } else { - th->ack_seq = htonl(ack_seq - vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " - "previous_delta (%d) from ack_seq\n", - vseq->previous_delta); - } - } -} - - -/* - * Updates ip_vs_seq if pkt has been resized - * Assumes already checked proto==IPPROTO_TCP and diff!=0. - */ -static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) -{ - /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); - if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { - vseq->previous_delta = vseq->delta; - vseq->delta += diff; - vseq->init_seq = seq; - cp->flags |= flag; - } - spin_unlock(&cp->lock); -} - -static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_seq(&cp->out_seq, th); - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_ack_seq(&cp->in_seq, th); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - if (!app->pkt_out(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->out_seq, - IP_VS_CONN_F_OUT_SEQ, seq, diff); - - return 1; -} - -/* - * Output pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL - * returns false if it can't handle packet (oom) - */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - return app->pkt_out(app, cp, skb, NULL); -} - - -static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_seq(&cp->in_seq, th); - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_ack_seq(&cp->out_seq, th); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - if (!app->pkt_in(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->in_seq, - IP_VS_CONN_F_IN_SEQ, seq, diff); - - return 1; -} - -/* - * Input pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL. - * returns false if can't handle packet (oom). - */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - return app->pkt_in(app, cp, skb, NULL); -} - - -#ifdef CONFIG_PROC_FS -/* - * /proc/net/ip_vs_app entry function - */ - -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) -{ - struct ip_vs_app *app, *inc; - - list_for_each_entry(app, &ip_vs_app_list, a_list) { - list_for_each_entry(inc, &app->incs_list, a_list) { - if (pos-- == 0) - return inc; - } - } - return NULL; - -} - -static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) -{ - mutex_lock(&__ip_vs_app_mutex); - - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_app *inc, *app; - struct list_head *e; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); - - inc = v; - app = inc->app; - - if ((e = inc->a_list.next) != &app->incs_list) - return list_entry(e, struct ip_vs_app, a_list); - - /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { - app = list_entry(e, struct ip_vs_app, a_list); - list_for_each_entry(inc, &app->incs_list, a_list) { - return inc; - } - } - return NULL; -} - -static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) -{ - mutex_unlock(&__ip_vs_app_mutex); -} - -static int ip_vs_app_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "prot port usecnt name\n"); - else { - const struct ip_vs_app *inc = v; - - seq_printf(seq, "%-3s %-7u %-6d %-17s\n", - ip_vs_proto_name(inc->protocol), - ntohs(inc->port), - atomic_read(&inc->usecnt), - inc->name); - } - return 0; -} - -static const struct seq_operations ip_vs_app_seq_ops = { - .start = ip_vs_app_seq_start, - .next = ip_vs_app_seq_next, - .stop = ip_vs_app_seq_stop, - .show = ip_vs_app_seq_show, -}; - -static int ip_vs_app_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_app_seq_ops); -} - -static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, - .open = ip_vs_app_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - -int __init ip_vs_app_init(void) -{ - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - - -void ip_vs_app_cleanup(void) -{ - proc_net_remove(&init_net, "ip_vs_app"); -} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 9a24332fbed..00000000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null @@ -1,1110 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. Many code here is taken from IP MASQ code of kernel 2.2. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include /* for proc_net_* */ -#include -#include -#include - -#include -#include - - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct list_head *ip_vs_conn_tab; - -/* SLAB cache for IPVS connections */ -static struct kmem_cache *ip_vs_conn_cachep __read_mostly; - -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - -/* counter for no client port connections */ -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); - -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 4 -#define CT_LOCKARRAY_SIZE (1<ip, (__force u32)port, proto, - ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -} - - -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. - * returns bool success. - */ -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); - cp->flags |= IP_VS_CONN_F_HASHED; - atomic_inc(&cp->refcnt); - ret = 1; - } else { - IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - ret = 0; - } - - ct_write_unlock(hash); - - return ret; -} - - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - atomic_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - ct_write_unlock(hash); - - return ret; -} - - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) - */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); - return cp; - } - } - - ct_read_unlock(hash); - - return NULL; -} - -struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - struct ip_vs_conn *cp; - - cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, - d_port); - - IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - cp ? "hit" : "not hit"); - - return cp; -} - -/* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - goto out; - } - } - cp = NULL; - - out: - ct_read_unlock(hash); - - IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - cp ? "hit" : "not hit"); - - return cp; -} - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp, *ret=NULL; - - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, d_addr, &cp->caddr) && - ip_vs_addr_equal(af, s_addr, &cp->daddr) && - d_port == cp->cport && s_port == cp->dport && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ret = cp; - break; - } - } - - ct_read_unlock(hash); - - IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - ret ? "hit" : "not hit"); - - return ret; -} - - -/* - * Put back the conn and restart its timer with its timeout - */ -void ip_vs_conn_put(struct ip_vs_conn *cp) -{ - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); - - __ip_vs_conn_put(cp); -} - - -/* - * Fill a no_client_port connection with a client port number - */ -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) -{ - if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ip_vs_conn_no_cport_cnt); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } - spin_unlock(&cp->lock); - - /* hash on new dport */ - ip_vs_conn_hash(cp); - } -} - - -/* - * Bind a connection entry with the corresponding packet_xmit. - * Called by ip_vs_conn_new. - */ -static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit; - break; - } -} - -#ifdef CONFIG_IP_VS_IPV6 -static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit_v6; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit_v6; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit_v6; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit_v6; - break; - } -} -#endif - - -static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) -{ - return atomic_read(&dest->activeconns) - + atomic_read(&dest->inactconns); -} - -/* - * Bind a connection entry with a virtual service destination - * Called just after a new connection entry is created. - */ -static inline void -ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) -{ - /* if dest is NULL, then return directly */ - if (!dest) - return; - - /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); - - /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) - /* if the connection is not template and is created - * by sync, preserve the activity flag. - */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); - cp->dest = dest; - - IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " - "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) - atomic_inc(&dest->activeconns); - else - atomic_inc(&dest->inactconns); - } else { - /* It is a persistent connection/template, so increase - the peristent connection counter */ - atomic_inc(&dest->persistconns); - } - - if (dest->u_threshold != 0 && - ip_vs_dest_totalconns(dest) >= dest->u_threshold) - dest->flags |= IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Check if there is a destination for the connection, if so - * bind the connection to the destination. - */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest; - - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, - &cp->vaddr, cp->vport, - cp->protocol); - ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; -} - - -/* - * Unbind a connection entry with its VS destination - * Called by the ip_vs_conn_expire function. - */ -static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest = cp->dest; - - if (!dest) - return; - - IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " - "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so decrease the inactconns - or activeconns counter */ - if (cp->flags & IP_VS_CONN_F_INACTIVE) { - atomic_dec(&dest->inactconns); - } else { - atomic_dec(&dest->activeconns); - } - } else { - /* It is a persistent connection/template, so decrease - the peristent connection counter */ - atomic_dec(&dest->persistconns); - } - - if (dest->l_threshold != 0) { - if (ip_vs_dest_totalconns(dest) < dest->l_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else if (dest->u_threshold != 0) { - if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } - - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); -} - - -/* - * Checking if the destination of a connection template is available. - * If available, return 1, otherwise invalidate this connection - * template and return 0. - */ -int ip_vs_check_template(struct ip_vs_conn *ct) -{ - struct ip_vs_dest *dest = ct->dest; - - /* - * Checking the dest server status. - */ - if ((dest == NULL) || - !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG_BUF(9, "check_template: dest not available for " - "protocol %s s:%s:%d v:%s:%d " - "-> d:%s:%d\n", - ip_vs_proto_name(ct->protocol), - IP_VS_DBG_ADDR(ct->af, &ct->caddr), - ntohs(ct->cport), - IP_VS_DBG_ADDR(ct->af, &ct->vaddr), - ntohs(ct->vport), - IP_VS_DBG_ADDR(ct->af, &ct->daddr), - ntohs(ct->dport)); - - /* - * Invalidate the connection template - */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } - - /* - * Simply decrease the refcnt of the template, - * don't restart its timer. - */ - atomic_dec(&ct->refcnt); - return 0; - } - return 1; -} - -static void ip_vs_conn_expire(unsigned long data) -{ - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - - /* - * do I control anybody? - */ - if (atomic_read(&cp->n_control)) - goto expire_later; - - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { - /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); - - /* does anybody control me? */ - if (cp->control) - ip_vs_control_del(cp); - - if (unlikely(cp->app != NULL)) - ip_vs_unbind_app(cp); - ip_vs_unbind_dest(cp); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) - atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); - return; - } - - /* hash it back to the table */ - ip_vs_conn_hash(cp); - - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, - atomic_read(&cp->n_control)); - - ip_vs_conn_put(cp); -} - - -void ip_vs_conn_expire_now(struct ip_vs_conn *cp) -{ - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); -} - - -/* - * Create a new connection entry and hash it into the ip_vs_conn_tab - */ -struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) -{ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); - if (cp == NULL) { - IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); - return NULL; - } - - INIT_LIST_HEAD(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->af = af; - cp->protocol = proto; - ip_vs_addr_copy(af, &cp->caddr, caddr); - cp->cport = cport; - ip_vs_addr_copy(af, &cp->vaddr, vaddr); - cp->vport = vport; - ip_vs_addr_copy(af, &cp->daddr, daddr); - cp->dport = dport; - cp->flags = flags; - spin_lock_init(&cp->lock); - - /* - * Set the entry is referenced by the current thread before hashing - * it in the table, so that other thread run ip_vs_random_dropentry - * but cannot drop this entry. - */ - atomic_set(&cp->refcnt, 1); - - atomic_set(&cp->n_control, 0); - atomic_set(&cp->in_pkts, 0); - - atomic_inc(&ip_vs_conn_count); - if (flags & IP_VS_CONN_F_NO_CPORT) - atomic_inc(&ip_vs_conn_no_cport_cnt); - - /* Bind the connection with a destination server */ - ip_vs_bind_dest(cp, dest); - - /* Set its state and timeout */ - cp->state = 0; - cp->timeout = 3*HZ; - - /* Bind its packet transmitter */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ip_vs_bind_xmit_v6(cp); - else -#endif - ip_vs_bind_xmit(cp); - - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); - - /* Hash it in the ip_vs_conn_tab finally */ - ip_vs_conn_hash(cp); - - return cp; -} - - -/* - * /proc/net/ip_vs_conn entries - */ -#ifdef CONFIG_PROC_FS - -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) -{ - int idx; - struct ip_vs_conn *cp; - - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - } - ct_read_unlock_bh(idx); - } - - return NULL; -} - -static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) -{ - seq->private = NULL; - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; -} - -static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; - int idx; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); - - /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - - while (++idx < IP_VS_CONN_TAB_SIZE) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - ct_read_unlock_bh(idx); - } - seq->private = NULL; - return NULL; -} - -static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) -{ - struct list_head *l = seq->private; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); -} - -static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); - else { - const struct ip_vs_conn *cp = v; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - seq_printf(seq, - "%-3s " NIP6_FMT " %04X " NIP6_FMT - " %04X " NIP6_FMT " %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - NIP6(cp->caddr.in6), ntohs(cp->cport), - NIP6(cp->vaddr.in6), ntohs(cp->vport), - NIP6(cp->daddr.in6), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - else -#endif - seq_printf(seq, - "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr.ip), ntohs(cp->cport), - ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_seq_show, -}; - -static int ip_vs_conn_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_seq_ops); -} - -static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const char *ip_vs_origin_name(unsigned flags) -{ - if (flags & IP_VS_CONN_F_SYNC) - return "SYNC"; - else - return "LOCAL"; -} - -static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); - else { - const struct ip_vs_conn *cp = v; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - seq_printf(seq, - "%-3s " NIP6_FMT " %04X " NIP6_FMT - " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - NIP6(cp->caddr.in6), ntohs(cp->cport), - NIP6(cp->vaddr.in6), ntohs(cp->vport), - NIP6(cp->daddr.in6), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - else -#endif - seq_printf(seq, - "%-3s %08X %04X %08X %04X " - "%08X %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr.ip), ntohs(cp->cport), - ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_sync_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_sync_seq_show, -}; - -static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_sync_seq_ops); -} - -static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_sync_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - - -/* - * Randomly drop connection entries before running out of memory - */ -static inline int todrop_entry(struct ip_vs_conn *cp) -{ - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; - int i; - - /* if the conn entry hasn't lasted for 60 seconds, don't drop it. - This will leave enough time for normal connection to get - through. */ - if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) - return 0; - - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ - i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) return 0; - - if (!todrop_rate[i]) return 0; - if (--todrop_counter[i] > 0) return 0; - - todrop_counter[i] = todrop_rate[i]; - return 1; -} - -/* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) -{ - int idx; - struct ip_vs_conn *cp; - - /* - * Randomly scan 1/32 of the whole table every second - */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; - - if (cp->protocol == IPPROTO_TCP) { - switch(cp->state) { - case IP_VS_TCP_S_SYN_RECV: - case IP_VS_TCP_S_SYNACK: - break; - - case IP_VS_TCP_S_ESTABLISHED: - if (todrop_entry(cp)) - break; - continue; - - default: - continue; - } - } else { - if (!todrop_entry(cp)) - continue; - } - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(hash); - } -} - - -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ -static void ip_vs_conn_flush(void) -{ - int idx; - struct ip_vs_conn *cp; - - flush_again: - for (idx=0; idxcontrol) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(idx); - } - - /* the counter may be not NULL, because maybe some conn entries - are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { - schedule(); - goto flush_again; - } -} - - -int __init ip_vs_conn_init(void) -{ - int idx; - - /* - * Allocate the connection hash table and initialize its list heads - */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); - if (!ip_vs_conn_tab) - return -ENOMEM; - - /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); - return -ENOMEM; - } - - IP_VS_INFO("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); - IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", - sizeof(struct ip_vs_conn)); - - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - - return 0; -} - - -void ip_vs_conn_cleanup(void) -{ - /* flush all the connection entries first */ - ip_vs_conn_flush(); - - /* Release the empty cache */ - kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); - vfree(ip_vs_conn_tab); -} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index 958abf3e5f8..00000000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null @@ -1,1542 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. - * - * Changes: - * Paul `Rusty' Russell properly handle non-linear skbs - * Harald Welte don't use nfcache - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include /* for icmp_send */ -#include - -#include -#include - -#ifdef CONFIG_IP_VS_IPV6 -#include -#include -#endif - -#include - - -EXPORT_SYMBOL(register_ip_vs_scheduler); -EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); -EXPORT_SYMBOL(ip_vs_proto_name); -EXPORT_SYMBOL(ip_vs_conn_new); -EXPORT_SYMBOL(ip_vs_conn_in_get); -EXPORT_SYMBOL(ip_vs_conn_out_get); -#ifdef CONFIG_IP_VS_PROTO_TCP -EXPORT_SYMBOL(ip_vs_tcp_conn_listen); -#endif -EXPORT_SYMBOL(ip_vs_conn_put); -#ifdef CONFIG_IP_VS_DEBUG -EXPORT_SYMBOL(ip_vs_get_debug_level); -#endif - - -/* ID used in ICMP lookups */ -#define icmp_id(icmph) (((icmph)->un).echo.id) -#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) - -const char *ip_vs_proto_name(unsigned proto) -{ - static char buf[20]; - - switch (proto) { - case IPPROTO_IP: - return "IP"; - case IPPROTO_UDP: - return "UDP"; - case IPPROTO_TCP: - return "TCP"; - case IPPROTO_ICMP: - return "ICMP"; -#ifdef CONFIG_IP_VS_IPV6 - case IPPROTO_ICMPV6: - return "ICMPv6"; -#endif - default: - sprintf(buf, "IP_%d", proto); - return buf; - } -} - -void ip_vs_init_hash_table(struct list_head *table, int rows) -{ - while (--rows >= 0) - INIT_LIST_HEAD(&table[rows]); -} - -static inline void -ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.inpkts++; - dest->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.inpkts++; - dest->svc->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.inpkts++; - ip_vs_stats.ustats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.outpkts++; - dest->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.outpkts++; - dest->svc->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.outpkts++; - ip_vs_stats.ustats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) -{ - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.ustats.conns++; - spin_unlock(&cp->dest->stats.lock); - - spin_lock(&svc->stats.lock); - svc->stats.ustats.conns++; - spin_unlock(&svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.conns++; - spin_unlock(&ip_vs_stats.lock); -} - - -static inline int -ip_vs_set_state(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - if (unlikely(!pp->state_transition)) - return 0; - return pp->state_transition(cp, direction, skb, pp); -} - - -/* - * IPVS persistent scheduling function - * It creates a connection entry according to its template if exists, - * or selects a server and creates a connection entry plus a template. - * Locking: we are svc user (svc->refcnt), so we hold all dests too - * Protocols supported: TCP, UDP - */ -static struct ip_vs_conn * -ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, - __be16 ports[2]) -{ - struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest; - struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ - union nf_inet_addr snet; /* source network of the client, - after masking */ - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - /* Mask saddr with the netmask to adjust template granularity */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); - else -#endif - snet.ip = iph.saddr.ip & svc->netmask; - - IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " - "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), - IP_VS_DBG_ADDR(svc->af, &snet)); - - /* - * As far as we know, FTP is a very complicated network protocol, and - * it uses control connection and data connections. For active FTP, - * FTP server initialize data connection to the client, its source port - * is often 20. For passive FTP, FTP server tells the clients the port - * that it passively listens to, and the client issues the data - * connection. In the tunneling or direct routing mode, the load - * balancer is on the client-to-server half of connection, the port - * number is unknown to the load balancer. So, a conn template like - * is created for persistent FTP - * service, and a template like - * is created for other persistent services. - */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, ports[1]); - else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like for non-ftp service, - * and - * for ftp service. - */ - if (svc->port != FTPPORT) - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, - ports[1], - &dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = dest->port; - } else { - /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: - * port zero template: - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, - &fwmark, 0); - } else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template according to the service - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_conn_new(svc->af, IPPROTO_IP, - &snet, 0, - &fwmark, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - } else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = ports[1]; - } - - /* - * Create a new connection according to the template - */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, ports[0], - &iph.daddr, ports[1], - &dest->addr, dport, - 0, - dest); - if (cp == NULL) { - ip_vs_conn_put(ct); - return NULL; - } - - /* - * Add its control - */ - ip_vs_control_add(cp, ct); - ip_vs_conn_put(ct); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * IPVS main scheduling function - * It selects a server according to the virtual service, and - * creates a connection entry. - * Protocols supported: TCP, UDP - */ -struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - /* - * Persistent service - */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr); - - /* - * Non-persistent service - */ - if (!svc->fwmark && pptr[1] != svc->port) { - if (!svc->port) - IP_VS_ERR("Schedule: port zero only supported " - "in persistent services, " - "check your ipvs configuration\n"); - return NULL; - } - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "Schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a connection entry. - */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &dest->addr, dest->port ? dest->port : pptr[1], - 0, - dest); - if (cp == NULL) - return NULL; - - IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " - "d:%s:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * Pass or drop the packet. - * Called by ip_vs_in, when the virtual service is available but - * no destination is available for a new connection. - */ -int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - __be16 _ports[2], *pptr; - struct ip_vs_iphdr iph; - int unicast; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); - if (pptr == NULL) { - ip_vs_service_put(svc); - return NF_DROP; - } - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; - else -#endif - unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); - - /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is a non-local unicast, then create - a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { - int ret, cs; - struct ip_vs_conn *cp; - union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - - ip_vs_service_put(svc); - - /* create a new connection entry */ - IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &daddr, 0, - IP_VS_CONN_F_BYPASS, - NULL); - if (cp == NULL) - return NF_DROP; - - /* statistics */ - ip_vs_in_stats(cp, skb); - - /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - - /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - - atomic_inc(&cp->in_pkts); - ip_vs_conn_put(cp); - return ret; - } - - /* - * When the virtual ftp service is presented, packets destined - * for other services on the VIP may get here (except services - * listed in the ipvs table), pass the packets, because it is - * not ipvs job to decide to drop the packets. - */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); - return NF_ACCEPT; - } - - ip_vs_service_put(svc); - - /* - * Notify the client that the destination is unreachable, and - * release the socket buffer. - * Since it is in IP layer, the TCP socket is not actually - * created, the TCP RST packet cannot be sent, instead that - * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ - */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, - skb->dev); - else -#endif - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - - return NF_DROP; -} - - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - -__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) -{ - return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); -} - -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err = ip_defrag(skb, user); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) -{ - /* TODO IPv6: Find out what to do here for IPv6 */ - return 0; -} -#endif - -/* - * Packet has been made sufficiently writable in caller - * - inout: 1=in->out, 0=out->in - */ -void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + - icmp_offset); - struct iphdr *ciph = (struct iphdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr.ip; - ip_send_check(iph); - ciph->daddr = cp->vaddr.ip; - ip_send_check(ciph); - } else { - iph->daddr = cp->daddr.ip; - ip_send_check(iph); - ciph->saddr = cp->daddr.ip; - ip_send_check(ciph); - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { - __be16 *ports = (void *)ciph + ciph->ihl*4; - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->checksum = 0; - icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMP"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMP"); -} - -#ifdef CONFIG_IP_VS_IPV6 -void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - unsigned int icmp_offset = sizeof(struct ipv6hdr); - struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + - icmp_offset); - struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr.in6; - ciph->daddr = cp->vaddr.in6; - } else { - iph->daddr = cp->daddr.in6; - ciph->saddr = cp->daddr.in6; - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { - __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); -} -#endif - -/* Handle relevant response ICMP messages - forward to the right - * destination host. Used for NAT and local client. - */ -static int handle_response_icmp(int af, struct sk_buff *skb, - union nf_inet_addr *snet, - __u8 protocol, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, - unsigned int offset, unsigned int ihl) -{ - unsigned int verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", - IP_VS_DBG_ADDR(af, snet)); - goto out; - } - - if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ip_vs_nat_icmp_v6(skb, pp, cp, 1); - else -#endif - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - -out: - __ip_vs_conn_put(cp); - - return verdict; -} - -/* - * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) - return NF_ACCEPT; - - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, offset, ihl); -} - -#ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) -{ - struct ipv6hdr *iph; - struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - NIP6(iph->saddr), NIP6(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) - return NF_ACCEPT; - - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, - pp, offset, sizeof(struct ipv6hdr)); -} -#endif - -static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - return th->rst; -} - -/* Handle response packets: rewrite addresses and send away... - * Used for NAT and local client. - */ -static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int ihl) -{ - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, ihl)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ipv6_hdr(skb)->saddr = cp->vaddr.in6; - else -#endif - { - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); - } - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - -drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; -} - -/* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn. - */ -static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_vs_iphdr iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int af; - - EnterFunction(11); - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - if (skb->ipvs_property) - return NF_ACCEPT; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else -#endif - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* reassemble IP fragments */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else -#endif - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - /* - * Check if the packet belongs to an existing entry - */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb, iph.len)) { -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0, skb->dev); - else -#endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } - } - } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - return handle_response(af, skb, pp, cp, iph.len); -} - - -/* - * Handle ICMP messages in the outside-to-inside direction (incoming). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (cp) { - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, - cih->protocol, cp, pp, - offset, ihl); - } - return NF_ACCEPT; - } - - verdict = NF_DROP; - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -#ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct ipv6hdr *iph; - struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, verdict; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - NIP6(iph->saddr), NIP6(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (cp) { - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, - cih->nexthdr, - cp, pp, offset, - sizeof(struct ipv6hdr)); - } - return NF_ACCEPT; - } - - verdict = NF_DROP; - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ - - __ip_vs_conn_put(cp); - - return verdict; -} -#endif - - -/* - * Check if it's for virtual services, look it up, - * and send it on its way... - */ -static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_vs_iphdr iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ret, restart, af; - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now - */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); - return NF_ACCEPT; - } - - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - /* Protocol supported? */ - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* - * Check if the packet belongs to an existing connection entry - */ - cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - int v; - - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); - - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) - return v; - } - - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); - - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { - /* the destination server is not available */ - - if (sysctl_ip_vs_expire_nodest_conn) { - /* try to expire the connection immediately */ - ip_vs_conn_expire_now(cp); - } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); - return NF_DROP; - } - - ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - else { - IP_VS_DBG_RL("warning: packet_xmit is null"); - ret = NF_ACCEPT; - } - - /* Increase its packet counter and check if it is needed - * to be synchronized - * - * Sync connection if it is about to close to - * encorage the standby servers to update the connections timeout - */ - atomic_inc(&cp->in_pkts); - if (af == AF_INET && - (ip_vs_sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); - cp->old_state = cp->state; - - ip_vs_conn_put(cp); - return ret; -} - - -/* - * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP - * related packets destined for 0.0.0.0/0. - * When fwmark-based virtual service is used, such as transparent - * cache cluster, TCP packets can be marked and routed to ip_vs_in, - * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain - * and send them to ip_vs_in_icmp. - */ -static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; - - return ip_vs_in_icmp(skb, &r, hooknum); -} - -#ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) - return NF_ACCEPT; - - return ip_vs_in_icmp_v6(skb, &r, hooknum); -} -#endif - - -static struct nf_hook_ops ip_vs_ops[] __read_mostly = { - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, -#ifdef CONFIG_IP_VS_IPV6 - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp_v6, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, -#endif -}; - - -/* - * Initialize IP Virtual Server - */ -static int __init ip_vs_init(void) -{ - int ret; - - ip_vs_estimator_init(); - - ret = ip_vs_control_init(); - if (ret < 0) { - IP_VS_ERR("can't setup control.\n"); - goto cleanup_estimator; - } - - ip_vs_protocol_init(); - - ret = ip_vs_app_init(); - if (ret < 0) { - IP_VS_ERR("can't setup application helper.\n"); - goto cleanup_protocol; - } - - ret = ip_vs_conn_init(); - if (ret < 0) { - IP_VS_ERR("can't setup connection table.\n"); - goto cleanup_app; - } - - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) { - IP_VS_ERR("can't register hooks.\n"); - goto cleanup_conn; - } - - IP_VS_INFO("ipvs loaded.\n"); - return ret; - - cleanup_conn: - ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - cleanup_estimator: - ip_vs_estimator_cleanup(); - return ret; -} - -static void __exit ip_vs_cleanup(void) -{ - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - ip_vs_estimator_cleanup(); - IP_VS_INFO("ipvs unloaded.\n"); -} - -module_init(ip_vs_init); -module_exit(ip_vs_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 0302cf3e503..00000000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null @@ -1,3443 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#ifdef CONFIG_IP_VS_IPV6 -#include -#include -#endif -#include -#include -#include - -#include - -#include - -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - -/* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; - - -#ifdef CONFIG_IP_VS_DEBUG -static int sysctl_ip_vs_debug_level = 0; - -int ip_vs_get_debug_level(void) -{ - return sysctl_ip_vs_debug_level; -} -#endif - -#ifdef CONFIG_IP_VS_IPV6 -/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) -{ - struct rt6_info *rt; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = *addr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) - return 1; - - return 0; -} -#endif -/* - * update_defense_level is called from keventd and from sysctl, - * so it needs to protect itself from softirqs - */ -static void update_defense_level(void) -{ - struct sysinfo i; - static int old_secure_tcp = 0; - int availmem; - int nomem; - int to_change = -1; - - /* we only count free and buffered memory (in pages) */ - si_meminfo(&i); - availmem = i.freeram + i.bufferram; - /* however in linux 2.5 the i.bufferram is total page cache size, - we need adjust it */ - /* si_swapinfo(&i); */ - /* availmem = availmem - (i.totalswap - i.freeswap); */ - - nomem = (availmem < sysctl_ip_vs_amemthresh); - - local_bh_disable(); - - /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { - case 0: - atomic_set(&ip_vs_dropentry, 0); - break; - case 1: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; - } else { - atomic_set(&ip_vs_dropentry, 0); - } - break; - case 2: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; - }; - break; - case 3: - atomic_set(&ip_vs_dropentry, 1); - break; - } - spin_unlock(&__ip_vs_dropentry_lock); - - /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { - case 0: - ip_vs_drop_rate = 0; - break; - case 1: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; - } else { - ip_vs_drop_rate = 0; - } - break; - case 2: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; - } - break; - case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; - break; - } - spin_unlock(&__ip_vs_droppacket_lock); - - /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { - case 0: - if (old_secure_tcp >= 2) - to_change = 0; - break; - case 1: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - sysctl_ip_vs_secure_tcp = 2; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - } - break; - case 2: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - sysctl_ip_vs_secure_tcp = 1; - } - break; - case 3: - if (old_secure_tcp < 2) - to_change = 1; - break; - } - old_secure_tcp = sysctl_ip_vs_secure_tcp; - if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); - - local_bh_enable(); -} - - -/* - * Timer for checking the defense - */ -#define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); - -static void defense_work_handler(struct work_struct *work) -{ - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); -} - -int -ip_vs_use_count_inc(void) -{ - return try_module_get(THIS_MODULE); -} - -void -ip_vs_use_count_dec(void) -{ - module_put(THIS_MODULE); -} - - -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) - -/* the service table hashed by */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; -/* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - - -/* - * Returns hash value for virtual service - */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; -} - -/* - * Returns hash value of fwmark for virtual service lookup - */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) -{ - return fwmark & IP_VS_SVC_TAB_MASK; -} - -/* - * Hashes a service in the ip_vs_svc_table by - * or in the ip_vs_svc_fwm_table by fwmark. - * Should be called with locked tables. - */ -static int ip_vs_svc_hash(struct ip_vs_service *svc) -{ - unsigned hash; - - if (svc->flags & IP_VS_SVC_F_HASHED) { - IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* - * Hash it by in ip_vs_svc_table - */ - hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, - svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); - } else { - /* - * Hash it by fwmark in ip_vs_svc_fwm_table - */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); - } - - svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); - return 1; -} - - -/* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. - * Should be called with locked tables. - */ -static int ip_vs_svc_unhash(struct ip_vs_service *svc) -{ - if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ - list_del(&svc->s_list); - } else { - /* Remove it from the ip_vs_svc_fwm_table table */ - list_del(&svc->f_list); - } - - svc->flags &= ~IP_VS_SVC_F_HASHED; - atomic_dec(&svc->refcnt); - return 1; -} - - -/* - * Get service by {proto,addr,port} in the service table. - */ -static inline struct ip_vs_service * -__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, - __be16 vport) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); - - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->af == af) - && ip_vs_addr_equal(af, &svc->addr, vaddr) - && (svc->port == vport) - && (svc->protocol == protocol)) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - - -/* - * Get service by {fwmark} in the service table. - */ -static inline struct ip_vs_service * -__ip_vs_svc_fwm_get(int af, __u32 fwmark) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); - - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - -struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) -{ - struct ip_vs_service *svc; - - read_lock(&__ip_vs_svc_lock); - - /* - * Check the table hashed by fwmark first - */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) - goto out; - - /* - * Check the table hashed by - * for "full" addressed entries - */ - svc = __ip_vs_service_get(af, protocol, vaddr, vport); - - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { - /* - * Check if ftp service entry exists, the packet - * might belong to FTP data connections. - */ - svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); - } - - if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { - /* - * Check if the catch-all port (port zero) exists - */ - svc = __ip_vs_service_get(af, protocol, vaddr, 0); - } - - out: - read_unlock(&__ip_vs_svc_lock); - - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), - svc ? "hit" : "not hit"); - - return svc; -} - - -static inline void -__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - atomic_inc(&svc->refcnt); - dest->svc = svc; -} - -static inline void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) -{ - struct ip_vs_service *svc = dest->svc; - - dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); -} - - -/* - * Returns hash value for real service - */ -static inline unsigned ip_vs_rs_hashkey(int af, - const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - - return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) - & IP_VS_RTAB_MASK; -} - -/* - * Hashes ip_vs_dest in ip_vs_rtable by . - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) -{ - unsigned hash; - - if (!list_empty(&dest->d_list)) { - return 0; - } - - /* - * Hash by proto,addr,port, - * which are the parameters of the real service. - */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; -} - -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) -{ - /* - * Remove it from the ip_vs_rtable table. - */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); - } - - return 1; -} - -/* - * Lookup real service by in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) -{ - unsigned hash; - struct ip_vs_dest *dest; - - /* - * Check for "full" addressed entries - * Return the first found entry - */ - hash = ip_vs_rs_hashkey(af, daddr, dport); - - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { - /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; - } - } - read_unlock(&__ip_vs_rs_lock); - - return NULL; -} - -/* - * Lookup destination by {addr,port} in the given service - */ -static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) -{ - struct ip_vs_dest *dest; - - /* - * Find the destination for the given service - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { - /* HIT */ - return dest; - } - } - - return NULL; -} - -/* - * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in - * the backup synchronization daemon. It finds the - * destination to be bound to the received connection - * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. - */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, - __be16 dport, - const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol) -{ - struct ip_vs_dest *dest; - struct ip_vs_service *svc; - - svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); - if (!svc) - return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); - return dest; -} - -/* - * Lookup dest by {svc,addr,port} in the destination trash. - * The destination trash is used to hold the destinations that are removed - * from the service table but are still referenced by some conn entries. - * The reason to add the destination trash is when the dest is temporary - * down (either by administrator or by monitor program), the dest can be - * picked back from the trash, the remaining connections to the dest can - * continue, and the counting information of the dest is also useful for - * scheduling. - */ -static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) -{ - struct ip_vs_dest *dest, *nxt; - - /* - * Find the destination in trash - */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->af == svc->af && - ip_vs_addr_equal(svc->af, &dest->addr, daddr) && - dest->port == dport && - dest->vfwmark == svc->fwmark && - dest->protocol == svc->protocol && - (svc->fwmark || - (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && - dest->vport == svc->port))) { - /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } - } - - return NULL; -} - - -/* - * Clean up all the destinations in the trash - * Called by the ip_vs_control_cleanup() - * - * When the ip_vs_control_clearup is activated by ipvs module exit, - * the service tables must have been flushed and all the connections - * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. - */ -static void ip_vs_trash_cleanup(void) -{ - struct ip_vs_dest *dest, *nxt; - - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } -} - - -static void -ip_vs_zero_stats(struct ip_vs_stats *stats) -{ - spin_lock_bh(&stats->lock); - - memset(&stats->ustats, 0, sizeof(stats->ustats)); - ip_vs_zero_estimator(stats); - - spin_unlock_bh(&stats->lock); -} - -/* - * Update a destination in the given service - */ -static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) -{ - int conn_flags; - - /* set the weight and the flags */ - atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; - - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { - conn_flags |= IP_VS_CONN_F_NOOUTPUT; - } else { - /* - * Put the real service in ip_vs_rtable if not present. - * For now only for NAT! - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - } - atomic_set(&dest->conn_flags, conn_flags); - - /* bind the service */ - if (!dest->svc) { - __ip_vs_bind_svc(dest, svc); - } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); - ip_vs_zero_stats(&dest->stats); - __ip_vs_bind_svc(dest, svc); - } - } - - /* set the dest status flags */ - dest->flags |= IP_VS_DEST_F_AVAILABLE; - - if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - dest->u_threshold = udest->u_threshold; - dest->l_threshold = udest->l_threshold; -} - - -/* - * Create a destination for the given service - */ -static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest **dest_p) -{ - struct ip_vs_dest *dest; - unsigned atype; - - EnterFunction(2); - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - atype = ipv6_addr_type(&udest->addr.in6); - if ((!(atype & IPV6_ADDR_UNICAST) || - atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(&udest->addr.in6)) - return -EINVAL; - } else -#endif - { - atype = inet_addr_type(&init_net, udest->addr.ip); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; - } - - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); - if (dest == NULL) { - IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); - return -ENOMEM; - } - - dest->af = svc->af; - dest->protocol = svc->protocol; - dest->vaddr = svc->addr; - dest->vport = svc->port; - dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); - dest->port = udest->port; - - atomic_set(&dest->activeconns, 0); - atomic_set(&dest->inactconns, 0); - atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); - - INIT_LIST_HEAD(&dest->d_list); - spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); - - *dest_p = dest; - - LeaveFunction(2); - return 0; -} - - -/* - * Add a destination into an existing service - */ -static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - union nf_inet_addr daddr; - __be16 dport = udest->port; - int ret; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - - /* - * Check if the dest already exists in the list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); - - if (dest != NULL) { - IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); - return -EEXIST; - } - - /* - * Check if the dest already exists in the trash and - * is from the same service - */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); - - if (dest != NULL) { - IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " - "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->vaddr), - ntohs(dest->vport)); - - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Edit a destination in the given service - */ -static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - union nf_inet_addr daddr; - __be16 dport = udest->port; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - - /* - * Lookup the destination list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); - - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); - return -ENOENT; - } - - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Delete a destination (must be already unlinked from the service) - */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) -{ - ip_vs_kill_estimator(&dest->stats); - - /* - * Remove it from the d-linked list with the real services. - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); - atomic_inc(&dest->refcnt); - } -} - - -/* - * Unlink a destination from the given service - */ -static void __ip_vs_unlink_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, - int svcupd) -{ - dest->flags &= ~IP_VS_DEST_F_AVAILABLE; - - /* - * Remove it from the d-linked destination list. - */ - list_del(&dest->n_list); - svc->num_dests--; - - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); -} - - -/* - * Delete a destination server in the given service - */ -static int -ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - __be16 dport = udest->port; - - EnterFunction(2); - - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); - - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); - return -ENOENT; - } - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Unlink dest from the service - */ - __ip_vs_unlink_dest(svc, dest, 1); - - write_unlock_bh(&__ip_vs_svc_lock); - - /* - * Delete the destination - */ - __ip_vs_del_dest(dest); - - LeaveFunction(2); - - return 0; -} - - -/* - * Add a service into the service hash table - */ -static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, - struct ip_vs_service **svc_p) -{ - int ret = 0; - struct ip_vs_scheduler *sched = NULL; - struct ip_vs_service *svc = NULL; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - ret = -ENOENT; - goto out_mod_dec; - } - -#ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6) { - if (!sched->supports_ipv6) { - ret = -EAFNOSUPPORT; - goto out_err; - } - if ((u->netmask < 1) || (u->netmask > 128)) { - ret = -EINVAL; - goto out_err; - } - } -#endif - - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); - if (svc == NULL) { - IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); - ret = -ENOMEM; - goto out_err; - } - - /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); - atomic_set(&svc->refcnt, 0); - - svc->af = u->af; - svc->protocol = u->protocol; - ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); - svc->port = u->port; - svc->fwmark = u->fwmark; - svc->flags = u->flags; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); - - /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; - - /* Update the virtual service counters */ - if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); - - ip_vs_new_estimator(&svc->stats); - - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ip_vs_num_services++; - - /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); - - *svc_p = svc; - return 0; - - out_err: - if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - kfree(svc); - } - ip_vs_scheduler_put(sched); - - out_mod_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -/* - * Edit a service and bind it with a new scheduler - */ -static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) -{ - struct ip_vs_scheduler *sched, *old_sched; - int ret = 0; - - /* - * Lookup the scheduler, by 'u->sched_name' - */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - return -ENOENT; - } - old_sched = sched; - -#ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6) { - if (!sched->supports_ipv6) { - ret = -EAFNOSUPPORT; - goto out; - } - if ((u->netmask < 1) || (u->netmask > 128)) { - ret = -EINVAL; - goto out; - } - } -#endif - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Set the flags and timeout value - */ - svc->flags = u->flags | IP_VS_SVC_F_HASHED; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 - out: -#endif - - if (old_sched) - ip_vs_scheduler_put(old_sched); - - return ret; -} - - -/* - * Delete a service from the service list - * - The service must be unlinked, unlocked and not referenced! - * - We are called under _bh lock - */ -static void __ip_vs_del_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest, *nxt; - struct ip_vs_scheduler *old_sched; - - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ip_vs_num_services--; - - ip_vs_kill_estimator(&svc->stats); - - /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); - - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - - /* - * Unlink the whole destination list - */ - list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { - __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); - } - - /* - * Update the virtual service counters - */ - if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); - - /* - * Free the service if nobody refers to it - */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - -/* - * Delete a service from the service list - */ -static int ip_vs_del_service(struct ip_vs_service *svc) -{ - if (svc == NULL) - return -EEXIST; - - /* - * Unhash it from the service table - */ - write_lock_bh(&__ip_vs_svc_lock); - - ip_vs_svc_unhash(svc); - - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - return 0; -} - - -/* - * Flush all the virtual services - */ -static int ip_vs_flush(void) -{ - int idx; - struct ip_vs_service *svc, *nxt; - - /* - * Flush the service table hashed by - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - /* - * Flush the service table hashed by fwmark - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - return 0; -} - - -/* - * Zero counters in a service or all services - */ -static int ip_vs_zero_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - - write_lock_bh(&__ip_vs_svc_lock); - list_for_each_entry(dest, &svc->destinations, n_list) { - ip_vs_zero_stats(&dest->stats); - } - ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; -} - -static int ip_vs_zero_all(void) -{ - int idx; - struct ip_vs_service *svc; - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); - } - } - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); - } - } - - ip_vs_zero_stats(&ip_vs_stats); - return 0; -} - - -static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; - } else { - update_defense_level(); - } - } - return rc; -} - - -static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val[2]; - int rc; - - /* backup the value first */ - memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - - -/* - * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) - */ - -static struct ctl_table vs_vars[] = { - { - .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, -#endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = &proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); - -static struct ctl_table_header * sysctl_header; - -#ifdef CONFIG_PROC_FS - -struct ip_vs_iter { - struct list_head *table; - int bucket; -}; - -/* - * Write the contents of the VS rule table to a PROCfs file. - * (It is kept just for backward compatibility) - */ -static inline const char *ip_vs_fwd_name(unsigned flags) -{ - switch (flags & IP_VS_CONN_F_FWD_MASK) { - case IP_VS_CONN_F_LOCALNODE: - return "Local"; - case IP_VS_CONN_F_TUNNEL: - return "Tunnel"; - case IP_VS_CONN_F_DROUTE: - return "Route"; - default: - return "Masq"; - } -} - - -/* Get the Nth entry in the two lists */ -static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) -{ - struct ip_vs_iter *iter = seq->private; - int idx; - struct ip_vs_service *svc; - - /* look in hash by protocol */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ - iter->table = ip_vs_svc_table; - iter->bucket = idx; - return svc; - } - } - } - - /* keep looking in fwmark */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { - iter->table = ip_vs_svc_fwm_table; - iter->bucket = idx; - return svc; - } - } - } - - return NULL; -} - -static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) -{ - - read_lock_bh(&__ip_vs_svc_lock); - return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; -} - - -static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct list_head *e; - struct ip_vs_iter *iter; - struct ip_vs_service *svc; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_info_array(seq,0); - - svc = v; - iter = seq->private; - - if (iter->table == ip_vs_svc_table) { - /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - - - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { - return svc; - } - } - - iter->table = ip_vs_svc_fwm_table; - iter->bucket = -1; - goto scan_fwmark; - } - - /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); - - scan_fwmark: - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) - return svc; - } - - return NULL; -} - -static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) -{ - read_unlock_bh(&__ip_vs_svc_lock); -} - - -static int ip_vs_info_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - seq_puts(seq, - "Prot LocalAddress:Port Scheduler Flags\n"); - seq_puts(seq, - " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); - } else { - const struct ip_vs_service *svc = v; - const struct ip_vs_iter *iter = seq->private; - const struct ip_vs_dest *dest; - - if (iter->table == ip_vs_svc_table) { -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", - ip_vs_proto_name(svc->protocol), - NIP6(svc->addr.in6), - ntohs(svc->port), - svc->scheduler->name); - else -#endif - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr.ip), - ntohs(svc->port), - svc->scheduler->name); - } else { - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); - } - - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - seq_printf(seq, "persistent %d %08X\n", - svc->timeout, - ntohl(svc->netmask)); - else - seq_putc(seq, '\n'); - - list_for_each_entry(dest, &svc->destinations, n_list) { -#ifdef CONFIG_IP_VS_IPV6 - if (dest->af == AF_INET6) - seq_printf(seq, - " -> [" NIP6_FMT "]:%04X" - " %-7s %-6d %-10d %-10d\n", - NIP6(dest->addr.in6), - ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - else -#endif - seq_printf(seq, - " -> %08X:%04X " - "%-7s %-6d %-10d %-10d\n", - ntohl(dest->addr.ip), - ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - - } - } - return 0; -} - -static const struct seq_operations ip_vs_info_seq_ops = { - .start = ip_vs_info_seq_start, - .next = ip_vs_info_seq_next, - .stop = ip_vs_info_seq_stop, - .show = ip_vs_info_seq_show, -}; - -static int ip_vs_info_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); -} - -static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, - .open = ip_vs_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), -}; - -#ifdef CONFIG_PROC_FS -static int ip_vs_stats_show(struct seq_file *seq, void *v) -{ - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - " Conns Packets Packets Bytes Bytes\n"); - - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, - ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, - (unsigned long long) ip_vs_stats.ustats.inbytes, - (unsigned long long) ip_vs_stats.ustats.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.ustats.cps, - ip_vs_stats.ustats.inpps, - ip_vs_stats.ustats.outpps, - ip_vs_stats.ustats.inbps, - ip_vs_stats.ustats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); - - return 0; -} - -static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip_vs_stats_show, NULL); -} - -static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, - .open = ip_vs_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif - -/* - * Set timeout values for tcp tcpfin udp in the timeout_table. - */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) -{ - IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", - u->tcp_timeout, - u->tcp_fin_timeout, - u->udp_timeout); - -#ifdef CONFIG_IP_VS_PROTO_TCP - if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] - = u->tcp_timeout * HZ; - } - - if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] - = u->tcp_fin_timeout * HZ; - } -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP - if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] - = u->udp_timeout * HZ; - } -#endif - return 0; -} - - -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, -}; - -static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, - struct ip_vs_service_user *usvc_compat) -{ - usvc->af = AF_INET; - usvc->protocol = usvc_compat->protocol; - usvc->addr.ip = usvc_compat->addr; - usvc->port = usvc_compat->port; - usvc->fwmark = usvc_compat->fwmark; - - /* Deep copy of sched_name is not needed here */ - usvc->sched_name = usvc_compat->sched_name; - - usvc->flags = usvc_compat->flags; - usvc->timeout = usvc_compat->timeout; - usvc->netmask = usvc_compat->netmask; -} - -static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest_user *udest_compat) -{ - udest->addr.ip = udest_compat->addr; - udest->port = udest_compat->port; - udest->conn_flags = udest_compat->conn_flags; - udest->weight = udest_compat->weight; - udest->u_threshold = udest_compat->u_threshold; - udest->l_threshold = udest_compat->l_threshold; -} - -static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) -{ - int ret; - unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc_compat; - struct ip_vs_service_user_kern usvc; - struct ip_vs_service *svc; - struct ip_vs_dest_user *udest_compat; - struct ip_vs_dest_user_kern udest; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (len != set_arglen[SET_CMDID(cmd)]) { - IP_VS_ERR("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, len) != 0) - return -EFAULT; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - - if (cmd == IP_VS_SO_SET_FLUSH) { - /* Flush the virtual service */ - ret = ip_vs_flush(); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_TIMEOUT) { - /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); - goto out_unlock; - } - - usvc_compat = (struct ip_vs_service_user *)arg; - udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); - - /* We only use the new structs internally, so copy userspace compat - * structs to extended internal versions */ - ip_vs_copy_usvc_compat(&usvc, usvc_compat); - ip_vs_copy_udest_compat(&udest, udest_compat); - - if (cmd == IP_VS_SO_SET_ZERO) { - /* if no service address is set, zero counters in all */ - if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); - goto out_unlock; - } - } - - /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { - IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc.protocol, NIPQUAD(usvc.addr.ip), - ntohs(usvc.port), usvc.sched_name); - ret = -EFAULT; - goto out_unlock; - } - - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - - if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc.protocol)) { - ret = -ESRCH; - goto out_unlock; - } - - switch (cmd) { - case IP_VS_SO_SET_ADD: - if (svc != NULL) - ret = -EEXIST; - else - ret = ip_vs_add_service(&usvc, &svc); - break; - case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, &usvc); - break; - case IP_VS_SO_SET_DEL: - ret = ip_vs_del_service(svc); - if (!ret) - goto out_unlock; - break; - case IP_VS_SO_SET_ZERO: - ret = ip_vs_zero_service(svc); - break; - case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, &udest); - break; - case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, &udest); - break; - case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, &udest); - break; - default: - ret = -EINVAL; - } - - if (svc) - ip_vs_service_put(svc); - - out_unlock: - mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, &src->ustats, sizeof(*dst)); - spin_unlock_bh(&src->lock); -} - -static void -ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) -{ - dst->protocol = src->protocol; - dst->addr = src->addr.ip; - dst->port = src->port; - dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); - dst->flags = src->flags; - dst->timeout = src->timeout / HZ; - dst->netmask = src->netmask; - dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); -} - -static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, - struct ip_vs_get_services __user *uptr) -{ - int idx, count=0; - struct ip_vs_service *svc; - struct ip_vs_service_entry entry; - int ret = 0; - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - out: - return ret; -} - -static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, - struct ip_vs_get_dests __user *uptr) -{ - struct ip_vs_service *svc; - union nf_inet_addr addr = { .ip = get->addr }; - int ret = 0; - - if (get->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); - else - svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, - get->port); - - if (svc) { - int count = 0; - struct ip_vs_dest *dest; - struct ip_vs_dest_entry entry; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (count >= get->num_dests) - break; - - entry.addr = dest->addr.ip; - entry.port = dest->port; - entry.conn_flags = atomic_read(&dest->conn_flags); - entry.weight = atomic_read(&dest->weight); - entry.u_threshold = dest->u_threshold; - entry.l_threshold = dest->l_threshold; - entry.activeconns = atomic_read(&dest->activeconns); - entry.inactconns = atomic_read(&dest->inactconns); - entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - break; - } - count++; - } - ip_vs_service_put(svc); - } else - ret = -ESRCH; - return ret; -} - -static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) -{ -#ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; -#endif -} - - -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, -}; - -static int -do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - unsigned char arg[128]; - int ret = 0; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (*len < get_arglen[GET_CMDID(cmd)]) { - IP_VS_ERR("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) - return -EFAULT; - - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - - switch (cmd) { - case IP_VS_SO_GET_VERSION: - { - char buf[64]; - - sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - if (copy_to_user(user, buf, strlen(buf)+1) != 0) { - ret = -EFAULT; - goto out; - } - *len = strlen(buf)+1; - } - break; - - case IP_VS_SO_GET_INFO: - { - struct ip_vs_getinfo info; - info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; - info.num_services = ip_vs_num_services; - if (copy_to_user(user, &info, sizeof(info)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - int size; - - get = (struct ip_vs_get_services *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_service_entry) * get->num_services; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(get, user); - } - break; - - case IP_VS_SO_GET_SERVICE: - { - struct ip_vs_service_entry *entry; - struct ip_vs_service *svc; - union nf_inet_addr addr; - - entry = (struct ip_vs_service_entry *)arg; - addr.ip = entry->addr; - if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); - else - svc = __ip_vs_service_get(AF_INET, entry->protocol, - &addr, entry->port); - if (svc) { - ip_vs_copy_service(entry, svc); - if (copy_to_user(user, entry, sizeof(*entry)) != 0) - ret = -EFAULT; - ip_vs_service_put(svc); - } else - ret = -ESRCH; - } - break; - - case IP_VS_SO_GET_DESTS: - { - struct ip_vs_get_dests *get; - int size; - - get = (struct ip_vs_get_dests *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_dest_entry) * get->num_dests; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_dest_entries(get, user); - } - break; - - case IP_VS_SO_GET_TIMEOUT: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - if (copy_to_user(user, &t, sizeof(t)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; - } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - - default: - ret = -EINVAL; - } - - out: - mutex_unlock(&__ip_vs_mutex); - return ret; -} - - -static struct nf_sockopt_ops ip_vs_sockopts = { - .pf = PF_INET, - .set_optmin = IP_VS_BASE_CTL, - .set_optmax = IP_VS_SO_SET_MAX+1, - .set = do_ip_vs_set_ctl, - .get_optmin = IP_VS_BASE_CTL, - .get_optmax = IP_VS_SO_GET_MAX+1, - .get = do_ip_vs_get_ctl, - .owner = THIS_MODULE, -}; - -/* - * Generic Netlink interface - */ - -/* IPVS genetlink family */ -static struct genl_family ip_vs_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = IPVS_GENL_NAME, - .version = IPVS_GENL_VERSION, - .maxattr = IPVS_CMD_MAX, -}; - -/* Policy used for first-level command attributes */ -static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { - [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, - [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, - [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ -static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { - [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, - [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, - .len = IP_VS_IFNAME_MAXLEN }, - [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ -static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { - [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, - .len = sizeof(union nf_inet_addr) }, - [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, - .len = IP_VS_SCHEDNAME_MAXLEN }, - [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, - .len = sizeof(struct ip_vs_flags) }, - [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ -static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { - [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, - .len = sizeof(union nf_inet_addr) }, - [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, - [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, -}; - -static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, - struct ip_vs_stats *stats) -{ - struct nlattr *nl_stats = nla_nest_start(skb, container_type); - if (!nl_stats) - return -EMSGSIZE; - - spin_lock_bh(&stats->lock); - - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); - - spin_unlock_bh(&stats->lock); - - nla_nest_end(skb, nl_stats); - - return 0; - -nla_put_failure: - spin_unlock_bh(&stats->lock); - nla_nest_cancel(skb, nl_stats); - return -EMSGSIZE; -} - -static int ip_vs_genl_fill_service(struct sk_buff *skb, - struct ip_vs_service *svc) -{ - struct nlattr *nl_service; - struct ip_vs_flags flags = { .flags = svc->flags, - .mask = ~0 }; - - nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); - if (!nl_service) - return -EMSGSIZE; - - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - - if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); - } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); - } - - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) - goto nla_put_failure; - - nla_nest_end(skb, nl_service); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_service); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_service(struct sk_buff *skb, - struct ip_vs_service *svc, - struct netlink_callback *cb) -{ - void *hdr; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_SERVICE); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_service(skb, svc) < 0) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_services(struct sk_buff *skb, - struct netlink_callback *cb) -{ - int idx = 0, i; - int start = cb->args[0]; - struct ip_vs_service *svc; - - mutex_lock(&__ip_vs_mutex); - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - -nla_put_failure: - mutex_unlock(&__ip_vs_mutex); - cb->args[0] = idx; - - return skb->len; -} - -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, - struct nlattr *nla, int full_entry) -{ - struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; - struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; - - /* Parse mandatory identifying service fields first */ - if (nla == NULL || - nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) - return -EINVAL; - - nla_af = attrs[IPVS_SVC_ATTR_AF]; - nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; - nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; - nla_port = attrs[IPVS_SVC_ATTR_PORT]; - nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; - - if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) - return -EINVAL; - - usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif - return -EAFNOSUPPORT; - - if (nla_fwmark) { - usvc->protocol = IPPROTO_TCP; - usvc->fwmark = nla_get_u32(nla_fwmark); - } else { - usvc->protocol = nla_get_u16(nla_protocol); - nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); - usvc->fwmark = 0; - } - - /* If a full entry was requested, check for the additional fields */ - if (full_entry) { - struct nlattr *nla_sched, *nla_flags, *nla_timeout, - *nla_netmask; - struct ip_vs_flags flags; - struct ip_vs_service *svc; - - nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; - nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; - nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; - nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; - - if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) - return -EINVAL; - - nla_memcpy(&flags, nla_flags, sizeof(flags)); - - /* prefill flags from service if it already exists */ - if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); - else - svc = __ip_vs_service_get(usvc->af, usvc->protocol, - &usvc->addr, usvc->port); - if (svc) { - usvc->flags = svc->flags; - ip_vs_service_put(svc); - } else - usvc->flags = 0; - - /* set new flags from userland */ - usvc->flags = (usvc->flags & ~flags.mask) | - (flags.flags & flags.mask); - usvc->sched_name = nla_data(nla_sched); - usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); - } - - return 0; -} - -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) -{ - struct ip_vs_service_user_kern usvc; - int ret; - - ret = ip_vs_genl_parse_service(&usvc, nla, 0); - if (ret) - return ERR_PTR(ret); - - if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - else - return __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); -} - -static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) -{ - struct nlattr *nl_dest; - - nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); - if (!nl_dest) - return -EMSGSIZE; - - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) - goto nla_put_failure; - - nla_nest_end(skb, nl_dest); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_dest); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, - struct netlink_callback *cb) -{ - void *hdr; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_DEST); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_dest(skb, dest) < 0) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_dests(struct sk_buff *skb, - struct netlink_callback *cb) -{ - int idx = 0; - int start = cb->args[0]; - struct ip_vs_service *svc; - struct ip_vs_dest *dest; - struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - - mutex_lock(&__ip_vs_mutex); - - /* Try to find the service for which to dump destinations */ - if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, - IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) - goto out_err; - - svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc) || svc == NULL) - goto out_err; - - /* Dump the destinations */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - -nla_put_failure: - cb->args[0] = idx; - ip_vs_service_put(svc); - -out_err: - mutex_unlock(&__ip_vs_mutex); - - return skb->len; -} - -static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, - struct nlattr *nla, int full_entry) -{ - struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; - struct nlattr *nla_addr, *nla_port; - - /* Parse mandatory identifying destination fields first */ - if (nla == NULL || - nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) - return -EINVAL; - - nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; - nla_port = attrs[IPVS_DEST_ATTR_PORT]; - - if (!(nla_addr && nla_port)) - return -EINVAL; - - nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); - - /* If a full entry was requested, check for the additional fields */ - if (full_entry) { - struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh; - - nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; - nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; - nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; - nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; - - if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) - return -EINVAL; - - udest->conn_flags = nla_get_u32(nla_fwd) - & IP_VS_CONN_F_FWD_MASK; - udest->weight = nla_get_u32(nla_weight); - udest->u_threshold = nla_get_u32(nla_u_thresh); - udest->l_threshold = nla_get_u32(nla_l_thresh); - } - - return 0; -} - -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) -{ - struct nlattr *nl_daemon; - - nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); - if (!nl_daemon) - return -EMSGSIZE; - - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - - nla_nest_end(skb, nl_daemon); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_daemon); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, - struct netlink_callback *cb) -{ - void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_DAEMON); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_daemons(struct sk_buff *skb, - struct netlink_callback *cb) -{ - mutex_lock(&__ip_vs_mutex); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { - if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ip_vs_master_mcast_ifn, - ip_vs_master_syncid, cb) < 0) - goto nla_put_failure; - - cb->args[0] = 1; - } - - if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { - if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ip_vs_backup_mcast_ifn, - ip_vs_backup_syncid, cb) < 0) - goto nla_put_failure; - - cb->args[1] = 1; - } - -nla_put_failure: - mutex_unlock(&__ip_vs_mutex); - - return skb->len; -} - -static int ip_vs_genl_new_daemon(struct nlattr **attrs) -{ - if (!(attrs[IPVS_DAEMON_ATTR_STATE] && - attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && - attrs[IPVS_DAEMON_ATTR_SYNC_ID])) - return -EINVAL; - - return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); -} - -static int ip_vs_genl_del_daemon(struct nlattr **attrs) -{ - if (!attrs[IPVS_DAEMON_ATTR_STATE]) - return -EINVAL; - - return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); -} - -static int ip_vs_genl_set_config(struct nlattr **attrs) -{ - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) - t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) - t.tcp_fin_timeout = - nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) - t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - - return ip_vs_set_timeout(&t); -} - -static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) -{ - struct ip_vs_service *svc = NULL; - struct ip_vs_service_user_kern usvc; - struct ip_vs_dest_user_kern udest; - int ret = 0, cmd; - int need_full_svc = 0, need_full_dest = 0; - - cmd = info->genlhdr->cmd; - - mutex_lock(&__ip_vs_mutex); - - if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(); - goto out; - } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(info->attrs); - goto out; - } else if (cmd == IPVS_CMD_NEW_DAEMON || - cmd == IPVS_CMD_DEL_DAEMON) { - - struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - - if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || - nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, - info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; - goto out; - } - - if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(daemon_attrs); - else - ret = ip_vs_genl_del_daemon(daemon_attrs); - goto out; - } else if (cmd == IPVS_CMD_ZERO && - !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); - goto out; - } - - /* All following commands require a service argument, so check if we - * received a valid one. We need a full service specification when - * adding / editing a service. Only identifying members otherwise. */ - if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) - need_full_svc = 1; - - ret = ip_vs_genl_parse_service(&usvc, - info->attrs[IPVS_CMD_ATTR_SERVICE], - need_full_svc); - if (ret) - goto out; - - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - - /* Unless we're adding a new service, the service must already exist */ - if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { - ret = -ESRCH; - goto out; - } - - /* Destination commands require a valid destination argument. For - * adding / editing a destination, we need a full destination - * specification. */ - if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || - cmd == IPVS_CMD_DEL_DEST) { - if (cmd != IPVS_CMD_DEL_DEST) - need_full_dest = 1; - - ret = ip_vs_genl_parse_dest(&udest, - info->attrs[IPVS_CMD_ATTR_DEST], - need_full_dest); - if (ret) - goto out; - } - - switch (cmd) { - case IPVS_CMD_NEW_SERVICE: - if (svc == NULL) - ret = ip_vs_add_service(&usvc, &svc); - else - ret = -EEXIST; - break; - case IPVS_CMD_SET_SERVICE: - ret = ip_vs_edit_service(svc, &usvc); - break; - case IPVS_CMD_DEL_SERVICE: - ret = ip_vs_del_service(svc); - break; - case IPVS_CMD_NEW_DEST: - ret = ip_vs_add_dest(svc, &udest); - break; - case IPVS_CMD_SET_DEST: - ret = ip_vs_edit_dest(svc, &udest); - break; - case IPVS_CMD_DEL_DEST: - ret = ip_vs_del_dest(svc, &udest); - break; - case IPVS_CMD_ZERO: - ret = ip_vs_zero_service(svc); - break; - default: - ret = -EINVAL; - } - -out: - if (svc) - ip_vs_service_put(svc); - mutex_unlock(&__ip_vs_mutex); - - return ret; -} - -static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *msg; - void *reply; - int ret, cmd, reply_cmd; - - cmd = info->genlhdr->cmd; - - if (cmd == IPVS_CMD_GET_SERVICE) - reply_cmd = IPVS_CMD_NEW_SERVICE; - else if (cmd == IPVS_CMD_GET_INFO) - reply_cmd = IPVS_CMD_SET_INFO; - else if (cmd == IPVS_CMD_GET_CONFIG) - reply_cmd = IPVS_CMD_SET_CONFIG; - else { - IP_VS_ERR("unknown Generic Netlink command\n"); - return -EINVAL; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&__ip_vs_mutex); - - reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); - if (reply == NULL) - goto nla_put_failure; - - switch (cmd) { - case IPVS_CMD_GET_SERVICE: - { - struct ip_vs_service *svc; - - svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc)) { - ret = PTR_ERR(svc); - goto out_err; - } else if (svc) { - ret = ip_vs_genl_fill_service(msg, svc); - ip_vs_service_put(svc); - if (ret) - goto nla_put_failure; - } else { - ret = -ESRCH; - goto out_err; - } - - break; - } - - case IPVS_CMD_GET_CONFIG: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); -#ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); -#endif - - break; - } - - case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); - break; - } - - genlmsg_end(msg, reply); - ret = genlmsg_unicast(msg, info->snd_pid); - goto out; - -nla_put_failure: - IP_VS_ERR("not enough space in Netlink message\n"); - ret = -EMSGSIZE; - -out_err: - nlmsg_free(msg); -out: - mutex_unlock(&__ip_vs_mutex); - - return ret; -} - - -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { - { - .cmd = IPVS_CMD_NEW_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_SET_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_SERVICE, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - .dumpit = ip_vs_genl_dump_services, - .policy = ip_vs_cmd_policy, - }, - { - .cmd = IPVS_CMD_NEW_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_SET_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .dumpit = ip_vs_genl_dump_dests, - }, - { - .cmd = IPVS_CMD_NEW_DAEMON, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_DAEMON, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_DAEMON, - .flags = GENL_ADMIN_PERM, - .dumpit = ip_vs_genl_dump_daemons, - }, - { - .cmd = IPVS_CMD_SET_CONFIG, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_CONFIG, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - }, - { - .cmd = IPVS_CMD_GET_INFO, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - }, - { - .cmd = IPVS_CMD_ZERO, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_FLUSH, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_set_cmd, - }, -}; - -static int __init ip_vs_genl_register(void) -{ - int ret, i; - - ret = genl_register_family(&ip_vs_genl_family); - if (ret) - return ret; - - for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { - ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); - if (ret) - goto err_out; - } - return 0; - -err_out: - genl_unregister_family(&ip_vs_genl_family); - return ret; -} - -static void ip_vs_genl_unregister(void) -{ - genl_unregister_family(&ip_vs_genl_family); -} - -/* End of Generic Netlink interface definitions */ - - -int __init ip_vs_control_init(void) -{ - int ret; - int idx; - - EnterFunction(2); - - ret = nf_register_sockopt(&ip_vs_sockopts); - if (ret) { - IP_VS_ERR("cannot register sockopt.\n"); - return ret; - } - - ret = ip_vs_genl_register(); - if (ret) { - IP_VS_ERR("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - return ret; - } - - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } - - ip_vs_new_estimator(&ip_vs_stats); - - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - - LeaveFunction(2); - return 0; -} - - -void ip_vs_control_cleanup(void) -{ - EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); - LeaveFunction(2); -} diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index 2eb2860dabb..00000000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * ip_vs_est.c: simple rate estimator for IPVS - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - This code is to estimate rate in a shorter interval (such as 8 - seconds) for virtual services and real servers. For measure rate in a - long interval, it is easy to implement a user level daemon which - periodically reads those statistical counters and measure rate. - - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - - We measure rate during the last 8 seconds every 2 seconds: - - avgrate = avgrate*(1-W) + rate*W - - where W = 2^(-2) - - NOTES. - - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. - - * A lot code is taken from net/sched/estimator.c - */ - - -static void estimation_timer(unsigned long arg); - -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); - -static void estimation_timer(unsigned long arg) -{ - struct ip_vs_estimator *e; - struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; - - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { - s = container_of(e, struct ip_vs_stats, est); - - spin_lock(&s->lock); - n_conns = s->ustats.conns; - n_inpkts = s->ustats.inpkts; - n_outpkts = s->ustats.outpkts; - n_inbytes = s->ustats.inbytes; - n_outbytes = s->ustats.outbytes; - - /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->ustats.cps = (e->cps+0x1FF)>>10; - - rate = (n_inpkts - e->last_inpkts)<<9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->ustats.inpps = (e->inpps+0x1FF)>>10; - - rate = (n_outpkts - e->last_outpkts)<<9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->ustats.outpps = (e->outpps+0x1FF)>>10; - - rate = (n_inbytes - e->last_inbytes)<<4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->ustats.inbps = (e->inbps+0xF)>>5; - - rate = (n_outbytes - e->last_outbytes)<<4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->ustats.outbps = (e->outbps+0xF)>>5; - spin_unlock(&s->lock); - } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); -} - -void ip_vs_new_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - INIT_LIST_HEAD(&est->list); - - est->last_conns = stats->ustats.conns; - est->cps = stats->ustats.cps<<10; - - est->last_inpkts = stats->ustats.inpkts; - est->inpps = stats->ustats.inpps<<10; - - est->last_outpkts = stats->ustats.outpkts; - est->outpps = stats->ustats.outpps<<10; - - est->last_inbytes = stats->ustats.inbytes; - est->inbps = stats->ustats.inbps<<5; - - est->last_outbytes = stats->ustats.outbytes; - est->outbps = stats->ustats.outbps<<5; - - spin_lock_bh(&est_lock); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_kill_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - spin_lock_bh(&est_lock); - list_del(&est->list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_zero_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - /* set counters zero, caller must hold the stats->lock lock */ - est->last_inbytes = 0; - est->last_outbytes = 0; - est->last_conns = 0; - est->last_inpkts = 0; - est->last_outpkts = 0; - est->cps = 0; - est->inpps = 0; - est->outpps = 0; - est->inbps = 0; - est->outbps = 0; -} - -int __init ip_vs_estimator_init(void) -{ - mod_timer(&est_timer, jiffies + 2 * HZ); - return 0; -} - -void ip_vs_estimator_cleanup(void) -{ - del_timer_sync(&est_timer); -} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index 2e7dbd8b73a..00000000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * ip_vs_ftp.c: IPVS ftp application module - * - * Authors: Wensong Zhang - * - * Changes: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference - * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. - * - * IP_MASQ_FTP ftp masquerading module - * - * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 - * - * Author: Wouter Gadeyne - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " - - -/* - * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper - * First port is set to the default port. - */ -static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); -MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); - - -/* Dummy variable */ -static int ip_vs_ftp_pasv; - - -static int -ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -static int -ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -/* - * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. - * is in network order. - */ -static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, - __be32 *addr, __be16 *port, - char **start, char **end) -{ - unsigned char p[6]; - int i = 0; - - if (data_limit - data < plen) { - /* check if there is partial match */ - if (strnicmp(data, pattern, data_limit - data) == 0) - return -1; - else - return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { - return 0; - } - *start = data + plen; - - for (data = *start; *data != term; data++) { - if (data == data_limit) - return -1; - } - *end = data; - - memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { - i++; - } else { - /* unexpected character */ - return -1; - } - } - - if (i != 5) - return -1; - - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); - return 1; -} - - -/* - * Look at outgoing ftp packets to catch the response to a PASV command - * from the server (inside-to-outside). - * When we see one, we build a connection entry with the client address, - * client port 0 (unknown at the moment), the server address and the - * server port. Mark the current connection entry as a control channel - * of the new entry. All this work is just to make the data connection - * can be scheduled to the right server later. - * - * The outgoing packet should be something like - * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". - * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. - */ -static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_limit; - char *start, *end; - union nf_inet_addr from; - __be16 port; - struct ip_vs_conn *n_cp; - char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; - int ret; - -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', - &from.ip, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " - "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from.ip), ntohs(port), - NIPQUAD(cp->caddr.ip), 0); - - /* - * Now update or create an connection entry for it - */ - n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, - &cp->caddr, 0); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &cp->caddr, 0, - &cp->vaddr, port, - &from, port, - IP_VS_CONN_F_NO_CPORT, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Replace the old passive address with the new one - */ - from.ip = n_cp->vaddr.ip; - port = n_cp->vport; - sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), - (ntohs(port)>>8)&255, ntohs(port)&255); - buf_len = strlen(buf); - - /* - * Calculate required delta-offset to keep TCP happy - */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; - } - return 1; -} - - -/* - * Look at incoming ftp packets to catch the PASV/PORT command - * (outside-to-inside). - * - * The incoming packet having the PORT command should be something like - * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". - * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. - * In this case, we create a connection entry using the client address and - * port, so that the active ftp data connection from the server can reach - * the client. - */ -static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_start, *data_limit; - char *start, *end; - union nf_inet_addr to; - __be16 port; - struct ip_vs_conn *n_cp; - -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - - /* no diff required for incoming packets */ - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - while (data <= data_limit - 6) { - if (strnicmp(data, "PASV\r\n", 6) == 0) { - /* Passive mode on */ - IP_VS_DBG(7, "got PASV at %td of %td\n", - data - data_start, - data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; - return 1; - } - data++; - } - - /* - * To support virtual FTP server, the scenerio is as follows: - * FTP client ----> Load Balancer ----> FTP server - * First detect the port number in the application data, - * then create a new connection entry for the coming data - * connection. - */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to.ip, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to.ip), ntohs(port)); - - /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", - ip_vs_proto_name(iph->protocol), - NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); - - n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1), - &cp->daddr, htons(ntohs(cp->dport)-1), - 0, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Move tunnel to listen state - */ - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - - return 1; -} - - -static struct ip_vs_app ip_vs_ftp = { - .name = "ftp", - .type = IP_VS_APP_TYPE_FTP, - .protocol = IPPROTO_TCP, - .module = THIS_MODULE, - .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), - .init_conn = ip_vs_ftp_init_conn, - .done_conn = ip_vs_ftp_done_conn, - .bind_conn = NULL, - .unbind_conn = NULL, - .pkt_out = ip_vs_ftp_out, - .pkt_in = ip_vs_ftp_in, -}; - - -/* - * ip_vs_ftp initialization - */ -static int __init ip_vs_ftp_init(void) -{ - int i, ret; - struct ip_vs_app *app = &ip_vs_ftp; - - ret = register_ip_vs_app(app); - if (ret) - return ret; - - for (i=0; iprotocol, ports[i]); - if (ret) - break; - IP_VS_INFO("%s: loaded support on port[%d] = %d\n", - app->name, i, ports[i]); - } - - if (ret) - unregister_ip_vs_app(app); - - return ret; -} - - -/* - * ip_vs_ftp finish. - */ -static void __exit ip_vs_ftp_exit(void) -{ - unregister_ip_vs_app(&ip_vs_ftp); -} - - -module_init(ip_vs_ftp_init); -module_exit(ip_vs_ftp_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 6ecef3518ca..00000000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null @@ -1,555 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Martin Hamilton : fixed the terrible locking bugs - * *lock(tbl->lock) ==> *lock(&tbl->lock) - * Wensong Zhang : fixed the uninitilized tbl->lock bug - * Wensong Zhang : added doing full expiration check to - * collect stale entries of 24+ hours when - * no partial expire check in a half hour - * Julian Anastasov : replaced del_timer call with del_timer_sync - * to avoid the possible race between timer - * handler and del_timer thread in SMP - * - */ - -/* - * The lblc algorithm is as follows (pseudo code): - * - * if cachenode[dest_ip] is null then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * else - * n <- cachenode[dest_ip]; - * if (n is dead) OR - * (n.conns>n.weight AND - * there is a node m with m.conns -#include -#include -#include -#include - -/* for sysctl */ -#include -#include - -#include - - -/* - * It is for garbage collection of stale IPVS lblc entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblc entry hash table - */ -#ifndef CONFIG_IP_VS_LBLC_TAB_BITS -#define CONFIG_IP_VS_LBLC_TAB_BITS 10 -#endif -#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS -#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) -#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) - - -/* - * IPVS lblc entry represents an association between destination - * IP address and its destination server - */ -struct ip_vs_lblc_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblc hash table - */ -struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLC sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) -{ - list_del(&en->list); - /* - * We don't kfree dest because it is refered either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLC entry - */ -static inline unsigned ip_vs_lblc_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblc_table. - * returns bool success. - */ -static void -ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) -{ - unsigned hash = ip_vs_lblc_hashkey(en->addr); - - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); -} - - -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) -{ - unsigned hash = ip_vs_lblc_hashkey(addr); - struct ip_vs_lblc_entry *en; - - list_for_each_entry(en, &tbl->bucket[hash], list) - if (en->addr == addr) - return en; - - return NULL; -} - - -/* - * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, - struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = ip_vs_lblc_get(tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - en->addr = daddr; - en->lastuse = jiffies; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } - - return en; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) -{ - struct ip_vs_lblc_entry *en, *nxt; - int i; - - for (i=0; ibucket[i], list) { - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - } -} - - -static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; - unsigned long now = jiffies; - int i, j; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&svc->sched_lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblc table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblc_check_expire(unsigned long data) -{ - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblc_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblc_full_check(svc); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&svc->sched_lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - - -static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblc_table *tbl; - - /* - * Allocate the ip_vs_lblc_table for this service - */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", sizeof(*tbl)); - - /* - * Initialize the hash buckets - */ - for (i=0; ibucket[i]); - } - tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)svc); - mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); - - return 0; -} - - -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); - - /* release the table itself */ - kfree(tbl); - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(*tbl)); - - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest = NULL; - struct ip_vs_lblc_entry *en; - - IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - - /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblc_get(tbl, iph->daddr); - if (en) { - /* We only hold a read lock, but this is atomic */ - en->lastuse = jiffies; - - /* - * If the destination is not available, i.e. it's in the trash, - * we must ignore it, as it may be removed from under our feet, - * if someone drops our reference count. Our caller only makes - * sure that destinations, that are not in the trash, are not - * moved to the trash, while we are scheduling. But anyone can - * free up entries from the trash at any time. - */ - - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; - } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; - - /* No cache entry or it is invalid, time to schedule */ - dest = __ip_vs_lblc_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - - /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, iph->daddr, dest); - write_unlock(&svc->sched_lock); - -out: - IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLC Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ - .name = "lblc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_lblc_init_svc, - .done_service = ip_vs_lblc_done_svc, - .schedule = ip_vs_lblc_schedule, -}; - - -static int __init ip_vs_lblc_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblc_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); -} - - -module_init(ip_vs_lblc_init); -module_exit(ip_vs_lblc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index 1f75ea83bcf..00000000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection with Replication scheduler - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Julian Anastasov : Added the missing (dest->weight>0) - * condition in the ip_vs_dest_set_max. - * - */ - -/* - * The lblc/r algorithm is as follows (pseudo code): - * - * if serverSet[dest_ip] is null then - * n, serverSet[dest_ip] <- {weighted least-conn node}; - * else - * n <- {least-conn (alive) node in serverSet[dest_ip]}; - * if (n is null) OR - * (n.conns>n.weight AND - * there is a node m with m.conns 1 AND - * now - serverSet[dest_ip].lastMod > T then - * m <- {most conn node in serverSet[dest_ip]}; - * remove m from serverSet[dest_ip]; - * if serverSet[dest_ip] changed then - * serverSet[dest_ip].lastMod <- now; - * - * return n; - * - */ - -#include -#include -#include -#include -#include - -/* for sysctl */ -#include -#include -#include - -#include - - -/* - * It is for garbage collection of stale IPVS lblcr entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblcr entry hash table - */ -#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS -#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 -#endif -#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS -#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) -#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) - - -/* - * IPVS destination set structure and operations - */ -struct ip_vs_dest_list { - struct ip_vs_dest_list *next; /* list link */ - struct ip_vs_dest *dest; /* destination server */ -}; - -struct ip_vs_dest_set { - atomic_t size; /* set size */ - unsigned long lastmod; /* last modified time */ - struct ip_vs_dest_list *list; /* destination list */ - rwlock_t lock; /* lock for this list */ -}; - - -static struct ip_vs_dest_list * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e; - - for (e=set->list; e!=NULL; e=e->next) { - if (e->dest == dest) - /* already existed */ - return NULL; - } - - e = kmalloc(sizeof(*e), GFP_ATOMIC); - if (e == NULL) { - IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); - return NULL; - } - - atomic_inc(&dest->refcnt); - e->dest = dest; - - /* link it to the list */ - e->next = set->list; - set->list = e; - atomic_inc(&set->size); - - set->lastmod = jiffies; - return e; -} - -static void -ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e, **ep; - - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - if (e->dest == dest) { - /* HIT */ - *ep = e->next; - atomic_dec(&set->size); - set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - kfree(e); - break; - } - ep = &e->next; - } -} - -static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - *ep = e->next; - /* - * We don't kfree dest because it is refered either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - kfree(e); - } - write_unlock(&set->lock); -} - -/* get weighted least-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *least; - int loh, doh; - - if (set == NULL) - return NULL; - - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - least = e->dest; - if (least->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if ((atomic_read(&least->weight) > 0) - && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* find the destination with the weighted least load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) - && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - return least; -} - - -/* get weighted most-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *most; - int moh, doh; - - if (set == NULL) - return NULL; - - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - most = e->dest; - if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); - goto nextstage; - } - } - return NULL; - - /* find the destination with the weighted most load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) - && (atomic_read(&dest->weight) > 0)) { - most = dest; - moh = doh; - } - } - - IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr.ip), ntohs(most->port), - atomic_read(&most->activeconns), - atomic_read(&most->refcnt), - atomic_read(&most->weight), moh); - return most; -} - - -/* - * IPVS lblcr entry represents an association between destination - * IP address and its destination server set - */ -struct ip_vs_lblcr_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest_set set; /* destination server set */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblcr hash table - */ -struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLCR sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) -{ - list_del(&en->list); - ip_vs_dest_set_eraseall(&en->set); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLCR entry - */ -static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblcr_table. - * returns bool success. - */ -static void -ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) -{ - unsigned hash = ip_vs_lblcr_hashkey(en->addr); - - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); -} - - -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) -{ - unsigned hash = ip_vs_lblcr_hashkey(addr); - struct ip_vs_lblcr_entry *en; - - list_for_each_entry(en, &tbl->bucket[hash], list) - if (en->addr == addr) - return en; - - return NULL; -} - - -/* - * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, - struct ip_vs_dest *dest) -{ - struct ip_vs_lblcr_entry *en; - - en = ip_vs_lblcr_get(tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - en->addr = daddr; - en->lastuse = jiffies; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - ip_vs_lblcr_hash(tbl, en); - } - - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - - return en; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) -{ - int i; - struct ip_vs_lblcr_entry *en, *nxt; - - /* No locking required, only called during cleanup. */ - for (i=0; ibucket[i], list) { - ip_vs_lblcr_free(en); - } - } -} - - -static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&svc->sched_lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblcr table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblcr_check_expire(unsigned long data) -{ - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblcr_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblcr_full_check(svc); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&svc->sched_lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - -static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblcr_table *tbl; - - /* - * Allocate the ip_vs_lblcr_table for this service - */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", sizeof(*tbl)); - - /* - * Initialize the hash buckets - */ - for (i=0; ibucket[i]); - } - tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)svc); - mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); - - return 0; -} - - -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); - - /* release the table itself */ - kfree(tbl); - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(*tbl)); - - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest = NULL; - struct ip_vs_lblcr_entry *en; - - IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - - /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en) { - /* We only hold a read lock, but this is atomic */ - en->lastuse = jiffies; - - /* Get the least loaded destination */ - read_lock(&en->set.lock); - dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); - - /* More than one destination + enough time passed by, cleanup */ - if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + - sysctl_ip_vs_lblcr_expiration)) { - struct ip_vs_dest *m; - - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); - } - - /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); - goto out; - } - - /* The cache entry is invalid, time to schedule */ - dest = __ip_vs_lblcr_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - read_unlock(&svc->sched_lock); - return NULL; - } - - /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) - goto out; - - /* No cache entry, time to schedule */ - dest = __ip_vs_lblcr_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - - /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, iph->daddr, dest); - write_unlock(&svc->sched_lock); - -out: - IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLCR Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblcr_scheduler = -{ - .name = "lblcr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_lblcr_init_svc, - .done_service = ip_vs_lblcr_done_svc, - .schedule = ip_vs_lblcr_schedule, -}; - - -static int __init ip_vs_lblcr_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblcr_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); -} - - -module_init(ip_vs_lblcr_init); -module_exit(ip_vs_lblcr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index b69f808ac46..00000000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * IPVS: Least-Connection Scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : added the ip_vs_lc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); - - /* - * Simply select the server with the least number of - * (activeconns<<5) + inactconns - * Except whose weight is equal to zero. - * If the weight is equal to zero, it means that the server is - * quiesced, the existing connections to the server still get - * served, but no new connection is assigned to the server. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || - atomic_read(&dest->weight) == 0) - continue; - doh = ip_vs_lc_dest_overhead(dest); - if (!least || doh < loh) { - least = dest; - loh = doh; - } - } - - if (least) - IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_lc_scheduler = { - .name = "lc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_lc_schedule, -}; - - -static int __init ip_vs_lc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; -} - -static void __exit ip_vs_lc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); -} - -module_init(ip_vs_lc_init); -module_exit(ip_vs_lc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index 9a2d8033f08..00000000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * IPVS: Never Queue scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The NQ algorithm adopts a two-speed model. When there is an idle server - * available, the job will be sent to the idle server, instead of waiting - * for a fast one. When there is no idle server available, the job will be - * sent to the server that minimize its expected delay (The Shortest - * Expected Delay scheduling algorithm). - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri for talking NQ to me. - * - * The difference between NQ and SED is that NQ can improve overall - * system utilization. - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - - if (dest->flags & IP_VS_DEST_F_OVERLOAD || - !atomic_read(&dest->weight)) - continue; - - doh = ip_vs_nq_dest_overhead(dest); - - /* return the server directly if it is idle */ - if (atomic_read(&dest->activeconns) == 0) { - least = dest; - loh = doh; - goto out; - } - - if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { - least = dest; - loh = doh; - } - } - - if (!least) - return NULL; - - out: - IP_VS_DBG_BUF(6, "NQ: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_nq_scheduler = -{ - .name = "nq", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_nq_schedule, -}; - - -static int __init ip_vs_nq_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -static void __exit ip_vs_nq_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -module_init(ip_vs_nq_init); -module_exit(ip_vs_nq_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 0791f9e08fe..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * ip_vs_proto.c: transport protocol load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * IPVS protocols can only be registered/unregistered when the ipvs - * module is loaded/unloaded, so no lock is needed in accessing the - * ipvs protocol table. - */ - -#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ -#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) - -static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; - - -/* - * register an ipvs protocol - */ -static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp->next = ip_vs_proto_table[hash]; - ip_vs_proto_table[hash] = pp; - - if (pp->init != NULL) - pp->init(pp); - - return 0; -} - - -/* - * unregister an ipvs protocol - */ -static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp_p = &ip_vs_proto_table[hash]; - for (; *pp_p; pp_p = &(*pp_p)->next) { - if (*pp_p == pp) { - *pp_p = pp->next; - if (pp->exit != NULL) - pp->exit(pp); - return 0; - } - } - - return -ESRCH; -} - - -/* - * get ip_vs_protocol object by its proto. - */ -struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) -{ - struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); - - for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { - if (pp->protocol == proto) - return pp; - } - - return NULL; -} - - -/* - * Propagate event for state change to all protocols - */ -void ip_vs_protocol_timeout_change(int flags) -{ - struct ip_vs_protocol *pp; - int i; - - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); - } - } -} - - -int * -ip_vs_create_timeout_table(int *table, int size) -{ - return kmemdup(table, size, GFP_ATOMIC); -} - - -/* - * Set timeout value for state specified by name - */ -int -ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) -{ - int i; - - if (!table || !name || !to) - return -EINVAL; - - for (i = 0; i < num; i++) { - if (strcmp(names[i], name)) - continue; - table[i] = to * HZ; - return 0; - } - return -ENOENT; -} - - -const char * ip_vs_state_name(__u16 proto, int state) -{ - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); -} - - -static void -ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[128]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & htons(IP_OFFSET)) - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else { - __be16 _ports[2], *pptr -; - pptr = skb_header_pointer(skb, offset + ih->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, - NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else - sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", - pp->name, - NIPQUAD(ih->saddr), - ntohs(pptr[0]), - NIPQUAD(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[192]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->nexthdr == IPPROTO_FRAGMENT) - sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", - pp->name, NIP6(ih->saddr), - NIP6(ih->daddr)); - else { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, - pp->name, - NIP6(ih->saddr), - NIP6(ih->daddr)); - else - sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", - pp->name, - NIP6(ih->saddr), - ntohs(pptr[0]), - NIP6(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - - -void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); -} - - -int __init ip_vs_protocol_init(void) -{ - char protocols[64]; -#define REGISTER_PROTOCOL(p) \ - do { \ - register_ip_vs_protocol(p); \ - strcat(protocols, ", "); \ - strcat(protocols, (p)->name); \ - } while (0) - - protocols[0] = '\0'; - protocols[2] = '\0'; -#ifdef CONFIG_IP_VS_PROTO_TCP - REGISTER_PROTOCOL(&ip_vs_protocol_tcp); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - REGISTER_PROTOCOL(&ip_vs_protocol_udp); -#endif -#ifdef CONFIG_IP_VS_PROTO_AH - REGISTER_PROTOCOL(&ip_vs_protocol_ah); -#endif -#ifdef CONFIG_IP_VS_PROTO_ESP - REGISTER_PROTOCOL(&ip_vs_protocol_esp); -#endif - IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); - - return 0; -} - - -void ip_vs_protocol_cleanup(void) -{ - struct ip_vs_protocol *pp; - int i; - - /* unregister all the ipvs protocols */ - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - while ((pp = ip_vs_proto_table[i]) != NULL) - unregister_ip_vs_protocol(pp); - } -} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c deleted file mode 100644 index 80ab0c8e5b4..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " - "%s%s %s->%s\n", - inverse ? "ICMP+" : "", - pp->name, - IP_VS_DBG_ADDR(af, &iph->saddr), - IP_VS_DBG_ADDR(af, &iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_esp_conn_out_get(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " - "%s%s %s->%s\n", - inverse ? "ICMP+" : "", - pp->name, - IP_VS_DBG_ADDR(af, &iph->saddr), - IP_VS_DBG_ADDR(af, &iph->daddr)); - } - - return cp; -} - - -static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH/ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, - pp->name, NIP6(ih->saddr), - NIP6(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -#ifdef CONFIG_IP_VS_PROTO_AH -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; -#endif - -#ifdef CONFIG_IP_VS_PROTO_ESP -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index dd4566ea2bf..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null @@ -1,732 +0,0 @@ -/* - * ip_vs_proto_tcp.c: TCP load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include /* for tcphdr */ -#include -#include /* for csum_tcpudp_magic */ -#include -#include -#include - -#include - - -static struct ip_vs_conn * -tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - - -static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct tcphdr _tcph, *th; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); - if (th == NULL) { - *verdict = NF_DROP; - return 0; - } - - if (th->syn && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, - th->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -tcp_fast_csum_update(int af, struct tcphdr *tcph, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldport, __be16 newport) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); - else -#endif - tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); -} - - -static inline void -tcp_partial_csum_update(int af, struct tcphdr *tcph, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldlen, __be16 newlen) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); - else -#endif - tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); -} - - -static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - unsigned int tcphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - oldlen = skb->len - tcphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - tcph = (void *)skb_network_header(skb) + tcphoff; - tcph->source = cp->vport; - - /* Adjust TCP checksums */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - tcphoff)); - } else if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcph->check = csum_ipv6_magic(&cp->vaddr.in6, - &cp->caddr.in6, - skb->len - tcphoff, - cp->protocol, skb->csum); - else -#endif - tcph->check = csum_tcpudp_magic(cp->vaddr.ip, - cp->caddr.ip, - skb->len - tcphoff, - cp->protocol, - skb->csum); - - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, tcph->check, - (char*)&(tcph->check) - (char*)tcph); - } - return 1; -} - - -static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - unsigned int tcphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - oldlen = skb->len - tcphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn and iph ack_seq stuff - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - tcph = (void *)skb_network_header(skb) + tcphoff; - tcph->dest = cp->dport; - - /* - * Adjust TCP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - tcphoff)); - } else if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcph->check = csum_ipv6_magic(&cp->caddr.in6, - &cp->daddr.in6, - skb->len - tcphoff, - cp->protocol, skb->csum); - else -#endif - tcph->check = csum_tcpudp_magic(cp->caddr.ip, - cp->daddr.ip, - skb->len - tcphoff, - cp->protocol, - skb->csum); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - unsigned int tcphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - case CHECKSUM_COMPLETE: -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len - tcphoff, - ipv6_hdr(skb)->nexthdr, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - } else -#endif - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - - return 1; -} - - -#define TCP_DIR_INPUT 0 -#define TCP_DIR_OUTPUT 4 -#define TCP_DIR_INPUT_ONLY 8 - -static const int tcp_state_off[IP_VS_DIR_LAST] = { - [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, - [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, - [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, -}; - -/* - * Timeout table[state] - */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 120*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = "NONE", - [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", - [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", - [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", - [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", - [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", - [IP_VS_TCP_S_CLOSE] = "CLOSE", - [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", - [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", - [IP_VS_TCP_S_LISTEN] = "LISTEN", - [IP_VS_TCP_S_SYNACK] = "SYNACK", - [IP_VS_TCP_S_LAST] = "BUG!", -}; - -#define sNO IP_VS_TCP_S_NONE -#define sES IP_VS_TCP_S_ESTABLISHED -#define sSS IP_VS_TCP_S_SYN_SENT -#define sSR IP_VS_TCP_S_SYN_RECV -#define sFW IP_VS_TCP_S_FIN_WAIT -#define sTW IP_VS_TCP_S_TIME_WAIT -#define sCL IP_VS_TCP_S_CLOSE -#define sCW IP_VS_TCP_S_CLOSE_WAIT -#define sLA IP_VS_TCP_S_LAST_ACK -#define sLI IP_VS_TCP_S_LISTEN -#define sSA IP_VS_TCP_S_SYNACK - -struct tcp_states_t { - int next_state[IP_VS_TCP_S_LAST]; -}; - -static const char * tcp_state_name(int state) -{ - if (state >= IP_VS_TCP_S_LAST) - return "ERR!"; - return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; -} - -static struct tcp_states_t tcp_states [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t tcp_states_dos [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ - int on = (flags & 1); /* secure_tcp */ - - /* - ** FIXME: change secure_tcp to independent sysctl var - ** or make it per-service or per-app because it is valid - ** for most if not for all of the applications. Something - ** like "capabilities" (flags) for each object. - */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); -} - -static inline int tcp_state_idx(struct tcphdr *th) -{ - if (th->rst) - return 3; - if (th->syn) - return 0; - if (th->fin) - return 1; - if (th->ack) - return 2; - return -1; -} - -static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, - int direction, struct tcphdr *th) -{ - int state_idx; - int new_state = IP_VS_TCP_S_CLOSE; - int state_off = tcp_state_off[direction]; - - /* - * Update state offset to INPUT_ONLY if necessary - * or delete NO_OUTPUT flag if output packet detected - */ - if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { - if (state_off == TCP_DIR_OUTPUT) - cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; - else - state_off = TCP_DIR_INPUT_ONLY; - } - - if ((state_idx = tcp_state_idx(th)) < 0) { - IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); - goto tcp_state_out; - } - - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; - - tcp_state_out: - if (new_state != cp->state) { - struct ip_vs_dest *dest = cp->dest; - - IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" - "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - ((state_off == TCP_DIR_OUTPUT) ? - "output " : "input "), - th->syn ? 'S' : '.', - th->fin ? 'F' : '.', - th->ack ? 'A' : '.', - th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->af, &cp->daddr), - ntohs(cp->dport), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); - - if (dest) { - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - } - - cp->timeout = pp->timeout_table[cp->state = new_state]; -} - - -/* - * Handle state transitions - */ -static int -tcp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - struct tcphdr _tcph, *th; - -#ifdef CONFIG_IP_VS_IPV6 - int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); -#else - int ihl = ip_hdrlen(skb); -#endif - - th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); - spin_unlock(&cp->lock); - - return 1; -} - - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - -static inline __u16 tcp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) - & TCP_APP_TAB_MASK; -} - - -static int tcp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = tcp_app_hashkey(port); - - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); - - out: - spin_unlock_bh(&tcp_app_lock); - return ret; -} - - -static void -tcp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); -} - - -static int -tcp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = tcp_app_hashkey(cp->vport); - - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&tcp_app_lock); - - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" - "%s:%u to app %s on port %u\n", - __func__, - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), - ntohs(cp->vport), - inc->name, ntohs(inc->port)); - - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&tcp_app_lock); - - out: - return result; -} - - -/* - * Set LISTEN timeout. (ip_vs_conn_put will setup timer) - */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) -{ - spin_lock(&cp->lock); - cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; - spin_unlock(&cp->lock); -} - - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; -} - - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_tcp = { - .name = "TCP", - .protocol = IPPROTO_TCP, - .num_states = IP_VS_TCP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, - .register_app = tcp_register_app, - .unregister_app = tcp_unregister_app, - .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, - .snat_handler = tcp_snat_handler, - .dnat_handler = tcp_dnat_handler, - .csum_check = tcp_csum_check, - .state_name = tcp_state_name, - .state_transition = tcp_state_transition, - .app_conn_bind = tcp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index 6eb6039d634..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null @@ -1,533 +0,0 @@ -/* - * ip_vs_proto_udp.c: UDP load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static struct ip_vs_conn * -udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct udphdr _udph, *uh; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); - if (uh == NULL) { - *verdict = NF_DROP; - return 0; - } - - svc = ip_vs_service_get(af, skb->mark, iph.protocol, - &iph.daddr, uh->dest); - if (svc) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -udp_fast_csum_update(int af, struct udphdr *uhdr, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldport, __be16 newport) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - else -#endif - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - if (!uhdr->check) - uhdr->check = CSUM_MANGLED_0; -} - -static inline void -udp_partial_csum_update(int af, struct udphdr *uhdr, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldlen, __be16 newlen) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); - else -#endif - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); -} - - -static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - oldlen = skb->len - udphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Call application helper if needed - */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - udph = (void *)skb_network_header(skb) + udphoff; - udph->source = cp->vport; - - /* - * Adjust UDP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udph->check = csum_ipv6_magic(&cp->vaddr.in6, - &cp->caddr.in6, - skb->len - udphoff, - cp->protocol, skb->csum); - else -#endif - udph->check = csum_tcpudp_magic(cp->vaddr.ip, - cp->caddr.ip, - skb->len - udphoff, - cp->protocol, - skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, udph->check, - (char*)&(udph->check) - (char*)udph); - } - return 1; -} - - -static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - oldlen = skb->len - udphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - udph = (void *)skb_network_header(skb) + udphoff; - udph->dest = cp->dport; - - /* - * Adjust UDP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udph->check = csum_ipv6_magic(&cp->caddr.in6, - &cp->daddr.in6, - skb->len - udphoff, - cp->protocol, skb->csum); - else -#endif - udph->check = csum_tcpudp_magic(cp->caddr.ip, - cp->daddr.ip, - skb->len - udphoff, - cp->protocol, - skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - struct udphdr _udph, *uh; - unsigned int udphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - - uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); - if (uh == NULL) - return 0; - - if (uh->check != 0) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, udphoff, - skb->len - udphoff, 0); - case CHECKSUM_COMPLETE: -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len - udphoff, - ipv6_hdr(skb)->nexthdr, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - } else -#endif - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - } - return 1; -} - - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - -static inline __u16 udp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) - & UDP_APP_TAB_MASK; -} - - -static int udp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = udp_app_hashkey(port); - - - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); - - out: - spin_unlock_bh(&udp_app_lock); - return ret; -} - - -static void -udp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); -} - - -static int udp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = udp_app_hashkey(cp->vport); - - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&udp_app_lock); - - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" - "%s:%u to app %s on port %u\n", - __func__, - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), - ntohs(cp->vport), - inc->name, ntohs(inc->port)); - - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&udp_app_lock); - - out: - return result; -} - - -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = 5*60*HZ, - [IP_VS_UDP_S_LAST] = 2*HZ, -}; - -static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = "UDP", - [IP_VS_UDP_S_LAST] = "BUG!", -}; - - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - -static const char * udp_state_name(int state) -{ - if (state >= IP_VS_UDP_S_LAST) - return "ERR!"; - return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; -} - -static int -udp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; -} - -static void udp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; -} - -static void udp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_udp = { - .name = "UDP", - .protocol = IPPROTO_UDP, - .num_states = IP_VS_UDP_S_LAST, - .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, - .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, - .snat_handler = udp_snat_handler, - .dnat_handler = udp_dnat_handler, - .csum_check = udp_csum_check, - .state_transition = udp_state_transition, - .state_name = udp_state_name, - .register_app = udp_register_app, - .unregister_app = udp_unregister_app, - .app_conn_bind = udp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index a22195f68ac..00000000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * IPVS: Round-Robin Scheduling module - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes/Changes: - * Wensong Zhang : changed the ip_vs_rr_schedule to return dest - * Julian Anastasov : fixed the NULL pointer access bug in debugging - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_rr_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static int ip_vs_rr_init_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -/* - * Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct list_head *p, *q; - struct ip_vs_dest *dest; - - IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); - - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; - do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; - } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); - return NULL; - - out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); - IP_VS_DBG_BUF(6, "RR: server %s:%u " - "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); - - return dest; -} - - -static struct ip_vs_scheduler ip_vs_rr_scheduler = { - .name = "rr", /* name */ - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, - .schedule = ip_vs_rr_schedule, -}; - -static int __init ip_vs_rr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -static void __exit ip_vs_rr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -module_init(ip_vs_rr_init); -module_exit(ip_vs_rr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index a46ad9e3501..00000000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include - -#include - -/* - * IPVS scheduler list - */ -static LIST_HEAD(ip_vs_schedulers); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); - - -/* - * Bind a service with a scheduler - */ -int ip_vs_bind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *scheduler) -{ - int ret; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - if (scheduler == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); - return -EINVAL; - } - - svc->scheduler = scheduler; - - if (scheduler->init_service) { - ret = scheduler->init_service(svc); - if (ret) { - IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); - return ret; - } - } - - return 0; -} - - -/* - * Unbind a service with its scheduler - */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) -{ - struct ip_vs_scheduler *sched; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); - return -EINVAL; - } - - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; -} - - -/* - * Get scheduler in the scheduler list by name - */ -static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", - sched_name); - - read_lock_bh(&__ip_vs_sched_lock); - - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - /* - * Test and get the modules atomically - */ - if (sched->module && !try_module_get(sched->module)) { - /* - * This scheduler is just deleted - */ - continue; - } - if (strcmp(sched_name, sched->name)==0) { - /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); - return sched; - } - if (sched->module) - module_put(sched->module); - } - - read_unlock_bh(&__ip_vs_sched_lock); - return NULL; -} - - -/* - * Lookup scheduler and try to load it if it doesn't exist - */ -struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - /* - * Search for the scheduler by sched_name - */ - sched = ip_vs_sched_getbyname(sched_name); - - /* - * If scheduler not found, load the module and search again - */ - if (sched == NULL) { - request_module("ip_vs_%s", sched_name); - sched = ip_vs_sched_getbyname(sched_name); - } - - return sched; -} - -void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) -{ - if (scheduler->module) - module_put(scheduler->module); -} - - -/* - * Register a scheduler in the scheduler list - */ -int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - struct ip_vs_scheduler *sched; - - if (!scheduler) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - if (!scheduler->name) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); - return -EINVAL; - } - - /* increase the module use count */ - ip_vs_use_count_inc(); - - write_lock_bh(&__ip_vs_sched_lock); - - if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already linked\n", scheduler->name); - return -EINVAL; - } - - /* - * Make sure that the scheduler with this name doesn't exist - * in the scheduler list. - */ - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", - scheduler->name); - return -EINVAL; - } - } - /* - * Add it into the d-linked scheduler list - */ - list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); - - IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); - - return 0; -} - - -/* - * Unregister a scheduler from the scheduler list - */ -int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - if (!scheduler) { - IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - write_lock_bh(&__ip_vs_sched_lock); - if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " - "is not in the list. failed\n", scheduler->name); - return -EINVAL; - } - - /* - * Remove it from the d-linked scheduler list - */ - list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index 7d2f22f04b8..00000000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * IPVS: Shortest Expected Delay scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The SED algorithm attempts to minimize each job's expected delay until - * completion. The expected delay that the job will experience is - * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of - * jobs on the ith server and Ui is the fixed service rate (weight) of - * the ith server. The SED algorithm adopts a greedy policy that each does - * what is in its own best interest, i.e. to join the queue which would - * minimize its expected delay of completion. - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri for talking SED to me. - * - * The difference between SED and WLC is that SED includes the incoming - * job in the cost function (the increment of 1). SED may outperform - * WLC, while scheduling big jobs under larger heterogeneous systems - * (the server weight varies a lot). - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_sed_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG_BUF(6, "SED: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_sed_scheduler = -{ - .name = "sed", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_sed_schedule, -}; - - -static int __init ip_vs_sed_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -static void __exit ip_vs_sed_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -module_init(ip_vs_sed_init); -module_exit(ip_vs_sed_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 1d96de27fef..00000000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * IPVS: Source Hashing scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The sh algorithm is to select server by the hash key of source IP - * address. The pseudo code is as follows: - * - * n <- servernode[src_ip]; - * if (n is dead) OR - * (n is overloaded) or (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet source IP address to the current server - * array. If the sh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include -#include -#include -#include - -#include - - -/* - * IPVS SH bucket - */ -struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS SH entry hash table - */ -#ifndef CONFIG_IP_VS_SH_TAB_BITS -#define CONFIG_IP_VS_SH_TAB_BITS 8 -#endif -#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS -#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) -#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS SH entry - */ -static inline unsigned ip_vs_sh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_sh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_sh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; idest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) -{ - int i; - struct ip_vs_sh_bucket *b; - - b = tbl; - for (i=0; idest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_sh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl; - - /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Source Hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(tbl, iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS SH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_sh_scheduler = -{ - .name = "sh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_sh_init_svc, - .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, - .schedule = ip_vs_sh_schedule, -}; - - -static int __init ip_vs_sh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -static void __exit ip_vs_sh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -module_init(ip_vs_sh_init); -module_exit(ip_vs_sh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index de5e7e118ee..00000000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null @@ -1,942 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * - * ip_vs_sync: sync connection info from master load balancer to backups - * through multicast - * - * Changes: - * Alexandre Cassen : Added master & backup support at a time. - * Alexandre Cassen : Added SyncID support for incoming sync - * messages filtering. - * Justin Ossevoort : Fix endian problem on sync message size. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for ip_mc_join_group */ -#include -#include -#include -#include -#include - -#include -#include - -#include - -#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ -#define IP_VS_SYNC_PORT 8848 /* multicast port */ - - -/* - * IPVS sync connection entry - */ -struct ip_vs_sync_conn { - __u8 reserved; - - /* Protocol, addresses and port numbers */ - __u8 protocol; /* Which protocol (TCP/UDP) */ - __be16 cport; - __be16 vport; - __be16 dport; - __be32 caddr; /* client address */ - __be32 vaddr; /* virtual address */ - __be32 daddr; /* destination address */ - - /* Flags and state transition */ - __be16 flags; /* status flags */ - __be16 state; /* state info */ - - /* The sequence options start here */ -}; - -struct ip_vs_sync_conn_options { - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ -}; - -struct ip_vs_sync_thread_data { - struct socket *sock; - char *buf; -}; - -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) -#define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) - - -/* - The master mulitcasts messages to the backup load balancers in the - following format. - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (1) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | . | - | . | - | . | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (n) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ - -#define SYNC_MESG_HEADER_LEN 4 -#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ - -struct ip_vs_sync_mesg { - __u8 nr_conns; - __u8 syncid; - __u16 size; - - /* ip_vs_sync_conn entries start here */ -}; - -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; - -struct ip_vs_sync_buff { - struct list_head list; - unsigned long firstuse; - - /* pointers for the message data */ - struct ip_vs_sync_mesg *mesg; - unsigned char *head; - unsigned char *end; -}; - - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = __constant_htons(IP_VS_SYNC_PORT), - .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), -}; - - -static inline struct ip_vs_sync_buff *sb_dequeue(void) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { - sb = NULL; - } else { - sb = list_entry(ip_vs_sync_queue.next, - struct ip_vs_sync_buff, - list); - list_del(&sb->list); - } - spin_unlock_bh(&ip_vs_sync_lock); - - return sb; -} - -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) -{ - struct ip_vs_sync_buff *sb; - - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) - return NULL; - - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { - kfree(sb); - return NULL; - } - sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; - sb->firstuse = jiffies; - return sb; -} - -static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) -{ - kfree(sb->mesg); - kfree(sb); -} - -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); - else - ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); -} - -/* - * Get the current sync buffer if it has been created for more - * than the specified time or the specified time is zero. - */ -static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; - } else - sb = NULL; - spin_unlock_bh(&curr_sb_lock); - return sb; -} - - -/* - * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. - */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) -{ - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; - int len; - - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); - IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); - return; - } - } - - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; - - /* copy members */ - s->protocol = cp->protocol; - s->cport = cp->cport; - s->vport = cp->vport; - s->dport = cp->dport; - s->caddr = cp->caddr.ip; - s->vaddr = cp->vaddr.ip; - s->daddr = cp->daddr.ip; - s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); - s->state = htons(cp->state); - if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { - struct ip_vs_sync_conn_options *opt = - (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); - } - - m->nr_conns++; - m->size += len; - curr_sb->head += len; - - /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; - } - spin_unlock(&curr_sb_lock); - - /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); -} - - -/* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. - */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) -{ - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; - char *p; - int i; - - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); - for (i=0; inr_conns; i++) { - unsigned flags, state; - - if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); - return; - } - s = (struct ip_vs_sync_conn *) p; - flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; - flags &= ~IP_VS_CONN_F_HASHED; - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; - p += FULL_CONN_SIZE; - if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); - return; - } - } else { - opt = NULL; - p += SIMPLE_CONN_SIZE; - } - - state = ntohs(s->state); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) { - pp = ip_vs_proto_get(s->protocol); - if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", - s->protocol); - continue; - } - if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", - pp->name, state); - continue; - } - } else { - /* protocol in templates is not used for state/timeout */ - pp = NULL; - if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", - state); - state = 0; - } - } - - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - else - cp = ip_vs_ct_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } - cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - (union nf_inet_addr *)&s->daddr, - s->dport, - flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - IP_VS_ERR("ip_vs_conn_new failed\n"); - return; - } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); - } -} - - -/* - * Setup loopback of outgoing multicasts on a sending socket - */ -static void set_mcast_loop(struct sock *sk, u_char loop) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; - release_sock(sk); -} - -/* - * Specify TTL for outgoing multicasts on a sending socket - */ -static void set_mcast_ttl(struct sock *sk, u_char ttl) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ - lock_sock(sk); - inet->mc_ttl = ttl; - release_sock(sk); -} - -/* - * Specifiy default interface for outgoing multicasts - */ -static int set_mcast_if(struct sock *sk, char *ifname) -{ - struct net_device *dev; - struct inet_sock *inet = inet_sk(sk); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - lock_sock(sk); - inet->mc_index = dev->ifindex; - /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - - -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(int sync_state) -{ - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) - return -ENODEV; - - sync_recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); - } - - return 0; -} - - -/* - * Join a multicast group. - * the group is specified by a class D multicast address 224.0.0.0/8 - * in the in_addr structure passed in as a parameter. - */ -static int -join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) -{ - struct ip_mreqn mreq; - struct net_device *dev; - int ret; - - memset(&mreq, 0, sizeof(mreq)); - memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - mreq.imr_ifindex = dev->ifindex; - - lock_sock(sk); - ret = ip_mc_join_group(sk, &mreq); - release_sock(sk); - - return ret; -} - - -static int bind_mcastif_addr(struct socket *sock, char *ifname) -{ - struct net_device *dev; - __be32 addr; - struct sockaddr_in sin; - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); - if (!addr) - IP_VS_ERR("You probably need to specify IP address on " - "multicast interface.\n"); - - IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", - ifname, NIPQUAD(addr)); - - /* Now bind the socket with the address of multicast interface */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - sin.sin_port = 0; - - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); -} - -/* - * Set up sending multicast socket over UDP - */ -static struct socket * make_send_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error setting outbound mcast interface\n"); - goto error; - } - - set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); - - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error binding address of the mcast interface\n"); - goto error; - } - - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); - if (result < 0) { - IP_VS_ERR("Error connecting to the multicast addr\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -/* - * Set up receiving multicast socket over UDP - */ -static struct socket * make_receive_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; - - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); - if (result < 0) { - IP_VS_ERR("Error binding to the multicast addr\n"); - goto error; - } - - /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error joining to the multicast group\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -static int -ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; - struct kvec iov; - int len; - - EnterFunction(7); - iov.iov_base = (void *)buffer; - iov.iov_len = length; - - len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - - LeaveFunction(7); - return len; -} - -static void -ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) -{ - int msize; - - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); - - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - IP_VS_ERR("ip_vs_send_async error\n"); -} - -static int -ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) -{ - struct msghdr msg = {NULL,}; - struct kvec iov; - int len; - - EnterFunction(7); - - /* Receive a packet */ - iov.iov_base = buffer; - iov.iov_len = (size_t)buflen; - - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); - - if (len < 0) - return -1; - - LeaveFunction(7); - return len; -} - - -static int sync_thread_master(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - struct ip_vs_sync_buff *sb; - - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); - - while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - schedule_timeout_interruptible(HZ); - } - - /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { - ip_vs_sync_buff_release(sb); - } - - /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { - ip_vs_sync_buff_release(sb); - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo); - - return 0; -} - - -static int sync_thread_backup(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - int len; - - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - - while (!kthread_should_stop()) { - wait_event_interruptible(*tinfo->sock->sk->sk_sleep, - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); - - /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { - len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); - if (len <= 0) { - IP_VS_ERR("receiving message error\n"); - break; - } - - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); - local_bh_enable(); - } - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - - return 0; -} - - -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) -{ - struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; - struct socket *sock; - char *name, *buf = NULL; - int (*threadfn)(void *data); - int result = -ENOMEM; - - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - - if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) - return -EEXIST; - - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; - threadfn = sync_thread_master; - sock = make_send_sock(); - } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) - return -EEXIST; - - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; - threadfn = sync_thread_backup; - sock = make_receive_sock(); - } else { - return -EINVAL; - } - - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } - - set_sync_mesg_maxlen(state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->sock = sock; - tinfo->buf = buf; - - task = kthread_run(threadfn, tinfo, name); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; - } - - /* mark as active */ - *realtask = task; - ip_vs_sync_state |= state; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - return 0; - -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); -outsocket: - sock_release(sock); -out: - return result; -} - - -int stop_sync_thread(int state) -{ - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - - if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) - return -ESRCH; - - IP_VS_INFO("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); - - /* - * The lock synchronizes with sb_queue_tail(), so that we don't - * add sync buffers to the queue, when we are already in - * progress of stopping the master sync daemon. - */ - - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; - } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) - return -ESRCH; - - IP_VS_INFO("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); - - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; - } else { - return -EINVAL; - } - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 8c596e71259..00000000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * IPVS: Weighted Least-Connection Scheduling module - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest - * Wensong Zhang : changed to use the inactconns in scheduling - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wlc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_wlc_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_wlc_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG_BUF(6, "WLC: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_wlc_scheduler = -{ - .name = "wlc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_wlc_schedule, -}; - - -static int __init ip_vs_wlc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -static void __exit ip_vs_wlc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -module_init(ip_vs_wlc_init); -module_exit(ip_vs_wlc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 7ea92fed50b..00000000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * IPVS: Weighted Round-Robin Scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wrr_update_svc - * Julian Anastasov : fixed the bug of returning destination - * with weight 0 when all weights are zero - * - */ - -#include -#include -#include - -#include - -/* - * current destination pointer for weighted round-robin scheduling - */ -struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ - int cw; /* current weight */ - int mw; /* maximum weight */ - int di; /* decreasing interval */ -}; - - -/* - * Get the gcd of server weights - */ -static int gcd(int a, int b) -{ - int c; - - while ((c = a % b)) { - a = b; - b = c; - } - return b; -} - -static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight; - int g = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - weight = atomic_read(&dest->weight); - if (weight > 0) { - if (g > 0) - g = gcd(weight, g); - else - g = weight; - } - } - return g ? g : 1; -} - - -/* - * Get the maximum weight of the service destinations. - */ -static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (atomic_read(&dest->weight) > weight) - weight = atomic_read(&dest->weight); - } - - return weight; -} - - -static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark; - - /* - * Allocate the mark variable for WRR scheduling - */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); - return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - svc->sched_data = mark; - - return 0; -} - - -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) -{ - /* - * Release the mark variable - */ - kfree(svc->sched_data); - - return 0; -} - - -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark = svc->sched_data; - - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; - return 0; -} - - -/* - * Weighted Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; - - IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); - - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; - while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - IP_VS_ERR_RL("ip_vs_wrr_schedule(): " - "no available servers\n"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } - } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - goto out; - } - } - - IP_VS_DBG_BUF(6, "WRR: server %s:%u " - "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); - - out: - write_unlock(&svc->sched_lock); - return dest; -} - - -static struct ip_vs_scheduler ip_vs_wrr_scheduler = { - .name = "wrr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .init_service = ip_vs_wrr_init_svc, - .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, - .schedule = ip_vs_wrr_schedule, -}; - -static int __init ip_vs_wrr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; -} - -static void __exit ip_vs_wrr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); -} - -module_init(ip_vs_wrr_init); -module_exit(ip_vs_wrr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index 02ddc2b3ce2..00000000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null @@ -1,1004 +0,0 @@ -/* - * ip_vs_xmit.c: various packet transmitters for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include /* for tcphdr */ -#include -#include /* for csum_tcpudp_magic */ -#include -#include /* for icmp_send */ -#include /* for ip_route_output */ -#include -#include -#include -#include -#include - -#include - - -/* - * Destination cache to speed up outgoing route lookup - */ -static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dst_release(old_dst); -} - -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) -{ - struct dst_entry *dst = dest->dst_cache; - - if (!dst) - return NULL; - if ((dst->obsolete - || (dest->af == AF_INET && rtos != dest->dst_rtos)) && - dst->ops->check(dst, cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); - return NULL; - } - dst_hold(dst); - return dst; -} - -static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) -{ - struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = dest->addr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, " - "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr.ip)); - return NULL; - } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr.ip), - atomic_read(&rt->u.dst.__refcnt), rtos); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = cp->daddr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); - return NULL; - } - } - - return rt; -} - -#ifdef CONFIG_IP_VS_IPV6 -static struct rt6_info * -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); - if (!rt) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = dest->addr.in6, - .saddr = { - .s6_addr32 = - { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, - NULL, &fl); - if (!rt) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip6_route_output error, " - "dest: " NIP6_FMT "\n", - NIP6(dest->addr.in6)); - return NULL; - } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", - NIP6(dest->addr.in6), - atomic_read(&rt->u.dst.__refcnt)); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = cp->daddr.in6, - .saddr = { - .s6_addr32 = { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip6_route_output error, dest: " - NIP6_FMT "\n", NIP6(cp->daddr.in6)); - return NULL; - } - } - - return rt; -} -#endif - - -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); -} - -#define IP_VS_XMIT(pf, skb, rt) \ -do { \ - (skb)->ipvs_property = 1; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ -} while (0) - - -/* - * NULL transmitter (do nothing except return NF_ACCEPT) - */ -int -ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; -} - - -/* - * Bypass transmitter - * Let packets bypass the destination when the destination is not - * available, it may be only used in transparent cache cluster. - */ -int -ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; - - EnterFunction(10); - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " - "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct ipv6hdr *iph = ipv6_hdr(skb); - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - - EnterFunction(10); - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " - "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - -/* - * NAT transmitter (only for outside-to-inside nat forwarding) - * Not used for related ICMP - */ -int -ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ip_hdr(skb)->daddr = cp->daddr.ip; - ip_send_check(ip_hdr(skb)); - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, sizeof(struct ipv6hdr), - sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ipv6_hdr(skb)->daddr = cp->daddr.in6; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; -} -#endif - - -/* - * IP Tunneling transmitter - * - * This function encapsulates the packet in a new IP packet, its - * destination will be set to cp->daddr. Most code of this function - * is taken from ipip.c. - * - * It is used in VS/TUN cluster. The load balancer selects a real - * server from a cluster based on a scheduling algorithm, - * encapsulates the request packet and forwards it to the selected - * server. For example, all real servers are configured with - * "ifconfig tunl0 up". When the server receives - * the encapsulated packet, it will decapsulate the packet, processe - * the request and return the response packets directly to the client - * without passing the load balancer. This can greatly increase the - * scalability of virtual server. - * - * Used for ANY protocol - */ -int -ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - if (mtu < 68) { - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - skb->transport_header = old_transport_header; - - /* fix old IP header checksum */ - ip_send_check(old_iph); - - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *old_iph = ipv6_hdr(skb); - sk_buff_data_t old_transport_header = skb->transport_header; - struct ipv6hdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IPV6)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " - "ETH_P_IPV6: %d, skb protocol: %d\n", - htons(ETH_P_IPV6), skb->protocol); - goto tx_error; - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); - /* TODO IPv6: do we need this check in IPv6? */ - if (mtu < 1280) { - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->u.dst); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ipv6_hdr(skb); - } - - skb->transport_header = old_transport_header; - - skb_push(skb, sizeof(struct ipv6hdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ipv6_hdr(skb); - iph->version = 6; - iph->nexthdr = IPPROTO_IPV6; - iph->payload_len = old_iph->payload_len + sizeof(old_iph); - iph->priority = old_iph->priority; - memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - iph->daddr = rt->rt6i_dst.addr; - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ - iph->hop_limit = old_iph->hop_limit; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip6_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - - -/* - * Direct Routing transmitter - * Used for ANY protocol - */ -int -ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; - - EnterFunction(10); - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - - EnterFunction(10); - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - - -/* - * ICMP packet transmitter - * called by the ip_vs_in_icmp - */ -int -ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - rc = NF_STOLEN; - goto out; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; - out: - LeaveFunction(10); - return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp_v6(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - rc = NF_STOLEN; - goto out; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; -out: - LeaveFunction(10); - return rc; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; -} -#endif -- cgit v1.2.3 From 071d7ab6649eb34a873a53e71635186e9117101d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 8 Oct 2008 14:41:35 -0700 Subject: ipvs: Remove stray file left over from ipvs move Commit cb7f6a7b716e801097b564dec3ccb58d330aef56 ("IPVS: Move IPVS to net/netfilter/ipvs") has left a stray file in the old location of ipvs. Signed-off-by: Sven Wegener Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_dh.c | 261 ----------------------------------------------- 1 file changed, 261 deletions(-) delete mode 100644 net/ipv4/ipvs/ip_vs_dh.c (limited to 'net/ipv4/ipvs') diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c deleted file mode 100644 index a16943fd72f..00000000000 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * IPVS: Destination Hashing scheduling module - * - * Authors: Wensong Zhang - * - * Inspired by the consistent hashing scheduler patch from - * Thomas Proell - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The dh algorithm is to select server by the hash key of destination IP - * address. The pseudo code is as follows: - * - * n <- servernode[dest_ip]; - * if (n is dead) OR - * (n is overloaded) OR (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet destination IP address to the current server - * array. If the dh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include -#include -#include -#include - -#include - - -/* - * IPVS DH bucket - */ -struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS DH entry hash table - */ -#ifndef CONFIG_IP_VS_DH_TAB_BITS -#define CONFIG_IP_VS_DH_TAB_BITS 8 -#endif -#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS -#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) -#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS DH entry - */ -static inline unsigned ip_vs_dh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_dh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_dh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; idest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) -{ - int i; - struct ip_vs_dh_bucket *b; - - b = tbl; - for (i=0; idest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_dh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl; - - /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Destination hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(tbl, iph->daddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS DH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_dh_scheduler = -{ - .name = "dh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_dh_init_svc, - .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, - .schedule = ip_vs_dh_schedule, -}; - - -static int __init ip_vs_dh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -static void __exit ip_vs_dh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -module_init(ip_vs_dh_init); -module_exit(ip_vs_dh_cleanup); -MODULE_LICENSE("GPL"); -- cgit v1.2.3