summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/bluetooth/hci_core.c9
-rw-r--r--net/bridge/br_mdb.c2
-rw-r--r--net/bridge/br_multicast.c2
-rw-r--r--net/bridge/br_netfilter.c27
-rw-r--r--net/bridge/br_netlink.c4
-rw-r--r--net/bridge/br_private.h2
-rw-r--r--net/bridge/br_stp_timer.c2
-rw-r--r--net/bridge/netfilter/ebtables.c4
-rw-r--r--net/caif/caif_socket.c8
-rw-r--r--net/ceph/osd_client.c33
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/ethtool.c10
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/rtnetlink.c15
-rw-r--r--net/core/skbuff.c30
-rw-r--r--net/core/sock.c2
-rw-r--r--net/dccp/ipv4.c3
-rw-r--r--net/dccp/ipv6.c3
-rw-r--r--net/dccp/minisocks.c3
-rw-r--r--net/dsa/dsa.c2
-rw-r--r--net/ieee802154/Makefile4
-rw-r--r--net/ieee802154/nl-phy.c5
-rw-r--r--net/ieee802154/nl802154.c2
-rw-r--r--net/ieee802154/rdev-ops.h85
-rw-r--r--net/ieee802154/trace.c7
-rw-r--r--net/ieee802154/trace.h247
-rw-r--r--net/ipv4/fib_trie.c3
-rw-r--r--net/ipv4/inet_connection_sock.c34
-rw-r--r--net/ipv4/inet_diag.c8
-rw-r--r--net/ipv4/netfilter/arp_tables.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c6
-rw-r--r--net/ipv4/ping.c1
-rw-r--r--net/ipv4/route.c9
-rw-r--r--net/ipv4/tcp.c31
-rw-r--r--net/ipv4/tcp_dctcp.c20
-rw-r--r--net/ipv4/tcp_fastopen.c5
-rw-r--r--net/ipv4/tcp_illinois.c21
-rw-r--r--net/ipv4/tcp_input.c55
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c64
-rw-r--r--net/ipv4/tcp_vegas.c19
-rw-r--r--net/ipv4/tcp_vegas.h3
-rw-r--r--net/ipv4/tcp_westwood.c15
-rw-r--r--net/ipv6/ip6_fib.c39
-rw-r--r--net/ipv6/ip6_gre.c9
-rw-r--r--net/ipv6/ip6_output.c43
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/ipv6/route.c19
-rw-r--r--net/ipv6/tcp_ipv6.c5
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--net/mac80211/cfg.c59
-rw-r--r--net/mac80211/ieee80211_i.h9
-rw-r--r--net/mac80211/iface.c18
-rw-r--r--net/mac80211/key.c82
-rw-r--r--net/mac80211/key.h1
-rw-r--r--net/mac80211/rx.c5
-rw-r--r--net/mac80211/sta_info.c19
-rw-r--r--net/mac80211/util.c3
-rw-r--r--net/mac80211/wep.c6
-rw-r--r--net/mac802154/cfg.c9
-rw-r--r--net/mac802154/ieee802154_i.h3
-rw-r--r--net/mac802154/iface.c5
-rw-r--r--net/mac802154/llsec.c4
-rw-r--r--net/mac802154/main.c7
-rw-r--r--net/mpls/af_mpls.c141
-rw-r--r--net/mpls/internal.h16
-rw-r--r--net/netfilter/Kconfig2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c35
-rw-r--r--net/netfilter/nf_tables_api.c7
-rw-r--r--net/netfilter/nfnetlink_log.c19
-rw-r--r--net/netfilter/nfnetlink_queue_core.c18
-rw-r--r--net/netfilter/nft_reject.c2
-rw-r--r--net/netfilter/nft_reject_inet.c2
-rw-r--r--net/netlink/af_netlink.c10
-rw-r--r--net/packet/af_packet.c9
-rw-r--r--net/rds/connection.c17
-rw-r--r--net/rds/ib_cm.c13
-rw-r--r--net/rds/tcp_connect.c1
-rw-r--r--net/rds/tcp_listen.c46
-rw-r--r--net/sched/act_connmark.c2
-rw-r--r--net/sched/cls_api.c12
-rw-r--r--net/sched/sch_api.c10
-rw-r--r--net/sched/sch_codel.c2
-rw-r--r--net/sched/sch_fq_codel.c2
-rw-r--r--net/sched/sch_gred.c4
-rw-r--r--net/socket.c6
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c23
-rw-r--r--net/sunrpc/rpc_pipe.c32
-rw-r--r--net/sunrpc/sched.c4
-rw-r--r--net/sunrpc/xprt.c22
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c208
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c353
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c94
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c87
-rw-r--r--net/sunrpc/xprtrdma/transport.c61
-rw-r--r--net/sunrpc/xprtrdma/verbs.c699
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h90
-rw-r--r--net/switchdev/switchdev.c6
-rw-r--r--net/tipc/bearer.c17
-rw-r--r--net/tipc/link.c16
-rw-r--r--net/tipc/server.c9
-rw-r--r--net/tipc/socket.c3
-rw-r--r--net/unix/af_unix.c16
-rw-r--r--net/unix/diag.c2
-rw-r--r--net/unix/garbage.c70
109 files changed, 2204 insertions, 1186 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 98a30a5b8664..59555f0f8fc8 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -443,7 +443,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_UP:
/* Put all VLANs for this dev in the up state too. */
vlan_group_for_each_dev(grp, i, vlandev) {
- flgs = vlandev->flags;
+ flgs = dev_get_flags(vlandev);
if (flgs & IFF_UP)
continue;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 476709bd068a..c4802f3bd4c5 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1557,7 +1557,8 @@ static int hci_dev_do_close(struct hci_dev *hdev)
{
BT_DBG("%s %p", hdev->name, hdev);
- if (!hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
+ test_bit(HCI_UP, &hdev->flags)) {
/* Execute vendor specific shutdown routine */
if (hdev->shutdown)
hdev->shutdown(hdev);
@@ -2853,9 +2854,11 @@ static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status,
* state. If we were running both LE and BR/EDR inquiry
* simultaneously, and BR/EDR inquiry is already
* finished, stop discovery, otherwise BR/EDR inquiry
- * will stop discovery when finished.
+ * will stop discovery when finished. If we will resolve
+ * remote device name, do not change discovery state.
*/
- if (!test_bit(HCI_INQUIRY, &hdev->flags))
+ if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+ hdev->discovery.state != DISCOVERY_RESOLVING)
hci_discovery_set_state(hdev,
DISCOVERY_STOPPED);
} else {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 409608960899..e29ad70b3000 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -170,7 +170,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
struct br_port_msg *bpm;
struct nlattr *nest, *nest2;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
if (!nlh)
return -EMSGSIZE;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4b6722f8f179..a3abe6ed111e 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1072,7 +1072,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
err = br_ip6_multicast_add_group(br, port, &grec->grec_mca,
vid);
- if (!err)
+ if (err)
break;
}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index ab55e2472beb..60ddfbeb47f5 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -37,10 +37,6 @@
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-#include <net/netfilter/nf_conntrack.h>
-#endif
-
#include <asm/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
@@ -350,24 +346,15 @@ free_skb:
return 0;
}
-static bool dnat_took_place(const struct sk_buff *skb)
+static bool daddr_was_changed(const struct sk_buff *skb,
+ const struct nf_bridge_info *nf_bridge)
{
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || nf_ct_is_untracked(ct))
- return false;
-
- return test_bit(IPS_DST_NAT_BIT, &ct->status);
-#else
- return false;
-#endif
+ return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
}
/* This requires some explaining. If DNAT has taken place,
* we will need to fix up the destination Ethernet address.
+ * This is also true when SNAT takes place (for the reply direction).
*
* There are two cases to consider:
* 1. The packet was DNAT'ed to a device in the same bridge
@@ -421,7 +408,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
nf_bridge->pkt_otherhost = false;
}
nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
- if (dnat_took_place(skb)) {
+ if (daddr_was_changed(skb, nf_bridge)) {
if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -632,6 +619,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
+ struct nf_bridge_info *nf_bridge;
struct net_bridge_port *p;
struct net_bridge *br;
__u32 len = nf_bridge_encap_header_len(skb);
@@ -669,6 +657,9 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
if (!setup_pre_routing(skb))
return NF_DROP;
+ nf_bridge = nf_bridge_info_get(skb);
+ nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;
+
skb->protocol = htons(ETH_P_IP);
NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 0e4ddb81610d..4b5c236998ff 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -394,7 +394,7 @@ errout:
* Dump information about all ports, in response to GETLINK
*/
int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
- struct net_device *dev, u32 filter_mask)
+ struct net_device *dev, u32 filter_mask, int nlflags)
{
struct net_bridge_port *port = br_port_get_rtnl(dev);
@@ -402,7 +402,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
return 0;
- return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI,
+ return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
filter_mask, dev);
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6ca0251cb478..3362c29400f1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -828,7 +828,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port);
int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
- u32 filter_mask);
+ u32 filter_mask, int nlflags);
#ifdef CONFIG_SYSFS
/* br_sysfs_if.c */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 4fcaa67750fd..7caf7fae2d5b 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -97,7 +97,9 @@ static void br_forward_delay_timer_expired(unsigned long arg)
netif_carrier_on(br->dev);
}
br_log_state(p);
+ rcu_read_lock();
br_ifinfo_notify(RTM_NEWLINK, p);
+ rcu_read_unlock();
spin_unlock(&br->lock);
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 91180a7fc943..24c7c96bf5f8 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1117,6 +1117,8 @@ static int do_replace(struct net *net, const void __user *user,
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
tmp.name[sizeof(tmp.name) - 1] = 0;
@@ -2159,6 +2161,8 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry));
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 4ec0c803aef1..112ad784838a 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -330,6 +330,10 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
release_sock(sk);
timeo = schedule_timeout(timeo);
lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_DEAD))
+ break;
+
clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
}
@@ -373,6 +377,10 @@ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg,
struct sk_buff *skb;
lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ err = -ECONNRESET;
+ goto unlock;
+ }
skb = skb_dequeue(&sk->sk_receive_queue);
caif_check_flow_release(sk);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 41a4abc7e98e..c4ec9239249a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1306,8 +1306,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
if (list_empty(&req->r_osd_item))
req->r_osd = NULL;
}
-
- list_del_init(&req->r_req_lru_item); /* can be on notarget */
ceph_osdc_put_request(req);
}
@@ -2017,20 +2015,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
err = __map_request(osdc, req,
force_resend || force_resend_writes);
dout("__map_request returned %d\n", err);
- if (err == 0)
- continue; /* no change and no osd was specified */
if (err < 0)
continue; /* hrm! */
- if (req->r_osd == NULL) {
- dout("tid %llu maps to no valid osd\n", req->r_tid);
- needmap++; /* request a newer map */
- continue;
- }
+ if (req->r_osd == NULL || err > 0) {
+ if (req->r_osd == NULL) {
+ dout("lingering %p tid %llu maps to no osd\n",
+ req, req->r_tid);
+ /*
+ * A homeless lingering request makes
+ * no sense, as it's job is to keep
+ * a particular OSD connection open.
+ * Request a newer map and kick the
+ * request, knowing that it won't be
+ * resent until we actually get a map
+ * that can tell us where to send it.
+ */
+ needmap++;
+ }
- dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- __register_request(osdc, req);
- __unregister_linger_request(osdc, req);
+ dout("kicking lingering %p tid %llu osd%d\n", req,
+ req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
+ __register_request(osdc, req);
+ __unregister_linger_request(osdc, req);
+ }
}
reset_changed_osds(osdc);
mutex_unlock(&osdc->request_mutex);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1796cef55ab5..2c1c67fad64d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3079,7 +3079,7 @@ static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
- if (next_cpu != RPS_NO_CPU) {
+ if (next_cpu < nr_cpu_ids) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
@@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
- * - Current CPU is unset (equal to RPS_NO_CPU).
+ * - Current CPU is unset (>= nr_cpu_ids).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
@@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) &&
- (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
- if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
@@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- int cpu;
+ unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = ACCESS_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
((int)(per_cpu(softnet_data, cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
@@ -5209,7 +5209,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
return -EBUSY;
- if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
+ if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 1d00b8922902..1347e11f5cc9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -359,7 +359,15 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
int err;
struct ethtool_cmd cmd;
- err = __ethtool_get_settings(dev, &cmd);
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+
+ cmd.cmd = ETHTOOL_GSET;
+
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
if (err < 0)
return err;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 78fc04ad36fc..572af0011997 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -601,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
}
err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- RTM_GETNSID, net, peer, -1);
+ RTM_NEWNSID, net, peer, -1);
if (err < 0)
goto err_out;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 358d52a38533..8de36824018d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2416,6 +2416,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
{
struct sk_buff *skb;
+ if (dev->reg_state != NETREG_REGISTERED)
+ return;
+
skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
if (skb)
rtmsg_ifinfo_send(skb, dev, flags);
@@ -2854,7 +2857,7 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct net_device *dev, u16 mode,
- u32 flags, u32 mask)
+ u32 flags, u32 mask, int nlflags)
{
struct nlmsghdr *nlh;
struct ifinfomsg *ifm;
@@ -2863,7 +2866,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
- nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2969,7 +2972,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
if (idx >= cb->args[0] &&
br_dev->netdev_ops->ndo_bridge_getlink(
- skb, portid, seq, dev, filter_mask) < 0)
+ skb, portid, seq, dev, filter_mask,
+ NLM_F_MULTI) < 0)
break;
idx++;
}
@@ -2977,7 +2981,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
if (ops->ndo_bridge_getlink) {
if (idx >= cb->args[0] &&
ops->ndo_bridge_getlink(skb, portid, seq, dev,
- filter_mask) < 0)
+ filter_mask,
+ NLM_F_MULTI) < 0)
break;
idx++;
}
@@ -3018,7 +3023,7 @@ static int rtnl_bridge_notify(struct net_device *dev)
goto errout;
}
- err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
if (err < 0)
goto errout;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d1967dab9cc6..3cfff2a3d651 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -280,13 +280,14 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
- * build_skb - build a network buffer
+ * __build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of fragment, or 0 if head was kmalloced
+ * @frag_size: size of data, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator.
+ * @frag_size is 0, otherwise data should come from the page allocator
+ * or vmalloc()
* The return is the new skb buffer.
* On a failure the return is %NULL, and @data is not freed.
* Notes :
@@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb);
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data, unsigned int frag_size)
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
@@ -311,7 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
- skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -328,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
return skb;
}
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __build_skb(data, frag_size);
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (virt_to_head_page(data)->pfmemalloc)
+ skb->pfmemalloc = 1;
+ }
+ return skb;
+}
EXPORT_SYMBOL(build_skb);
struct netdev_alloc_cache {
@@ -348,7 +365,8 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
gfp_t gfp = gfp_mask;
if (order) {
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
nc->frag.size = PAGE_SIZE << (page ? order : 0);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index e891bcf325ca..292f42228bfb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1474,8 +1474,8 @@ void sk_release_kernel(struct sock *sk)
return;
sock_hold(sk);
- sock_net_set(sk, get_net(&init_net));
sock_release(sk->sk_socket);
+ sock_net_set(sk, get_net(&init_net));
sock_put(sk);
}
EXPORT_SYMBOL(sk_release_kernel);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2b4f21d34df6..ccf4c5629b3c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -453,7 +453,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
iph->saddr, iph->daddr);
if (req) {
nsk = dccp_check_req(sk, skb, req);
- reqsk_put(req);
+ if (!nsk)
+ reqsk_put(req);
return nsk;
}
nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9d0551092c6c..5165571f397a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -301,7 +301,8 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
&iph->daddr, inet6_iif(skb));
if (req) {
nsk = dccp_check_req(sk, skb, req);
- reqsk_put(req);
+ if (!nsk)
+ reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5f566663e47f..30addee2dd03 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -186,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
if (child == NULL)
goto listen_overflow;
- inet_csk_reqsk_queue_unlink(sk, req);
- inet_csk_reqsk_queue_removed(sk, req);
+ inet_csk_reqsk_queue_drop(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
out:
return child;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 079a224471e7..e6f6cc3a1bcf 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -633,7 +633,7 @@ static int dsa_of_probe(struct device *dev)
if (cd->sw_addr > PHY_MAX_ADDR)
continue;
- if (!of_property_read_u32(np, "eeprom-length", &eeprom_len))
+ if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
cd->eeprom_len = eeprom_len;
for_each_available_child_of_node(child, port) {
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 05dab2957cd4..4adfd4d5471b 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -3,7 +3,9 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o
obj-y += 6lowpan/
ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
- header_ops.o sysfs.o nl802154.o
+ header_ops.o sysfs.o nl802154.o trace.o
ieee802154_socket-y := socket.o
+CFLAGS_trace.o := -I$(src)
+
ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 1b9d25f6e898..346c6665d25e 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -175,6 +175,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
int rc = -ENOBUFS;
struct net_device *dev;
int type = __IEEE802154_DEV_INVALID;
+ unsigned char name_assign_type;
pr_debug("%s\n", __func__);
@@ -190,8 +191,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
!= '\0')
return -EINVAL; /* phy name should be null-terminated */
+ name_assign_type = NET_NAME_USER;
} else {
devname = "wpan%d";
+ name_assign_type = NET_NAME_ENUM;
}
if (strlen(devname) >= IFNAMSIZ)
@@ -221,7 +224,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
}
dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname,
- type);
+ name_assign_type, type);
if (IS_ERR(dev)) {
rc = PTR_ERR(dev);
goto nla_put_failure;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index a4daf91b8d0a..f3c12f6a4a39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -589,7 +589,7 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
return rdev_add_virtual_intf(rdev,
nla_data(info->attrs[NL802154_ATTR_IFNAME]),
- type, extended_addr);
+ NET_NAME_USER, type, extended_addr);
}
static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info)
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 7c46732fad2b..7b5a9dd94fe5 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -4,13 +4,16 @@
#include <net/cfg802154.h>
#include "core.h"
+#include "trace.h"
static inline struct net_device *
rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
- const char *name, int type)
+ const char *name,
+ unsigned char name_assign_type,
+ int type)
{
return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name,
- type);
+ name_assign_type, type);
}
static inline void
@@ -22,75 +25,131 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
static inline int
rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
+ unsigned char name_assign_type,
enum nl802154_iftype type, __le64 extended_addr)
{
- return rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type,
+ int ret;
+
+ trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type,
extended_addr);
+ ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name,
+ name_assign_type, type,
+ extended_addr);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_del_virtual_intf(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev)
{
- return rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ int ret;
+
+ trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel)
{
- return rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
+ int ret;
+
+ trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel);
+ ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
const struct wpan_phy_cca *cca)
{
- return rdev->ops->set_cca_mode(&rdev->wpan_phy, cca);
+ int ret;
+
+ trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca);
+ ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_pan_id(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, __le16 pan_id)
{
- return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ int ret;
+
+ trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_short_addr(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, __le16 short_addr)
{
- return rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ int ret;
+
+ trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, u8 min_be, u8 max_be)
{
- return rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
+ int ret;
+
+ trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
min_be, max_be);
+ ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
+ min_be, max_be);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, u8 max_csma_backoffs)
{
- return rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev,
- max_csma_backoffs);
+ int ret;
+
+ trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev,
+ max_csma_backoffs);
+ ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev,
+ max_csma_backoffs);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, s8 max_frame_retries)
{
- return rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
+ int ret;
+
+ trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
max_frame_retries);
+ ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
+ max_frame_retries);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
static inline int
rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
struct wpan_dev *wpan_dev, bool mode)
{
- return rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ int ret;
+
+ trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
}
#endif /* __CFG802154_RDEV_OPS */
diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c
new file mode 100644
index 000000000000..95f997fad755
--- /dev/null
+++ b/net/ieee802154/trace.c
@@ -0,0 +1,7 @@
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
new file mode 100644
index 000000000000..5ac25eb6ed17
--- /dev/null
+++ b/net/ieee802154/trace.h
@@ -0,0 +1,247 @@
+/* Based on net/wireless/tracing.h */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cfg802154
+
+#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __RDEV_CFG802154_OPS_TRACE
+
+#include <linux/tracepoint.h>
+
+#include <net/cfg802154.h>
+
+#define MAXNAME 32
+#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME)
+#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \
+ wpan_phy_name(wpan_phy), \
+ MAXNAME)
+#define WPAN_PHY_PR_FMT "%s"
+#define WPAN_PHY_PR_ARG __entry->wpan_phy_name
+
+#define WPAN_DEV_ENTRY __field(u32, identifier)
+#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \
+ ? wpan_dev->identifier : 0)
+#define WPAN_DEV_PR_FMT "wpan_dev(%u)"
+#define WPAN_DEV_PR_ARG (__entry->identifier)
+
+#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \
+ __field(enum nl802154_cca_opts, cca_opt)
+#define WPAN_CCA_ASSIGN \
+ do { \
+ (__entry->cca_mode) = cca->mode; \
+ (__entry->cca_opt) = cca->opt; \
+ } while (0)
+#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d"
+#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt
+
+#define BOOL_TO_STR(bo) (bo) ? "true" : "false"
+
+/*************************************************************
+ * rdev->ops traces *
+ *************************************************************/
+
+TRACE_EVENT(802154_rdev_add_virtual_intf,
+ TP_PROTO(struct wpan_phy *wpan_phy, char *name,
+ enum nl802154_iftype type, __le64 extended_addr),
+ TP_ARGS(wpan_phy, name, type, extended_addr),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __string(vir_intf_name, name ? name : "<noname>")
+ __field(enum nl802154_iftype, type)
+ __field(__le64, extended_addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __assign_str(vir_intf_name, name ? name : "<noname>");
+ __entry->type = type;
+ __entry->extended_addr = extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx",
+ WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type,
+ __le64_to_cpu(__entry->extended_addr))
+);
+
+TRACE_EVENT(802154_rdev_del_virtual_intf,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG)
+);
+
+TRACE_EVENT(802154_rdev_set_channel,
+ TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel),
+ TP_ARGS(wpan_phy, page, channel),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, page)
+ __field(u8, channel)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->page = page;
+ __entry->channel = channel;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG,
+ __entry->page, __entry->channel)
+);
+
+TRACE_EVENT(802154_rdev_set_cca_mode,
+ TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
+ TP_ARGS(wpan_phy, cca),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_CCA_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_CCA_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG,
+ WPAN_CCA_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(802154_le16_template,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le16, le16arg)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->le16arg = le16arg;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
+ __le16_to_cpu(__entry->le16arg))
+);
+
+DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg)
+);
+
+DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
+ __le16_to_cpu(__entry->le16arg))
+);
+
+TRACE_EVENT(802154_rdev_set_backoff_exponent,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ u8 min_be, u8 max_be),
+ TP_ARGS(wpan_phy, wpan_dev, min_be, max_be),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(u8, min_be)
+ __field(u8, max_be)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->min_be = min_be;
+ __entry->max_be = max_be;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be)
+);
+
+TRACE_EVENT(802154_rdev_set_csma_backoffs,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ u8 max_csma_backoffs),
+ TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(u8, max_csma_backoffs)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->max_csma_backoffs = max_csma_backoffs;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", max csma backoffs: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->max_csma_backoffs)
+);
+
+TRACE_EVENT(802154_rdev_set_max_frame_retries,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ s8 max_frame_retries),
+ TP_ARGS(wpan_phy, wpan_dev, max_frame_retries),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(s8, max_frame_retries)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->max_frame_retries = max_frame_retries;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", max frame retries: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->max_frame_retries)
+);
+
+TRACE_EVENT(802154_rdev_set_lbt_mode,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ bool mode),
+ TP_ARGS(wpan_phy, wpan_dev, mode),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(bool, mode)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->mode = mode;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", lbt mode: %s", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
+);
+
+TRACE_EVENT(802154_rdev_return_int,
+ TP_PROTO(struct wpan_phy *wpan_phy, int ret),
+ TP_ARGS(wpan_phy, ret),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG,
+ __entry->ret)
+);
+
+#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e13fcc602da2..09b62e17dd8c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1164,6 +1164,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
+ new_fa->tb_id = tb->tb_id;
err = netdev_switch_fib_ipv4_add(key, plen, fi,
new_fa->fa_tos,
@@ -1764,7 +1765,7 @@ void fib_table_flush_external(struct fib_table *tb)
/* record local slen */
slen = fa->fa_slen;
- if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
+ if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
continue;
netdev_switch_fib_ipv4_del(n->key,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5c3dd6267ed3..8976ca423a07 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,6 +564,40 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
+/* return true if req was found in the syn_table[] */
+static bool reqsk_queue_unlink(struct request_sock_queue *queue,
+ struct request_sock *req)
+{
+ struct listen_sock *lopt = queue->listen_opt;
+ struct request_sock **prev;
+ bool found = false;
+
+ spin_lock(&queue->syn_wait_lock);
+
+ for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
+ prev = &(*prev)->dl_next) {
+ if (*prev == req) {
+ *prev = req->dl_next;
+ found = true;
+ break;
+ }
+ }
+
+ spin_unlock(&queue->syn_wait_lock);
+ if (del_timer(&req->rsk_timer))
+ reqsk_put(req);
+ return found;
+}
+
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+ if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ reqsk_put(req);
+ }
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index bb77ebdae3b3..4d32262c7502 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -224,14 +224,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
handler->idiag_get_info(sk, r, info);
if (sk->sk_state < TCP_TIME_WAIT) {
- int err = 0;
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
rcu_read_lock();
ca_ops = READ_ONCE(icsk->icsk_ca_ops);
if (ca_ops && ca_ops->get_info)
- err = ca_ops->get_info(sk, ext, skb);
+ sz = ca_ops->get_info(sk, ext, &attr, &info);
rcu_read_unlock();
- if (err < 0)
+ if (sz && nla_put(skb, attr, sz, &info) < 0)
goto errout;
}
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 13bfe84bf3ca..a61200754f4b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1075,6 +1075,9 @@ static int do_replace(struct net *net, const void __user *user,
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
@@ -1499,6 +1502,9 @@ static int compat_do_replace(struct net *net, void __user *user,
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c69db7fa25ee..2d0e265fef6e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1262,6 +1262,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
@@ -1809,6 +1812,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index a93f260cf24c..05ff44b758df 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk)
if (sk_hashed(sk)) {
write_lock_bh(&ping_table.lock);
hlist_nulls_del(&sk->sk_nulls_node);
+ sk_nulls_node_init(&sk->sk_nulls_node);
sock_put(sk);
isk->inet_num = 0;
isk->inet_sport = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a78540f28276..f45f2a12f37b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -902,6 +902,10 @@ static int ip_error(struct sk_buff *skb)
bool send;
int code;
+ /* IP on this device is disabled. */
+ if (!in_dev)
+ goto out;
+
net = dev_net(rt->dst.dev);
if (!IN_DEV_FORWARD(in_dev)) {
switch (rt->dst.error) {
@@ -962,10 +966,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (dst_metric_locked(dst, RTAX_MTU))
return;
- if (dst->dev->mtu < mtu)
- return;
-
- if (rt->rt_pmtu && rt->rt_pmtu < mtu)
+ if (ipv4_mtu(dst) < mtu)
return;
if (mtu < ip_rt_min_pmtu)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c5cd9efebbc..f1377f2a0472 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -252,6 +252,7 @@
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
+#include <linux/inet_diag.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
@@ -401,6 +402,7 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
+ u64_stats_init(&tp->syncp);
tp->reordering = sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
@@ -2592,11 +2594,12 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
/* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(const struct sock *sk, struct tcp_info *info)
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
+ unsigned int start;
u32 rate;
memset(info, 0, sizeof(*info));
@@ -2663,6 +2666,12 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
rate = READ_ONCE(sk->sk_max_pacing_rate);
info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tp->syncp);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2734,6 +2743,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
}
+ case TCP_CC_INFO: {
+ const struct tcp_congestion_ops *ca_ops;
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ ca_ops = icsk->icsk_ca_ops;
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ~0U, &attr, &info);
+
+ len = min_t(unsigned int, len, sz);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
case TCP_QUICKACK:
val = !icsk->icsk_ack.pingpong;
break;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 4376016f7fa5..4c41c1287197 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -277,7 +277,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
}
}
-static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct dctcp *ca = inet_csk_ca(sk);
@@ -286,18 +287,17 @@ static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
*/
if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcp_dctcp_info info;
-
- memset(&info, 0, sizeof(info));
+ memset(info, 0, sizeof(struct tcp_dctcp_info));
if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
- info.dctcp_enabled = 1;
- info.dctcp_ce_state = (u16) ca->ce_state;
- info.dctcp_alpha = ca->dctcp_alpha;
- info.dctcp_ab_ecn = ca->acked_bytes_ecn;
- info.dctcp_ab_tot = ca->acked_bytes_total;
+ info->dctcp.dctcp_enabled = 1;
+ info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+ info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+ info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
+ info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
}
- return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+ *attr = INET_DIAG_DCTCPINFO;
+ return sizeof(*info);
}
return 0;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index e3d87aca6be8..46b087a27503 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -206,6 +206,11 @@ static bool tcp_fastopen_create_child(struct sock *sk,
skb_set_owner_r(skb2, child);
__skb_queue_tail(&child->sk_receive_queue, skb2);
tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
} else {
end_seq = TCP_SKB_CB(skb)->seq + 1;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 67476f085e48..f71002e4db0b 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -300,24 +300,25 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
}
/* Extract info for Tcp socket info provided via netlink. */
-static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct illinois *ca = inet_csk_ca(sk);
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info info = {
- .tcpv_enabled = 1,
- .tcpv_rttcnt = ca->cnt_rtt,
- .tcpv_minrtt = ca->base_rtt,
- };
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+ info->vegas.tcpv_minrtt = ca->base_rtt;
+ info->vegas.tcpv_rtt = 0;
- if (info.tcpv_rttcnt > 0) {
+ if (info->vegas.tcpv_rttcnt > 0) {
u64 t = ca->sum_rtt;
- do_div(t, info.tcpv_rttcnt);
- info.tcpv_rtt = t;
+ do_div(t, info->vegas.tcpv_rttcnt);
+ info->vegas.tcpv_rtt = t;
}
- return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
return 0;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3a4d9b34bed4..c9ab964189a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1820,14 +1820,12 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- tcp_mark_lost_retrans(sk);
-
- tcp_verify_left_out(tp);
-
if ((state.reord < tp->fackets_out) &&
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+ tcp_mark_lost_retrans(sk);
+ tcp_verify_left_out(tp);
out:
#if FASTRETRANS_DEBUG > 0
@@ -2700,16 +2698,21 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
+ if ((flag & FLAG_SND_UNA_ADVANCED) &&
+ tcp_try_undo_loss(sk, false))
+ return;
+
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
/* Step 3.b. A timeout is spurious if not all data are
* lost, i.e., never-retransmitted data are (s)acked.
*/
- if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, true))
return;
- if (after(tp->snd_nxt, tp->high_seq) &&
- (flag & FLAG_DATA_SACKED || is_dupack)) {
- tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+ if (after(tp->snd_nxt, tp->high_seq)) {
+ if (flag & FLAG_DATA_SACKED || is_dupack)
+ tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
@@ -2734,8 +2737,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
- if (tcp_try_undo_loss(sk, false))
- return;
tcp_xmit_retransmit_queue(sk);
}
@@ -3280,6 +3281,28 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+ u32 delta = ack - tp->snd_una;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_acked += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->snd_una = ack;
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+ u32 delta = seq - tp->rcv_nxt;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_received += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->rcv_nxt = seq;
+}
+
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -3315,7 +3338,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
}
}
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
return flag;
}
@@ -3497,7 +3520,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
@@ -4236,7 +4259,7 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4404,7 +4427,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
__skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
- tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
@@ -4497,7 +4520,7 @@ queue_and_out:
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -5245,7 +5268,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3571f2be4470..fc1c658ec6c1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1348,7 +1348,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
if (req) {
nsk = tcp_check_req(sk, skb, req, false);
- reqsk_put(req);
+ if (!nsk)
+ reqsk_put(req);
return nsk;
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d6311b5365..b5732a54f2ad 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -300,7 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_v6_daddr = sk->sk_v6_daddr;
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
- tw->tw_flowlabel = np->flow_label >> 12;
+ tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
@@ -755,10 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
- inet_csk_reqsk_queue_unlink(sk, req);
- inet_csk_reqsk_queue_removed(sk, req);
-
+ inet_csk_reqsk_queue_drop(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
+ /* Warning: caller must not call reqsk_put(req);
+ * child stole last reference on it.
+ */
return child;
listen_overflow:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8d7e06b72f..a369e8a70b2c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2812,39 +2812,65 @@ begin_fwd:
}
}
-/* Send a fin. The caller locks the socket for us. This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int mss_now;
- /* Optimization, tack on the FIN if we have a queue of
- * unsent frames. But be careful about outgoing SACKS
- * and IP options.
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
*/
- mss_now = tcp_current_mss(sk);
-
- if (tcp_send_head(sk)) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
- TCP_SKB_CB(skb)->end_seq++;
+ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
} else {
- /* Socket is locked, keep trying until memory is available. */
- for (;;) {
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
- if (skb)
- break;
- yield();
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
}
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c71a1b8f7bde..a6cea1d5e20d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -286,18 +286,19 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
/* Extract info for Tcp socket info provided via netlink. */
-int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct vegas *ca = inet_csk_ca(sk);
+
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info info = {
- .tcpv_enabled = ca->doing_vegas_now,
- .tcpv_rttcnt = ca->cntRTT,
- .tcpv_rtt = ca->baseRTT,
- .tcpv_minrtt = ca->minRTT,
- };
-
- return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+ info->vegas.tcpv_enabled = ca->doing_vegas_now,
+ info->vegas.tcpv_rttcnt = ca->cntRTT,
+ info->vegas.tcpv_rtt = ca->baseRTT,
+ info->vegas.tcpv_minrtt = ca->minRTT,
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
return 0;
}
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index e8a6b33cc61d..ef9da5306c68 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk);
void tcp_vegas_state(struct sock *sk, u8 ca_state);
void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index b3c57cceb990..c10732e39837 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -256,18 +256,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
}
/* Extract info for Tcp socket info provided via netlink. */
-static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct westwood *ca = inet_csk_ca(sk);
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info info = {
- .tcpv_enabled = 1,
- .tcpv_rtt = jiffies_to_usecs(ca->rtt),
- .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
- };
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = 0;
+ info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt),
+ info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
- return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
return 0;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 96dbffff5a24..bde57b113009 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -693,6 +693,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
{
struct rt6_info *iter = NULL;
struct rt6_info **ins;
+ struct rt6_info **fallback_ins = NULL;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
int add = (!info->nlh ||
@@ -716,8 +717,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
(info->nlh->nlmsg_flags & NLM_F_EXCL))
return -EEXIST;
if (replace) {
- found++;
- break;
+ if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+ found++;
+ break;
+ }
+ if (rt_can_ecmp)
+ fallback_ins = fallback_ins ?: ins;
+ goto next_iter;
}
if (iter->dst.dev == rt->dst.dev &&
@@ -753,9 +759,17 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
if (iter->rt6i_metric > rt->rt6i_metric)
break;
+next_iter:
ins = &iter->dst.rt6_next;
}
+ if (fallback_ins && !found) {
+ /* No ECMP-able route found, replace first non-ECMP one */
+ ins = fallback_ins;
+ iter = *ins;
+ found++;
+ }
+
/* Reset round-robin state, if necessary */
if (ins == &fn->leaf)
fn->rr_ptr = NULL;
@@ -815,6 +829,8 @@ add:
}
} else {
+ int nsiblings;
+
if (!found) {
if (add)
goto add;
@@ -835,8 +851,27 @@ add:
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
+ nsiblings = iter->rt6i_nsiblings;
fib6_purge_rt(iter, fn, info->nl_net);
rt6_release(iter);
+
+ if (nsiblings) {
+ /* Replacing an ECMP route, remove all siblings */
+ ins = &rt->dst.rt6_next;
+ iter = *ins;
+ while (iter) {
+ if (rt6_qualify_for_ecmp(iter)) {
+ *ins = iter->dst.rt6_next;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ rt6_release(iter);
+ nsiblings--;
+ } else {
+ ins = &iter->dst.rt6_next;
+ }
+ iter = *ins;
+ }
+ WARN_ON(nsiblings != 0);
+ }
}
return 0;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b5e6cc1d4a73..a38d3ac0f18f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
static int ip6gre_tunnel_init(struct net_device *dev)
{
struct ip6_tnl *tunnel;
- int i;
tunnel = netdev_priv(dev);
@@ -1260,16 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev)
if (ipv6_addr_any(&tunnel->parms.raddr))
dev->header_ops = &ip6gre_header_ops;
- dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
- for_each_possible_cpu(i) {
- struct pcpu_sw_netstats *ip6gre_tunnel_stats;
- ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i);
- u64_stats_init(&ip6gre_tunnel_stats->syncp);
- }
-
return 0;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fde1f265c90..bc09cb97b840 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -886,22 +886,45 @@ static int ip6_dst_lookup_tail(struct sock *sk,
#endif
int err;
- if (!*dst)
- *dst = ip6_route_output(net, sk, fl6);
-
- err = (*dst)->error;
- if (err)
- goto out_err_release;
+ /* The correct way to handle this would be to do
+ * ip6_route_get_saddr, and then ip6_route_output; however,
+ * the route-specific preferred source forces the
+ * ip6_route_output call _before_ ip6_route_get_saddr.
+ *
+ * In source specific routing (no src=any default route),
+ * ip6_route_output will fail given src=any saddr, though, so
+ * that's why we try it again later.
+ */
+ if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ struct rt6_info *rt;
+ bool had_dst = *dst != NULL;
- if (ipv6_addr_any(&fl6->saddr)) {
- struct rt6_info *rt = (struct rt6_info *) *dst;
+ if (!had_dst)
+ *dst = ip6_route_output(net, sk, fl6);
+ rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
err = ip6_route_get_saddr(net, rt, &fl6->daddr,
sk ? inet6_sk(sk)->srcprefs : 0,
&fl6->saddr);
if (err)
goto out_err_release;
+
+ /* If we had an erroneous initial result, pretend it
+ * never existed and let the SA-enabled version take
+ * over.
+ */
+ if (!had_dst && (*dst)->error) {
+ dst_release(*dst);
+ *dst = NULL;
+ }
}
+ if (!*dst)
+ *dst = ip6_route_output(net, sk, fl6);
+
+ err = (*dst)->error;
+ if (err)
+ goto out_err_release;
+
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
/*
* Here if the dst entry we've looked up
@@ -1277,8 +1300,10 @@ emsgsize:
/* If this is the first and only packet and device
* supports checksum offloading, let's use it.
+ * Use transhdrlen, same as IPv4, because partial
+ * sums only work when transhdrlen is set.
*/
- if (!skb && sk->sk_protocol == IPPROTO_UDP &&
+ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
length + fragheaderlen < mtu &&
rt->dst.dev->features & NETIF_F_V6_CSUM &&
!exthdrlen)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 1a732a1d3c8e..62f5b0d0bc9b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1275,6 +1275,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
@@ -1822,6 +1825,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5c48293ff062..c73ae5039e46 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2245,9 +2245,10 @@ int ip6_route_get_saddr(struct net *net,
unsigned int prefs,
struct in6_addr *saddr)
{
- struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
+ struct inet6_dev *idev =
+ rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
int err = 0;
- if (rt->rt6i_prefsrc.plen)
+ if (rt && rt->rt6i_prefsrc.plen)
*saddr = rt->rt6i_prefsrc.addr;
else
err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
@@ -2503,9 +2504,9 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add)
int attrlen;
int err = 0, last_err = 0;
+ remaining = cfg->fc_mp_len;
beginning:
rtnh = (struct rtnexthop *)cfg->fc_mp;
- remaining = cfg->fc_mp_len;
/* Parse a Multipath Entry */
while (rtnh_ok(rtnh, remaining)) {
@@ -2535,15 +2536,19 @@ beginning:
* next hops that have been already added.
*/
add = 0;
+ remaining = cfg->fc_mp_len - remaining;
goto beginning;
}
}
/* Because each route is added like a single route we remove
- * this flag after the first nexthop (if there is a collision,
- * we have already fail to add the first nexthop:
- * fib6_add_rt2node() has reject it).
+ * these flags after the first nexthop: if there is a collision,
+ * we have already failed to add the first nexthop:
+ * fib6_add_rt2node() has rejected it; when replacing, old
+ * nexthops have been replaced by first new, the rest should
+ * be added to it.
*/
- cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+ NLM_F_REPLACE);
rtnh = rtnh_next(rtnh, &remaining);
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad51df85aa00..3adffb300238 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -914,7 +914,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
- tw->tw_tclass, (tw->tw_flowlabel << 12));
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
inet_twsk_put(tw);
}
@@ -946,7 +946,8 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
&ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
if (req) {
nsk = tcp_check_req(sk, skb, req, false);
- reqsk_put(req);
+ if (!nsk)
+ reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3477c919fcc8..c2ec41617a35 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -731,7 +731,9 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+ (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
return false;
if (!inet6_mc_check(sk, loc_addr, rmt_addr))
return false;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 265e42721a66..ff347a0eebd4 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2495,51 +2495,22 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local,
struct ieee80211_roc_work *new_roc,
struct ieee80211_roc_work *cur_roc)
{
- unsigned long j = jiffies;
- unsigned long cur_roc_end = cur_roc->hw_start_time +
- msecs_to_jiffies(cur_roc->duration);
- struct ieee80211_roc_work *next_roc;
- int new_dur;
+ unsigned long now = jiffies;
+ unsigned long remaining = cur_roc->hw_start_time +
+ msecs_to_jiffies(cur_roc->duration) -
+ now;
if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun))
return false;
- if (time_after(j + IEEE80211_ROC_MIN_LEFT, cur_roc_end))
+ /* if it doesn't fit entirely, schedule a new one */
+ if (new_roc->duration > jiffies_to_msecs(remaining))
return false;
ieee80211_handle_roc_started(new_roc);
- new_dur = new_roc->duration - jiffies_to_msecs(cur_roc_end - j);
-
- /* cur_roc is long enough - add new_roc to the dependents list. */
- if (new_dur <= 0) {
- list_add_tail(&new_roc->list, &cur_roc->dependents);
- return true;
- }
-
- new_roc->duration = new_dur;
-
- /*
- * if cur_roc was already coalesced before, we might
- * want to extend the next roc instead of adding
- * a new one.
- */
- next_roc = list_entry(cur_roc->list.next,
- struct ieee80211_roc_work, list);
- if (&next_roc->list != &local->roc_list &&
- next_roc->chan == new_roc->chan &&
- next_roc->sdata == new_roc->sdata &&
- !WARN_ON(next_roc->started)) {
- list_add_tail(&new_roc->list, &next_roc->dependents);
- next_roc->duration = max(next_roc->duration,
- new_roc->duration);
- next_roc->type = max(next_roc->type, new_roc->type);
- return true;
- }
-
- /* add right after cur_roc */
- list_add(&new_roc->list, &cur_roc->list);
-
+ /* add to dependents so we send the expired event properly */
+ list_add_tail(&new_roc->list, &cur_roc->dependents);
return true;
}
@@ -2652,17 +2623,9 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local,
* In the offloaded ROC case, if it hasn't begun, add
* this new one to the dependent list to be handled
* when the master one begins. If it has begun,
- * check that there's still a minimum time left and
- * if so, start this one, transmitting the frame, but
- * add it to the list directly after this one with
- * a reduced time so we'll ask the driver to execute
- * it right after finishing the previous one, in the
- * hope that it'll also be executed right afterwards,
- * effectively extending the old one.
- * If there's no minimum time left, just add it to the
- * normal list.
- * TODO: the ROC type is ignored here, assuming that it
- * is better to immediately use the current ROC.
+ * check if it fits entirely within the existing one,
+ * in which case it will just be dependent as well.
+ * Otherwise, schedule it by itself.
*/
if (!tmp->hw_begun) {
list_add_tail(&roc->list, &tmp->dependents);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ab46ab4a7249..c0a9187bc3a9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -205,6 +205,8 @@ enum ieee80211_packet_rx_flags {
* @IEEE80211_RX_CMNTR: received on cooked monitor already
* @IEEE80211_RX_BEACON_REPORTED: This frame was already reported
* to cfg80211_report_obss_beacon().
+ * @IEEE80211_RX_REORDER_TIMER: this frame is released by the
+ * reorder buffer timeout timer, not the normal RX path
*
* These flags are used across handling multiple interfaces
* for a single frame.
@@ -212,6 +214,7 @@ enum ieee80211_packet_rx_flags {
enum ieee80211_rx_flags {
IEEE80211_RX_CMNTR = BIT(0),
IEEE80211_RX_BEACON_REPORTED = BIT(1),
+ IEEE80211_RX_REORDER_TIMER = BIT(2),
};
struct ieee80211_rx_data {
@@ -325,12 +328,6 @@ struct mesh_preq_queue {
u8 flags;
};
-#if HZ/100 == 0
-#define IEEE80211_ROC_MIN_LEFT 1
-#else
-#define IEEE80211_ROC_MIN_LEFT (HZ/100)
-#endif
-
struct ieee80211_roc_work {
struct list_head list;
struct list_head dependents;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b4ac596a7cb7..84cef600c573 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -522,6 +522,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
sizeof(sdata->vif.hw_queue));
sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
+
+ mutex_lock(&local->key_mtx);
+ sdata->crypto_tx_tailroom_needed_cnt +=
+ master->crypto_tx_tailroom_needed_cnt;
+ mutex_unlock(&local->key_mtx);
+
break;
}
case NL80211_IFTYPE_AP:
@@ -819,13 +825,15 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
* (because if we remove a STA after ops->remove_interface()
* the driver will have removed the vif info already!)
*
- * This is relevant only in WDS mode, in all other modes we've
- * already removed all stations when disconnecting or similar,
- * so warn otherwise.
+ * In WDS mode a station must exist here and be flushed, for
+ * AP_VLANs stations may exist since there's nothing else that
+ * would have removed them, but in other modes there shouldn't
+ * be any stations.
*/
flushed = sta_info_flush(sdata);
- WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) ||
- (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1));
+ WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) ||
+ (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)));
/* don't count this interface for promisc/allmulti while it is down */
if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 2291cd730091..a907f2d5c12d 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -58,6 +58,22 @@ static void assert_key_lock(struct ieee80211_local *local)
lockdep_assert_held(&local->key_mtx);
}
+static void
+update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta)
+{
+ struct ieee80211_sub_if_data *vlan;
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return;
+
+ mutex_lock(&sdata->local->mtx);
+
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ vlan->crypto_tx_tailroom_needed_cnt += delta;
+
+ mutex_unlock(&sdata->local->mtx);
+}
+
static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata)
{
/*
@@ -79,6 +95,8 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata)
* http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net
*/
+ update_vlan_tailroom_need_count(sdata, 1);
+
if (!sdata->crypto_tx_tailroom_needed_cnt++) {
/*
* Flush all XMIT packets currently using HW encryption or no
@@ -88,6 +106,15 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata)
}
}
+static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata,
+ int delta)
+{
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta);
+
+ update_vlan_tailroom_need_count(sdata, -delta);
+ sdata->crypto_tx_tailroom_needed_cnt -= delta;
+}
+
static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
{
struct ieee80211_sub_if_data *sdata;
@@ -144,7 +171,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
(key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
- sdata->crypto_tx_tailroom_needed_cnt--;
+ decrease_tailroom_need_count(sdata, 1);
WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV));
@@ -541,7 +568,7 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key,
schedule_delayed_work(&sdata->dec_tailroom_needed_wk,
HZ/2);
} else {
- sdata->crypto_tx_tailroom_needed_cnt--;
+ decrease_tailroom_need_count(sdata, 1);
}
}
@@ -631,6 +658,7 @@ void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom)
void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_key *key;
+ struct ieee80211_sub_if_data *vlan;
ASSERT_RTNL();
@@ -639,7 +667,14 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
mutex_lock(&sdata->local->key_mtx);
- sdata->crypto_tx_tailroom_needed_cnt = 0;
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt ||
+ sdata->crypto_tx_tailroom_pending_dec);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt ||
+ vlan->crypto_tx_tailroom_pending_dec);
+ }
list_for_each_entry(key, &sdata->key_list, list) {
increment_tailroom_need_count(sdata);
@@ -649,6 +684,22 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
mutex_unlock(&sdata->local->key_mtx);
}
+void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_sub_if_data *vlan;
+
+ mutex_lock(&sdata->local->key_mtx);
+
+ sdata->crypto_tx_tailroom_needed_cnt = 0;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ vlan->crypto_tx_tailroom_needed_cnt = 0;
+ }
+
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
void ieee80211_iter_keys(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
void (*iter)(struct ieee80211_hw *hw,
@@ -688,8 +739,8 @@ static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_key *key, *tmp;
- sdata->crypto_tx_tailroom_needed_cnt -=
- sdata->crypto_tx_tailroom_pending_dec;
+ decrease_tailroom_need_count(sdata,
+ sdata->crypto_tx_tailroom_pending_dec);
sdata->crypto_tx_tailroom_pending_dec = 0;
ieee80211_debugfs_key_remove_mgmt_default(sdata);
@@ -709,6 +760,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_sub_if_data *vlan;
+ struct ieee80211_sub_if_data *master;
struct ieee80211_key *key, *tmp;
LIST_HEAD(keys);
@@ -728,8 +780,20 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
list_for_each_entry_safe(key, tmp, &keys, list)
__ieee80211_key_destroy(key, false);
- WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt ||
- sdata->crypto_tx_tailroom_pending_dec);
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (sdata->bss) {
+ master = container_of(sdata->bss,
+ struct ieee80211_sub_if_data,
+ u.ap);
+
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt !=
+ master->crypto_tx_tailroom_needed_cnt);
+ }
+ } else {
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt ||
+ sdata->crypto_tx_tailroom_pending_dec);
+ }
+
if (sdata->vif.type == NL80211_IFTYPE_AP) {
list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt ||
@@ -793,8 +857,8 @@ void ieee80211_delayed_tailroom_dec(struct work_struct *wk)
*/
mutex_lock(&sdata->local->key_mtx);
- sdata->crypto_tx_tailroom_needed_cnt -=
- sdata->crypto_tx_tailroom_pending_dec;
+ decrease_tailroom_need_count(sdata,
+ sdata->crypto_tx_tailroom_pending_dec);
sdata->crypto_tx_tailroom_pending_dec = 0;
mutex_unlock(&sdata->local->key_mtx);
}
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index c5a31835be0e..96557dd1e77d 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -161,6 +161,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
void ieee80211_free_sta_keys(struct ieee80211_local *local,
struct sta_info *sta);
void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata);
+void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata);
#define key_mtx_dereference(local, ref) \
rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx)))
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 260eed45b6d2..5793f75c5ffd 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2121,7 +2121,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
/* deliver to local stack */
skb->protocol = eth_type_trans(skb, dev);
memset(skb->cb, 0, sizeof(skb->cb));
- if (rx->local->napi)
+ if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) &&
+ rx->local->napi)
napi_gro_receive(rx->local->napi, skb);
else
netif_receive_skb(skb);
@@ -3231,7 +3232,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
/* This is OK -- must be QoS data frame */
.security_idx = tid,
.seqno_idx = tid,
- .flags = 0,
+ .flags = IEEE80211_RX_REORDER_TIMER,
};
struct tid_ampdu_rx *tid_agg_rx;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 12971b71d0fa..2880f2ae99ab 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -66,6 +66,7 @@
static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
+ .automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, sta.addr),
.key_len = ETH_ALEN,
@@ -157,8 +158,24 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct rhash_head *tmp;
+ const struct bucket_table *tbl;
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
- return rhashtable_lookup_fast(&local->sta_hash, addr, sta_rht_params);
+ for_each_sta_info(local, tbl, addr, sta, tmp) {
+ if (sta->sdata == sdata) {
+ rcu_read_unlock();
+ /* this is safe as the caller must already hold
+ * another rcu read section or the mutex
+ */
+ return sta;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
}
/*
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 79412f16b61d..b864ebc6ab8f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2023,6 +2023,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
/* add back keys */
list_for_each_entry(sdata, &local->interfaces, list)
+ ieee80211_reset_crypto_tx_tailroom(sdata);
+
+ list_for_each_entry(sdata, &local->interfaces, list)
if (ieee80211_sdata_running(sdata))
ieee80211_enable_keys(sdata);
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index a4220e92f0cc..efa3f48f1ec5 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -98,8 +98,7 @@ static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local,
hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
- if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN ||
- skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
+ if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
return NULL;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -167,6 +166,9 @@ int ieee80211_wep_encrypt(struct ieee80211_local *local,
size_t len;
u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
+ if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN))
+ return -1;
+
iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx);
if (!iv)
return -1;
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 5d9f68c75e5f..70be9c799f8a 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -22,13 +22,14 @@
static struct net_device *
ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy,
- const char *name, int type)
+ const char *name,
+ unsigned char name_assign_type, int type)
{
struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
struct net_device *dev;
rtnl_lock();
- dev = ieee802154_if_add(local, name, type,
+ dev = ieee802154_if_add(local, name, name_assign_type, type,
cpu_to_le64(0x0000000000000000ULL));
rtnl_unlock();
@@ -45,12 +46,14 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy,
static int
ieee802154_add_iface(struct wpan_phy *phy, const char *name,
+ unsigned char name_assign_type,
enum nl802154_iftype type, __le64 extended_addr)
{
struct ieee802154_local *local = wpan_phy_priv(phy);
struct net_device *err;
- err = ieee802154_if_add(local, name, type, extended_addr);
+ err = ieee802154_if_add(local, name, name_assign_type, type,
+ extended_addr);
return PTR_ERR_OR_ZERO(err);
}
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index bebd70ffc7a3..127ba18386fc 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -182,7 +182,8 @@ void ieee802154_iface_exit(void);
void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata);
struct net_device *
ieee802154_if_add(struct ieee802154_local *local, const char *name,
- enum nl802154_iftype type, __le64 extended_addr);
+ unsigned char name_assign_type, enum nl802154_iftype type,
+ __le64 extended_addr);
void ieee802154_remove_interfaces(struct ieee802154_local *local);
#endif /* __IEEE802154_I_H */
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 38b56f9d9386..91b75abbd1a1 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -522,7 +522,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
struct net_device *
ieee802154_if_add(struct ieee802154_local *local, const char *name,
- enum nl802154_iftype type, __le64 extended_addr)
+ unsigned char name_assign_type, enum nl802154_iftype type,
+ __le64 extended_addr)
{
struct net_device *ndev = NULL;
struct ieee802154_sub_if_data *sdata = NULL;
@@ -531,7 +532,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
ASSERT_RTNL();
ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name,
- NET_NAME_UNKNOWN, ieee802154_if_setup);
+ name_assign_type, ieee802154_if_setup);
if (!ndev)
return ERR_PTR(-ENOMEM);
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index dcf73958133a..5b2be12832e6 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -134,7 +134,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
for (i = 0; i < ARRAY_SIZE(key->tfm); i++) {
key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0,
CRYPTO_ALG_ASYNC);
- if (!key->tfm[i])
+ if (IS_ERR(key->tfm[i]))
goto err_tfm;
if (crypto_aead_setkey(key->tfm[i], template->key,
IEEE802154_LLSEC_KEY_SIZE))
@@ -144,7 +144,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
}
key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
- if (!key->tfm0)
+ if (IS_ERR(key->tfm0))
goto err_tfm;
if (crypto_blkcipher_setkey(key->tfm0, template->key,
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 8500378c8318..08cb32dc8fd3 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -161,18 +161,21 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
rtnl_lock();
- dev = ieee802154_if_add(local, "wpan%d", NL802154_IFTYPE_NODE,
+ dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM,
+ NL802154_IFTYPE_NODE,
cpu_to_le64(0x0000000000000000ULL));
if (IS_ERR(dev)) {
rtnl_unlock();
rc = PTR_ERR(dev);
- goto out_wq;
+ goto out_phy;
}
rtnl_unlock();
return 0;
+out_phy:
+ wpan_phy_unregister(local->phy);
out_wq:
destroy_workqueue(local->workqueue);
out:
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index db8a2ea6d4de..7b3f732269e4 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -53,6 +53,11 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
return rt;
}
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
static bool mpls_output_possible(const struct net_device *dev)
{
return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -136,6 +141,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
struct mpls_route *rt;
struct mpls_entry_decoded dec;
struct net_device *out_dev;
+ struct mpls_dev *mdev;
unsigned int hh_len;
unsigned int new_header_size;
unsigned int mtu;
@@ -143,6 +149,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
/* Careful this entire function runs inside of an rcu critical section */
+ mdev = mpls_dev_get(dev);
+ if (!mdev || !mdev->input_enabled)
+ goto drop;
+
if (skb->pkt_type != PACKET_HOST)
goto drop;
@@ -352,9 +362,9 @@ static int mpls_route_add(struct mpls_route_config *cfg)
if (!dev)
goto errout;
- /* For now just support ethernet devices */
+ /* Ensure this is a supported device */
err = -EINVAL;
- if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
+ if (!mpls_dev_get(dev))
goto errout;
err = -EINVAL;
@@ -428,10 +438,89 @@ errout:
return err;
}
+#define MPLS_PERDEV_SYSCTL_OFFSET(field) \
+ (&((struct mpls_dev *)0)->field)
+
+static const struct ctl_table mpls_dev_table[] = {
+ {
+ .procname = "input",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
+ },
+ { }
+};
+
+static int mpls_dev_sysctl_register(struct net_device *dev,
+ struct mpls_dev *mdev)
+{
+ char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+ struct ctl_table *table;
+ int i;
+
+ table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ /* Table data contains only offsets relative to the base of
+ * the mdev at this point, so make them absolute.
+ */
+ for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+ table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+
+ snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
+
+ mdev->sysctl = register_net_sysctl(dev_net(dev), path, table);
+ if (!mdev->sysctl)
+ goto free;
+
+ return 0;
+
+free:
+ kfree(table);
+out:
+ return -ENOBUFS;
+}
+
+static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev)
+{
+ struct ctl_table *table;
+
+ table = mdev->sysctl->ctl_table_arg;
+ unregister_net_sysctl_table(mdev->sysctl);
+ kfree(table);
+}
+
+static struct mpls_dev *mpls_add_dev(struct net_device *dev)
+{
+ struct mpls_dev *mdev;
+ int err = -ENOMEM;
+
+ ASSERT_RTNL();
+
+ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+ if (!mdev)
+ return ERR_PTR(err);
+
+ err = mpls_dev_sysctl_register(dev, mdev);
+ if (err)
+ goto free;
+
+ rcu_assign_pointer(dev->mpls_ptr, mdev);
+
+ return mdev;
+
+free:
+ kfree(mdev);
+ return ERR_PTR(err);
+}
+
static void mpls_ifdown(struct net_device *dev)
{
struct mpls_route __rcu **platform_label;
struct net *net = dev_net(dev);
+ struct mpls_dev *mdev;
unsigned index;
platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -443,14 +532,35 @@ static void mpls_ifdown(struct net_device *dev)
continue;
rt->rt_dev = NULL;
}
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ return;
+
+ mpls_dev_sysctl_unregister(mdev);
+
+ RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+
+ kfree(mdev);
}
static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct mpls_dev *mdev;
switch(event) {
+ case NETDEV_REGISTER:
+ /* For now just support ethernet devices */
+ if ((dev->type == ARPHRD_ETHER) ||
+ (dev->type == ARPHRD_LOOPBACK)) {
+ mdev = mpls_add_dev(dev);
+ if (IS_ERR(mdev))
+ return notifier_from_errno(PTR_ERR(mdev));
+ }
+ break;
+
case NETDEV_UNREGISTER:
mpls_ifdown(dev);
break;
@@ -536,6 +646,15 @@ int nla_get_labels(const struct nlattr *nla,
if ((dec.bos != bos) || dec.ttl || dec.tc)
return -EINVAL;
+ switch (dec.label) {
+ case MPLS_LABEL_IMPLNULL:
+ /* RFC3032: This is a label that an LSR may
+ * assign and distribute, but which never
+ * actually appears in the encapsulation.
+ */
+ return -EINVAL;
+ }
+
label[i] = dec.label;
}
*labels = nla_labels;
@@ -816,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
}
/* In case the predefined labels need to be populated */
- if (limit > LABEL_IPV4_EXPLICIT_NULL) {
+ if (limit > MPLS_LABEL_IPV4NULL) {
struct net_device *lo = net->loopback_dev;
rt0 = mpls_rt_alloc(lo->addr_len);
if (!rt0)
@@ -826,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
rt0->rt_via_table = NEIGH_LINK_TABLE;
memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
}
- if (limit > LABEL_IPV6_EXPLICIT_NULL) {
+ if (limit > MPLS_LABEL_IPV6NULL) {
struct net_device *lo = net->loopback_dev;
rt2 = mpls_rt_alloc(lo->addr_len);
if (!rt2)
@@ -854,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit)
memcpy(labels, old, cp_size);
/* If needed set the predefined labels */
- if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) &&
- (limit > LABEL_IPV6_EXPLICIT_NULL)) {
- RCU_INIT_POINTER(labels[LABEL_IPV6_EXPLICIT_NULL], rt2);
+ if ((old_limit <= MPLS_LABEL_IPV6NULL) &&
+ (limit > MPLS_LABEL_IPV6NULL)) {
+ RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2);
rt2 = NULL;
}
- if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) &&
- (limit > LABEL_IPV4_EXPLICIT_NULL)) {
- RCU_INIT_POINTER(labels[LABEL_IPV4_EXPLICIT_NULL], rt0);
+ if ((old_limit <= MPLS_LABEL_IPV4NULL) &&
+ (limit > MPLS_LABEL_IPV4NULL)) {
+ RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0);
rt0 = NULL;
}
@@ -912,7 +1031,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table mpls_table[] = {
+static const struct ctl_table mpls_table[] = {
{
.procname = "platform_labels",
.data = NULL,
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index fb6de92052c4..b064c345042c 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -1,16 +1,6 @@
#ifndef MPLS_INTERNAL_H
#define MPLS_INTERNAL_H
-#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */
-#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */
-#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */
-#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */
-#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */
-#define LABEL_GAL 13 /* RFC5586 */
-#define LABEL_OAM_ALERT 14 /* RFC3429 */
-#define LABEL_EXTENSION 15 /* RFC7274 */
-
-
struct mpls_shim_hdr {
__be32 label_stack_entry;
};
@@ -22,6 +12,12 @@ struct mpls_entry_decoded {
u8 bos;
};
+struct mpls_dev {
+ int input_enabled;
+
+ struct ctl_table_header *sysctl;
+};
+
struct sk_buff;
static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f70e34a68f70..a0f3e6a3c7d1 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -863,6 +863,7 @@ config NETFILTER_XT_TARGET_TPROXY
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
depends on (IPV6 || IPV6=n)
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
@@ -1356,6 +1357,7 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_ADVANCED
depends on !NF_CONNTRACK || NF_CONNTRACK
depends on (IPV6 || IPV6=n)
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
help
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 49532672f66d..285eae3a1454 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3823,6 +3823,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
ip_vs_stop_estimator(net, &ipvs->tot_stats);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->sysctl_tbl);
}
#else
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 5caa0c41bf26..70383de72054 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -202,7 +202,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
* sCW -> sCW
- * sLA -> sTW Last ACK detected.
+ * sLA -> sTW Last ACK detected (RFC5961 challenged)
* sTW -> sTW Retransmitted last ACK. Remain in the same state.
* sCL -> sCL
*/
@@ -261,7 +261,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
* sCW -> sCW
- * sLA -> sTW Last ACK detected.
+ * sLA -> sTW Last ACK detected (RFC5961 challenged)
* sTW -> sTW Retransmitted last ACK.
* sCL -> sCL
*/
@@ -906,6 +906,7 @@ static int tcp_packet(struct nf_conn *ct,
1 : ct->proto.tcp.last_win;
ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
ct->proto.tcp.last_wscale;
+ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
ct->proto.tcp.last_flags;
memset(&ct->proto.tcp.seen[dir], 0,
@@ -923,7 +924,9 @@ static int tcp_packet(struct nf_conn *ct,
* may be in sync but we are not. In that case, we annotate
* the TCP options and let the packet go through. If it is a
* valid SYN packet, the server will reply with a SYN/ACK, and
- * then we'll get in sync. Otherwise, the server ignores it. */
+ * then we'll get in sync. Otherwise, the server potentially
+ * responds with a challenge ACK if implementing RFC5961.
+ */
if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
struct ip_ct_tcp_state seen = {};
@@ -939,6 +942,13 @@ static int tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags |=
IP_CT_TCP_FLAG_SACK_PERM;
}
+ /* Mark the potential for RFC5961 challenge ACK,
+ * this pose a special problem for LAST_ACK state
+ * as ACK is intrepretated as ACKing last FIN.
+ */
+ if (old_state == TCP_CONNTRACK_LAST_ACK)
+ ct->proto.tcp.last_flags |=
+ IP_CT_EXP_CHALLENGE_ACK;
}
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
@@ -970,6 +980,25 @@ static int tcp_packet(struct nf_conn *ct,
nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
return -NF_ACCEPT;
+ case TCP_CONNTRACK_TIME_WAIT:
+ /* RFC5961 compliance cause stack to send "challenge-ACK"
+ * e.g. in response to spurious SYNs. Conntrack MUST
+ * not believe this ACK is acking last FIN.
+ */
+ if (old_state == TCP_CONNTRACK_LAST_ACK &&
+ index == TCP_ACK_SET &&
+ ct->proto.tcp.last_dir != dir &&
+ ct->proto.tcp.last_index == TCP_SYN_SET &&
+ (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
+ /* Detected RFC5961 challenge ACK */
+ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: challenge-ACK ignored ");
+ return NF_ACCEPT; /* Don't change state */
+ }
+ break;
case TCP_CONNTRACK_CLOSE:
if (index == TCP_RST_SET
&& (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 78af83bc9c8e..34ded09317e7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4340,7 +4340,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
case NFT_CONTINUE:
case NFT_BREAK:
case NFT_RETURN:
- desc->len = sizeof(data->verdict);
break;
case NFT_JUMP:
case NFT_GOTO:
@@ -4355,10 +4354,10 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
chain->use++;
data->verdict.chain = chain;
- desc->len = sizeof(data);
break;
}
+ desc->len = sizeof(data->verdict);
desc->type = NFT_DATA_VERDICT;
return 0;
}
@@ -4473,9 +4472,9 @@ EXPORT_SYMBOL_GPL(nft_data_init);
*/
void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
{
- switch (type) {
- case NFT_DATA_VALUE:
+ if (type < NFT_DATA_VERDICT)
return;
+ switch (type) {
case NFT_DATA_VERDICT:
return nft_verdict_uninit(data);
default:
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3ad91266c821..4ef1fae8445e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1073,7 +1073,13 @@ static struct pernet_operations nfnl_log_net_ops = {
static int __init nfnetlink_log_init(void)
{
- int status = -ENOMEM;
+ int status;
+
+ status = register_pernet_subsys(&nfnl_log_net_ops);
+ if (status < 0) {
+ pr_err("failed to register pernet ops\n");
+ goto out;
+ }
netlink_register_notifier(&nfulnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfulnl_subsys);
@@ -1088,28 +1094,23 @@ static int __init nfnetlink_log_init(void)
goto cleanup_subsys;
}
- status = register_pernet_subsys(&nfnl_log_net_ops);
- if (status < 0) {
- pr_err("failed to register pernet ops\n");
- goto cleanup_logger;
- }
return status;
-cleanup_logger:
- nf_log_unregister(&nfulnl_logger);
cleanup_subsys:
nfnetlink_subsys_unregister(&nfulnl_subsys);
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_log_net_ops);
+out:
return status;
}
static void __exit nfnetlink_log_fini(void)
{
- unregister_pernet_subsys(&nfnl_log_net_ops);
nf_log_unregister(&nfulnl_logger);
nfnetlink_subsys_unregister(&nfulnl_subsys);
netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_log_net_ops);
}
MODULE_DESCRIPTION("netfilter userspace logging");
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 0b98c7420239..11c7682fa0ea 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -1317,7 +1317,13 @@ static struct pernet_operations nfnl_queue_net_ops = {
static int __init nfnetlink_queue_init(void)
{
- int status = -ENOMEM;
+ int status;
+
+ status = register_pernet_subsys(&nfnl_queue_net_ops);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register pernet ops\n");
+ goto out;
+ }
netlink_register_notifier(&nfqnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfqnl_subsys);
@@ -1326,19 +1332,13 @@ static int __init nfnetlink_queue_init(void)
goto cleanup_netlink_notifier;
}
- status = register_pernet_subsys(&nfnl_queue_net_ops);
- if (status < 0) {
- pr_err("nf_queue: failed to register pernet ops\n");
- goto cleanup_subsys;
- }
register_netdevice_notifier(&nfqnl_dev_notifier);
nf_register_queue_handler(&nfqh);
return status;
-cleanup_subsys:
- nfnetlink_subsys_unregister(&nfqnl_subsys);
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+out:
return status;
}
@@ -1346,9 +1346,9 @@ static void __exit nfnetlink_queue_fini(void)
{
nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
- unregister_pernet_subsys(&nfnl_queue_net_ops);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 57d3e1af5630..0522fc9bfb0a 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
goto nla_put_failure;
break;
+ default:
+ break;
}
return 0;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 62cabee42fbe..635dbba93d01 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb,
if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
goto nla_put_failure;
break;
+ default:
+ break;
}
return 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 19909d0786a2..bf6e76643f78 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -89,7 +89,7 @@ static inline int netlink_is_kernel(struct sock *sk)
return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
}
-struct netlink_table *nl_table;
+struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);
static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
@@ -1081,6 +1081,7 @@ static int netlink_insert(struct sock *sk, u32 portid)
if (err) {
if (err == -EEXIST)
err = -EADDRINUSE;
+ nlk_sk(sk)->portid = 0;
sock_put(sk);
}
@@ -1629,13 +1630,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
if (data == NULL)
return NULL;
- skb = build_skb(data, size);
+ skb = __build_skb(data, size);
if (skb == NULL)
vfree(data);
- else {
- skb->head_frag = 0;
+ else
skb->destructor = netlink_skb_destructor;
- }
return skb;
}
@@ -3141,7 +3140,6 @@ static const struct rhashtable_params netlink_rhashtable_params = {
.key_len = netlink_compare_arg_len,
.obj_hashfn = netlink_hash,
.obj_cmpfn = netlink_compare,
- .max_size = 65536,
.automatic_shrinking = true,
};
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5102c3cc4eec..b5989c6ee551 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2311,11 +2311,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
tlen = dev->needed_tailroom;
skb = sock_alloc_send_skb(&po->sk,
hlen + tlen + sizeof(struct sockaddr_ll),
- 0, &err);
+ !need_wait, &err);
- if (unlikely(skb == NULL))
+ if (unlikely(skb == NULL)) {
+ /* we assume the socket was initially writeable ... */
+ if (likely(len_sum > 0))
+ err = len_sum;
goto out_status;
-
+ }
tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
addr, hlen);
if (tp_len > dev->mtu + dev->hard_header_len) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 14f041398ca1..da6da57e5f36 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
+ struct rds_transport *otrans = trans;
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
@@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
if (conn)
goto out;
+new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
@@ -230,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
/* Creating normal conn */
struct rds_connection *found;
- found = rds_conn_lookup(head, laddr, faddr, trans);
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ found = NULL;
+ else
+ found = rds_conn_lookup(head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
- hlist_add_head_rcu(&conn->c_hash_node, head);
+ if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
+ (otrans->t_type != RDS_TRANS_TCP)) {
+ /* Only the active side should be added to
+ * reconnect list for TCP.
+ */
+ hlist_add_head_rcu(&conn->c_hash_node, head);
+ }
rds_cong_add_conn(conn);
rds_conn_count++;
}
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 31b74f5e61ad..8a09ee7db3c1 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -183,8 +183,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
- if (dp && dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+ if (dp) {
+ /* dp structure start is not guaranteed to be 8 bytes aligned.
+ * Since dp_ack_seq is 64-bit extended load operations can be
+ * used so go through get_unaligned to avoid unaligned errors.
+ */
+ __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
+
+ if (dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+ NULL);
+ }
rds_connect_complete(conn);
}
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index f9f564a6c960..973109c7b8e8 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_ESTABLISHED:
rds_connect_complete(conn);
break;
+ case TCP_CLOSE_WAIT:
case TCP_CLOSE:
rds_conn_drop(conn);
default:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 23ab4dcd1d9f..0da49e34495f 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
static struct socket *rds_tcp_listen_sock;
+static int rds_tcp_keepalive(struct socket *sock)
+{
+ /* values below based on xs_udp_default_timeout */
+ int keepidle = 5; /* send a probe 'keepidle' secs after last data */
+ int keepcnt = 5; /* number of unack'ed probes before declaring dead */
+ int keepalive = 1;
+ int ret = 0;
+
+ ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&keepalive, sizeof(keepalive));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ if (ret < 0)
+ goto bail;
+
+ /* KEEPINTVL is the interval between successive probes. We follow
+ * the model in xs_tcp_finish_connecting() and re-use keepidle.
+ */
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+bail:
+ return ret;
+}
+
static int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
struct inet_sock *inet;
+ struct rds_tcp_connection *rs_tcp;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
@@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ ret = rds_tcp_keepalive(new_sock);
+ if (ret < 0)
+ goto out;
+
rds_tcp_tune(new_sock);
inet = inet_sk(new_sock->sk);
@@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
ret = PTR_ERR(conn);
goto out;
}
+ /* An incoming SYN request came in, and TCP just accepted it.
+ * We always create a new conn for listen side of TCP, and do not
+ * add it to the c_hash_list.
+ *
+ * If the client reboots, this conn will need to be cleaned up.
+ * rds_tcp_state_change() will do that cleanup
+ */
+ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
+ WARN_ON(!rs_tcp || rs_tcp->t_sock);
/*
* see the comment above rds_queue_delayed_reconnect()
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 8e472518f9f6..295d14bd6c67 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -63,7 +63,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
skb->mark = c->mark;
/* using overlimits stats to count how many packets marked */
ca->tcf_qstats.overlimits++;
- nf_ct_put(c);
goto out;
}
@@ -82,7 +81,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
nf_ct_put(c);
out:
- skb->nfct = NULL;
spin_unlock(&ca->tcf_lock);
return ca->tcf_action;
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8b0470e418dc..a75864d93142 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -81,6 +81,11 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
struct tcf_proto_ops *t;
int rc = -ENOENT;
+ /* Wait for outstanding call_rcu()s, if any, from a
+ * tcf_proto_ops's destroy() handler.
+ */
+ rcu_barrier();
+
write_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head) {
if (t == ops) {
@@ -308,12 +313,11 @@ replay:
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
if (err == 0) {
- tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
- if (tcf_destroy(tp, false)) {
- struct tcf_proto *next = rtnl_dereference(tp->next);
+ struct tcf_proto *next = rtnl_dereference(tp->next);
+ tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ if (tcf_destroy(tp, false))
RCU_INIT_POINTER(*back, next);
- }
}
goto errout;
case RTM_GETTFILTER:
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ad9eed70bc8f..1e1c89e51a11 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -815,10 +815,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (dev->flags & IFF_UP)
dev_deactivate(dev);
- if (new && new->ops->attach) {
- new->ops->attach(new);
- num_q = 0;
- }
+ if (new && new->ops->attach)
+ goto skip;
for (i = 0; i < num_q; i++) {
struct netdev_queue *dev_queue = dev_ingress_queue(dev);
@@ -834,12 +832,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_destroy(old);
}
+skip:
if (!ingress) {
notify_and_destroy(net, skb, n, classid,
dev->qdisc, new);
if (new && !new->ops->attach)
atomic_inc(&new->refcnt);
dev->qdisc = new ? : &noop_qdisc;
+
+ if (new && new->ops->attach)
+ new->ops->attach(new);
} else {
notify_and_destroy(net, skb, n, classid, old, new);
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index de28f8e968e8..7a0bdb16ac92 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -164,7 +164,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt)
sch->limit = DEFAULT_CODEL_LIMIT;
- codel_params_init(&q->params);
+ codel_params_init(&q->params, sch);
codel_vars_init(&q->vars);
codel_stats_init(&q->stats);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 1e52decb7b59..c244c45b78d7 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -391,7 +391,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
q->perturbation = prandom_u32();
INIT_LIST_HEAD(&q->new_flows);
INIT_LIST_HEAD(&q->old_flows);
- codel_params_init(&q->cparams);
+ codel_params_init(&q->cparams, sch);
codel_stats_init(&q->cstats);
q->cparams.ecn = true;
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index a4ca4517cdc8..634529e0ce6b 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -229,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
break;
}
- if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
+ if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) {
q->backlog += qdisc_pkt_len(skb);
return qdisc_enqueue_tail(skb, sch);
}
@@ -553,7 +553,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.limit = q->limit;
opt.DP = q->DP;
- opt.backlog = q->backlog;
+ opt.backlog = gred_backlog(table, q, sch);
opt.prio = q->prio;
opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
diff --git a/net/socket.c b/net/socket.c
index 3e33959f3ce5..884e32997698 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -312,7 +312,7 @@ static const struct super_operations sockfs_ops = {
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
- dentry->d_inode->i_ino);
+ d_inode(dentry)->i_ino);
}
static const struct dentry_operations sockfs_dentry_operations = {
@@ -375,7 +375,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
&socket_file_ops);
if (unlikely(IS_ERR(file))) {
/* drop dentry, keep inode */
- ihold(path.dentry->d_inode);
+ ihold(d_inode(path.dentry));
path_put(&path);
return file;
}
@@ -497,7 +497,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
ssize_t len;
ssize_t used = 0;
- len = security_inode_listsecurity(dentry->d_inode, buffer, size);
+ len = security_inode_listsecurity(d_inode(dentry), buffer, size);
if (len < 0)
return len;
used += len;
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 1ec19f6f0c2b..eeeba5adee6d 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -793,20 +793,26 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
{
u32 value_follows;
int err;
+ struct page *scratch;
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (!scratch)
+ return -ENOMEM;
+ xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
if (err)
- return err;
+ goto out_free;
/* res->context_handle */
err = gssx_dec_bool(xdr, &value_follows);
if (err)
- return err;
+ goto out_free;
if (value_follows) {
err = gssx_dec_ctx(xdr, res->context_handle);
if (err)
- return err;
+ goto out_free;
} else {
res->context_handle = NULL;
}
@@ -814,11 +820,11 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
/* res->output_token */
err = gssx_dec_bool(xdr, &value_follows);
if (err)
- return err;
+ goto out_free;
if (value_follows) {
err = gssx_dec_buffer(xdr, res->output_token);
if (err)
- return err;
+ goto out_free;
} else {
res->output_token = NULL;
}
@@ -826,14 +832,17 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
/* res->delegated_cred_handle */
err = gssx_dec_bool(xdr, &value_follows);
if (err)
- return err;
+ goto out_free;
if (value_follows) {
/* we do not support upcall servers sending this data. */
- return -EINVAL;
+ err = -EINVAL;
+ goto out_free;
}
/* res->options */
err = gssx_dec_option_array(xdr, &res->options);
+out_free:
+ __free_page(scratch);
return err;
}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 2d12b76b5a64..d81186d34558 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -94,7 +94,7 @@ rpc_timeout_upcall_queue(struct work_struct *work)
}
dentry = dget(pipe->dentry);
spin_unlock(&pipe->lock);
- rpc_purge_list(dentry ? &RPC_I(dentry->d_inode)->waitq : NULL,
+ rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL,
&free_list, destroy_msg, -ETIMEDOUT);
dput(dentry);
}
@@ -152,7 +152,7 @@ rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg)
dentry = dget(pipe->dentry);
spin_unlock(&pipe->lock);
if (dentry) {
- wake_up(&RPC_I(dentry->d_inode)->waitq);
+ wake_up(&RPC_I(d_inode(dentry))->waitq);
dput(dentry);
}
return res;
@@ -591,7 +591,7 @@ static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry,
err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
if (err)
return err;
- rpci = RPC_I(dentry->d_inode);
+ rpci = RPC_I(d_inode(dentry));
rpci->private = private;
rpci->pipe = pipe;
fsnotify_create(dir, dentry);
@@ -616,7 +616,7 @@ int rpc_rmdir(struct dentry *dentry)
int error;
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
error = __rpc_rmdir(dir, dentry);
mutex_unlock(&dir->i_mutex);
@@ -638,7 +638,7 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
rpc_close_pipes(inode);
return __rpc_unlink(dir, dentry);
@@ -654,7 +654,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
if (!dentry)
return ERR_PTR(-ENOMEM);
}
- if (dentry->d_inode == NULL)
+ if (d_really_is_negative(dentry))
return dentry;
dput(dentry);
return ERR_PTR(-EEXIST);
@@ -667,7 +667,7 @@ static void __rpc_depopulate(struct dentry *parent,
const struct rpc_filelist *files,
int start, int eof)
{
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct qstr name;
int i;
@@ -679,9 +679,9 @@ static void __rpc_depopulate(struct dentry *parent,
if (dentry == NULL)
continue;
- if (dentry->d_inode == NULL)
+ if (d_really_is_negative(dentry))
goto next;
- switch (dentry->d_inode->i_mode & S_IFMT) {
+ switch (d_inode(dentry)->i_mode & S_IFMT) {
default:
BUG();
case S_IFREG:
@@ -699,7 +699,7 @@ static void rpc_depopulate(struct dentry *parent,
const struct rpc_filelist *files,
int start, int eof)
{
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
__rpc_depopulate(parent, files, start, eof);
@@ -711,7 +711,7 @@ static int rpc_populate(struct dentry *parent,
int start, int eof,
void *private)
{
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct dentry *dentry;
int i, err;
@@ -754,7 +754,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
int (*populate)(struct dentry *, void *), void *args_populate)
{
struct dentry *dentry;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
int error;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
@@ -787,7 +787,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
int error;
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
if (depopulate != NULL)
depopulate(dentry);
@@ -819,7 +819,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
void *private, struct rpc_pipe *pipe)
{
struct dentry *dentry;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
int err;
@@ -864,7 +864,7 @@ rpc_unlink(struct dentry *dentry)
int error = 0;
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
error = __rpc_rmpipe(dir, dentry);
mutex_unlock(&dir->i_mutex);
@@ -1375,7 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
struct dentry *clnt_dir = pipe_dentry->d_parent;
struct dentry *gssd_dir = clnt_dir->d_parent;
- __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry);
+ __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
__rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
__rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
dput(pipe_dentry);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b91fd9c597b4..337ca851a350 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
if (!task->tk_timeout)
return;
- dprintk("RPC: %5u setting alarm for %lu ms\n",
- task->tk_pid, task->tk_timeout * 1000 / HZ);
+ dprintk("RPC: %5u setting alarm for %u ms\n",
+ task->tk_pid, jiffies_to_msecs(task->tk_timeout));
task->u.tk_wait.expires = jiffies + task->tk_timeout;
if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9949722d99ce..1d4fe24af06a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -326,6 +326,15 @@ out_unlock:
xprt_clear_locked(xprt);
}
+static void xprt_task_clear_bytes_sent(struct rpc_task *task)
+{
+ if (task != NULL) {
+ struct rpc_rqst *req = task->tk_rqstp;
+ if (req != NULL)
+ req->rq_bytes_sent = 0;
+ }
+}
+
/**
* xprt_release_xprt - allow other requests to use a transport
* @xprt: transport with other tasks potentially waiting
@@ -336,11 +345,7 @@ out_unlock:
void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
- if (task != NULL) {
- struct rpc_rqst *req = task->tk_rqstp;
- if (req != NULL)
- req->rq_bytes_sent = 0;
- }
+ xprt_task_clear_bytes_sent(task);
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
@@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
- if (task != NULL) {
- struct rpc_rqst *req = task->tk_rqstp;
- if (req != NULL)
- req->rq_bytes_sent = 0;
- }
+ xprt_task_clear_bytes_sent(task);
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
@@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
goto out;
if (xprt->snd_task != task)
goto out;
+ xprt_task_clear_bytes_sent(task);
xprt->snd_task = cookie;
ret = true;
out:
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index da5136fd5694..579f72bbcf4b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
-xprtrdma-y := transport.o rpc_rdma.o verbs.o
+xprtrdma-y := transport.o rpc_rdma.o verbs.o \
+ fmr_ops.o frwr_ops.o physical_ops.o
obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 000000000000..302d4ebf6fbf
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Memory Regions (FMR).
+ * Referred to sometimes as MTHCAFMR mode.
+ *
+ * FMR uses synchronous memory registration and deregistration.
+ * FMR registration is known to be fast, but FMR deregistration
+ * can take tens of usecs to complete.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+/* Maximum scatter/gather per FMR */
+#define RPCRDMA_MAX_FMR_SGES (64)
+
+static int
+fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ return 0;
+}
+
+/* FMR mode conveys up to 64 pages of payload per chunk segment.
+ */
+static size_t
+fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+ return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+}
+
+static int
+fmr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_FMR_SGES,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+ struct rpcrdma_mw *r;
+ int i, rc;
+
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
+
+ while (i--) {
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->r.fmr))
+ goto out_fmr_err;
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ }
+ return 0;
+
+out_fmr_err:
+ rc = PTR_ERR(r->r.fmr);
+ dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
+ kfree(r);
+ return rc;
+}
+
+/* Use the ib_map_phys_fmr() verb to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct ib_device *device = ia->ri_id->device;
+ enum dma_data_direction direction = rpcrdma_data_dir(writing);
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
+ u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+ int len, pageoff, i, rc;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (nsegs > RPCRDMA_MAX_FMR_SGES)
+ nsegs = RPCRDMA_MAX_FMR_SGES;
+ for (i = 0; i < nsegs;) {
+ rpcrdma_map_one(device, seg, direction);
+ physaddrs[i] = seg->mr_dma;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+
+ rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+ if (rc)
+ goto out_maperr;
+
+ seg1->mr_rkey = mw->r.fmr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ return i;
+
+out_maperr:
+ dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+ __func__, len, (unsigned long long)seg1->mr_dma,
+ pageoff, i, rc);
+ while (i--)
+ rpcrdma_unmap_one(device, --seg);
+ return rc;
+}
+
+/* Use the ib_unmap_fmr() verb to prevent further remote
+ * access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_device *device;
+ int rc, nsegs = seg->mr_nsegs;
+ LIST_HEAD(l);
+
+ list_add(&seg1->rl_mw->r.fmr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ read_lock(&ia->ri_qplock);
+ device = ia->ri_id->device;
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(device, seg++);
+ read_unlock(&ia->ri_qplock);
+ if (rc)
+ goto out_err;
+ return nsegs;
+
+out_err:
+ dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
+ return nsegs;
+}
+
+/* After a disconnect, unmap all FMRs.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_fmr_external().
+ */
+static void
+fmr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_mw *r;
+ LIST_HEAD(list);
+ int rc;
+
+ list_for_each_entry(r, &buf->rb_all, mw_all)
+ list_add(&r->r.fmr->list, &list);
+
+ rc = ib_unmap_fmr(&list);
+ if (rc)
+ dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
+ __func__, rc);
+}
+
+static void
+fmr_op_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int rc;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
+ __func__, rc);
+ kfree(r);
+ }
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+ .ro_map = fmr_op_map,
+ .ro_unmap = fmr_op_unmap,
+ .ro_open = fmr_op_open,
+ .ro_maxpages = fmr_op_maxpages,
+ .ro_init = fmr_op_init,
+ .ro_reset = fmr_op_reset,
+ .ro_destroy = fmr_op_destroy,
+ .ro_displayname = "fmr",
+};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000000..dff0481dbcf8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR). Also referred to sometimes as FRMR mode.
+ *
+ * FRWR features ordered asynchronous registration and deregistration
+ * of arbitrarily sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+static int
+__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
+ unsigned int depth)
+{
+ struct rpcrdma_frmr *f = &r->r.frmr;
+ int rc;
+
+ f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+ if (IS_ERR(f->fr_mr))
+ goto out_mr_err;
+ f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
+ if (IS_ERR(f->fr_pgl))
+ goto out_list_err;
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
+ __func__, rc);
+ return rc;
+
+out_list_err:
+ rc = PTR_ERR(f->fr_pgl);
+ dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
+ __func__, rc);
+ ib_dereg_mr(f->fr_mr);
+ return rc;
+}
+
+static void
+__frwr_release(struct rpcrdma_mw *r)
+{
+ int rc;
+
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr status %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+}
+
+static int
+frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ int depth, delta;
+
+ ia->ri_max_frmr_depth =
+ min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ devattr->max_fast_reg_page_list_len);
+ dprintk("RPC: %s: device's max FR page list len = %u\n",
+ __func__, ia->ri_max_frmr_depth);
+
+ /* Add room for frmr register and invalidate WRs.
+ * 1. FRMR reg WR for head
+ * 2. FRMR invalidate WR for head
+ * 3. N FRMR reg WRs for pagelist
+ * 4. N FRMR invalidate WRs for pagelist
+ * 5. FRMR reg WR for tail
+ * 6. FRMR invalidate WR for tail
+ * 7. The RDMA_SEND WR
+ */
+ depth = 7;
+
+ /* Calculate N if the device max FRMR depth is smaller than
+ * RPCRDMA_MAX_DATA_SEGS.
+ */
+ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
+ do {
+ depth += 2; /* FRMR reg + invalidate */
+ delta -= ia->ri_max_frmr_depth;
+ } while (delta > 0);
+ }
+
+ ep->rep_attr.cap.max_send_wr *= depth;
+ if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
+ cdata->max_requests = devattr->max_qp_wr / depth;
+ if (!cdata->max_requests)
+ return -EINVAL;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+ depth;
+ }
+
+ return 0;
+}
+
+/* FRWR mode conveys a list of pages per chunk segment. The
+ * maximum length of that list is the FRWR page list depth.
+ */
+static size_t
+frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+}
+
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+ struct rpcrdma_mw *r;
+
+ if (likely(wc->status == IB_WC_SUCCESS))
+ return;
+
+ /* WARNING: Only wr_id and status are reliable at this point */
+ r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ dprintk("RPC: %s: frmr %p (stale), status %d\n",
+ __func__, r, wc->status);
+ r->r.frmr.fr_state = FRMR_IS_STALE;
+}
+
+static int
+frwr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+ int i;
+
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
+
+ while (i--) {
+ struct rpcrdma_mw *r;
+ int rc;
+
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ rc = __frwr_init(r, pd, device, depth);
+ if (rc) {
+ kfree(r);
+ return rc;
+ }
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ r->mw_sendcompletion = frwr_sendcompletion;
+ }
+
+ return 0;
+}
+
+/* Post a FAST_REG Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct ib_device *device = ia->ri_id->device;
+ enum dma_data_direction direction = rpcrdma_data_dir(writing);
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
+ struct rpcrdma_frmr *frmr = &mw->r.frmr;
+ struct ib_mr *mr = frmr->fr_mr;
+ struct ib_send_wr fastreg_wr, *bad_wr;
+ u8 key;
+ int len, pageoff;
+ int i, rc;
+ int seg_len;
+ u64 pa;
+ int page_no;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (nsegs > ia->ri_max_frmr_depth)
+ nsegs = ia->ri_max_frmr_depth;
+ for (page_no = i = 0; i < nsegs;) {
+ rpcrdma_map_one(device, seg, direction);
+ pa = seg->mr_dma;
+ for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+ frmr->fr_pgl->page_list[page_no++] = pa;
+ pa += PAGE_SIZE;
+ }
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+ dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
+ __func__, mw, i, len);
+
+ frmr->fr_state = FRMR_IS_VALID;
+
+ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+ fastreg_wr.wr_id = (unsigned long)(void *)mw;
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
+ fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.page_list_len = page_no;
+ fastreg_wr.wr.fast_reg.length = len;
+ fastreg_wr.wr.fast_reg.access_flags = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
+ key = (u8)(mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(mr, ++key);
+ fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+ rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+ if (rc)
+ goto out_senderr;
+
+ seg1->mr_rkey = mr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ return i;
+
+out_senderr:
+ dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
+ ib_update_fast_reg_key(mr, --key);
+ frmr->fr_state = FRMR_IS_INVALID;
+ while (i--)
+ rpcrdma_unmap_one(device, --seg);
+ return rc;
+}
+
+/* Post a LOCAL_INV Work Request to prevent further remote access
+ * via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc, nsegs = seg->mr_nsegs;
+ struct ib_device *device;
+
+ seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+
+ memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+ invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ read_lock(&ia->ri_qplock);
+ device = ia->ri_id->device;
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(device, seg++);
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ read_unlock(&ia->ri_qplock);
+ if (rc)
+ goto out_err;
+ return nsegs;
+
+out_err:
+ /* Force rpcrdma_buffer_get() to retry */
+ seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+ dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
+ return nsegs;
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each. FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+frwr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+ struct rpcrdma_mw *r;
+ int rc;
+
+ list_for_each_entry(r, &buf->rb_all, mw_all) {
+ if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+ continue;
+
+ __frwr_release(r);
+ rc = __frwr_init(r, pd, device, depth);
+ if (rc) {
+ dprintk("RPC: %s: mw %p left %s\n",
+ __func__, r,
+ (r->r.frmr.fr_state == FRMR_IS_STALE ?
+ "stale" : "valid"));
+ continue;
+ }
+
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+ }
+}
+
+static void
+frwr_op_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ __frwr_release(r);
+ kfree(r);
+ }
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+ .ro_map = frwr_op_map,
+ .ro_unmap = frwr_op_unmap,
+ .ro_open = frwr_op_open,
+ .ro_maxpages = frwr_op_maxpages,
+ .ro_init = frwr_op_init,
+ .ro_reset = frwr_op_reset,
+ .ro_destroy = frwr_op_destroy,
+ .ro_displayname = "frwr",
+};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 000000000000..ba518af16787
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* No-op chunk preparation. All client memory is pre-registered.
+ * Sometimes referred to as ALLPHYSICAL mode.
+ *
+ * Physical registration is simple because all client memory is
+ * pre-registered and never deregistered. This mode is good for
+ * adapter bring up, but is considered not safe: the server is
+ * trusted not to abuse its access to client memory not involved
+ * in RDMA I/O.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+static int
+physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ return 0;
+}
+
+/* PHYSICAL memory registration conveys one page per chunk segment.
+ */
+static size_t
+physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+ return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ rpcrdma_max_segments(r_xprt));
+}
+
+static int
+physical_op_init(struct rpcrdma_xprt *r_xprt)
+{
+ return 0;
+}
+
+/* The client's physical memory is already exposed for
+ * remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ rpcrdma_map_one(ia->ri_id->device, seg,
+ rpcrdma_data_dir(writing));
+ seg->mr_rkey = ia->ri_bind_mem->rkey;
+ seg->mr_base = seg->mr_dma;
+ seg->mr_nsegs = 1;
+ return 1;
+}
+
+/* Unmap a memory region, but leave it registered.
+ */
+static int
+physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ read_lock(&ia->ri_qplock);
+ rpcrdma_unmap_one(ia->ri_id->device, seg);
+ read_unlock(&ia->ri_qplock);
+
+ return 1;
+}
+
+static void
+physical_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+}
+
+static void
+physical_op_destroy(struct rpcrdma_buffer *buf)
+{
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+ .ro_map = physical_op_map,
+ .ro_unmap = physical_op_unmap,
+ .ro_open = physical_op_open,
+ .ro_maxpages = physical_op_maxpages,
+ .ro_init = physical_op_init,
+ .ro_reset = physical_op_reset,
+ .ro_destroy = physical_op_destroy,
+ .ro_displayname = "physical",
+};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 91ffde82fa0c..2c53ea9e1b83 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,6 +53,14 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+enum rpcrdma_chunktype {
+ rpcrdma_noch = 0,
+ rpcrdma_readch,
+ rpcrdma_areadch,
+ rpcrdma_writech,
+ rpcrdma_replych
+};
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static const char transfertypes[][12] = {
"pure inline", /* no chunks */
@@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
struct rpcrdma_write_array *warray = NULL;
struct rpcrdma_write_chunk *cur_wchunk = NULL;
__be32 *iptr = headerp->rm_body.rm_chunks;
+ int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
if (type == rpcrdma_readch || type == rpcrdma_areadch) {
/* a read chunk - server will RDMA Read our memory */
@@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
if (nsegs < 0)
return nsegs;
+ map = r_xprt->rx_ia.ri_ops->ro_map;
do {
- n = rpcrdma_register_external(seg, nsegs,
- cur_wchunk != NULL, r_xprt);
+ n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
if (n <= 0)
goto out;
if (cur_rchunk) { /* read */
@@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return (unsigned char *)iptr - (unsigned char *)headerp;
out:
- if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
- for (pos = 0; nchunks--;)
- pos += rpcrdma_deregister_external(
- &req->rl_segments[pos], r_xprt);
- }
- return n;
-}
+ if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+ return n;
-/*
- * Marshal chunks. This routine returns the header length
- * consumed by marshaling.
- *
- * Returns positive RPC/RDMA header size, or negative errno.
- */
-
-ssize_t
-rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
-{
- struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
-
- if (req->rl_rtype != rpcrdma_noch)
- result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
- headerp, req->rl_rtype);
- else if (req->rl_wtype != rpcrdma_noch)
- result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
- headerp, req->rl_wtype);
- return result;
+ for (pos = 0; nchunks--;)
+ pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+ &req->rl_segments[pos]);
+ return n;
}
/*
@@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
char *base;
size_t rpclen, padlen;
ssize_t hdrlen;
+ enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
/*
@@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* into pages; otherwise use reply chunks.
*/
if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
- req->rl_wtype = rpcrdma_noch;
+ wtype = rpcrdma_noch;
else if (rqst->rq_rcv_buf.page_len == 0)
- req->rl_wtype = rpcrdma_replych;
+ wtype = rpcrdma_replych;
else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- req->rl_wtype = rpcrdma_writech;
+ wtype = rpcrdma_writech;
else
- req->rl_wtype = rpcrdma_replych;
+ wtype = rpcrdma_replych;
/*
* Chunks needed for arguments?
@@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* TBD check NFSv4 setacl
*/
if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
- req->rl_rtype = rpcrdma_noch;
+ rtype = rpcrdma_noch;
else if (rqst->rq_snd_buf.page_len == 0)
- req->rl_rtype = rpcrdma_areadch;
+ rtype = rpcrdma_areadch;
else
- req->rl_rtype = rpcrdma_readch;
+ rtype = rpcrdma_readch;
/* The following simplification is not true forever */
- if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
- req->rl_wtype = rpcrdma_noch;
- if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
+ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+ wtype = rpcrdma_noch;
+ if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
@@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* When padding is in use and applies to the transfer, insert
* it and change the message type.
*/
- if (req->rl_rtype == rpcrdma_noch) {
+ if (rtype == rpcrdma_noch) {
padlen = rpcrdma_inline_pullup(rqst,
RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (req->rl_wtype != rpcrdma_noch) {
+ if (wtype != rpcrdma_noch) {
dprintk("RPC: %s: invalid chunk list\n",
__func__);
return -EIO;
@@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
- if (req->rl_wtype == rpcrdma_noch)
- req->rl_wtype = rpcrdma_replych;
+ if (wtype == rpcrdma_noch)
+ wtype = rpcrdma_replych;
}
}
- hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
+ if (rtype != rpcrdma_noch) {
+ hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+ headerp, rtype);
+ wtype = rtype; /* simplify dprintk */
+
+ } else if (wtype != rpcrdma_noch) {
+ hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+ headerp, wtype);
+ }
if (hdrlen < 0)
return hdrlen;
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
headerp, base, rdmab_lkey(req->rl_rdmabuf));
/*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2e192baa59f3..54f23b1be986 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = {
static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ char buf[20];
+
+ snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+ xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+ xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ char buf[40];
+
+ snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+ xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+ xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
+}
+
+static void
xprt_rdma_format_addresses(struct rpc_xprt *xprt)
{
struct sockaddr *sap = (struct sockaddr *)
&rpcx_to_rdmad(xprt).addr;
- struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- char buf[64];
+ char buf[128];
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ xprt_rdma_format_addresses4(xprt, sap);
+ break;
+ case AF_INET6:
+ xprt_rdma_format_addresses6(xprt, sap);
+ break;
+ default:
+ pr_err("rpcrdma: Unrecognized address family\n");
+ return;
+ }
(void)rpc_ntop(sap, buf, sizeof(buf));
xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
- xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
-
- snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
- xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
-
snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
- /* netid */
- xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+ xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
}
static void
@@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_connect_worker);
xprt_rdma_format_addresses(xprt);
- xprt->max_payload = rpcrdma_max_payload(new_xprt);
+ xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+ if (xprt->max_payload == 0)
+ goto out4;
+ xprt->max_payload <<= PAGE_SHIFT;
dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
__func__, xprt->max_payload);
@@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer)
for (i = 0; req->rl_nchunks;) {
--req->rl_nchunks;
- i += rpcrdma_deregister_external(
- &req->rl_segments[i], r_xprt);
+ i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+ &req->rl_segments[i]);
}
rpcrdma_buffer_put(req);
@@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
- if (req->rl_niovs == 0)
- rc = rpcrdma_marshal_req(rqst);
- else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
- rc = rpcrdma_marshal_chunks(rqst, 0);
+ rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
goto failed_marshal;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e28909fddd30..4870d272e006 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
+#include <linux/sunrpc/addr.h>
#include <asm/bitops.h>
#include "xprt_rdma.h"
@@ -62,9 +63,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
-static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
-
/*
* internal functions
*/
@@ -188,7 +186,7 @@ static const char * const wc_status[] = {
"remote access error",
"remote operation error",
"transport retry counter exceeded",
- "RNR retrycounter exceeded",
+ "RNR retry counter exceeded",
"local RDD violation error",
"remove invalid RD request",
"operation aborted",
@@ -206,21 +204,17 @@ static const char * const wc_status[] = {
static void
rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{
- if (likely(wc->status == IB_WC_SUCCESS))
- return;
-
/* WARNING: Only wr_id and status are reliable at this point */
- if (wc->wr_id == 0ULL) {
- if (wc->status != IB_WC_WR_FLUSH_ERR)
+ if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("RPC: %s: SEND: %s\n",
__func__, COMPLETION_MSG(wc->status));
} else {
struct rpcrdma_mw *r;
r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- r->r.frmr.fr_state = FRMR_IS_STALE;
- pr_err("RPC: %s: frmr %p (stale): %s\n",
- __func__, r, COMPLETION_MSG(wc->status));
+ r->mw_sendcompletion(wc);
}
}
@@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
struct rpcrdma_ia *ia = &xprt->rx_ia;
struct rpcrdma_ep *ep = &xprt->rx_ep;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+ struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
#endif
struct ib_qp_attr *attr = &ia->ri_qp_attr;
struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +474,8 @@ connected:
wake_up_all(&ep->rep_connect_wait);
/*FALLTHROUGH*/
default:
- dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
- __func__, &addr->sin_addr.s_addr,
- ntohs(addr->sin_port), ep,
+ dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
+ __func__, sap, rpc_get_port(sap), ep,
CONNECTION_MSG(event->event));
break;
}
@@ -491,19 +484,16 @@ connected:
if (connstate == 1) {
int ird = attr->max_dest_rd_atomic;
int tird = ep->rep_remote_cma.responder_resources;
- printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
- "on %s, memreg %d slots %d ird %d%s\n",
- &addr->sin_addr.s_addr,
- ntohs(addr->sin_port),
+
+ pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
+ sap, rpc_get_port(sap),
ia->ri_id->device->name,
- ia->ri_memreg_strategy,
+ ia->ri_ops->ro_displayname,
xprt->rx_buf.rb_max_requests,
ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
} else if (connstate < 0) {
- printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
- &addr->sin_addr.s_addr,
- ntohs(addr->sin_port),
- connstate);
+ pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
+ sap, rpc_get_port(sap), connstate);
}
#endif
@@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (memreg == RPCRDMA_FRMR) {
/* Requires both frmr reg and local dma lkey */
- if ((devattr->device_cap_flags &
+ if (((devattr->device_cap_flags &
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
- (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
+ (devattr->max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
- } else {
- /* Mind the ia limit on FRMR page list depth */
- ia->ri_max_frmr_depth = min_t(unsigned int,
- RPCRDMA_MAX_DATA_SEGS,
- devattr->max_fast_reg_page_list_len);
}
}
if (memreg == RPCRDMA_MTHCAFMR) {
@@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
*/
switch (memreg) {
case RPCRDMA_FRMR:
+ ia->ri_ops = &rpcrdma_frwr_memreg_ops;
break;
case RPCRDMA_ALLPHYSICAL:
+ ia->ri_ops = &rpcrdma_physical_memreg_ops;
mem_priv = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ;
goto register_setup;
case RPCRDMA_MTHCAFMR:
+ ia->ri_ops = &rpcrdma_fmr_memreg_ops;
if (ia->ri_have_dma_lkey)
break;
mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
rc = -ENOMEM;
goto out3;
}
- dprintk("RPC: %s: memory registration strategy is %d\n",
- __func__, memreg);
+ dprintk("RPC: %s: memory registration strategy is '%s'\n",
+ __func__, ia->ri_ops->ro_displayname);
/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
@@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
ep->rep_attr.qp_context = ep;
- /* send_cq and recv_cq initialized below */
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR: {
- int depth = 7;
-
- /* Add room for frmr register and invalidate WRs.
- * 1. FRMR reg WR for head
- * 2. FRMR invalidate WR for head
- * 3. N FRMR reg WRs for pagelist
- * 4. N FRMR invalidate WRs for pagelist
- * 5. FRMR reg WR for tail
- * 6. FRMR invalidate WR for tail
- * 7. The RDMA_SEND WR
- */
-
- /* Calculate N if the device max FRMR depth is smaller than
- * RPCRDMA_MAX_DATA_SEGS.
- */
- if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
- int delta = RPCRDMA_MAX_DATA_SEGS -
- ia->ri_max_frmr_depth;
-
- do {
- depth += 2; /* FRMR reg + invalidate */
- delta -= ia->ri_max_frmr_depth;
- } while (delta > 0);
-
- }
- ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
- cdata->max_requests = devattr->max_qp_wr / depth;
- if (!cdata->max_requests)
- return -EINVAL;
- ep->rep_attr.cap.max_send_wr = cdata->max_requests *
- depth;
- }
- break;
- }
- default:
- break;
- }
+ rc = ia->ri_ops->ro_open(ia, ep, cdata);
+ if (rc)
+ return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
ep->rep_attr.cap.max_recv_sge = 1;
@@ -944,21 +895,9 @@ retry:
rpcrdma_ep_disconnect(ep, ia);
rpcrdma_flush_cqs(ep);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rpcrdma_reset_frmrs(ia);
- break;
- case RPCRDMA_MTHCAFMR:
- rpcrdma_reset_fmrs(ia);
- break;
- case RPCRDMA_ALLPHYSICAL:
- break;
- default:
- rc = -EIO;
- goto out;
- }
-
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+ ia->ri_ops->ro_reset(xprt);
+
id = rpcrdma_create_id(xprt, ia,
(struct sockaddr *)&xprt->rx_data.addr);
if (IS_ERR(id)) {
@@ -1123,91 +1062,6 @@ out:
return ERR_PTR(rc);
}
-static int
-rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
- int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
- struct ib_fmr_attr fmr_attr = {
- .max_pages = RPCRDMA_MAX_DATA_SEGS,
- .max_maps = 1,
- .page_shift = PAGE_SHIFT
- };
- struct rpcrdma_mw *r;
- int i, rc;
-
- i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
- dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
-
- while (i--) {
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (r == NULL)
- return -ENOMEM;
-
- r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->r.fmr)) {
- rc = PTR_ERR(r->r.fmr);
- dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
- __func__, rc);
- goto out_free;
- }
-
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
- return 0;
-
-out_free:
- kfree(r);
- return rc;
-}
-
-static int
-rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_frmr *f;
- struct rpcrdma_mw *r;
- int i, rc;
-
- i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
- dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
-
- while (i--) {
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (r == NULL)
- return -ENOMEM;
- f = &r->r.frmr;
-
- f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
- ia->ri_max_frmr_depth);
- if (IS_ERR(f->fr_mr)) {
- rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_fast_reg_mr "
- "failed %i\n", __func__, rc);
- goto out_free;
- }
-
- f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
- ia->ri_max_frmr_depth);
- if (IS_ERR(f->fr_pgl)) {
- rc = PTR_ERR(f->fr_pgl);
- dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
- "failed %i\n", __func__, rc);
-
- ib_dereg_mr(f->fr_mr);
- goto out_free;
- }
-
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
-
- return 0;
-
-out_free:
- kfree(r);
- return rc;
-}
-
int
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
@@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rc = rpcrdma_init_frmrs(ia, buf);
- if (rc)
- goto out;
- break;
- case RPCRDMA_MTHCAFMR:
- rc = rpcrdma_init_fmrs(ia, buf);
- if (rc)
- goto out;
- break;
- default:
- break;
- }
+ rc = ia->ri_ops->ro_init(r_xprt);
+ if (rc)
+ goto out;
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
@@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
kfree(req);
}
-static void
-rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
- int rc;
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- list_del(&r->mw_list);
-
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
- __func__, rc);
-
- kfree(r);
- }
-}
-
-static void
-rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
- int rc;
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- list_del(&r->mw_list);
-
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n",
- __func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
- kfree(r);
- }
-}
-
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
@@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
}
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rpcrdma_destroy_frmrs(buf);
- break;
- case RPCRDMA_MTHCAFMR:
- rpcrdma_destroy_fmrs(buf);
- break;
- default:
- break;
- }
+ ia->ri_ops->ro_destroy(buf);
kfree(buf->rb_pool);
}
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
-{
- struct rpcrdma_xprt *r_xprt =
- container_of(ia, struct rpcrdma_xprt, rx_ia);
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct list_head *pos;
- struct rpcrdma_mw *r;
- LIST_HEAD(l);
- int rc;
-
- list_for_each(pos, &buf->rb_all) {
- r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
- INIT_LIST_HEAD(&l);
- list_add(&r->r.fmr->list, &l);
- rc = ib_unmap_fmr(&l);
- if (rc)
- dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
- __func__, rc);
- }
-}
-
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each. FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
-{
- struct rpcrdma_xprt *r_xprt =
- container_of(ia, struct rpcrdma_xprt, rx_ia);
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct list_head *pos;
- struct rpcrdma_mw *r;
- int rc;
-
- list_for_each(pos, &buf->rb_all) {
- r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
- if (r->r.frmr.fr_state == FRMR_IS_INVALID)
- continue;
-
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n",
- __func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
- r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
- ia->ri_max_frmr_depth);
- if (IS_ERR(r->r.frmr.fr_mr)) {
- rc = PTR_ERR(r->r.frmr.fr_mr);
- dprintk("RPC: %s: ib_alloc_fast_reg_mr"
- " failed %i\n", __func__, rc);
- continue;
- }
- r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
- ia->ri_id->device,
- ia->ri_max_frmr_depth);
- if (IS_ERR(r->r.frmr.fr_pgl)) {
- rc = PTR_ERR(r->r.frmr.fr_pgl);
- dprintk("RPC: %s: "
- "ib_alloc_fast_reg_page_list "
- "failed %i\n", __func__, rc);
-
- ib_dereg_mr(r->r.frmr.fr_mr);
- continue;
- }
- r->r.frmr.fr_state = FRMR_IS_INVALID;
- }
-}
-
/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
* some req segments uninitialized.
*/
@@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
}
}
-/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+/* rpcrdma_unmap_one() was already done during deregistration.
* Redo only the ib_post_send().
*/
static void
@@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
* Wrappers for internal-use kmalloc memory registration, used by buffer code.
*/
+void
+rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
+{
+ dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
+ seg->mr_offset,
+ (unsigned long long)seg->mr_dma, seg->mr_dmalen);
+}
+
static int
rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
struct ib_mr **mrp, struct ib_sge *iov)
@@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
}
/*
- * Wrappers for chunk registration, shared by read/write chunk code.
- */
-
-static void
-rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
-{
- seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
- seg->mr_dmalen = seg->mr_len;
- if (seg->mr_page)
- seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
- seg->mr_page, offset_in_page(seg->mr_offset),
- seg->mr_dmalen, seg->mr_dir);
- else
- seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
- seg->mr_offset,
- seg->mr_dmalen, seg->mr_dir);
- if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
- dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
- __func__,
- (unsigned long long)seg->mr_dma,
- seg->mr_offset, seg->mr_dmalen);
- }
-}
-
-static void
-rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
-{
- if (seg->mr_page)
- ib_dma_unmap_page(ia->ri_id->device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
- else
- ib_dma_unmap_single(ia->ri_id->device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
-static int
-rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
- int *nsegs, int writing, struct rpcrdma_ia *ia,
- struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->r.frmr;
- struct ib_mr *mr = frmr->fr_mr;
- struct ib_send_wr fastreg_wr, *bad_wr;
- u8 key;
- int len, pageoff;
- int i, rc;
- int seg_len;
- u64 pa;
- int page_no;
-
- pageoff = offset_in_page(seg1->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
- if (*nsegs > ia->ri_max_frmr_depth)
- *nsegs = ia->ri_max_frmr_depth;
- for (page_no = i = 0; i < *nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- pa = seg->mr_dma;
- for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
- frmr->fr_pgl->page_list[page_no++] = pa;
- pa += PAGE_SIZE;
- }
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
- break;
- }
- dprintk("RPC: %s: Using frmr %p to map %d segments\n",
- __func__, mw, i);
-
- frmr->fr_state = FRMR_IS_VALID;
-
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.wr_id = (unsigned long)(void *)mw;
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
- fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
- fastreg_wr.wr.fast_reg.page_list_len = page_no;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
- if (fastreg_wr.wr.fast_reg.length < len) {
- rc = -EIO;
- goto out_err;
- }
-
- /* Bump the key */
- key = (u8)(mr->rkey & 0x000000FF);
- ib_update_fast_reg_key(mr, ++key);
-
- fastreg_wr.wr.fast_reg.access_flags = (writing ?
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
- IB_ACCESS_REMOTE_READ);
- fastreg_wr.wr.fast_reg.rkey = mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
-
- rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
- if (rc) {
- dprintk("RPC: %s: failed ib_post_send for register,"
- " status %i\n", __func__, rc);
- ib_update_fast_reg_key(mr, --key);
- goto out_err;
- } else {
- seg1->mr_rkey = mr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- }
- *nsegs = i;
- return 0;
-out_err:
- frmr->fr_state = FRMR_IS_INVALID;
- while (i--)
- rpcrdma_unmap_one(ia, --seg);
- return rc;
-}
-
-static int
-rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_send_wr invalidate_wr, *bad_wr;
- int rc;
-
- seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
-
- memset(&invalidate_wr, 0, sizeof invalidate_wr);
- invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
-
- read_lock(&ia->ri_qplock);
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc) {
- /* Force rpcrdma_buffer_get() to retry */
- seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
- dprintk("RPC: %s: failed ib_post_send for invalidate,"
- " status %i\n", __func__, rc);
- }
- return rc;
-}
-
-static int
-rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
- int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
- int len, pageoff, i, rc;
-
- pageoff = offset_in_page(seg1->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
- *nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (i = 0; i < *nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- physaddrs[i] = seg->mr_dma;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
- break;
- }
- rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
- if (rc) {
- dprintk("RPC: %s: failed ib_map_phys_fmr "
- "%u@0x%llx+%i (%d)... status %i\n", __func__,
- len, (unsigned long long)seg1->mr_dma,
- pageoff, i, rc);
- while (i--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- }
- *nsegs = i;
- return rc;
-}
-
-static int
-rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_ia *ia)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- LIST_HEAD(l);
- int rc;
-
- list_add(&seg1->rl_mw->r.fmr->list, &l);
- rc = ib_unmap_fmr(&l);
- read_lock(&ia->ri_qplock);
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- read_unlock(&ia->ri_qplock);
- if (rc)
- dprintk("RPC: %s: failed ib_unmap_fmr,"
- " status %i\n", __func__, rc);
- return rc;
-}
-
-int
-rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
- int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int rc = 0;
-
- switch (ia->ri_memreg_strategy) {
-
- case RPCRDMA_ALLPHYSICAL:
- rpcrdma_map_one(ia, seg, writing);
- seg->mr_rkey = ia->ri_bind_mem->rkey;
- seg->mr_base = seg->mr_dma;
- seg->mr_nsegs = 1;
- nsegs = 1;
- break;
-
- /* Registration using frmr registration */
- case RPCRDMA_FRMR:
- rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
- break;
-
- /* Registration using fmr memory registration */
- case RPCRDMA_MTHCAFMR:
- rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
- break;
-
- default:
- return -EIO;
- }
- if (rc)
- return rc;
-
- return nsegs;
-}
-
-int
-rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int nsegs = seg->mr_nsegs, rc;
-
- switch (ia->ri_memreg_strategy) {
-
- case RPCRDMA_ALLPHYSICAL:
- read_lock(&ia->ri_qplock);
- rpcrdma_unmap_one(ia, seg);
- read_unlock(&ia->ri_qplock);
- break;
-
- case RPCRDMA_FRMR:
- rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
- break;
-
- case RPCRDMA_MTHCAFMR:
- rc = rpcrdma_deregister_fmr_external(seg, ia);
- break;
-
- default:
- break;
- }
- return nsegs;
-}
-
-/*
* Prepost any receive buffer, then post send.
*
* Receive buffer is donated to hardware, reclaimed upon recv completion.
@@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
}
send_wr.next = NULL;
- send_wr.wr_id = 0ULL; /* no send cookie */
+ send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
send_wr.sg_list = req->rl_send_iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
@@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
return rc;
}
-/* Physical mapping means one Read/Write list entry per-page.
- * All list entries must fit within an inline buffer
- *
- * NB: The server must return a Write list for NFS READ,
- * which has the same constraint. Factor in the inline
- * rsize as well.
+/* How many chunk list items fit within our inline buffers?
*/
-static size_t
-rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+unsigned int
+rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- unsigned int inline_size, pages;
+ int bytes, segments;
- inline_size = min_t(unsigned int,
- cdata->inline_wsize, cdata->inline_rsize);
- inline_size -= RPCRDMA_HDRLEN_MIN;
- pages = inline_size / sizeof(struct rpcrdma_segment);
- return pages << PAGE_SHIFT;
-}
-
-static size_t
-rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
-{
- return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
-}
-
-size_t
-rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
-{
- size_t result;
-
- switch (r_xprt->rx_ia.ri_memreg_strategy) {
- case RPCRDMA_ALLPHYSICAL:
- result = rpcrdma_physical_max_payload(r_xprt);
- break;
- default:
- result = rpcrdma_mr_max_payload(r_xprt);
+ bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
+ bytes -= RPCRDMA_HDRLEN_MIN;
+ if (bytes < sizeof(struct rpcrdma_segment) * 2) {
+ pr_warn("RPC: %s: inline threshold too small\n",
+ __func__);
+ return 0;
}
- return result;
+
+ segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
+ dprintk("RPC: %s: max chunk list size = %d segments\n",
+ __func__, segments);
+ return segments;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0a16fb6f0885..78e0b8beaa36 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -60,6 +60,7 @@
* Interface Adapter -- one per transport instance
*/
struct rpcrdma_ia {
+ const struct rpcrdma_memreg_ops *ri_ops;
rwlock_t ri_qplock;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
@@ -105,6 +106,10 @@ struct rpcrdma_ep {
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+/* Force completion handler to ignore the signal
+ */
+#define RPCRDMA_IGNORE_COMPLETION (0ULL)
+
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
*
* The below structure appears at the front of a large region of kmalloc'd
@@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
-enum rpcrdma_chunktype {
- rpcrdma_noch = 0,
- rpcrdma_readch,
- rpcrdma_areadch,
- rpcrdma_writech,
- rpcrdma_replych
-};
-
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -213,6 +210,7 @@ struct rpcrdma_mw {
struct ib_fmr *fmr;
struct rpcrdma_frmr frmr;
} r;
+ void (*mw_sendcompletion)(struct ib_wc *);
struct list_head mw_list;
struct list_head mw_all;
};
@@ -258,7 +256,6 @@ struct rpcrdma_req {
unsigned int rl_niovs; /* 0, 2 or 4 */
unsigned int rl_nchunks; /* non-zero if chunks */
unsigned int rl_connect_cookie; /* retry detection */
- enum rpcrdma_chunktype rl_rtype, rl_wtype;
struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct ib_sge rl_send_iov[4]; /* for active requests */
@@ -340,6 +337,29 @@ struct rpcrdma_stats {
};
/*
+ * Per-registration mode operations
+ */
+struct rpcrdma_xprt;
+struct rpcrdma_memreg_ops {
+ int (*ro_map)(struct rpcrdma_xprt *,
+ struct rpcrdma_mr_seg *, int, bool);
+ int (*ro_unmap)(struct rpcrdma_xprt *,
+ struct rpcrdma_mr_seg *);
+ int (*ro_open)(struct rpcrdma_ia *,
+ struct rpcrdma_ep *,
+ struct rpcrdma_create_data_internal *);
+ size_t (*ro_maxpages)(struct rpcrdma_xprt *);
+ int (*ro_init)(struct rpcrdma_xprt *);
+ void (*ro_reset)(struct rpcrdma_xprt *);
+ void (*ro_destroy)(struct rpcrdma_buffer *);
+ const char *ro_displayname;
+};
+
+extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
+
+/*
* RPCRDMA transport -- encapsulates the structures above for
* integration with RPC.
*
@@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
-int rpcrdma_register_external(struct rpcrdma_mr_seg *,
- int, int, struct rpcrdma_xprt *);
-int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
- struct rpcrdma_xprt *);
-
struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
size_t, gfp_t);
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
+unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+ return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+static inline void
+rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
+ enum dma_data_direction direction)
+{
+ seg->mr_dir = direction;
+ seg->mr_dmalen = seg->mr_len;
+
+ if (seg->mr_page)
+ seg->mr_dma = ib_dma_map_page(device,
+ seg->mr_page, offset_in_page(seg->mr_offset),
+ seg->mr_dmalen, seg->mr_dir);
+ else
+ seg->mr_dma = ib_dma_map_single(device,
+ seg->mr_offset,
+ seg->mr_dmalen, seg->mr_dir);
+
+ if (ib_dma_mapping_error(device, seg->mr_dma))
+ rpcrdma_mapping_error(seg);
+}
+
+static inline void
+rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
+{
+ if (seg->mr_page)
+ ib_dma_unmap_page(device,
+ seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+ else
+ ib_dma_unmap_single(device,
+ seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
@@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
-ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
int rpcrdma_marshal_req(struct rpc_rqst *);
-size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
/* Temporary NFS request map cache. Created in svc_rdma.c */
extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 46568b85c333..055453d48668 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -338,7 +338,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
fi, tos, type, nlflags,
tb_id);
if (!err)
- fi->fib_flags |= RTNH_F_EXTERNAL;
+ fi->fib_flags |= RTNH_F_OFFLOAD;
}
return err;
@@ -364,7 +364,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
const struct swdev_ops *ops;
int err = 0;
- if (!(fi->fib_flags & RTNH_F_EXTERNAL))
+ if (!(fi->fib_flags & RTNH_F_OFFLOAD))
return 0;
dev = netdev_switch_get_dev_by_nhs(fi);
@@ -376,7 +376,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len,
fi, tos, type, tb_id);
if (!err)
- fi->fib_flags &= ~RTNH_F_EXTERNAL;
+ fi->fib_flags &= ~RTNH_F_OFFLOAD;
}
return err;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 3613e72e858e..70e3dacbf84a 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -591,14 +591,14 @@ void tipc_bearer_stop(struct net *net)
/* Caller should hold rtnl_lock to protect the bearer */
static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
- struct tipc_bearer *bearer)
+ struct tipc_bearer *bearer, int nlflags)
{
void *hdr;
struct nlattr *attrs;
struct nlattr *prop;
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
- NLM_F_MULTI, TIPC_NL_BEARER_GET);
+ nlflags, TIPC_NL_BEARER_GET);
if (!hdr)
return -EMSGSIZE;
@@ -657,7 +657,7 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!bearer)
continue;
- err = __tipc_nl_add_bearer(&msg, bearer);
+ err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI);
if (err)
break;
}
@@ -705,7 +705,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info)
goto err_out;
}
- err = __tipc_nl_add_bearer(&msg, bearer);
+ err = __tipc_nl_add_bearer(&msg, bearer, 0);
if (err)
goto err_out;
rtnl_unlock();
@@ -857,14 +857,14 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
}
static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
- struct tipc_media *media)
+ struct tipc_media *media, int nlflags)
{
void *hdr;
struct nlattr *attrs;
struct nlattr *prop;
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
- NLM_F_MULTI, TIPC_NL_MEDIA_GET);
+ nlflags, TIPC_NL_MEDIA_GET);
if (!hdr)
return -EMSGSIZE;
@@ -916,7 +916,8 @@ int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb)
rtnl_lock();
for (; media_info_array[i] != NULL; i++) {
- err = __tipc_nl_add_media(&msg, media_info_array[i]);
+ err = __tipc_nl_add_media(&msg, media_info_array[i],
+ NLM_F_MULTI);
if (err)
break;
}
@@ -963,7 +964,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info)
goto err_out;
}
- err = __tipc_nl_add_media(&msg, media);
+ err = __tipc_nl_add_media(&msg, media, 0);
if (err)
goto err_out;
rtnl_unlock();
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a6b30df6ec02..43a515dc97b0 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1145,11 +1145,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
}
/* Synchronize with parallel link if applicable */
if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
- link_handle_out_of_seq_msg(l_ptr, skb);
- if (link_synch(l_ptr))
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
+ if (!link_synch(l_ptr))
+ goto unlock;
}
l_ptr->next_in_no++;
if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
@@ -2013,7 +2010,7 @@ msg_full:
/* Caller should hold appropriate locks to protect the link */
static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
- struct tipc_link *link)
+ struct tipc_link *link, int nlflags)
{
int err;
void *hdr;
@@ -2022,7 +2019,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
struct tipc_net *tn = net_generic(net, tipc_net_id);
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
- NLM_F_MULTI, TIPC_NL_LINK_GET);
+ nlflags, TIPC_NL_LINK_GET);
if (!hdr)
return -EMSGSIZE;
@@ -2095,7 +2092,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
if (!node->links[i])
continue;
- err = __tipc_nl_add_link(net, msg, node->links[i]);
+ err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
if (err)
return err;
}
@@ -2143,7 +2140,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb)
err = __tipc_nl_add_node_links(net, &msg, node,
&prev_link);
tipc_node_unlock(node);
- tipc_node_put(node);
if (err)
goto out;
@@ -2210,7 +2206,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
goto err_out;
}
- err = __tipc_nl_add_link(net, &msg, link);
+ err = __tipc_nl_add_link(net, &msg, link, 0);
if (err)
goto err_out;
diff --git a/net/tipc/server.c b/net/tipc/server.c
index ab6183cdb121..77ff03ed1e18 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -102,7 +102,7 @@ static void tipc_conn_kref_release(struct kref *kref)
}
saddr->scope = -TIPC_NODE_SCOPE;
kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
- sk_release_kernel(sk);
+ sock_release(sock);
con->sock = NULL;
}
@@ -321,12 +321,9 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
struct socket *sock = NULL;
int ret;
- ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock);
+ ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1);
if (ret < 0)
return NULL;
-
- sk_change_net(sock->sk, s->net);
-
ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
(char *)&s->imp, sizeof(s->imp));
if (ret < 0)
@@ -376,7 +373,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
create_err:
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
return NULL;
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index ee90d74d7516..9074b5cede38 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1764,13 +1764,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
u32 dnode, dport = 0;
- int err = -TIPC_ERR_NO_PORT;
+ int err;
struct sk_buff *skb;
struct tipc_sock *tsk;
struct tipc_net *tn;
struct sock *sk;
while (skb_queue_len(inputq)) {
+ err = -TIPC_ERR_NO_PORT;
skb = NULL;
dport = tipc_skb_peek_port(inputq, dport);
tsk = tipc_sk_lookup(net, dport);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433f287ee548..06430598cf51 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -305,7 +305,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
- if (dentry && dentry->d_inode == i) {
+ if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
goto found;
}
@@ -778,7 +778,7 @@ static struct sock *unix_find_other(struct net *net,
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
- inode = path.dentry->d_inode;
+ inode = d_backing_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
@@ -839,7 +839,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
*/
err = security_path_mknod(&path, dentry, mode, 0);
if (!err) {
- err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
+ err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
if (!err) {
res->mnt = mntget(path.mnt);
res->dentry = dget(dentry);
@@ -905,7 +905,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
+ hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
spin_lock(&unix_table_lock);
u->path = path;
list = &unix_socket_table[hash];
@@ -1880,6 +1880,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
unix_state_unlock(sk);
timeo = freezable_schedule_timeout(timeo);
unix_state_lock(sk);
+
+ if (sock_flag(sk, SOCK_DEAD))
+ break;
+
clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
}
@@ -1939,6 +1943,10 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
struct sk_buff *skb, *last;
unix_state_lock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ err = -ECONNRESET;
+ goto unlock;
+ }
last = skb = skb_peek(&sk->sk_receive_queue);
again:
if (skb == NULL) {
diff --git a/net/unix/diag.c b/net/unix/diag.c
index ef542fbca9fe..c512f64d5287 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -25,7 +25,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
if (dentry) {
struct unix_diag_vfs uv = {
- .udiag_vfs_ino = dentry->d_inode->i_ino,
+ .udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
.udiag_vfs_dev = dentry->d_sb->s_dev,
};
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 99f7012b23b9..a73a226f2d33 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
unsigned int unix_tot_inflight;
-
struct sock *unix_get_socket(struct file *filp)
{
struct sock *u_sock = NULL;
struct inode *inode = file_inode(filp);
- /*
- * Socket ?
- */
+ /* Socket ? */
if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
struct socket *sock = SOCKET_I(inode);
struct sock *s = sock->sk;
- /*
- * PF_UNIX ?
- */
+ /* PF_UNIX ? */
if (s && sock->ops && sock->ops->family == PF_UNIX)
u_sock = s;
}
return u_sock;
}
-/*
- * Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
*/
void unix_inflight(struct file *fp)
{
struct sock *s = unix_get_socket(fp);
+
if (s) {
struct unix_sock *u = unix_sk(s);
+
spin_lock(&unix_gc_lock);
+
if (atomic_long_inc_return(&u->inflight) == 1) {
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &gc_inflight_list);
@@ -142,10 +139,13 @@ void unix_inflight(struct file *fp)
void unix_notinflight(struct file *fp)
{
struct sock *s = unix_get_socket(fp);
+
if (s) {
struct unix_sock *u = unix_sk(s);
+
spin_lock(&unix_gc_lock);
BUG_ON(list_empty(&u->link));
+
if (atomic_long_dec_and_test(&u->inflight))
list_del_init(&u->link);
unix_tot_inflight--;
@@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
spin_lock(&x->sk_receive_queue.lock);
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- /*
- * Do we have file descriptors ?
- */
+ /* Do we have file descriptors ? */
if (UNIXCB(skb).fp) {
bool hit = false;
- /*
- * Process the descriptors of this socket
- */
+ /* Process the descriptors of this socket */
int nfd = UNIXCB(skb).fp->count;
struct file **fp = UNIXCB(skb).fp->fp;
+
while (nfd--) {
- /*
- * Get the socket the fd matches
- * if it indeed does so
- */
+ /* Get the socket the fd matches if it indeed does so */
struct sock *sk = unix_get_socket(*fp++);
+
if (sk) {
struct unix_sock *u = unix_sk(sk);
- /*
- * Ignore non-candidates, they could
+ /* Ignore non-candidates, they could
* have been added to the queues after
* starting the garbage collection
*/
if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
hit = true;
+
func(u);
}
}
@@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
struct sk_buff_head *hitlist)
{
- if (x->sk_state != TCP_LISTEN)
+ if (x->sk_state != TCP_LISTEN) {
scan_inflight(x, func, hitlist);
- else {
+ } else {
struct sk_buff *skb;
struct sk_buff *next;
struct unix_sock *u;
LIST_HEAD(embryos);
- /*
- * For a listening socket collect the queued embryos
+ /* For a listening socket collect the queued embryos
* and perform a scan on them as well.
*/
spin_lock(&x->sk_receive_queue.lock);
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
u = unix_sk(skb->sk);
- /*
- * An embryo cannot be in-flight, so it's safe
+ /* An embryo cannot be in-flight, so it's safe
* to use the list link.
*/
BUG_ON(!list_empty(&u->link));
@@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk)
static void inc_inflight_move_tail(struct unix_sock *u)
{
atomic_long_inc(&u->inflight);
- /*
- * If this still might be part of a cycle, move it to the end
+ /* If this still might be part of a cycle, move it to the end
* of the list, so that it's checked even if it was already
* passed over
*/
@@ -263,8 +255,7 @@ static bool gc_in_progress;
void wait_for_unix_gc(void)
{
- /*
- * If number of inflight sockets is insane,
+ /* If number of inflight sockets is insane,
* force a garbage collect right now.
*/
if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
@@ -288,8 +279,7 @@ void unix_gc(void)
goto out;
gc_in_progress = true;
- /*
- * First, select candidates for garbage collection. Only
+ /* First, select candidates for garbage collection. Only
* in-flight sockets are considered, and from those only ones
* which don't have any external reference.
*
@@ -320,15 +310,13 @@ void unix_gc(void)
}
}
- /*
- * Now remove all internal in-flight reference to children of
+ /* Now remove all internal in-flight reference to children of
* the candidates.
*/
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, dec_inflight, NULL);
- /*
- * Restore the references for children of all candidates,
+ /* Restore the references for children of all candidates,
* which have remaining references. Do this recursively, so
* only those remain, which form cyclic references.
*
@@ -350,8 +338,7 @@ void unix_gc(void)
}
list_del(&cursor);
- /*
- * not_cycle_list contains those sockets which do not make up a
+ /* not_cycle_list contains those sockets which do not make up a
* cycle. Restore these to the inflight list.
*/
while (!list_empty(&not_cycle_list)) {
@@ -360,8 +347,7 @@ void unix_gc(void)
list_move_tail(&u->link, &gc_inflight_list);
}
- /*
- * Now gc_candidates contains only garbage. Restore original
+ /* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well, and remove the skbuffs
* which are creating the cycle(s).
*/