summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/esp4.c36
-rw-r--r--net/ipv4/esp4_offload.c73
-rw-r--r--net/ipv4/inet_diag.c8
-rw-r--r--net/ipv4/ip_gre.c5
-rw-r--r--net/ipv4/netfilter.c62
-rw-r--r--net/ipv4/netfilter/Kconfig10
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/arp_tables.c26
-rw-r--r--net/ipv4/netfilter/ip_tables.c26
-rw-r--r--net/ipv4/netfilter/iptable_filter.c6
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c5
-rw-r--r--net/ipv4/netfilter/iptable_nat.c4
-rw-r--r--net/ipv4/netfilter/iptable_raw.c6
-rw-r--r--net/ipv4/netfilter/iptable_security.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c284
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c13
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c36
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c8
-rw-r--r--net/ipv4/tcp.c30
-rw-r--r--net/ipv4/tcp_input.c3
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c3
-rw-r--r--net/ipv4/tcp_probe.c301
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/xfrm4_input.c12
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c5
32 files changed, 448 insertions, 555 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c6c8ad1d4b6d..47a0a6649a9d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index bab98a4fedad..54cccdd8b1e3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -790,7 +790,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int addr_len = 0;
int err;
- sock_rps_record_flow(sk);
+ if (likely(!(flags & MSG_ERRQUEUE)))
+ sock_rps_record_flow(sk);
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d57aa64fa7c7..6f00e43120a8 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -121,14 +121,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
void *tmp;
- struct dst_entry *dst = skb_dst(skb);
- struct xfrm_state *x = dst->xfrm;
+ struct xfrm_state *x;
+
+ if (xo && (xo->flags & XFRM_DEV_RESUME))
+ x = skb->sp->xvec[skb->sp->len - 1];
+ else
+ x = skb_dst(skb)->xfrm;
tmp = ESP_SKB_CB(skb)->tmp;
esp_ssg_unref(x, tmp);
kfree(tmp);
- xfrm_output_resume(skb, err);
+
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ if (err) {
+ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ kfree_skb(skb);
+ return;
+ }
+
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ secpath_reset(skb);
+ xfrm_dev_resume(skb);
+ } else {
+ xfrm_output_resume(skb, err);
+ }
}
/* Move ESP header back into place. */
@@ -825,17 +843,13 @@ static int esp_init_aead(struct xfrm_state *x)
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- u32 mask = 0;
err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
goto error;
- if (x->xso.offload_handle)
- mask |= CRYPTO_ALG_ASYNC;
-
- aead = crypto_alloc_aead(aead_name, 0, mask);
+ aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -865,7 +879,6 @@ static int esp_init_authenc(struct xfrm_state *x)
char authenc_name[CRYPTO_MAX_ALG_NAME];
unsigned int keylen;
int err;
- u32 mask = 0;
err = -EINVAL;
if (!x->ealg)
@@ -891,10 +904,7 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
}
- if (x->xso.offload_handle)
- mask |= CRYPTO_ALG_ASYNC;
-
- aead = crypto_alloc_aead(authenc_name, 0, mask);
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index f8b918c766b0..c359f3cfeec3 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -108,75 +108,36 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- __u32 seq;
- int err = 0;
- struct sk_buff *skb2;
struct xfrm_state *x;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
- struct sk_buff *segs = ERR_PTR(-EINVAL);
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
if (!xo)
- goto out;
-
- seq = xo->seq.low;
+ return ERR_PTR(-EINVAL);
x = skb->sp->xvec[skb->sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
if (esph->spi != x->id.spi)
- goto out;
+ return ERR_PTR(-EINVAL);
if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
- goto out;
+ return ERR_PTR(-EINVAL);
__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP))
+ if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
+ (x->xso.dev != skb->dev))
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
- segs = x->outer_mode->gso_segment(x, skb, esp_features);
- if (IS_ERR_OR_NULL(segs))
- goto out;
-
- __skb_pull(skb, skb->data - skb_mac_header(skb));
-
- skb2 = segs;
- do {
- struct sk_buff *nskb = skb2->next;
-
- xo = xfrm_offload(skb2);
- xo->flags |= XFRM_GSO_SEGMENT;
- xo->seq.low = seq;
- xo->seq.hi = xfrm_replay_seqhi(x, seq);
+ xo->flags |= XFRM_GSO_SEGMENT;
- if(!(features & NETIF_F_HW_ESP))
- xo->flags |= CRYPTO_FALLBACK;
-
- x->outer_mode->xmit(x, skb2);
-
- err = x->type_offload->xmit(x, skb2, esp_features);
- if (err) {
- kfree_skb_list(segs);
- return ERR_PTR(err);
- }
-
- if (!skb_is_gso(skb2))
- seq++;
- else
- seq += skb_shinfo(skb2)->gso_segs;
-
- skb_push(skb2, skb2->mac_len);
- skb2 = nskb;
- } while (skb2);
-
-out:
- return segs;
+ return x->outer_mode->gso_segment(x, skb, esp_features);
}
static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
@@ -203,6 +164,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
struct crypto_aead *aead;
struct esp_info esp;
bool hw_offload = true;
+ __u32 seq;
esp.inplace = true;
@@ -241,23 +203,30 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
return esp.nfrags;
}
+ seq = xo->seq.low;
+
esph = esp.esph;
esph->spi = x->id.spi;
skb_push(skb, -skb_network_offset(skb));
if (xo->flags & XFRM_GSO_SEGMENT) {
- esph->seq_no = htonl(xo->seq.low);
- } else {
- ip_hdr(skb)->tot_len = htons(skb->len);
- ip_send_check(ip_hdr(skb));
+ esph->seq_no = htonl(seq);
+
+ if (!skb_is_gso(skb))
+ xo->seq.low++;
+ else
+ xo->seq.low += skb_shinfo(skb)->gso_segs;
}
+ esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
+
+ ip_hdr(skb)->tot_len = htons(skb->len);
+ ip_send_check(ip_hdr(skb));
+
if (hw_offload)
return 0;
- esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
-
err = esp_output_tail(x, skb, &esp);
if (err)
return err;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c9c35b61a027..a383f299ce24 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
case INET_DIAG_BC_JMP:
yes = 0;
break;
+ case INET_DIAG_BC_S_EQ:
+ yes = entry->sport == op[1].no;
+ break;
case INET_DIAG_BC_S_GE:
yes = entry->sport >= op[1].no;
break;
case INET_DIAG_BC_S_LE:
yes = entry->sport <= op[1].no;
break;
+ case INET_DIAG_BC_D_EQ:
+ yes = entry->dport == op[1].no;
+ break;
case INET_DIAG_BC_D_GE:
yes = entry->dport >= op[1].no;
break;
@@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
if (!valid_devcond(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_S_EQ:
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_EQ:
case INET_DIAG_BC_D_GE:
case INET_DIAG_BC_D_LE:
if (!valid_port_comparison(bc, len, &min_len))
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 78365094f56c..b61f2285816d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -313,11 +313,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
return PACKET_REJECT;
md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
- if (!md) {
- dst_release((struct dst_entry *)tun_dst);
- return PACKET_REJECT;
- }
-
memcpy(md, pkt_md, sizeof(*md));
md->version = ver;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c0cc6aa8cfaa..e6774ccb7731 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -80,35 +80,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
}
EXPORT_SYMBOL(ip_route_me_harder);
-/*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
-
-struct ip_rt_info {
- __be32 daddr;
- __be32 saddr;
- u_int8_t tos;
- u_int32_t mark;
-};
-
-static void nf_ip_saveroute(const struct sk_buff *skb,
- struct nf_queue_entry *entry)
-{
- struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
- if (entry->state.hook == NF_INET_LOCAL_OUT) {
- const struct iphdr *iph = ip_hdr(skb);
-
- rt_info->tos = iph->tos;
- rt_info->daddr = iph->daddr;
- rt_info->saddr = iph->saddr;
- rt_info->mark = skb->mark;
- }
-}
-
-static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
- const struct nf_queue_entry *entry)
+int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
{
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -119,10 +91,12 @@ static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(net, skb, RTN_UNSPEC);
+ return ip_route_me_harder(entry->state.net, skb,
+ RTN_UNSPEC);
}
return 0;
}
+EXPORT_SYMBOL_GPL(nf_ip_reroute);
__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
unsigned int dataoff, u_int8_t protocol)
@@ -155,9 +129,9 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
}
EXPORT_SYMBOL(nf_ip_checksum);
-static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, unsigned int len,
- u_int8_t protocol)
+__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u_int8_t protocol)
{
const struct iphdr *iph = ip_hdr(skb);
__sum16 csum = 0;
@@ -175,9 +149,10 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
}
return csum;
}
+EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
-static int nf_ip_route(struct net *net, struct dst_entry **dst,
- struct flowi *fl, bool strict __always_unused)
+int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+ bool strict __always_unused)
{
struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
if (IS_ERR(rt))
@@ -185,19 +160,4 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst,
*dst = &rt->dst;
return 0;
}
-
-static const struct nf_afinfo nf_ip_afinfo = {
- .family = AF_INET,
- .checksum = nf_ip_checksum,
- .checksum_partial = nf_ip_checksum_partial,
- .route = nf_ip_route,
- .saveroute = nf_ip_saveroute,
- .reroute = nf_ip_reroute,
- .route_key_size = sizeof(struct ip_rt_info),
-};
-
-static int __init ipv4_netfilter_init(void)
-{
- return nf_register_afinfo(&nf_ip_afinfo);
-}
-subsys_initcall(ipv4_netfilter_init);
+EXPORT_SYMBOL_GPL(nf_ip_route);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index c11eb1744ab1..7d5d444964aa 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -72,11 +72,20 @@ endif # NF_TABLES_IPV4
config NF_TABLES_ARP
tristate "ARP nf_tables support"
+ select NETFILTER_FAMILY_ARP
help
This option enables the ARP support for nf_tables.
endif # NF_TABLES
+config NF_FLOW_TABLE_IPV4
+ select NF_FLOW_TABLE
+ tristate "Netfilter flow table IPv4 module"
+ help
+ This option adds the flow table IPv4 support.
+
+ To compile it as a module, choose M here.
+
config NF_DUP_IPV4
tristate "Netfilter IPv4 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
@@ -392,6 +401,7 @@ endif # IP_NF_IPTABLES
config IP_NF_ARPTABLES
tristate "ARP tables support"
select NETFILTER_XTABLES
+ select NETFILTER_FAMILY_ARP
depends on NETFILTER_ADVANCED
help
arptables is a general, extensible packet identification framework.
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index adcdae358365..8bb1f0c7a375 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -43,6 +43,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
+# flow table support
+obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
+
# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 0c3c944a7b72..bf8a5340f15e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -810,9 +810,8 @@ static int get_info(struct net *net, void __user *user,
if (compat)
xt_compat_lock(NFPROTO_ARP);
#endif
- t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
- "arptable_%s", name);
- if (t) {
+ t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+ if (!IS_ERR(t)) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -841,7 +840,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = -ENOENT;
+ ret = PTR_ERR(t);
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(NFPROTO_ARP);
@@ -866,7 +865,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t) {
+ if (!IS_ERR(t)) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
@@ -878,7 +877,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = -ENOENT;
+ ret = PTR_ERR(t);
return ret;
}
@@ -903,10 +902,9 @@ static int __do_replace(struct net *net, const char *name,
goto out;
}
- t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
- "arptable_%s", name);
- if (!t) {
- ret = -ENOENT;
+ t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free_newinfo_counters_untrans;
}
@@ -1020,8 +1018,8 @@ static int do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
- if (!t) {
- ret = -ENOENT;
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free;
}
@@ -1408,7 +1406,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t) {
+ if (!IS_ERR(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
@@ -1423,7 +1421,7 @@ static int compat_get_entries(struct net *net,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = -ENOENT;
+ ret = PTR_ERR(t);
xt_compat_unlock(NFPROTO_ARP);
return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2e0d339028bb..0b975aa2d363 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -973,9 +973,8 @@ static int get_info(struct net *net, void __user *user,
if (compat)
xt_compat_lock(AF_INET);
#endif
- t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
- "iptable_%s", name);
- if (t) {
+ t = xt_request_find_table_lock(net, AF_INET, name);
+ if (!IS_ERR(t)) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1005,7 +1004,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = -ENOENT;
+ ret = PTR_ERR(t);
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(AF_INET);
@@ -1030,7 +1029,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t) {
+ if (!IS_ERR(t)) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1041,7 +1040,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = -ENOENT;
+ ret = PTR_ERR(t);
return ret;
}
@@ -1064,10 +1063,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
goto out;
}
- t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
- "iptable_%s", name);
- if (!t) {
- ret = -ENOENT;
+ t = xt_request_find_table_lock(net, AF_INET, name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free_newinfo_counters_untrans;
}
@@ -1181,8 +1179,8 @@ do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET, tmp.name);
- if (!t) {
- ret = -ENOENT;
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free;
}
@@ -1625,7 +1623,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t) {
+ if (!IS_ERR(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
ret = compat_table_info(private, &info);
@@ -1639,7 +1637,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = -ENOENT;
+ ret = PTR_ERR(t);
xt_compat_unlock(AF_INET);
return ret;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 7667f223d7f8..9ac92ea7b93c 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -38,12 +38,6 @@ static unsigned int
iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (state->hook == NF_INET_LOCAL_OUT &&
- (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr)))
- /* root is playing with raw sockets. */
- return NF_ACCEPT;
-
return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index aebdb337fd7e..dea138ca8925 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -49,11 +49,6 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
u_int32_t mark;
int err;
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
/* Save things which could affect route */
mark = skb->mark;
iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a1a07b338ccf..0f7255cc65ee 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -72,6 +72,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
.hook = iptable_nat_ipv4_in,
.pf = NFPROTO_IPV4,
+ .nat_hook = true,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
@@ -79,6 +80,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
.hook = iptable_nat_ipv4_out,
.pf = NFPROTO_IPV4,
+ .nat_hook = true,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
@@ -86,6 +88,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
.hook = iptable_nat_ipv4_local_fn,
.pf = NFPROTO_IPV4,
+ .nat_hook = true,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
@@ -93,6 +96,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
.hook = iptable_nat_ipv4_fn,
.pf = NFPROTO_IPV4,
+ .nat_hook = true,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 2642ecd2645c..a869d1fea7d9 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -26,12 +26,6 @@ static unsigned int
iptable_raw_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (state->hook == NF_INET_LOCAL_OUT &&
- (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr)))
- /* root is playing with raw sockets. */
- return NF_ACCEPT;
-
return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
}
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index ff226596e4b5..e5379fe57b64 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -43,12 +43,6 @@ static unsigned int
iptable_security_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (state->hook == NF_INET_LOCAL_OUT &&
- (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr)))
- /* Somebody is playing with raw sockets. */
- return NF_ACCEPT;
-
return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 89af9d88ca21..de213a397ea8 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -154,11 +154,6 @@ static unsigned int ipv4_conntrack_local(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
return NF_ACCEPT;
@@ -368,7 +363,7 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("ip_conntrack");
MODULE_LICENSE("GPL");
-static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
+static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
&nf_conntrack_l4proto_tcp4,
&nf_conntrack_l4proto_udp4,
&nf_conntrack_l4proto_icmp,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 1849fedd9b81..5c15beafa711 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -22,7 +22,7 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
-static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+static const unsigned int nf_ct_icmp_timeout = 30*HZ;
static inline struct nf_icmp_net *icmp_pernet(struct net *net)
{
@@ -351,7 +351,7 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net)
return &net->ct.nf_ct_proto.icmp.pn;
}
-struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_ICMP,
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
new file mode 100644
index 000000000000..b2d01eb25f2c
--- /dev/null
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -0,0 +1,284 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int thoff = iph->ihl * 4;
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+ return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return -1;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(ip_has_options(thoff)))
+ return -1;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return -1;
+
+ thoff = iph->ihl * 4;
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+ return false;
+
+ return true;
+}
+
+static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
+{
+ u32 mtu;
+
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (__nf_flow_exceeds_mtu(skb, mtu))
+ return true;
+
+ return false;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ const struct rtable *rt;
+ struct iphdr *iph;
+ __be32 nexthop;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+ rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*iph)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ip(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
+
+static struct nf_flowtable_type flowtable_ipv4 = {
+ .family = NFPROTO_IPV4,
+ .params = &nf_flow_offload_rhash_params,
+ .gc = nf_flow_offload_work_gc,
+ .hook = nf_flow_offload_ip_hook,
+ .owner = THIS_MODULE,
+};
+
+static int __init nf_flow_ipv4_module_init(void)
+{
+ nft_register_flowtable_type(&flowtable_ipv4);
+
+ return 0;
+}
+
+static void __exit nf_flow_ipv4_module_exit(void)
+{
+ nft_unregister_flowtable_type(&flowtable_ipv4);
+}
+
+module_init(nf_flow_ipv4_module_init);
+module_exit(nf_flow_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 0443ca4120b0..f7ff6a364d7b 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -356,11 +356,6 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
#endif
unsigned int ret;
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
@@ -396,11 +391,6 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
unsigned int ret;
int err;
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 4bbc273b45e8..f84c17763f6f 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -21,7 +21,8 @@ nft_do_chain_arp(void *priv,
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_unspec(&pkt, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb);
return nft_do_chain(&pkt, priv);
}
@@ -30,12 +31,6 @@ static struct nft_af_info nft_af_arp __read_mostly = {
.family = NFPROTO_ARP,
.nhooks = NF_ARP_NUMHOOKS,
.owner = THIS_MODULE,
- .nops = 1,
- .hooks = {
- [NF_ARP_IN] = nft_do_chain_arp,
- [NF_ARP_OUT] = nft_do_chain_arp,
- [NF_ARP_FORWARD] = nft_do_chain_arp,
- },
};
static int nf_tables_arp_init_net(struct net *net)
@@ -73,6 +68,10 @@ static const struct nf_chain_type filter_arp = {
.owner = THIS_MODULE,
.hook_mask = (1 << NF_ARP_IN) |
(1 << NF_ARP_OUT),
+ .hooks = {
+ [NF_ARP_IN] = nft_do_chain_arp,
+ [NF_ARP_OUT] = nft_do_chain_arp,
+ },
};
static int __init nf_tables_arp_init(void)
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 2840a29b2e04..f4675253f1e6 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -24,40 +24,17 @@ static unsigned int nft_do_chain_ipv4(void *priv,
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb);
return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_ipv4_output(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- if (unlikely(skb->len < sizeof(struct iphdr) ||
- ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
- if (net_ratelimit())
- pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
- "packet\n");
- return NF_ACCEPT;
- }
-
- return nft_do_chain_ipv4(priv, skb, state);
-}
-
-struct nft_af_info nft_af_ipv4 __read_mostly = {
+static struct nft_af_info nft_af_ipv4 __read_mostly = {
.family = NFPROTO_IPV4,
.nhooks = NF_INET_NUMHOOKS,
.owner = THIS_MODULE,
- .nops = 1,
- .hooks = {
- [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
- [NF_INET_LOCAL_OUT] = nft_ipv4_output,
- [NF_INET_FORWARD] = nft_do_chain_ipv4,
- [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
- [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
- },
};
-EXPORT_SYMBOL_GPL(nft_af_ipv4);
static int nf_tables_ipv4_init_net(struct net *net)
{
@@ -97,6 +74,13 @@ static const struct nf_chain_type filter_ipv4 = {
(1 << NF_INET_FORWARD) |
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING),
+ .hooks = {
+ [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
+ [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4,
+ [NF_INET_FORWARD] = nft_do_chain_ipv4,
+ [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
+ [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
+ },
};
static int __init nf_tables_ipv4_init(void)
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index f5c66a7a4bf2..f2a490981594 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -33,7 +33,8 @@ static unsigned int nft_nat_do_chain(void *priv,
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb);
return nft_do_chain(&pkt, priv);
}
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 30493beb611a..d965c225b9f6 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -33,12 +33,8 @@ static unsigned int nf_route_table_hook(void *priv,
const struct iphdr *iph;
int err;
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- nft_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb);
mark = skb->mark;
iph = ip_hdr(skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 67d39b79c801..d7cf861bf699 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -498,8 +498,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
- sock_rps_record_flow(sk);
-
sock_poll_wait(file, sk_sleep(sk), wait);
state = inet_sk_state_load(sk);
@@ -1104,12 +1102,15 @@ static int linear_payload_sz(bool first_skb)
return 0;
}
-static int select_size(const struct sock *sk, bool sg, bool first_skb)
+static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc)
{
const struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sg) {
+ if (zc)
+ return 0;
+
if (sk_can_gso(sk)) {
tmp = linear_payload_sz(first_skb);
} else {
@@ -1186,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
bool process_backlog = false;
- bool sg;
+ bool sg, zc = false;
long timeo;
flags = msg->msg_flags;
@@ -1204,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto out_err;
}
- if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+ zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG;
+ if (!zc)
uarg->zerocopy = 0;
}
@@ -1281,6 +1283,7 @@ restart:
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
+ int linear;
new_segment:
/* Allocate new segment. If the interface is SG,
@@ -1294,9 +1297,8 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- skb = sk_stream_alloc_skb(sk,
- select_size(sk, sg, first_skb),
- sk->sk_allocation,
+ linear = select_size(sk, sg, first_skb, zc);
+ skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
first_skb);
if (!skb)
goto wait_for_memory;
@@ -1325,13 +1327,13 @@ new_segment:
copy = msg_data_left(msg);
/* Where to copy to? */
- if (skb_availroom(skb) > 0) {
+ if (skb_availroom(skb) > 0 && !zc) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
- } else if (!uarg || !uarg->zerocopy) {
+ } else if (!zc) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1371,8 +1373,10 @@ new_segment:
pfrag->offset += copy;
} else {
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
- if (err == -EMSGSIZE || err == -EEXIST)
+ if (err == -EMSGSIZE || err == -EEXIST) {
+ tcp_mark_push(tp, skb);
goto new_segment;
+ }
if (err < 0)
goto do_error;
copy = err;
@@ -1729,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb,
}
/* Similar to __sock_recv_timestamp, but does not require an skb */
-void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
- struct scm_timestamping *tss)
+static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping *tss)
{
struct timeval tv;
bool has_timestamping = false;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4d55c4b338ee..ff71b18d9682 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
unsigned int len = skb->len;
struct tcp_sock *tp = tcp_sk(sk);
+ /* TCP congestion window tracking */
+ trace_tcp_probe(sk, skb);
+
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dd945b114215..5d203248123e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Clean up the MD5 key list, if any */
if (tp->md5sig_info) {
tcp_clear_md5_list(sk);
- kfree_rcu(tp->md5sig_info, rcu);
+ kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
tp->md5sig_info = NULL;
}
#endif
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 04be9f833927..95461f02ac9a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1944,7 +1944,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
in_flight = tcp_packets_in_flight(tp);
- BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
+ BUG_ON(tcp_skb_pcount(skb) <= 1);
+ BUG_ON(tp->snd_cwnd <= in_flight);
send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
deleted file mode 100644
index 697f4c67b2e3..000000000000
--- a/net/ipv4/tcp_probe.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * tcpprobe - Observe the TCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/ktime.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/tcp.h>
-
-MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
-MODULE_DESCRIPTION("TCP cwnd snooper");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("1.1");
-
-static int port __read_mostly;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int bufsize __read_mostly = 4096;
-MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
-module_param(bufsize, uint, 0);
-
-static unsigned int fwmark __read_mostly;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int full __read_mostly;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "tcpprobe";
-
-struct tcp_log {
- ktime_t tstamp;
- union {
- struct sockaddr raw;
- struct sockaddr_in v4;
- struct sockaddr_in6 v6;
- } src, dst;
- u16 length;
- u32 snd_nxt;
- u32 snd_una;
- u32 snd_wnd;
- u32 rcv_wnd;
- u32 snd_cwnd;
- u32 ssthresh;
- u32 srtt;
-};
-
-static struct {
- spinlock_t lock;
- wait_queue_head_t wait;
- ktime_t start;
- u32 lastcwnd;
-
- unsigned long head, tail;
- struct tcp_log *log;
-} tcp_probe;
-
-static inline int tcp_probe_used(void)
-{
- return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
-}
-
-static inline int tcp_probe_avail(void)
-{
- return bufsize - tcp_probe_used() - 1;
-}
-
-#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
- do { \
- si4.sin_family = AF_INET; \
- si4.sin_port = inet->inet_##mem##port; \
- si4.sin_addr.s_addr = inet->inet_##mem##addr; \
- } while (0) \
-
-/*
- * Hook inserted to be called before each receive packet.
- * Note: arguments must match tcp_rcv_established()!
- */
-static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th)
-{
- unsigned int len = skb->len;
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_sock *inet = inet_sk(sk);
-
- /* Only update if port or skb mark matches */
- if (((port == 0 && fwmark == 0) ||
- ntohs(inet->inet_dport) == port ||
- ntohs(inet->inet_sport) == port ||
- (fwmark > 0 && skb->mark == fwmark)) &&
- (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
-
- spin_lock(&tcp_probe.lock);
- /* If log fills, just silently drop */
- if (tcp_probe_avail() > 1) {
- struct tcp_log *p = tcp_probe.log + tcp_probe.head;
-
- p->tstamp = ktime_get();
- switch (sk->sk_family) {
- case AF_INET:
- tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
- tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
- break;
- case AF_INET6:
- memset(&p->src.v6, 0, sizeof(p->src.v6));
- memset(&p->dst.v6, 0, sizeof(p->dst.v6));
-#if IS_ENABLED(CONFIG_IPV6)
- p->src.v6.sin6_family = AF_INET6;
- p->src.v6.sin6_port = inet->inet_sport;
- p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
-
- p->dst.v6.sin6_family = AF_INET6;
- p->dst.v6.sin6_port = inet->inet_dport;
- p->dst.v6.sin6_addr = sk->sk_v6_daddr;
-#endif
- break;
- default:
- BUG();
- }
-
- p->length = len;
- p->snd_nxt = tp->snd_nxt;
- p->snd_una = tp->snd_una;
- p->snd_cwnd = tp->snd_cwnd;
- p->snd_wnd = tp->snd_wnd;
- p->rcv_wnd = tp->rcv_wnd;
- p->ssthresh = tcp_current_ssthresh(sk);
- p->srtt = tp->srtt_us >> 3;
-
- tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
- }
- tcp_probe.lastcwnd = tp->snd_cwnd;
- spin_unlock(&tcp_probe.lock);
-
- wake_up(&tcp_probe.wait);
- }
-
- jprobe_return();
-}
-
-static struct jprobe tcp_jprobe = {
- .kp = {
- .symbol_name = "tcp_rcv_established",
- },
- .entry = jtcp_rcv_established,
-};
-
-static int tcpprobe_open(struct inode *inode, struct file *file)
-{
- /* Reset (empty) log */
- spin_lock_bh(&tcp_probe.lock);
- tcp_probe.head = tcp_probe.tail = 0;
- tcp_probe.start = ktime_get();
- spin_unlock_bh(&tcp_probe.lock);
-
- return 0;
-}
-
-static int tcpprobe_sprint(char *tbuf, int n)
-{
- const struct tcp_log *p
- = tcp_probe.log + tcp_probe.tail;
- struct timespec64 ts
- = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
-
- return scnprintf(tbuf, n,
- "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
- (unsigned long)ts.tv_sec,
- (unsigned long)ts.tv_nsec,
- &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
- p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
-}
-
-static ssize_t tcpprobe_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- int error = 0;
- size_t cnt = 0;
-
- if (!buf)
- return -EINVAL;
-
- while (cnt < len) {
- char tbuf[256];
- int width;
-
- /* Wait for data in buffer */
- error = wait_event_interruptible(tcp_probe.wait,
- tcp_probe_used() > 0);
- if (error)
- break;
-
- spin_lock_bh(&tcp_probe.lock);
- if (tcp_probe.head == tcp_probe.tail) {
- /* multiple readers race? */
- spin_unlock_bh(&tcp_probe.lock);
- continue;
- }
-
- width = tcpprobe_sprint(tbuf, sizeof(tbuf));
-
- if (cnt + width < len)
- tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
-
- spin_unlock_bh(&tcp_probe.lock);
-
- /* if record greater than space available
- return partial buffer (so far) */
- if (cnt + width >= len)
- break;
-
- if (copy_to_user(buf + cnt, tbuf, width))
- return -EFAULT;
- cnt += width;
- }
-
- return cnt == 0 ? error : cnt;
-}
-
-static const struct file_operations tcpprobe_fops = {
- .owner = THIS_MODULE,
- .open = tcpprobe_open,
- .read = tcpprobe_read,
- .llseek = noop_llseek,
-};
-
-static __init int tcpprobe_init(void)
-{
- int ret = -ENOMEM;
-
- /* Warning: if the function signature of tcp_rcv_established,
- * has been changed, you also have to change the signature of
- * jtcp_rcv_established, otherwise you end up right here!
- */
- BUILD_BUG_ON(__same_type(tcp_rcv_established,
- jtcp_rcv_established) == 0);
-
- init_waitqueue_head(&tcp_probe.wait);
- spin_lock_init(&tcp_probe.lock);
-
- if (bufsize == 0)
- return -EINVAL;
-
- bufsize = roundup_pow_of_two(bufsize);
- tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
- if (!tcp_probe.log)
- goto err0;
-
- if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
- goto err0;
-
- ret = register_jprobe(&tcp_jprobe);
- if (ret)
- goto err1;
-
- pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
- port, fwmark, bufsize);
- return 0;
- err1:
- remove_proc_entry(procname, init_net.proc_net);
- err0:
- kfree(tcp_probe.log);
- return ret;
-}
-module_init(tcpprobe_init);
-
-static __exit void tcpprobe_exit(void)
-{
- remove_proc_entry(procname, init_net.proc_net);
- unregister_jprobe(&tcp_jprobe);
- kfree(tcp_probe.log);
-}
-module_exit(tcpprobe_exit);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e9c0d1e1772e..db72619e07e4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2490,8 +2490,6 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
mask |= POLLIN | POLLRDNORM;
- sock_rps_record_flow(sk);
-
/* Check for false positives due to checksum errors */
if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
!(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index e50b7fea57ee..bcfc00e88756 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -23,6 +23,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm4_extract_header(skb);
}
+static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dst_input(skb);
+}
+
static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
@@ -33,7 +39,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
iph->tos, skb->dev))
goto drop;
}
- return dst_input(skb);
+
+ if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2))
+ goto drop;
+
+ return 0;
drop:
kfree_skb(skb);
return NET_RX_DROP;
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 7d885a44dc9d..8affc6d83d58 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -105,18 +105,15 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x,
{
__skb_push(skb, skb->mac_len);
return skb_mac_gso_segment(skb, features);
-
}
static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
{
struct xfrm_offload *xo = xfrm_offload(skb);
- if (xo->flags & XFRM_GSO_SEGMENT) {
- skb->network_header = skb->network_header - x->props.header_len;
+ if (xo->flags & XFRM_GSO_SEGMENT)
skb->transport_header = skb->network_header +
sizeof(struct iphdr);
- }
skb_reset_mac_len(skb);
pskb_pull(skb, skb->mac_len + x->props.header_len);