From 876c27314ce51fe7e7e2aeb24a6448da1a26c78f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Apr 2018 12:10:09 +0200 Subject: netfilter: nf_conntrack_sip: allow duplicate SDP expectations Callum Sinclair reported SIP IP Phone errors that he tracked down to such phones sending session descriptions for different media types but with same port numbers. The expect core will only 'refresh' existing expectation if it is from same master AND same expectation class (media type). As expectation class is different, we get an error. The SIP connection tracking code will then 1). drop the SDP packet 2). if an rtp expectation was already installed successfully, error on rtcp expectation will cancel the rtp one. Make the expect core report back to caller when the conflict is due to different expectation class and have SIP tracker ignore soft-error. Reported-by: Callum Sinclair Tested-by: Callum Sinclair Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 5 ++++- net/netfilter/nf_conntrack_sip.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 8ef21d9f9a00..4b2b3d53acfc 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -252,7 +252,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { - return a->master == b->master && a->class == b->class && + return a->master == b->master && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && @@ -421,6 +421,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { + if (i->class != expect->class) + return -EALREADY; + if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 4dbb5bad4363..908e51e2dc2b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -938,11 +938,19 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, datalen, rtp_exp, rtcp_exp, mediaoff, medialen, daddr); else { - if (nf_ct_expect_related(rtp_exp) == 0) { - if (nf_ct_expect_related(rtcp_exp) != 0) - nf_ct_unexpect_related(rtp_exp); - else + /* -EALREADY handling works around end-points that send + * SDP messages with identical port but different media type, + * we pretend expectation was set up. + */ + int errp = nf_ct_expect_related(rtp_exp); + + if (errp == 0 || errp == -EALREADY) { + int errcp = nf_ct_expect_related(rtcp_exp); + + if (errcp == 0 || errcp == -EALREADY) ret = NF_ACCEPT; + else if (errp == 0) + nf_ct_unexpect_related(rtp_exp); } } nf_ct_expect_put(rtcp_exp); -- cgit v1.2.1 From 5c64576a77894a50be80be0024bed27171b55989 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 7 Apr 2018 15:50:47 +0300 Subject: ipvs: fix rtnl_lock lockups caused by start_sync_thread syzkaller reports for wrong rtnl_lock usage in sync code [1] and [2] We have 2 problems in start_sync_thread if error path is taken, eg. on memory allocation error or failure to configure sockets for mcast group or addr/port binding: 1. recursive locking: holding rtnl_lock while calling sock_release which in turn calls again rtnl_lock in ip_mc_drop_socket to leave the mcast group, as noticed by Florian Westphal. Additionally, sock_release can not be called while holding sync_mutex (ABBA deadlock). 2. task hung: holding rtnl_lock while calling kthread_stop to stop the running kthreads. As the kthreads do the same to leave the mcast group (sock_release -> ip_mc_drop_socket -> rtnl_lock) they hang. Fix the problems by calling rtnl_unlock early in the error path, now sock_release is called after unlocking both mutexes. Problem 3 (task hung reported by syzkaller [2]) is variant of problem 2: use _trylock to prevent one user to call rtnl_lock and then while waiting for sync_mutex to block kthreads that execute sock_release when they are stopped by stop_sync_thread. [1] IPVS: stopping backup sync thread 4500 ... WARNING: possible recursive locking detected 4.16.0-rc7+ #3 Not tainted -------------------------------------------- syzkaller688027/4497 is trying to acquire lock: (rtnl_mutex){+.+.}, at: [<00000000bb14d7fb>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 but task is already holding lock: IPVS: stopping backup sync thread 4495 ... (rtnl_mutex){+.+.}, at: [<00000000bb14d7fb>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(rtnl_mutex); lock(rtnl_mutex); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by syzkaller688027/4497: #0: (rtnl_mutex){+.+.}, at: [<00000000bb14d7fb>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 #1: (ipvs->sync_mutex){+.+.}, at: [<00000000703f78e3>] do_ip_vs_set_ctl+0x10f8/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2388 stack backtrace: CPU: 1 PID: 4497 Comm: syzkaller688027 Not tainted 4.16.0-rc7+ #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x24d lib/dump_stack.c:53 print_deadlock_bug kernel/locking/lockdep.c:1761 [inline] check_deadlock kernel/locking/lockdep.c:1805 [inline] validate_chain kernel/locking/lockdep.c:2401 [inline] __lock_acquire+0xe8f/0x3e00 kernel/locking/lockdep.c:3431 lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920 __mutex_lock_common kernel/locking/mutex.c:756 [inline] __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 ip_mc_drop_socket+0x88/0x230 net/ipv4/igmp.c:2643 inet_release+0x4e/0x1c0 net/ipv4/af_inet.c:413 sock_release+0x8d/0x1e0 net/socket.c:595 start_sync_thread+0x2213/0x2b70 net/netfilter/ipvs/ip_vs_sync.c:1924 do_ip_vs_set_ctl+0x1139/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2389 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1261 udp_setsockopt+0x45/0x80 net/ipv4/udp.c:2406 sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2975 SYSC_setsockopt net/socket.c:1849 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1828 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 RIP: 0033:0x446a69 RSP: 002b:00007fa1c3a64da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000446a69 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00000000006e29fc R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200000c0 R11: 0000000000000246 R12: 00000000006e29f8 R13: 00676e697279656b R14: 00007fa1c3a659c0 R15: 00000000006e2b60 [2] IPVS: sync thread started: state = BACKUP, mcast_ifn = syz_tun, syncid = 4, id = 0 IPVS: stopping backup sync thread 25415 ... INFO: task syz-executor7:25421 blocked for more than 120 seconds. Not tainted 4.16.0-rc6+ #284 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D23688 25421 4408 0x00000004 Call Trace: context_switch kernel/sched/core.c:2862 [inline] __schedule+0x8fb/0x1ec0 kernel/sched/core.c:3440 schedule+0xf5/0x430 kernel/sched/core.c:3499 schedule_timeout+0x1a3/0x230 kernel/time/timer.c:1777 do_wait_for_common kernel/sched/completion.c:86 [inline] __wait_for_common kernel/sched/completion.c:107 [inline] wait_for_common kernel/sched/completion.c:118 [inline] wait_for_completion+0x415/0x770 kernel/sched/completion.c:139 kthread_stop+0x14a/0x7a0 kernel/kthread.c:530 stop_sync_thread+0x3d9/0x740 net/netfilter/ipvs/ip_vs_sync.c:1996 do_ip_vs_set_ctl+0x2b1/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2394 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1253 sctp_setsockopt+0x2ca/0x63e0 net/sctp/socket.c:4154 sock_common_setsockopt+0x95/0xd0 net/core/sock.c:3039 SYSC_setsockopt net/socket.c:1850 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1829 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 RIP: 0033:0x454889 RSP: 002b:00007fc927626c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007fc9276276d4 RCX: 0000000000454889 RDX: 000000000000048c RSI: 0000000000000000 RDI: 0000000000000017 RBP: 000000000072bf58 R08: 0000000000000018 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 000000000000051c R14: 00000000006f9b40 R15: 0000000000000001 Showing all locks held in the system: 2 locks held by khungtaskd/868: #0: (rcu_read_lock){....}, at: [<00000000a1a8f002>] check_hung_uninterruptible_tasks kernel/hung_task.c:175 [inline] #0: (rcu_read_lock){....}, at: [<00000000a1a8f002>] watchdog+0x1c5/0xd60 kernel/hung_task.c:249 #1: (tasklist_lock){.+.+}, at: [<0000000037c2f8f9>] debug_show_all_locks+0xd3/0x3d0 kernel/locking/lockdep.c:4470 1 lock held by rsyslogd/4247: #0: (&f->f_pos_lock){+.+.}, at: [<000000000d8d6983>] __fdget_pos+0x12b/0x190 fs/file.c:765 2 locks held by getty/4338: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 2 locks held by getty/4339: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 2 locks held by getty/4340: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 2 locks held by getty/4341: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 2 locks held by getty/4342: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 2 locks held by getty/4343: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 2 locks held by getty/4344: #0: (&tty->ldisc_sem){++++}, at: [<00000000bee98654>] ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365 #1: (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>] n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131 3 locks held by kworker/0:5/6494: #0: ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: [<00000000a062b18e>] work_static include/linux/workqueue.h:198 [inline] #0: ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: [<00000000a062b18e>] set_work_data kernel/workqueue.c:619 [inline] #0: ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: [<00000000a062b18e>] set_work_pool_and_clear_pending kernel/workqueue.c:646 [inline] #0: ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: [<00000000a062b18e>] process_one_work+0xb12/0x1bb0 kernel/workqueue.c:2084 #1: ((addr_chk_work).work){+.+.}, at: [<00000000278427d5>] process_one_work+0xb89/0x1bb0 kernel/workqueue.c:2088 #2: (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 1 lock held by syz-executor7/25421: #0: (ipvs->sync_mutex){+.+.}, at: [<00000000d414a689>] do_ip_vs_set_ctl+0x277/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2393 2 locks held by syz-executor7/25427: #0: (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 #1: (ipvs->sync_mutex){+.+.}, at: [<00000000e6d48489>] do_ip_vs_set_ctl+0x10f8/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2388 1 lock held by syz-executor7/25435: #0: (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 1 lock held by ipvs-b:2:0/25415: #0: (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74 Reported-and-tested-by: syzbot+a46d6abf9d56b1365a72@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+5fe074c01b2032ce9618@syzkaller.appspotmail.com Fixes: e0b26cc997d5 ("ipvs: call rtnl_lock early") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 8 --- net/netfilter/ipvs/ip_vs_sync.c | 155 +++++++++++++++++++++------------------- 2 files changed, 80 insertions(+), 83 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5ebde4b15810..f36098887ad0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2384,11 +2384,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) strlcpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)); cfg.syncid = dm->syncid; - rtnl_lock(); - mutex_lock(&ipvs->sync_mutex); ret = start_sync_thread(ipvs, &cfg, dm->state); - mutex_unlock(&ipvs->sync_mutex); - rtnl_unlock(); } else { mutex_lock(&ipvs->sync_mutex); ret = stop_sync_thread(ipvs, dm->state); @@ -3481,12 +3477,8 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) if (ipvs->mixed_address_family_dests > 0) return -EINVAL; - rtnl_lock(); - mutex_lock(&ipvs->sync_mutex); ret = start_sync_thread(ipvs, &c, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); - mutex_unlock(&ipvs->sync_mutex); - rtnl_unlock(); return ret; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index fbaf3bd05b2e..001501e25625 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -49,6 +49,7 @@ #include #include #include +#include #include /* Used for ntoh_seq and hton_seq */ @@ -1360,15 +1361,9 @@ static void set_mcast_pmtudisc(struct sock *sk, int val) /* * Specifiy default interface for outgoing multicasts */ -static int set_mcast_if(struct sock *sk, char *ifname) +static int set_mcast_if(struct sock *sk, struct net_device *dev) { - struct net_device *dev; struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - - dev = __dev_get_by_name(net, ifname); - if (!dev) - return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -1396,19 +1391,14 @@ static int set_mcast_if(struct sock *sk, char *ifname) * in the in_addr structure passed in as a parameter. */ static int -join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) { - struct net *net = sock_net(sk); struct ip_mreqn mreq; - struct net_device *dev; int ret; memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - dev = __dev_get_by_name(net, ifname); - if (!dev) - return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -1423,15 +1413,10 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) #ifdef CONFIG_IP_VS_IPV6 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, - char *ifname) + struct net_device *dev) { - struct net *net = sock_net(sk); - struct net_device *dev; int ret; - dev = __dev_get_by_name(net, ifname); - if (!dev) - return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -1443,24 +1428,18 @@ static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, } #endif -static int bind_mcastif_addr(struct socket *sock, char *ifname) +static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) { - struct net *net = sock_net(sock->sk); - struct net_device *dev; __be32 addr; struct sockaddr_in sin; - dev = __dev_get_by_name(net, ifname); - if (!dev) - return -ENODEV; - addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); if (!addr) pr_err("You probably need to specify IP address on " "multicast interface.\n"); IP_VS_DBG(7, "binding socket with (%s) %pI4\n", - ifname, &addr); + dev->name, &addr); /* Now bind the socket with the address of multicast interface */ sin.sin_family = AF_INET; @@ -1493,7 +1472,8 @@ static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) +static int make_send_sock(struct netns_ipvs *ipvs, int id, + struct net_device *dev, struct socket **sock_ret) { /* multicast addr */ union ipvs_sockaddr mcast_addr; @@ -1505,9 +1485,10 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); - return ERR_PTR(result); + goto error; } - result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); + *sock_ret = sock; + result = set_mcast_if(sock->sk, dev); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; @@ -1522,7 +1503,7 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) set_sock_size(sock->sk, 1, result); if (AF_INET == ipvs->mcfg.mcast_af) - result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); + result = bind_mcastif_addr(sock, dev); else result = 0; if (result < 0) { @@ -1538,19 +1519,18 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) goto error; } - return sock; + return 0; error: - sock_release(sock); - return ERR_PTR(result); + return result; } /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, - int ifindex) +static int make_receive_sock(struct netns_ipvs *ipvs, int id, + struct net_device *dev, struct socket **sock_ret) { /* multicast addr */ union ipvs_sockaddr mcast_addr; @@ -1562,8 +1542,9 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); - return ERR_PTR(result); + goto error; } + *sock_ret = sock; /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); @@ -1571,7 +1552,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, set_sock_size(sock->sk, 0, result); get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); - sock->sk->sk_bound_dev_if = ifindex; + sock->sk->sk_bound_dev_if = dev->ifindex; result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); @@ -1582,21 +1563,20 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, #ifdef CONFIG_IP_VS_IPV6 if (ipvs->bcfg.mcast_af == AF_INET6) result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, - ipvs->bcfg.mcast_ifn); + dev); else #endif result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, - ipvs->bcfg.mcast_ifn); + dev); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; } - return sock; + return 0; error: - sock_release(sock); - return ERR_PTR(result); + return result; } @@ -1778,13 +1758,12 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, int state) { - struct ip_vs_sync_thread_data *tinfo; + struct ip_vs_sync_thread_data *tinfo = NULL; struct task_struct **array = NULL, *task; - struct socket *sock; struct net_device *dev; char *name; int (*threadfn)(void *data); - int id, count, hlen; + int id = 0, count, hlen; int result = -ENOMEM; u16 mtu, min_mtu; @@ -1792,6 +1771,18 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + /* Do not hold one mutex and then to block on another */ + for (;;) { + rtnl_lock(); + if (mutex_trylock(&ipvs->sync_mutex)) + break; + rtnl_unlock(); + mutex_lock(&ipvs->sync_mutex); + if (rtnl_trylock()) + break; + mutex_unlock(&ipvs->sync_mutex); + } + if (!ipvs->sync_state) { count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); ipvs->threads_mask = count - 1; @@ -1810,7 +1801,8 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); if (!dev) { pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); - return -ENODEV; + result = -ENODEV; + goto out_early; } hlen = (AF_INET6 == c->mcast_af) ? sizeof(struct ipv6hdr) + sizeof(struct udphdr) : @@ -1827,26 +1819,30 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, c->sync_maxlen = mtu - hlen; if (state == IP_VS_STATE_MASTER) { + result = -EEXIST; if (ipvs->ms) - return -EEXIST; + goto out_early; ipvs->mcfg = *c; name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { + result = -EEXIST; if (ipvs->backup_threads) - return -EEXIST; + goto out_early; ipvs->bcfg = *c; name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; } else { - return -EINVAL; + result = -EINVAL; + goto out_early; } if (state == IP_VS_STATE_MASTER) { struct ipvs_master_sync_state *ms; + result = -ENOMEM; ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); if (!ipvs->ms) goto out; @@ -1862,39 +1858,38 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, } else { array = kcalloc(count, sizeof(struct task_struct *), GFP_KERNEL); + result = -ENOMEM; if (!array) goto out; } - tinfo = NULL; for (id = 0; id < count; id++) { - if (state == IP_VS_STATE_MASTER) - sock = make_send_sock(ipvs, id); - else - sock = make_receive_sock(ipvs, id, dev->ifindex); - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto outtinfo; - } + result = -ENOMEM; tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) - goto outsocket; + goto out; tinfo->ipvs = ipvs; - tinfo->sock = sock; + tinfo->sock = NULL; if (state == IP_VS_STATE_BACKUP) { tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) - goto outtinfo; + goto out; } else { tinfo->buf = NULL; } tinfo->id = id; + if (state == IP_VS_STATE_MASTER) + result = make_send_sock(ipvs, id, dev, &tinfo->sock); + else + result = make_receive_sock(ipvs, id, dev, &tinfo->sock); + if (result < 0) + goto out; task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); if (IS_ERR(task)) { result = PTR_ERR(task); - goto outtinfo; + goto out; } tinfo = NULL; if (state == IP_VS_STATE_MASTER) @@ -1911,20 +1906,20 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, ipvs->sync_state |= state; spin_unlock_bh(&ipvs->sync_buff_lock); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + /* increase the module use count */ ip_vs_use_count_inc(); return 0; -outsocket: - sock_release(sock); - -outtinfo: - if (tinfo) { - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - } +out: + /* We do not need RTNL lock anymore, release it here so that + * sock_release below and in the kthreads can use rtnl_lock + * to leave the mcast group. + */ + rtnl_unlock(); count = id; while (count-- > 0) { if (state == IP_VS_STATE_MASTER) @@ -1932,13 +1927,23 @@ outtinfo: else kthread_stop(array[count]); } - kfree(array); - -out: if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { kfree(ipvs->ms); ipvs->ms = NULL; } + mutex_unlock(&ipvs->sync_mutex); + if (tinfo) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + kfree(tinfo); + } + kfree(array); + return result; + +out_early: + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); return result; } -- cgit v1.2.1 From 3f1e53abff84cf40b1adb3455d480dd295bf42e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Apr 2018 21:13:30 +0200 Subject: netfilter: ebtables: don't attempt to allocate 0-sized compat array Dmitry reports 32bit ebtables on 64bit kernel got broken by a recent change that returns -EINVAL when ruleset has no entries. ebtables however only counts user-defined chains, so for the initial table nentries will be 0. Don't try to allocate the compat array in this case, as no user defined rules exist no rule will need 64bit translation. Reported-by: Dmitry Vyukov Fixes: 7d7d7e02111e9 ("netfilter: compat: reject huge allocation requests") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 032e0fe45940..28a4c3490359 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1825,13 +1825,14 @@ static int compat_table_info(const struct ebt_table_info *info, { unsigned int size = info->entries_size; const void *entries = info->entries; - int ret; newinfo->entries_size = size; - - ret = xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); - if (ret) - return ret; + if (info->nentries) { + int ret = xt_compat_init_offsets(NFPROTO_BRIDGE, + info->nentries); + if (ret) + return ret; + } return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); -- cgit v1.2.1 From 0abf854d7cbbb405e39e0f93d5c1da98dca24bc0 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 10 Apr 2018 14:24:21 +0200 Subject: selftests: bpf: update .gitignore with missing generated files Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 9cf83f895d98..5e1ab2f0eb79 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -12,3 +12,6 @@ test_tcpbpf_user test_verifier_log feature test_libbpf_open +test_sock +test_sock_addr +urandom_read -- cgit v1.2.1 From 3a38bb98d9abdc3856f26b5ed4332803065cd7cf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 10 Apr 2018 09:37:32 -0700 Subject: bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog syzbot reported a possible deadlock in perf_event_detach_bpf_prog. The error details: ====================================================== WARNING: possible circular locking dependency detected 4.16.0-rc7+ #3 Not tainted ------------------------------------------------------ syz-executor7/24531 is trying to acquire lock: (bpf_event_mutex){+.+.}, at: [<000000008a849b07>] perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854 but task is already holding lock: (&mm->mmap_sem){++++}, at: [<0000000038768f87>] vm_mmap_pgoff+0x198/0x280 mm/util.c:353 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_sem){++++}: __might_fault+0x13a/0x1d0 mm/memory.c:4571 _copy_to_user+0x2c/0xc0 lib/usercopy.c:25 copy_to_user include/linux/uaccess.h:155 [inline] bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694 perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891 _perf_ioctl kernel/events/core.c:4750 [inline] perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770 vfs_ioctl fs/ioctl.c:46 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686 SYSC_ioctl fs/ioctl.c:701 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 -> #0 (bpf_event_mutex){+.+.}: lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920 __mutex_lock_common kernel/locking/mutex.c:756 [inline] __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854 perf_event_free_bpf_prog kernel/events/core.c:8147 [inline] _free_event+0xbdb/0x10f0 kernel/events/core.c:4116 put_event+0x24/0x30 kernel/events/core.c:4204 perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172 remove_vma+0xb4/0x1b0 mm/mmap.c:172 remove_vma_list mm/mmap.c:2490 [inline] do_munmap+0x82a/0xdf0 mm/mmap.c:2731 mmap_region+0x59e/0x15a0 mm/mmap.c:1646 do_mmap+0x6c0/0xe00 mm/mmap.c:1483 do_mmap_pgoff include/linux/mm.h:2223 [inline] vm_mmap_pgoff+0x1de/0x280 mm/util.c:355 SYSC_mmap_pgoff mm/mmap.c:1533 [inline] SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491 SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline] SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(bpf_event_mutex); lock(&mm->mmap_sem); lock(bpf_event_mutex); *** DEADLOCK *** ====================================================== The bug is introduced by Commit f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") where copy_to_user, which requires mm->mmap_sem, is called inside bpf_event_mutex lock. At the same time, during perf_event file descriptor close, mm->mmap_sem is held first and then subsequent perf_event_detach_bpf_prog needs bpf_event_mutex lock. Such a senario caused a deadlock. As suggested by Daniel, moving copy_to_user out of the bpf_event_mutex lock should fix the problem. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Reported-by: syzbot+dc5ca0e4c9bfafaf2bae@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- kernel/bpf/core.c | 45 +++++++++++++++++++++++++++++---------------- kernel/trace/bpf_trace.c | 25 +++++++++++++++++++++---- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 95a7abd0ee92..486e65e3db26 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -339,8 +339,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt); + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d315b393abdd..ba03ec39efb3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1572,13 +1572,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) return cnt; } +static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + u32 *prog_ids, + u32 request_cnt) +{ + int i = 0; + + for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; + prog_ids[i] = (*prog)->aux->id; + if (++i == request_cnt) { + prog++; + break; + } + } + + return !!(*prog); +} + int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; unsigned long err = 0; - u32 i = 0, *ids; bool nospc; + u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); @@ -1595,16 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) - continue; - ids[i] = (*prog)->aux->id; - if (++i == cnt) { - prog++; - break; - } - } - nospc = !!(*prog); + nospc = bpf_prog_array_copy_core(prog, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1683,22 +1693,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt) + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt) { + struct bpf_prog **prog; u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); - if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) - return -EFAULT; + *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; - return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); + /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ + prog = rcu_dereference_check(array, 1)->progs; + return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + : 0; } static void bpf_prog_free_deferred(struct work_struct *work) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d88e96d4e12c..56ba0f2a01db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -977,6 +977,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + u32 *ids, prog_cnt, ids_len; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -985,16 +986,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; - if (query.ids_len > BPF_TRACE_MAX_PROGS) + + ids_len = query.ids_len; + if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; + ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; + /* + * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which + * is required when user only wants to check for uquery->prog_cnt. + * There is no need to check for it since the case is handled + * gracefully in bpf_prog_array_copy_info. + */ mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - uquery->ids, - query.ids_len, - &uquery->prog_cnt); + ids, + ids_len, + &prog_cnt); mutex_unlock(&bpf_event_mutex); + if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || + copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) + ret = -EFAULT; + + kfree(ids); return ret; } -- cgit v1.2.1 From cf43ae63c024971e6df94665e829c01c22202a19 Mon Sep 17 00:00:00 2001 From: Jack Ma Date: Fri, 6 Apr 2018 15:45:16 +1200 Subject: netfilter: xt_connmark: Add bit mapping for bit-shift operation. With the addition of bit-shift operations, we are able to shift ct/skbmark based on user requirements. However, this change might also cause the most left/right hand- side mark to be accidentially lost during shift operations. This patch adds the ability to 'grep' certain bits based on ctmask or nfmask out of the original mark. Then, apply shift operations to achieve a new mapping between ctmark and skb->mark. For example: If someone would like save the fourth F bits of ctmark 0xFFF(F)000F into the seventh hexadecimal (0) skb->mark 0xABC000(0)E. new_targetmark = (ctmark & ctmask) >> 12; (new) skb->mark = (skb->mark &~nfmask) ^ new_targetmark; This will preserve the other bits that are not related to this operation. Fixes: 472a73e00757 ("netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets.") Reviewed-by: Florian Westphal Signed-off-by: Jack Ma Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connmark.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 773da82190dc..4b424e6caf3e 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -41,6 +41,7 @@ connmark_tg_shift(struct sk_buff *skb, u8 shift_bits, u8 shift_dir) { enum ip_conntrack_info ctinfo; + u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; @@ -61,24 +62,26 @@ connmark_tg_shift(struct sk_buff *skb, } break; case XT_CONNMARK_SAVE: - newmark = (ct->mark & ~info->ctmask) ^ - (skb->mark & info->nfmask); + new_targetmark = (skb->mark & info->nfmask); if (shift_dir == D_SHIFT_RIGHT) - newmark >>= shift_bits; + new_targetmark >>= shift_bits; else - newmark <<= shift_bits; + new_targetmark <<= shift_bits; + newmark = (ct->mark & ~info->ctmask) ^ + new_targetmark; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: - newmark = (skb->mark & ~info->nfmask) ^ - (ct->mark & info->ctmask); + new_targetmark = (ct->mark & info->ctmask); if (shift_dir == D_SHIFT_RIGHT) - newmark >>= shift_bits; + new_targetmark >>= shift_bits; else - newmark <<= shift_bits; + new_targetmark <<= shift_bits; + newmark = (skb->mark & ~info->nfmask) ^ + new_targetmark; skb->mark = newmark; break; } -- cgit v1.2.1 From 114aa35d06d4920c537b72f9fa935de5dd205260 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 30 Mar 2018 13:22:06 -0700 Subject: netfilter: conntrack: silent a memory leak warning The following memory leak is false postive: unreferenced object 0xffff8f37f156fb38 (size 128): comm "softirq", pid 0, jiffies 4294899665 (age 11.292s) hex dump (first 32 bytes): 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk 00 00 00 00 30 00 20 00 48 6b 6b 6b 6b 6b 6b 6b ....0. .Hkkkkkkk backtrace: [<000000004fda266a>] __kmalloc_track_caller+0x10d/0x141 [<000000007b0a7e3c>] __krealloc+0x45/0x62 [<00000000d08e0bfb>] nf_ct_ext_add+0xdc/0x133 [<0000000099b47fd8>] init_conntrack+0x1b1/0x392 [<0000000086dc36ec>] nf_conntrack_in+0x1ee/0x34b [<00000000940592de>] nf_hook_slow+0x36/0x95 [<00000000d1bd4da7>] nf_hook.constprop.43+0x1c3/0x1dd [<00000000c3673266>] __ip_local_out+0xae/0xb4 [<000000003e4192a6>] ip_local_out+0x17/0x33 [<00000000b64356de>] igmp_ifc_timer_expire+0x23e/0x26f [<000000006a8f3032>] call_timer_fn+0x14c/0x2a5 [<00000000650c1725>] __run_timers.part.34+0x150/0x182 [<0000000090e6946e>] run_timer_softirq+0x2a/0x4c [<000000004d1e7293>] __do_softirq+0x1d1/0x3c2 [<000000004643557d>] irq_exit+0x53/0xa2 [<0000000029ddee8f>] smp_apic_timer_interrupt+0x22a/0x235 because __krealloc() is not supposed to release the old memory and it is released later via kfree_rcu(). Since this is the only external user of __krealloc(), just mark it as not leak here. Signed-off-by: Cong Wang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_extend.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 9fe0ddc333fb..bd71a828ebde 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -71,6 +71,7 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) rcu_read_unlock(); alloc = max(newlen, NF_CT_EXT_PREALLOC); + kmemleak_not_leak(old); new = __krealloc(old, alloc, gfp); if (!new) return NULL; -- cgit v1.2.1 From a6615743704fdc179e227f84b7903edd1f0b4241 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Apr 2018 12:53:12 +0200 Subject: netfilter: fix CONFIG_NF_REJECT_IPV6=m link error We get a new link error with CONFIG_NFT_REJECT_INET=y and CONFIG_NF_REJECT_IPV6=m after larger parts of the nftables modules are linked together: net/netfilter/nft_reject_inet.o: In function `nft_reject_inet_eval': nft_reject_inet.c:(.text+0x17c): undefined reference to `nf_send_unreach6' nft_reject_inet.c:(.text+0x190): undefined reference to `nf_send_reset6' The problem is that with NF_TABLES_INET set, we implicitly try to use the ipv6 version as well for NFT_REJECT, but when CONFIG_IPV6 is set to a loadable module, it's impossible to reach that. The best workaround I found is to express the above as a Kconfig dependency, forcing NFT_REJECT itself to be 'm' in that particular configuration. Fixes: 02c7b25e5f54 ("netfilter: nf_tables: build-in filter chain type") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 704b3832dbad..44d8a55e9721 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -594,6 +594,7 @@ config NFT_QUOTA config NFT_REJECT default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" + depends on !NF_TABLES_INET || (IPV6!=m || m) help This option adds the "reject" expression that you can use to explicitly deny and notify via TCP reset/ICMP informational errors -- cgit v1.2.1 From 569ccae68b38654f04b6842b034aa33857f605fe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 10 Apr 2018 09:30:27 +0200 Subject: netfilter: nf_tables: can't fail after linking rule into active rule list rules in nftables a free'd using kfree, but protected by rcu, i.e. we must wait for a grace period to elapse. Normal removal patch does this, but nf_tables_newrule() doesn't obey this rule during error handling. It calls nft_trans_rule_add() *after* linking rule, and, if that fails to allocate memory, it unlinks the rule and then kfree() it -- this is unsafe. Switch order -- first add rule to transaction list, THEN link it to public list. Note: nft_trans_rule_add() uses GFP_KERNEL; it will not fail so this is not a problem in practice (spotted only during code review). Fixes: 0628b123c96d12 ("netfilter: nfnetlink: add batch support and use it from nf_tables") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 59 +++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9134cc429ad4..b1984f8f7253 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2361,41 +2361,46 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (nft_is_active_next(net, old_rule)) { - trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, - old_rule); - if (trans == NULL) { - err = -ENOMEM; - goto err2; - } - nft_deactivate_next(net, old_rule); - chain->use--; - list_add_tail_rcu(&rule->list, &old_rule->list); - } else { + if (!nft_is_active_next(net, old_rule)) { err = -ENOENT; goto err2; } - } else if (nlh->nlmsg_flags & NLM_F_APPEND) - if (old_rule) - list_add_rcu(&rule->list, &old_rule->list); - else - list_add_tail_rcu(&rule->list, &chain->rules); - else { - if (old_rule) - list_add_tail_rcu(&rule->list, &old_rule->list); - else - list_add_rcu(&rule->list, &chain->rules); - } + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, + old_rule); + if (trans == NULL) { + err = -ENOMEM; + goto err2; + } + nft_deactivate_next(net, old_rule); + chain->use--; - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { - err = -ENOMEM; - goto err3; + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + err = -ENOMEM; + goto err2; + } + + list_add_tail_rcu(&rule->list, &old_rule->list); + } else { + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + err = -ENOMEM; + goto err2; + } + + if (nlh->nlmsg_flags & NLM_F_APPEND) { + if (old_rule) + list_add_rcu(&rule->list, &old_rule->list); + else + list_add_tail_rcu(&rule->list, &chain->rules); + } else { + if (old_rule) + list_add_tail_rcu(&rule->list, &old_rule->list); + else + list_add_rcu(&rule->list, &chain->rules); + } } chain->use++; return 0; -err3: - list_del_rcu(&rule->list); err2: nf_tables_rule_destroy(&ctx, rule); err1: -- cgit v1.2.1 From 2f6adf481527c8ab8033c601f55bfb5b3712b2ac Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 10 Apr 2018 09:00:24 +0200 Subject: netfilter: nf_tables: free set name in error path set->name must be free'd here in case ops->init fails. Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b1984f8f7253..102ad873acb4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3212,18 +3212,20 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = ops->init(set, &desc, nla); if (err < 0) - goto err2; + goto err3; err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err3; + goto err4; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; -err3: +err4: ops->destroy(set); +err3: + kfree(set->name); err2: kvfree(set); err1: -- cgit v1.2.1 From 765cca91b895c8b747bca0b5fa54d1dc85c867a7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 17 Apr 2018 09:28:59 +1000 Subject: netfilter: conntrack: include kmemleak.h for kmemleak_not_leak() After merging the netfilter tree, today's linux-next build (powerpc ppc64_defconfig) failed like this: net/netfilter/nf_conntrack_extend.c: In function 'nf_ct_ext_add': net/netfilter/nf_conntrack_extend.c:74:2: error: implicit declaration of function 'kmemleak_not_leak' [-Werror=implicit-function-declaration] kmemleak_not_leak(old); ^~~~~~~~~~~~~~~~~ cc1: some warnings being treated as errors Fixes: 114aa35d06d4 ("netfilter: conntrack: silent a memory leak warning") Signed-off-by: Stephen Rothwell Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_extend.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index bd71a828ebde..277bbfe26478 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include +#include #include #include #include -- cgit v1.2.1 From 0a0a7e00a250c117f0c7ad8e1184abd98e7c098a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 18 Apr 2018 10:49:12 -0700 Subject: tools/bpf: fix test_sock and test_sock_addr.sh failure The bpf selftests test_sock and test_sock_addr.sh failed in my test machine. The failure looks like: $ ./test_sock Test case: bind4 load with invalid access: src_ip6 .. [PASS] Test case: bind4 load with invalid access: mark .. [PASS] Test case: bind6 load with invalid access: src_ip4 .. [PASS] Test case: sock_create load with invalid access: src_port .. [PASS] Test case: sock_create load w/o expected_attach_type (compat mode) .. [FAIL] Test case: sock_create load w/ expected_attach_type .. [FAIL] Test case: attach type mismatch bind4 vs bind6 .. [FAIL] ... Summary: 4 PASSED, 12 FAILED $ ./test_sock_addr.sh Wait for testing IPv4/IPv6 to become available ..... ERROR: Timeout waiting for test IP to become available. In test_sock, bpf program loads failed due to hitting memlock limits. In test_sock_addr.sh, my test machine is a ipv6 only test box and using "ping" without specifying address family for an ipv6 address does not work. This patch fixed the issue by including header bpf_rlimit.h in test_sock.c and test_sock_addr.c, and specifying address family for ping command. Cc: Andrey Ignatov Signed-off-by: Yonghong Song Acked-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sock.c | 1 + tools/testing/selftests/bpf/test_sock_addr.c | 1 + tools/testing/selftests/bpf/test_sock_addr.sh | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index 73bb20cfb9b7..f4d99fabc56d 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -13,6 +13,7 @@ #include #include "cgroup_helpers.h" +#include "bpf_rlimit.h" #ifndef ARRAY_SIZE # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index d488f20926e8..2950f80ba7fb 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -15,6 +15,7 @@ #include #include "cgroup_helpers.h" +#include "bpf_rlimit.h" #define CG_PATH "/foo" #define CONNECT4_PROG_PATH "./connect4_prog.o" diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh index c6e1dcf992c4..9832a875a828 100755 --- a/tools/testing/selftests/bpf/test_sock_addr.sh +++ b/tools/testing/selftests/bpf/test_sock_addr.sh @@ -4,7 +4,7 @@ set -eu ping_once() { - ping -q -c 1 -W 1 ${1%%/*} >/dev/null 2>&1 + ping -${1} -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 } wait_for_ip() @@ -13,7 +13,7 @@ wait_for_ip() echo -n "Wait for testing IPv4/IPv6 to become available " for _i in $(seq ${MAX_PING_TRIES}); do echo -n "." - if ping_once ${TEST_IPv4} && ping_once ${TEST_IPv6}; then + if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then echo " OK" return fi -- cgit v1.2.1 From 39f2ff0816e5421476c2bc538b68b4bb0708a78e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Apr 2018 12:23:39 +0200 Subject: netfilter: nf_tables: NAT chain and extensions require NF_TABLES Move these options inside the scope of the 'if' NF_TABLES and NF_TABLES_IPV6 dependencies. This patch fixes: net/ipv6/netfilter/nft_chain_nat_ipv6.o: In function `nft_nat_do_chain': >> net/ipv6/netfilter/nft_chain_nat_ipv6.c:37: undefined reference to `nft_do_chain' net/ipv6/netfilter/nft_chain_nat_ipv6.o: In function `nft_chain_nat_ipv6_exit': >> net/ipv6/netfilter/nft_chain_nat_ipv6.c:94: undefined reference to `nft_unregister_chain_type' net/ipv6/netfilter/nft_chain_nat_ipv6.o: In function `nft_chain_nat_ipv6_init': >> net/ipv6/netfilter/nft_chain_nat_ipv6.c:87: undefined reference to `nft_register_chain_type' that happens with: CONFIG_NF_TABLES=m CONFIG_NFT_CHAIN_NAT_IPV6=y Fixes: 02c7b25e5f54 ("netfilter: nf_tables: build-in filter chain type") Reported-by: kbuild test robot Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/Kconfig | 55 +++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ccbfa83e4bb0..ce77bcc2490c 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -48,6 +48,34 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. +if NF_NAT_IPV6 + +config NFT_CHAIN_NAT_IPV6 + tristate "IPv6 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv6 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + +config NFT_REDIR_IPV6 + tristate "IPv6 redirect support for nf_tables" + depends on NFT_REDIR + select NF_NAT_REDIRECT + help + This is the expression that provides IPv4 redirect support for + nf_tables. + +endif # NF_NAT_IPV6 + config NFT_REJECT_IPV6 select NF_REJECT_IPV6 default NFT_REJECT @@ -107,39 +135,12 @@ config NF_NAT_IPV6 if NF_NAT_IPV6 -config NFT_CHAIN_NAT_IPV6 - depends on NF_TABLES_IPV6 - tristate "IPv6 nf_tables nat chain support" - help - This option enables the "nat" chain for IPv6 in nf_tables. This - chain type is used to perform Network Address Translation (NAT) - packet transformations such as the source, destination address and - source and destination ports. - config NF_NAT_MASQUERADE_IPV6 tristate "IPv6 masquerade support" help This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection) for IPv6. -config NFT_MASQ_IPV6 - tristate "IPv6 masquerade support for nf_tables" - depends on NF_TABLES_IPV6 - depends on NFT_MASQ - select NF_NAT_MASQUERADE_IPV6 - help - This is the expression that provides IPv4 masquerading support for - nf_tables. - -config NFT_REDIR_IPV6 - tristate "IPv6 redirect support for nf_tables" - depends on NF_TABLES_IPV6 - depends on NFT_REDIR - select NF_NAT_REDIRECT - help - This is the expression that provides IPv4 redirect support for - nf_tables. - endif # NF_NAT_IPV6 config IP6_NF_IPTABLES -- cgit v1.2.1 From d71efb599ad42ef1e564c652d8084252bdc85edf Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 18 Apr 2018 23:35:34 +0900 Subject: netfilter: nf_tables: fix out-of-bounds in nft_chain_commit_update When chain name is changed, nft_chain_commit_update is called. In the nft_chain_commit_update, trans->ctx.chain->name has old chain name and nft_trans_chain_name(trans) has new chain name. If new chain name is longer than old chain name, KASAN warns slab-out-of-bounds. [ 175.015012] BUG: KASAN: slab-out-of-bounds in strcpy+0x9e/0xb0 [ 175.022735] Write of size 1 at addr ffff880114e022da by task iptables-compat/1458 [ 175.031353] CPU: 0 PID: 1458 Comm: iptables-compat Not tainted 4.16.0-rc7+ #146 [ 175.031353] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 175.031353] Call Trace: [ 175.031353] dump_stack+0x68/0xa0 [ 175.031353] print_address_description+0xd0/0x260 [ 175.031353] ? strcpy+0x9e/0xb0 [ 175.031353] kasan_report+0x234/0x350 [ 175.031353] __asan_report_store1_noabort+0x1c/0x20 [ 175.031353] strcpy+0x9e/0xb0 [ 175.031353] nf_tables_commit+0x1ccc/0x2990 [ 175.031353] nfnetlink_rcv+0x141e/0x16c0 [ 175.031353] ? nfnetlink_net_init+0x150/0x150 [ 175.031353] ? lock_acquire+0x370/0x370 [ 175.031353] ? lock_acquire+0x370/0x370 [ 175.031353] netlink_unicast+0x444/0x640 [ 175.031353] ? netlink_attachskb+0x700/0x700 [ 175.031353] ? _copy_from_iter_full+0x180/0x740 [ 175.031353] ? kasan_check_write+0x14/0x20 [ 175.031353] ? _copy_from_user+0x9b/0xd0 [ 175.031353] netlink_sendmsg+0x845/0xc70 [ ... ] Steps to reproduce: iptables-compat -N 1 iptables-compat -E 1 aaaaaaaaa Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 102ad873acb4..04d4e3772584 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5745,7 +5745,7 @@ static void nft_chain_commit_update(struct nft_trans *trans) struct nft_base_chain *basechain; if (nft_trans_chain_name(trans)) - strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + swap(trans->ctx.chain->name, nft_trans_chain_name(trans)); if (!nft_is_base_chain(trans->ctx.chain)) return; -- cgit v1.2.1 From 5a786232eb69a1f870ddc0cfd69d5bdef241a2ea Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 19 Apr 2018 16:17:14 +0200 Subject: netfilter: xt_connmark: do not cast xt_connmark_tginfo1 to xt_connmark_tginfo2 These structures have different layout, fill xt_connmark_tginfo2 with old fields in xt_connmark_tginfo1. Based on patch from Jack Ma. Fixes: 472a73e00757 ("netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets.") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connmark.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 4b424e6caf3e..94df000abb92 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -36,9 +36,7 @@ MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int -connmark_tg_shift(struct sk_buff *skb, - const struct xt_connmark_tginfo1 *info, - u8 shift_bits, u8 shift_dir) +connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) { enum ip_conntrack_info ctinfo; u_int32_t new_targetmark; @@ -52,10 +50,11 @@ connmark_tg_shift(struct sk_buff *skb, switch (info->mode) { case XT_CONNMARK_SET: newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; - if (shift_dir == D_SHIFT_RIGHT) - newmark >>= shift_bits; + if (info->shift_dir == D_SHIFT_RIGHT) + newmark >>= info->shift_bits; else - newmark <<= shift_bits; + newmark <<= info->shift_bits; + if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -63,10 +62,11 @@ connmark_tg_shift(struct sk_buff *skb, break; case XT_CONNMARK_SAVE: new_targetmark = (skb->mark & info->nfmask); - if (shift_dir == D_SHIFT_RIGHT) - new_targetmark >>= shift_bits; + if (info->shift_dir == D_SHIFT_RIGHT) + new_targetmark >>= info->shift_bits; else - new_targetmark <<= shift_bits; + new_targetmark <<= info->shift_bits; + newmark = (ct->mark & ~info->ctmask) ^ new_targetmark; if (ct->mark != newmark) { @@ -76,10 +76,11 @@ connmark_tg_shift(struct sk_buff *skb, break; case XT_CONNMARK_RESTORE: new_targetmark = (ct->mark & info->ctmask); - if (shift_dir == D_SHIFT_RIGHT) - new_targetmark >>= shift_bits; + if (info->shift_dir == D_SHIFT_RIGHT) + new_targetmark >>= info->shift_bits; else - new_targetmark <<= shift_bits; + new_targetmark <<= info->shift_bits; + newmark = (skb->mark & ~info->nfmask) ^ new_targetmark; skb->mark = newmark; @@ -92,8 +93,14 @@ static unsigned int connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo1 *info = par->targinfo; - - return connmark_tg_shift(skb, info, 0, 0); + const struct xt_connmark_tginfo2 info2 = { + .ctmark = info->ctmark, + .ctmask = info->ctmask, + .nfmask = info->nfmask, + .mode = info->mode, + }; + + return connmark_tg_shift(skb, &info2); } static unsigned int @@ -101,8 +108,7 @@ connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo2 *info = par->targinfo; - return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info, - info->shift_bits, info->shift_dir); + return connmark_tg_shift(skb, info); } static int connmark_tg_check(const struct xt_tgchk_param *par) -- cgit v1.2.1 From 6ab690aa439803347743c0d899ac422774fdd5e7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 20 Apr 2018 18:16:30 +0200 Subject: bpf: sockmap remove dead check Remove dead code that bails on `attr->value_size > KMALLOC_MAX_SIZE` - the previous check already bails on `attr->value_size != 4`. Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8dd9210d7db7..a3b21385e947 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1442,9 +1442,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); if (err && err != -EEXIST) return ERR_PTR(err); -- cgit v1.2.1 From 686c97ee29c886ee07d17987d0059874c5c3b5af Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Apr 2018 12:52:06 +0200 Subject: s390/qeth: fix error handling in adapter command callbacks Make sure to check both return code fields before(!) processing the command response. Otherwise we risk operating on invalid data. This matches an earlier fix for SETASSPARMS commands, see commit ad3cbf613329 ("s390/qeth: fix error handling in checksum cmd callback"). Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 85 +++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 04fefa5bb08d..36bc94088de1 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3033,28 +3033,23 @@ static int qeth_send_startlan(struct qeth_card *card) return rc; } -static int qeth_default_setadapterparms_cb(struct qeth_card *card, - struct qeth_reply *reply, unsigned long data) +static int qeth_setadpparms_inspect_rc(struct qeth_ipa_cmd *cmd) { - struct qeth_ipa_cmd *cmd; - - QETH_CARD_TEXT(card, 4, "defadpcb"); - - cmd = (struct qeth_ipa_cmd *) data; - if (cmd->hdr.return_code == 0) + if (!cmd->hdr.return_code) cmd->hdr.return_code = cmd->data.setadapterparms.hdr.return_code; - return 0; + return cmd->hdr.return_code; } static int qeth_query_setadapterparms_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data; QETH_CARD_TEXT(card, 3, "quyadpcb"); + if (qeth_setadpparms_inspect_rc(cmd)) + return 0; - cmd = (struct qeth_ipa_cmd *) data; if (cmd->data.setadapterparms.data.query_cmds_supp.lan_type & 0x7f) { card->info.link_type = cmd->data.setadapterparms.data.query_cmds_supp.lan_type; @@ -3062,7 +3057,7 @@ static int qeth_query_setadapterparms_cb(struct qeth_card *card, } card->options.adp.supported_funcs = cmd->data.setadapterparms.data.query_cmds_supp.supported_cmds; - return qeth_default_setadapterparms_cb(card, reply, (unsigned long)cmd); + return 0; } static struct qeth_cmd_buffer *qeth_get_adapter_cmd(struct qeth_card *card, @@ -3154,22 +3149,20 @@ EXPORT_SYMBOL_GPL(qeth_query_ipassists); static int qeth_query_switch_attributes_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; - struct qeth_switch_info *sw_info; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data; struct qeth_query_switch_attributes *attrs; + struct qeth_switch_info *sw_info; QETH_CARD_TEXT(card, 2, "qswiatcb"); - cmd = (struct qeth_ipa_cmd *) data; - sw_info = (struct qeth_switch_info *)reply->param; - if (cmd->data.setadapterparms.hdr.return_code == 0) { - attrs = &cmd->data.setadapterparms.data.query_switch_attributes; - sw_info->capabilities = attrs->capabilities; - sw_info->settings = attrs->settings; - QETH_CARD_TEXT_(card, 2, "%04x%04x", sw_info->capabilities, - sw_info->settings); - } - qeth_default_setadapterparms_cb(card, reply, (unsigned long) cmd); + if (qeth_setadpparms_inspect_rc(cmd)) + return 0; + sw_info = (struct qeth_switch_info *)reply->param; + attrs = &cmd->data.setadapterparms.data.query_switch_attributes; + sw_info->capabilities = attrs->capabilities; + sw_info->settings = attrs->settings; + QETH_CARD_TEXT_(card, 2, "%04x%04x", sw_info->capabilities, + sw_info->settings); return 0; } @@ -4207,16 +4200,13 @@ EXPORT_SYMBOL_GPL(qeth_do_send_packet); static int qeth_setadp_promisc_mode_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data; struct qeth_ipacmd_setadpparms *setparms; QETH_CARD_TEXT(card, 4, "prmadpcb"); - cmd = (struct qeth_ipa_cmd *) data; setparms = &(cmd->data.setadapterparms); - - qeth_default_setadapterparms_cb(card, reply, (unsigned long)cmd); - if (cmd->hdr.return_code) { + if (qeth_setadpparms_inspect_rc(cmd)) { QETH_CARD_TEXT_(card, 4, "prmrc%x", cmd->hdr.return_code); setparms->data.mode = SET_PROMISC_MODE_OFF; } @@ -4286,18 +4276,18 @@ EXPORT_SYMBOL_GPL(qeth_get_stats); static int qeth_setadpparms_change_macaddr_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data; QETH_CARD_TEXT(card, 4, "chgmaccb"); + if (qeth_setadpparms_inspect_rc(cmd)) + return 0; - cmd = (struct qeth_ipa_cmd *) data; if (!card->options.layer2 || !(card->info.mac_bits & QETH_LAYER2_MAC_READ)) { ether_addr_copy(card->dev->dev_addr, cmd->data.setadapterparms.data.change_addr.addr); card->info.mac_bits |= QETH_LAYER2_MAC_READ; } - qeth_default_setadapterparms_cb(card, reply, (unsigned long) cmd); return 0; } @@ -4328,13 +4318,15 @@ EXPORT_SYMBOL_GPL(qeth_setadpparms_change_macaddr); static int qeth_setadpparms_set_access_ctrl_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data; struct qeth_set_access_ctrl *access_ctrl_req; int fallback = *(int *)reply->param; QETH_CARD_TEXT(card, 4, "setaccb"); + if (cmd->hdr.return_code) + return 0; + qeth_setadpparms_inspect_rc(cmd); - cmd = (struct qeth_ipa_cmd *) data; access_ctrl_req = &cmd->data.setadapterparms.data.set_access_ctrl; QETH_DBF_TEXT_(SETUP, 2, "setaccb"); QETH_DBF_TEXT_(SETUP, 2, "%s", card->gdev->dev.kobj.name); @@ -4407,7 +4399,6 @@ static int qeth_setadpparms_set_access_ctrl_cb(struct qeth_card *card, card->options.isolation = card->options.prev_isolation; break; } - qeth_default_setadapterparms_cb(card, reply, (unsigned long) cmd); return 0; } @@ -4695,14 +4686,15 @@ out: static int qeth_setadpparms_query_oat_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *)data; struct qeth_qoat_priv *priv; char *resdata; int resdatalen; QETH_CARD_TEXT(card, 3, "qoatcb"); + if (qeth_setadpparms_inspect_rc(cmd)) + return 0; - cmd = (struct qeth_ipa_cmd *)data; priv = (struct qeth_qoat_priv *)reply->param; resdatalen = cmd->data.setadapterparms.hdr.cmdlength; resdata = (char *)data + 28; @@ -4796,21 +4788,18 @@ out: static int qeth_query_card_info_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { - struct qeth_ipa_cmd *cmd; + struct carrier_info *carrier_info = (struct carrier_info *)reply->param; + struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *)data; struct qeth_query_card_info *card_info; - struct carrier_info *carrier_info; QETH_CARD_TEXT(card, 2, "qcrdincb"); - carrier_info = (struct carrier_info *)reply->param; - cmd = (struct qeth_ipa_cmd *)data; - card_info = &cmd->data.setadapterparms.data.card_info; - if (cmd->data.setadapterparms.hdr.return_code == 0) { - carrier_info->card_type = card_info->card_type; - carrier_info->port_mode = card_info->port_mode; - carrier_info->port_speed = card_info->port_speed; - } + if (qeth_setadpparms_inspect_rc(cmd)) + return 0; - qeth_default_setadapterparms_cb(card, reply, (unsigned long) cmd); + card_info = &cmd->data.setadapterparms.data.card_info; + carrier_info->card_type = card_info->card_type; + carrier_info->port_mode = card_info->port_mode; + carrier_info->port_speed = card_info->port_speed; return 0; } -- cgit v1.2.1 From 901e3f49facbd31b2b3d1786637b4a35e1022e9b Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Apr 2018 12:52:07 +0200 Subject: s390/qeth: avoid control IO completion stalls For control IO, qeth currently tracks the index of the buffer that it expects to complete the next IO on each qeth_channel. If the channel presents an IRQ while this buffer has not yet completed, no completion processing for _any_ completed buffer takes place. So if the 'next buffer' is skipped for any sort of reason* (eg. when it is released due to error conditions, before the IO is started), the buffer obviously won't switch to PROCESSED until it is eventually allocated for a _different_ IO and completes. Until this happens, all completion processing on that channel stalls and pending requests possibly time out. As a fix, remove the whole 'next buffer' logic and simply process any IO buffer right when it completes. A channel will never have more than one IO pending, so there's no risk of processing out-of-sequence. *Note: currently just one location in the code really handles this problem, by advancing the 'next' index manually. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 2 -- drivers/s390/net/qeth_core_main.c | 22 +++++----------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 4326715dc13e..78b98b3e7efa 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -557,7 +557,6 @@ enum qeth_prot_versions { enum qeth_cmd_buffer_state { BUF_STATE_FREE, BUF_STATE_LOCKED, - BUF_STATE_PROCESSED, }; enum qeth_cq { @@ -601,7 +600,6 @@ struct qeth_channel { struct qeth_cmd_buffer iob[QETH_CMD_BUFFER_NO]; atomic_t irq_pending; int io_buf_no; - int buf_no; }; /** diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 36bc94088de1..5ec47c6ebaa6 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -818,7 +818,6 @@ void qeth_clear_cmd_buffers(struct qeth_channel *channel) for (cnt = 0; cnt < QETH_CMD_BUFFER_NO; cnt++) qeth_release_buffer(channel, &channel->iob[cnt]); - channel->buf_no = 0; channel->io_buf_no = 0; } EXPORT_SYMBOL_GPL(qeth_clear_cmd_buffers); @@ -924,7 +923,6 @@ static int qeth_setup_channel(struct qeth_channel *channel) kfree(channel->iob[cnt].data); return -ENOMEM; } - channel->buf_no = 0; channel->io_buf_no = 0; atomic_set(&channel->irq_pending, 0); spin_lock_init(&channel->iob_lock); @@ -1100,11 +1098,9 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, { int rc; int cstat, dstat; - struct qeth_cmd_buffer *buffer; struct qeth_channel *channel; struct qeth_card *card; struct qeth_cmd_buffer *iob; - __u8 index; if (__qeth_check_irb_error(cdev, intparm, irb)) return; @@ -1182,25 +1178,18 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, channel->state = CH_STATE_RCD_DONE; goto out; } - if (intparm) { - buffer = (struct qeth_cmd_buffer *) __va((addr_t)intparm); - buffer->state = BUF_STATE_PROCESSED; - } if (channel == &card->data) return; if (channel == &card->read && channel->state == CH_STATE_UP) __qeth_issue_next_read(card); - iob = channel->iob; - index = channel->buf_no; - while (iob[index].state == BUF_STATE_PROCESSED) { - if (iob[index].callback != NULL) - iob[index].callback(channel, iob + index); - - index = (index + 1) % QETH_CMD_BUFFER_NO; + if (intparm) { + iob = (struct qeth_cmd_buffer *) __va((addr_t)intparm); + if (iob->callback) + iob->callback(iob->channel, iob); } - channel->buf_no = index; + out: wake_up(&card->wait_q); return; @@ -2214,7 +2203,6 @@ time_err: error: atomic_set(&card->write.irq_pending, 0); qeth_release_buffer(iob->channel, iob); - card->write.buf_no = (card->write.buf_no + 1) % QETH_CMD_BUFFER_NO; rc = reply->rc; qeth_put_reply(reply); return rc; -- cgit v1.2.1 From a936b1ef37ce1e996533878f4b23944f9444dcdf Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Apr 2018 12:52:08 +0200 Subject: s390/qeth: handle failure on workqueue creation Creating the global workqueue during driver init may fail, deal with it. Also, destroy the created workqueue on any subsequent error. Fixes: 0f54761d167f ("qeth: Support VEPA mode") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 5ec47c6ebaa6..9a08b545d018 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -6540,10 +6540,14 @@ static int __init qeth_core_init(void) mutex_init(&qeth_mod_mutex); qeth_wq = create_singlethread_workqueue("qeth_wq"); + if (!qeth_wq) { + rc = -ENOMEM; + goto out_err; + } rc = qeth_register_dbf_views(); if (rc) - goto out_err; + goto dbf_err; qeth_core_root_dev = root_device_register("qeth"); rc = PTR_ERR_OR_ZERO(qeth_core_root_dev); if (rc) @@ -6580,6 +6584,8 @@ slab_err: root_device_unregister(qeth_core_root_dev); register_err: qeth_unregister_dbf_views(); +dbf_err: + destroy_workqueue(qeth_wq); out_err: pr_err("Initializing the qeth device driver failed\n"); return rc; -- cgit v1.2.1 From bcacfcbc82b4235d280ed9b067aa4567f4a0c756 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Apr 2018 12:52:09 +0200 Subject: s390/qeth: fix MAC address update sequence When changing the MAC address on a L2 qeth device, current code first unregisters the old address, then registers the new one. If HW rejects the new address (or the IO fails), the device ends up with no operable address at all. Re-order the code flow so that the old address only gets dropped if the new address was registered successfully. While at it, add logic to catch some corner-cases. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 55 +++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 2ad6f12f3d49..36f9b74848fe 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -121,13 +121,10 @@ static int qeth_l2_send_setmac(struct qeth_card *card, __u8 *mac) QETH_CARD_TEXT(card, 2, "L2Setmac"); rc = qeth_l2_send_setdelmac(card, mac, IPA_CMD_SETVMAC); if (rc == 0) { - card->info.mac_bits |= QETH_LAYER2_MAC_REGISTERED; - ether_addr_copy(card->dev->dev_addr, mac); dev_info(&card->gdev->dev, - "MAC address %pM successfully registered on device %s\n", - card->dev->dev_addr, card->dev->name); + "MAC address %pM successfully registered on device %s\n", + mac, card->dev->name); } else { - card->info.mac_bits &= ~QETH_LAYER2_MAC_REGISTERED; switch (rc) { case -EEXIST: dev_warn(&card->gdev->dev, @@ -142,19 +139,6 @@ static int qeth_l2_send_setmac(struct qeth_card *card, __u8 *mac) return rc; } -static int qeth_l2_send_delmac(struct qeth_card *card, __u8 *mac) -{ - int rc; - - QETH_CARD_TEXT(card, 2, "L2Delmac"); - if (!(card->info.mac_bits & QETH_LAYER2_MAC_REGISTERED)) - return 0; - rc = qeth_l2_send_setdelmac(card, mac, IPA_CMD_DELVMAC); - if (rc == 0) - card->info.mac_bits &= ~QETH_LAYER2_MAC_REGISTERED; - return rc; -} - static int qeth_l2_write_mac(struct qeth_card *card, u8 *mac) { enum qeth_ipa_cmds cmd = is_multicast_ether_addr_64bits(mac) ? @@ -519,6 +503,7 @@ static int qeth_l2_set_mac_address(struct net_device *dev, void *p) { struct sockaddr *addr = p; struct qeth_card *card = dev->ml_priv; + u8 old_addr[ETH_ALEN]; int rc = 0; QETH_CARD_TEXT(card, 3, "setmac"); @@ -530,14 +515,35 @@ static int qeth_l2_set_mac_address(struct net_device *dev, void *p) return -EOPNOTSUPP; } QETH_CARD_HEX(card, 3, addr->sa_data, ETH_ALEN); + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + if (qeth_wait_for_threads(card, QETH_RECOVER_THREAD)) { QETH_CARD_TEXT(card, 3, "setmcREC"); return -ERESTARTSYS; } - rc = qeth_l2_send_delmac(card, &card->dev->dev_addr[0]); - if (!rc || (rc == -ENOENT)) - rc = qeth_l2_send_setmac(card, addr->sa_data); - return rc ? -EINVAL : 0; + + if (!qeth_card_hw_is_reachable(card)) { + ether_addr_copy(dev->dev_addr, addr->sa_data); + return 0; + } + + /* don't register the same address twice */ + if (ether_addr_equal_64bits(dev->dev_addr, addr->sa_data) && + (card->info.mac_bits & QETH_LAYER2_MAC_REGISTERED)) + return 0; + + /* add the new address, switch over, drop the old */ + rc = qeth_l2_send_setmac(card, addr->sa_data); + if (rc) + return rc; + ether_addr_copy(old_addr, dev->dev_addr); + ether_addr_copy(dev->dev_addr, addr->sa_data); + + if (card->info.mac_bits & QETH_LAYER2_MAC_REGISTERED) + qeth_l2_remove_mac(card, old_addr); + card->info.mac_bits |= QETH_LAYER2_MAC_REGISTERED; + return 0; } static void qeth_promisc_to_bridge(struct qeth_card *card) @@ -1067,8 +1073,9 @@ static int __qeth_l2_set_online(struct ccwgroup_device *gdev, int recovery_mode) goto out_remove; } - if (card->info.type != QETH_CARD_TYPE_OSN) - qeth_l2_send_setmac(card, &card->dev->dev_addr[0]); + if (card->info.type != QETH_CARD_TYPE_OSN && + !qeth_l2_send_setmac(card, card->dev->dev_addr)) + card->info.mac_bits |= QETH_LAYER2_MAC_REGISTERED; if (qeth_is_diagass_supported(card, QETH_DIAGS_CMD_TRAP)) { if (card->info.hwtrap && -- cgit v1.2.1 From db71bbbd11a4d314f0fa3fbf3369b71cf33ce33c Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Apr 2018 12:52:10 +0200 Subject: s390/qeth: fix request-side race during cmd IO timeout Submitting a cmd IO request (usually on the WRITE device, but for IDX also on the READ device) is currently done with ccw_device_start() and a manual timeout in the caller. On timeout, the caller cleans up the related resources (eg. IO buffer). But 1) the IO might still be active and utilize those resources, and 2) when the IO completes, qeth_irq() will attempt to clean up the same resources again. Instead of introducing additional resource locking, switch to ccw_device_start_timeout() to ensure IO termination after timeout, and let the IRQ handler alone deal with cleaning up after a request. This also removes a stray write->irq_pending reset from clear_ipacmd_list(). The routine doesn't terminate any pending IO on the WRITE device, so this should be handled properly via IO timeout in the IRQ handler. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 51 ++++++++++++++++++++------------------- drivers/s390/net/qeth_core_mpc.h | 12 +++++++++ drivers/s390/net/qeth_l2_main.c | 4 +-- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9a08b545d018..9b22d5d496ae 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -706,7 +706,6 @@ void qeth_clear_ipacmd_list(struct qeth_card *card) qeth_put_reply(reply); } spin_unlock_irqrestore(&card->lock, flags); - atomic_set(&card->write.irq_pending, 0); } EXPORT_SYMBOL_GPL(qeth_clear_ipacmd_list); @@ -1098,14 +1097,9 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, { int rc; int cstat, dstat; + struct qeth_cmd_buffer *iob = NULL; struct qeth_channel *channel; struct qeth_card *card; - struct qeth_cmd_buffer *iob; - - if (__qeth_check_irb_error(cdev, intparm, irb)) - return; - cstat = irb->scsw.cmd.cstat; - dstat = irb->scsw.cmd.dstat; card = CARD_FROM_CDEV(cdev); if (!card) @@ -1123,6 +1117,19 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, channel = &card->data; QETH_CARD_TEXT(card, 5, "data"); } + + if (qeth_intparm_is_iob(intparm)) + iob = (struct qeth_cmd_buffer *) __va((addr_t)intparm); + + if (__qeth_check_irb_error(cdev, intparm, irb)) { + /* IO was terminated, free its resources. */ + if (iob) + qeth_release_buffer(iob->channel, iob); + atomic_set(&channel->irq_pending, 0); + wake_up(&card->wait_q); + return; + } + atomic_set(&channel->irq_pending, 0); if (irb->scsw.cmd.fctl & (SCSW_FCTL_CLEAR_FUNC)) @@ -1146,6 +1153,10 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, /* we don't have to handle this further */ intparm = 0; } + + cstat = irb->scsw.cmd.cstat; + dstat = irb->scsw.cmd.dstat; + if ((dstat & DEV_STAT_UNIT_EXCEP) || (dstat & DEV_STAT_UNIT_CHECK) || (cstat)) { @@ -1184,11 +1195,8 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, channel->state == CH_STATE_UP) __qeth_issue_next_read(card); - if (intparm) { - iob = (struct qeth_cmd_buffer *) __va((addr_t)intparm); - if (iob->callback) - iob->callback(iob->channel, iob); - } + if (iob && iob->callback) + iob->callback(iob->channel, iob); out: wake_up(&card->wait_q); @@ -1859,8 +1867,8 @@ static int qeth_idx_activate_get_answer(struct qeth_channel *channel, atomic_cmpxchg(&channel->irq_pending, 0, 1) == 0); QETH_DBF_TEXT(SETUP, 6, "noirqpnd"); spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_start(channel->ccwdev, - &channel->ccw, (addr_t) iob, 0, 0); + rc = ccw_device_start_timeout(channel->ccwdev, &channel->ccw, + (addr_t) iob, 0, 0, QETH_TIMEOUT); spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); if (rc) { @@ -1877,7 +1885,6 @@ static int qeth_idx_activate_get_answer(struct qeth_channel *channel, if (channel->state != CH_STATE_UP) { rc = -ETIME; QETH_DBF_TEXT_(SETUP, 2, "3err%d", rc); - qeth_clear_cmd_buffers(channel); } else rc = 0; return rc; @@ -1931,8 +1938,8 @@ static int qeth_idx_activate_channel(struct qeth_channel *channel, atomic_cmpxchg(&channel->irq_pending, 0, 1) == 0); QETH_DBF_TEXT(SETUP, 6, "noirqpnd"); spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_start(channel->ccwdev, - &channel->ccw, (addr_t) iob, 0, 0); + rc = ccw_device_start_timeout(channel->ccwdev, &channel->ccw, + (addr_t) iob, 0, 0, QETH_TIMEOUT); spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); if (rc) { @@ -1953,7 +1960,6 @@ static int qeth_idx_activate_channel(struct qeth_channel *channel, QETH_DBF_MESSAGE(2, "%s IDX activate timed out\n", dev_name(&channel->ccwdev->dev)); QETH_DBF_TEXT_(SETUP, 2, "2err%d", -ETIME); - qeth_clear_cmd_buffers(channel); return -ETIME; } return qeth_idx_activate_get_answer(channel, idx_reply_cb); @@ -2155,8 +2161,8 @@ int qeth_send_control_data(struct qeth_card *card, int len, QETH_CARD_TEXT(card, 6, "noirqpnd"); spin_lock_irqsave(get_ccwdev_lock(card->write.ccwdev), flags); - rc = ccw_device_start(card->write.ccwdev, &card->write.ccw, - (addr_t) iob, 0, 0); + rc = ccw_device_start_timeout(CARD_WDEV(card), &card->write.ccw, + (addr_t) iob, 0, 0, event_timeout); spin_unlock_irqrestore(get_ccwdev_lock(card->write.ccwdev), flags); if (rc) { QETH_DBF_MESSAGE(2, "%s qeth_send_control_data: " @@ -2188,8 +2194,6 @@ int qeth_send_control_data(struct qeth_card *card, int len, } } - if (reply->rc == -EIO) - goto error; rc = reply->rc; qeth_put_reply(reply); return rc; @@ -2200,9 +2204,6 @@ time_err: list_del_init(&reply->list); spin_unlock_irqrestore(&reply->card->lock, flags); atomic_inc(&reply->received); -error: - atomic_set(&card->write.irq_pending, 0); - qeth_release_buffer(iob->channel, iob); rc = reply->rc; qeth_put_reply(reply); return rc; diff --git a/drivers/s390/net/qeth_core_mpc.h b/drivers/s390/net/qeth_core_mpc.h index 619f897b4bb0..f4d1ec0b8f5a 100644 --- a/drivers/s390/net/qeth_core_mpc.h +++ b/drivers/s390/net/qeth_core_mpc.h @@ -35,6 +35,18 @@ extern unsigned char IPA_PDU_HEADER[]; #define QETH_HALT_CHANNEL_PARM -11 #define QETH_RCD_PARM -12 +static inline bool qeth_intparm_is_iob(unsigned long intparm) +{ + switch (intparm) { + case QETH_CLEAR_CHANNEL_PARM: + case QETH_HALT_CHANNEL_PARM: + case QETH_RCD_PARM: + case 0: + return false; + } + return true; +} + /*****************************************************************************/ /* IP Assist related definitions */ /*****************************************************************************/ diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 36f9b74848fe..b8079f2a65b3 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1345,8 +1345,8 @@ static int qeth_osn_send_control_data(struct qeth_card *card, int len, qeth_prepare_control_data(card, len, iob); QETH_CARD_TEXT(card, 6, "osnoirqp"); spin_lock_irqsave(get_ccwdev_lock(card->write.ccwdev), flags); - rc = ccw_device_start(card->write.ccwdev, &card->write.ccw, - (addr_t) iob, 0, 0); + rc = ccw_device_start_timeout(CARD_WDEV(card), &card->write.ccw, + (addr_t) iob, 0, 0, QETH_IPA_TIMEOUT); spin_unlock_irqrestore(get_ccwdev_lock(card->write.ccwdev), flags); if (rc) { QETH_DBF_MESSAGE(2, "qeth_osn_send_control_data: " -- cgit v1.2.1 From b7493e91c11a757cf0f8ab26989642ee4bb2c642 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Apr 2018 12:52:11 +0200 Subject: s390/qeth: use Read device to query hypervisor for MAC For z/VM NICs, qeth needs to consider which of the three CCW devices in an MPC group it uses for requesting a managed MAC address. On the Base device, the hypervisor returns a default MAC which is pre-assigned when creating the NIC (this MAC is also returned by the READ MAC primitive). Querying any other device results in the allocation of an additional MAC address. For consistency with READ MAC and to avoid using up more addresses than necessary, it is preferable to use the NIC's default MAC. So switch the the diag26c over to using a NIC's Read device, which should always be identical to the Base device. Fixes: ec61bd2fd2a2 ("s390/qeth: use diag26c to get MAC address on L2") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9b22d5d496ae..dffd820731f2 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -4835,7 +4835,7 @@ int qeth_vm_request_mac(struct qeth_card *card) goto out; } - ccw_device_get_id(CARD_DDEV(card), &id); + ccw_device_get_id(CARD_RDEV(card), &id); request->resp_buf_len = sizeof(*response); request->resp_version = DIAG26C_VERSION2; request->op_code = DIAG26C_GET_MAC; -- cgit v1.2.1 From 5411b6187adf62909e3b998ac782e722904c7487 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 19 Apr 2018 16:20:48 +0200 Subject: l2tp: fix {pppol2tp, l2tp_dfs}_seq_stop() in case of seq_file overflow Commit 0e0c3fee3a59 ("l2tp: hold reference on tunnels printed in pppol2tp proc file") assumed that if pppol2tp_seq_stop() was called with non-NULL private data (the 'v' pointer), then pppol2tp_seq_start() would not be called again. It turns out that this isn't guaranteed, and overflowing the seq_file's buffer in pppol2tp_seq_show() is a way to get into this situation. Therefore, pppol2tp_seq_stop() needs to reset pd->tunnel, so that pppol2tp_seq_start() won't drop a reference again if it gets called. We also have to clear pd->session, because the rest of the code expects a non-NULL tunnel when pd->session is set. The l2tp_debugfs module has the same issue. Fix it in the same way. Fixes: 0e0c3fee3a59 ("l2tp: hold reference on tunnels printed in pppol2tp proc file") Fixes: f726214d9b23 ("l2tp: hold reference on tunnels printed in l2tp/tunnels debugfs file") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_debugfs.c | 5 ++++- net/l2tp/l2tp_ppp.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index b8f9d45bfeb1..7f1e842ef05a 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -106,8 +106,11 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v) return; /* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */ - if (pd->tunnel) + if (pd->tunnel) { l2tp_tunnel_dec_refcount(pd->tunnel); + pd->tunnel = NULL; + pd->session = NULL; + } } static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 7d0c963680e6..1404bc1c1bb7 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1618,8 +1618,11 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v) return; /* Drop reference taken by last invocation of pppol2tp_next_tunnel() */ - if (pd->tunnel) + if (pd->tunnel) { l2tp_tunnel_dec_refcount(pd->tunnel); + pd->tunnel = NULL; + pd->session = NULL; + } } static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) -- cgit v1.2.1 From b905ef9ab90115d001c1658259af4b1c65088779 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 19 Apr 2018 12:25:38 -0700 Subject: llc: delete timers synchronously in llc_sk_free() The connection timers of an llc sock could be still flying after we delete them in llc_sk_free(), and even possibly after we free the sock. We could just wait synchronously here in case of troubles. Note, I leave other call paths as they are, since they may not have to wait, at least we can change them to synchronously when needed. Also, move the code to net/llc/llc_conn.c, which is apparently a better place. Reported-by: Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/llc_conn.h | 1 + net/llc/llc_c_ac.c | 9 +-------- net/llc/llc_conn.c | 22 +++++++++++++++++++++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index 5c40f118c0fa..df528a623548 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -97,6 +97,7 @@ static __inline__ char llc_backlog_type(struct sk_buff *skb) struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); +void llc_sk_stop_all_timers(struct sock *sk, bool sync); void llc_sk_free(struct sock *sk); void llc_sk_reset(struct sock *sk); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 163121192aca..4d78375f9872 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -1099,14 +1099,7 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb) { - struct llc_sock *llc = llc_sk(sk); - - del_timer(&llc->pf_cycle_timer.timer); - del_timer(&llc->ack_timer.timer); - del_timer(&llc->rej_sent_timer.timer); - del_timer(&llc->busy_state_timer.timer); - llc->ack_must_be_send = 0; - llc->ack_pf = 0; + llc_sk_stop_all_timers(sk, false); return 0; } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 110e32bcb399..c0ac522b48a1 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -961,6 +961,26 @@ out: return sk; } +void llc_sk_stop_all_timers(struct sock *sk, bool sync) +{ + struct llc_sock *llc = llc_sk(sk); + + if (sync) { + del_timer_sync(&llc->pf_cycle_timer.timer); + del_timer_sync(&llc->ack_timer.timer); + del_timer_sync(&llc->rej_sent_timer.timer); + del_timer_sync(&llc->busy_state_timer.timer); + } else { + del_timer(&llc->pf_cycle_timer.timer); + del_timer(&llc->ack_timer.timer); + del_timer(&llc->rej_sent_timer.timer); + del_timer(&llc->busy_state_timer.timer); + } + + llc->ack_must_be_send = 0; + llc->ack_pf = 0; +} + /** * llc_sk_free - Frees a LLC socket * @sk - socket to free @@ -973,7 +993,7 @@ void llc_sk_free(struct sock *sk) llc->state = LLC_CONN_OUT_OF_SVC; /* Stop all (possibly) running timers */ - llc_conn_ac_stop_all_timers(sk, NULL); + llc_sk_stop_all_timers(sk, true); #ifdef DEBUG_LLC_CONN_ALLOC printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__, skb_queue_len(&llc->pdu_unack_q), -- cgit v1.2.1 From 5e391dc5a8d801a2410d0032ad4a428d1d61800c Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 19 Apr 2018 22:49:09 +0300 Subject: net: ethernet: ti: cpsw: fix tx vlan priority mapping The CPDMA_TX_PRIORITY_MAP in real is vlan pcp field priority mapping register and basically replaces vlan pcp field for tagged packets. So, set it to be 1:1 mapping. Otherwise, it will cause unexpected change of egress vlan tagged packets, like prio 2 -> prio 5. Fixes: e05107e6b747 ("net: ethernet: ti: cpsw: add multi queue support") Reviewed-by: Grygorii Strashko Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 30371274409d..74f828412055 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -129,7 +129,7 @@ do { \ #define RX_PRIORITY_MAPPING 0x76543210 #define TX_PRIORITY_MAPPING 0x33221100 -#define CPDMA_TX_PRIORITY_MAP 0x01234567 +#define CPDMA_TX_PRIORITY_MAP 0x76543210 #define CPSW_VLAN_AWARE BIT(1) #define CPSW_RX_VLAN_ENCAP BIT(2) -- cgit v1.2.1 From 3a04ce7130a7e5dad4e78d45d50313747f8c830f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 19 Apr 2018 21:54:34 -0700 Subject: llc: fix NULL pointer deref for SOCK_ZAPPED For SOCK_ZAPPED socket, we don't need to care about llc->sap, so we should just skip these refcount functions in this case. Fixes: f7e43672683b ("llc: hold llc_sap before release_sock()") Reported-by: kernel test robot Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/llc/af_llc.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 6d29b2b94e84..cb80ebb38311 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -189,7 +189,6 @@ static int llc_ui_release(struct socket *sock) { struct sock *sk = sock->sk; struct llc_sock *llc; - struct llc_sap *sap; if (unlikely(sk == NULL)) goto out; @@ -200,15 +199,19 @@ static int llc_ui_release(struct socket *sock) llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); - sap = llc->sap; - /* Hold this for release_sock(), so that llc_backlog_rcv() could still - * use it. - */ - llc_sap_hold(sap); - if (!sock_flag(sk, SOCK_ZAPPED)) + if (!sock_flag(sk, SOCK_ZAPPED)) { + struct llc_sap *sap = llc->sap; + + /* Hold this for release_sock(), so that llc_backlog_rcv() + * could still use it. + */ + llc_sap_hold(sap); llc_sap_remove_socket(llc->sap, sk); - release_sock(sk); - llc_sap_put(sap); + release_sock(sk); + llc_sap_put(sap); + } else { + release_sock(sk); + } if (llc->dev) dev_put(llc->dev); sock_put(sk); -- cgit v1.2.1 From a957fa190aa9d9168b33d460a5241a6d088c6265 Mon Sep 17 00:00:00 2001 From: Ahmed Abdelsalam Date: Fri, 20 Apr 2018 15:58:05 +0200 Subject: ipv6: sr: fix NULL pointer dereference in seg6_do_srh_encap()- v4 pkts In case of seg6 in encap mode, seg6_do_srh_encap() calls set_tun_src() in order to set the src addr of outer IPv6 header. The net_device is required for set_tun_src(). However calling ip6_dst_idev() on dst_entry in case of IPv4 traffic results on the following bug. Using just dst->dev should fix this BUG. [ 196.242461] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 196.242975] PGD 800000010f076067 P4D 800000010f076067 PUD 10f060067 PMD 0 [ 196.243329] Oops: 0000 [#1] SMP PTI [ 196.243468] Modules linked in: nfsd auth_rpcgss nfs_acl nfs lockd grace fscache sunrpc crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd input_leds glue_helper led_class pcspkr serio_raw mac_hid video autofs4 hid_generic usbhid hid e1000 i2c_piix4 ahci pata_acpi libahci [ 196.244362] CPU: 2 PID: 1089 Comm: ping Not tainted 4.16.0+ #1 [ 196.244606] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 196.244968] RIP: 0010:seg6_do_srh_encap+0x1ac/0x300 [ 196.245236] RSP: 0018:ffffb2ce00b23a60 EFLAGS: 00010202 [ 196.245464] RAX: 0000000000000000 RBX: ffff8c7f53eea300 RCX: 0000000000000000 [ 196.245742] RDX: 0000f10000000000 RSI: ffff8c7f52085a6c RDI: ffff8c7f41166850 [ 196.246018] RBP: ffffb2ce00b23aa8 R08: 00000000000261e0 R09: ffff8c7f41166800 [ 196.246294] R10: ffffdce5040ac780 R11: ffff8c7f41166828 R12: ffff8c7f41166808 [ 196.246570] R13: ffff8c7f52085a44 R14: ffffffffb73211c0 R15: ffff8c7e69e44200 [ 196.246846] FS: 00007fc448789700(0000) GS:ffff8c7f59d00000(0000) knlGS:0000000000000000 [ 196.247286] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 196.247526] CR2: 0000000000000000 CR3: 000000010f05a000 CR4: 00000000000406e0 [ 196.247804] Call Trace: [ 196.247972] seg6_do_srh+0x15b/0x1c0 [ 196.248156] seg6_output+0x3c/0x220 [ 196.248341] ? prandom_u32+0x14/0x20 [ 196.248526] ? ip_idents_reserve+0x6c/0x80 [ 196.248723] ? __ip_select_ident+0x90/0x100 [ 196.248923] ? ip_append_data.part.50+0x6c/0xd0 [ 196.249133] lwtunnel_output+0x44/0x70 [ 196.249328] ip_send_skb+0x15/0x40 [ 196.249515] raw_sendmsg+0x8c3/0xac0 [ 196.249701] ? _copy_from_user+0x2e/0x60 [ 196.249897] ? rw_copy_check_uvector+0x53/0x110 [ 196.250106] ? _copy_from_user+0x2e/0x60 [ 196.250299] ? copy_msghdr_from_user+0xce/0x140 [ 196.250508] sock_sendmsg+0x36/0x40 [ 196.250690] ___sys_sendmsg+0x292/0x2a0 [ 196.250881] ? _cond_resched+0x15/0x30 [ 196.251074] ? copy_termios+0x1e/0x70 [ 196.251261] ? _copy_to_user+0x22/0x30 [ 196.251575] ? tty_mode_ioctl+0x1c3/0x4e0 [ 196.251782] ? _cond_resched+0x15/0x30 [ 196.251972] ? mutex_lock+0xe/0x30 [ 196.252152] ? vvar_fault+0xd2/0x110 [ 196.252337] ? __do_fault+0x1f/0xc0 [ 196.252521] ? __handle_mm_fault+0xc1f/0x12d0 [ 196.252727] ? __sys_sendmsg+0x63/0xa0 [ 196.252919] __sys_sendmsg+0x63/0xa0 [ 196.253107] do_syscall_64+0x72/0x200 [ 196.253305] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 196.253530] RIP: 0033:0x7fc4480b0690 [ 196.253715] RSP: 002b:00007ffde9f252f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 196.254053] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007fc4480b0690 [ 196.254331] RDX: 0000000000000000 RSI: 000000000060a360 RDI: 0000000000000003 [ 196.254608] RBP: 00007ffde9f253f0 R08: 00000000002d1e81 R09: 0000000000000002 [ 196.254884] R10: 00007ffde9f250c0 R11: 0000000000000246 R12: 0000000000b22070 [ 196.255205] R13: 20c49ba5e353f7cf R14: 431bde82d7b634db R15: 00007ffde9f278fe [ 196.255484] Code: a5 0f b6 45 c0 41 88 41 28 41 0f b6 41 2c 48 c1 e0 04 49 8b 54 01 38 49 8b 44 01 30 49 89 51 20 49 89 41 18 48 8b 83 b0 00 00 00 <48> 8b 30 49 8b 86 08 0b 00 00 48 8b 40 20 48 8b 50 08 48 0b 10 [ 196.256190] RIP: seg6_do_srh_encap+0x1ac/0x300 RSP: ffffb2ce00b23a60 [ 196.256445] CR2: 0000000000000000 [ 196.256676] ---[ end trace 71af7d093603885c ]--- Fixes: 8936ef7604c11 ("ipv6: sr: fix NULL pointer dereference when setting encap source address") Signed-off-by: Ahmed Abdelsalam Acked-by: David Lebrun Signed-off-by: David S. Miller --- net/ipv6/seg6_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index f343e6f0fc95..5fe139484919 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -136,7 +136,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; - set_tun_src(net, ip6_dst_idev(dst)->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { -- cgit v1.2.1 From 7c5aba211dd61f41d737a2c51729eb9fdcd3edf4 Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Fri, 20 Apr 2018 12:11:11 -0700 Subject: strparser: Do not call mod_delayed_work with a timeout of LONG_MAX struct sock's sk_rcvtimeo is initialized to LONG_MAX/MAX_SCHEDULE_TIMEOUT in sock_init_data. Calling mod_delayed_work with a timeout of LONG_MAX causes spurious execution of the work function. timer->expires is set equal to jiffies + LONG_MAX. When timer_base->clk falls behind the current value of jiffies, the delta between timer_base->clk and jiffies + LONG_MAX causes the expiration to be in the past. Returning early from strp_start_timer if timeo == LONG_MAX solves this problem. Found while testing net/tls_sw recv path. Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages") Reviewed-by: Tejun Heo Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/strparser/strparser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 805b139756db..092bebc70048 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -67,7 +67,7 @@ static void strp_abort_strp(struct strparser *strp, int err) static void strp_start_timer(struct strparser *strp, long timeo) { - if (timeo) + if (timeo && timeo != LONG_MAX) mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo); } -- cgit v1.2.1 From f6cd14537ff9919081be19b9c53b9b19c0d3ea97 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 20 Apr 2018 15:15:03 -0400 Subject: net: sched: ife: signal not finding metaid We need to record stats for received metadata that we dont know how to process. Have find_decode_metaid() return -ENOENT to capture this. Signed-off-by: Alexander Aring Reviewed-by: Yotam Gigi Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index a5994cf0512b..49b8ab551fbe 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -652,7 +652,7 @@ static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, } } - return 0; + return -ENOENT; } static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, -- cgit v1.2.1 From cc74eddd0ff325d57373cea99f642b787d7f76f5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 20 Apr 2018 15:15:04 -0400 Subject: net: sched: ife: handle malformed tlv length There is currently no handling to check on a invalid tlv length. This patch adds such handling to avoid killing the kernel with a malformed ife packet. Signed-off-by: Alexander Aring Reviewed-by: Yotam Gigi Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/ife.h | 3 ++- net/ife/ife.c | 35 +++++++++++++++++++++++++++++++++-- net/sched/act_ife.c | 7 ++++++- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/include/net/ife.h b/include/net/ife.h index 44b9c00f7223..e117617e3c34 100644 --- a/include/net/ife.h +++ b/include/net/ife.h @@ -12,7 +12,8 @@ void *ife_encode(struct sk_buff *skb, u16 metalen); void *ife_decode(struct sk_buff *skb, u16 *metalen); -void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen); +void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype, + u16 *dlen, u16 *totlen); int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval); diff --git a/net/ife/ife.c b/net/ife/ife.c index 7d1ec76e7f43..7fbe70a0af4b 100644 --- a/net/ife/ife.c +++ b/net/ife/ife.c @@ -92,12 +92,43 @@ struct meta_tlvhdr { __be16 len; }; +static bool __ife_tlv_meta_valid(const unsigned char *skbdata, + const unsigned char *ifehdr_end) +{ + const struct meta_tlvhdr *tlv; + u16 tlvlen; + + if (unlikely(skbdata + sizeof(*tlv) > ifehdr_end)) + return false; + + tlv = (const struct meta_tlvhdr *)skbdata; + tlvlen = ntohs(tlv->len); + + /* tlv length field is inc header, check on minimum */ + if (tlvlen < NLA_HDRLEN) + return false; + + /* overflow by NLA_ALIGN check */ + if (NLA_ALIGN(tlvlen) < tlvlen) + return false; + + if (unlikely(skbdata + NLA_ALIGN(tlvlen) > ifehdr_end)) + return false; + + return true; +} + /* Caller takes care of presenting data in network order */ -void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen) +void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype, + u16 *dlen, u16 *totlen) { - struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata; + struct meta_tlvhdr *tlv; + + if (!__ife_tlv_meta_valid(skbdata, ifehdr_end)) + return NULL; + tlv = (struct meta_tlvhdr *)skbdata; *dlen = ntohs(tlv->len) - NLA_HDRLEN; *attrtype = ntohs(tlv->type); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 49b8ab551fbe..8527cfdc446d 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -682,7 +682,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u16 mtype; u16 dlen; - curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL); + curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype, + &dlen, NULL); + if (!curr_data) { + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); + return TC_ACT_SHOT; + } if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata -- cgit v1.2.1 From d57493d6d1be26c8ac8516a4463bfe24956978eb Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 20 Apr 2018 15:15:05 -0400 Subject: net: sched: ife: check on metadata length This patch checks if sk buffer is available to dererence ife header. If not then NULL will returned to signal an malformed ife packet. This avoids to crashing the kernel from outside. Signed-off-by: Alexander Aring Reviewed-by: Yotam Gigi Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/ife/ife.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ife/ife.c b/net/ife/ife.c index 7fbe70a0af4b..13bbf8cb6a39 100644 --- a/net/ife/ife.c +++ b/net/ife/ife.c @@ -69,6 +69,9 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen) int total_pull; u16 ifehdrln; + if (!pskb_may_pull(skb, skb->dev->hard_header_len + IFE_METAHDRLEN)) + return NULL; + ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len); ifehdrln = ntohs(ifehdr->metalen); total_pull = skb->dev->hard_header_len + ifehdrln; -- cgit v1.2.1 From 660e309ddd6aa99bb4d2a859c4a0b56965e744ef Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 20 Apr 2018 14:25:32 -0500 Subject: ibmvnic: Clean actual number of RX or TX pools Avoid using value stored in the login response buffer when cleaning TX and RX buffer pools since these could be inconsistent depending on the device state. Instead use the field in the driver's private data that tracks the number of active pools. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 2df01ad98df7..6e8d6a6f6aaf 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1128,7 +1128,7 @@ static void clean_rx_pools(struct ibmvnic_adapter *adapter) if (!adapter->rx_pool) return; - rx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); + rx_scrqs = adapter->num_active_rx_pools; rx_entries = adapter->req_rx_add_entries_per_subcrq; /* Free any remaining skbs in the rx buffer pools */ @@ -1177,7 +1177,7 @@ static void clean_tx_pools(struct ibmvnic_adapter *adapter) if (!adapter->tx_pool || !adapter->tso_pool) return; - tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); + tx_scrqs = adapter->num_active_tx_pools; /* Free any remaining skbs in the tx buffer pools */ for (i = 0; i < tx_scrqs; i++) { -- cgit v1.2.1 From 7e5a206ab686f098367b61aca989f5cdfa8114a3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 20 Apr 2018 15:57:30 +0200 Subject: tcp: don't read out-of-bounds opsize The old code reads the "opsize" variable from out-of-bounds memory (first byte behind the segment) if a broken TCP segment ends directly after an opcode that is neither EOL nor NOP. The result of the read isn't used for anything, so the worst thing that could theoretically happen is a pagefault; and since the physmap is usually mostly contiguous, even that seems pretty unlikely. The following C reproducer triggers the uninitialized read - however, you can't actually see anything happen unless you put something like a pr_warn() in tcp_parse_md5sig_option() to print the opsize. ==================================== #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void systemf(const char *command, ...) { char *full_command; va_list ap; va_start(ap, command); if (vasprintf(&full_command, command, ap) == -1) err(1, "vasprintf"); va_end(ap); printf("systemf: <<<%s>>>\n", full_command); system(full_command); } char *devname; int tun_alloc(char *name) { int fd = open("/dev/net/tun", O_RDWR); if (fd == -1) err(1, "open tun dev"); static struct ifreq req = { .ifr_flags = IFF_TUN|IFF_NO_PI }; strcpy(req.ifr_name, name); if (ioctl(fd, TUNSETIFF, &req)) err(1, "TUNSETIFF"); devname = req.ifr_name; printf("device name: %s\n", devname); return fd; } #define IPADDR(a,b,c,d) (((a)<<0)+((b)<<8)+((c)<<16)+((d)<<24)) void sum_accumulate(unsigned int *sum, void *data, int len) { assert((len&2)==0); for (int i=0; i> 16) + (sum & 0xffff); sum = (sum >> 16) + (sum & 0xffff); return htons(~sum); } void fix_ip_sum(struct iphdr *ip) { unsigned int sum = 0; sum_accumulate(&sum, ip, sizeof(*ip)); ip->check = sum_final(sum); } void fix_tcp_sum(struct iphdr *ip, struct tcphdr *tcp) { unsigned int sum = 0; struct { unsigned int saddr; unsigned int daddr; unsigned char pad; unsigned char proto_num; unsigned short tcp_len; } fakehdr = { .saddr = ip->saddr, .daddr = ip->daddr, .proto_num = ip->protocol, .tcp_len = htons(ntohs(ip->tot_len) - ip->ihl*4) }; sum_accumulate(&sum, &fakehdr, sizeof(fakehdr)); sum_accumulate(&sum, tcp, tcp->doff*4); tcp->check = sum_final(sum); } int main(void) { int tun_fd = tun_alloc("inject_dev%d"); systemf("ip link set %s up", devname); systemf("ip addr add 192.168.42.1/24 dev %s", devname); struct { struct iphdr ip; struct tcphdr tcp; unsigned char tcp_opts[20]; } __attribute__((packed)) syn_packet = { .ip = { .ihl = sizeof(struct iphdr)/4, .version = 4, .tot_len = htons(sizeof(syn_packet)), .ttl = 30, .protocol = IPPROTO_TCP, /* FIXUP check */ .saddr = IPADDR(192,168,42,2), .daddr = IPADDR(192,168,42,1) }, .tcp = { .source = htons(1), .dest = htons(1337), .seq = 0x12345678, .doff = (sizeof(syn_packet.tcp)+sizeof(syn_packet.tcp_opts))/4, .syn = 1, .window = htons(64), .check = 0 /*FIXUP*/ }, .tcp_opts = { /* INVALID: trailing MD5SIG opcode after NOPs */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19 } }; fix_ip_sum(&syn_packet.ip); fix_tcp_sum(&syn_packet.ip, &syn_packet.tcp); while (1) { int write_res = write(tun_fd, &syn_packet, sizeof(syn_packet)); if (write_res != sizeof(syn_packet)) err(1, "packet write failed"); } } ==================================== Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.") Signed-off-by: Jann Horn Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 367def6ddeda..e51c644484dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3868,11 +3868,8 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int length = (th->doff << 2) - sizeof(*th); const u8 *ptr = (const u8 *)(th + 1); - /* If the TCP option is too short, we can short cut */ - if (length < TCPOLEN_MD5SIG) - return NULL; - - while (length > 0) { + /* If not enough data remaining, we can short cut */ + while (length >= TCPOLEN_MD5SIG) { int opcode = *ptr++; int opsize; -- cgit v1.2.1 From ddea788c63094f7c483783265563dd5b50052e28 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 22 Apr 2018 19:11:50 +0800 Subject: bonding: do not set slave_dev npinfo before slave_enable_netpoll in bond_enslave After Commit 8a8efa22f51b ("bonding: sync netpoll code with bridge"), it would set slave_dev npinfo in slave_enable_netpoll when enslaving a dev if bond->dev->npinfo was set. However now slave_dev npinfo is set with bond->dev->npinfo before calling slave_enable_netpoll. With slave_dev npinfo set, __netpoll_setup called in slave_enable_netpoll will not call slave dev's .ndo_netpoll_setup(). It causes that the lower dev of this slave dev can't set its npinfo. One way to reproduce it: # modprobe bonding # brctl addbr br0 # brctl addif br0 eth1 # ifconfig bond0 192.168.122.1/24 up # ifenslave bond0 eth2 # systemctl restart netconsole # ifenslave bond0 br0 # ifconfig eth2 down # systemctl restart netconsole The netpoll won't really work. This patch is to remove that slave_dev npinfo setting in bond_enslave(). Fixes: 8a8efa22f51b ("bonding: sync netpoll code with bridge") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b7b113018853..718e4914e3a0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1660,8 +1660,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, } /* switch(bond_mode) */ #ifdef CONFIG_NET_POLL_CONTROLLER - slave_dev->npinfo = bond->dev->npinfo; - if (slave_dev->npinfo) { + if (bond->dev->npinfo) { if (slave_enable_netpoll(new_slave)) { netdev_info(bond_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n"); res = -EBUSY; -- cgit v1.2.1 From aa8f8778493c85fff480cdf8b349b1e1dcb5f243 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Apr 2018 18:29:23 -0700 Subject: ipv6: add RTA_TABLE and RTA_PREFSRC to rtm_ipv6_policy KMSAN reported use of uninit-value that I tracked to lack of proper size check on RTA_TABLE attribute. I also believe RTA_PREFSRC lacks a similar check. Fixes: 86872cb57925 ("[IPv6] route: FIB6 configuration using struct fib6_config") Fixes: c3968a857a6b ("ipv6: RTA_PREFSRC support for ipv6 route source address selection") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 49b954d6d0fa..cde7d8251377 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3975,6 +3975,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, + [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, @@ -3986,6 +3987,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, + [RTA_TABLE] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.1 From b6a930fa88083b41d26ddf1cab95cbd740936c22 Mon Sep 17 00:00:00 2001 From: Jingju Hou Date: Mon, 23 Apr 2018 15:22:49 +0800 Subject: net: phy: marvell: clear wol event before setting it If WOL event happened once, the LED[2] interrupt pin will not be cleared unless we read the CSISR register. If interrupts are in use, the normal interrupt handling will clear the WOL event. Let's clear the WOL event before enabling it if !phy_interrupt_is_valid(). Signed-off-by: Jingju Hou Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index c22e8e383247..25e2a099b71c 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1393,6 +1393,15 @@ static int m88e1318_set_wol(struct phy_device *phydev, if (err < 0) goto error; + /* If WOL event happened once, the LED[2] interrupt pin + * will not be cleared unless we reading the interrupt status + * register. If interrupts are in use, the normal interrupt + * handling will clear the WOL event. Clear the WOL event + * before enabling it if !phy_interrupt_is_valid() + */ + if (!phy_interrupt_is_valid(phydev)) + phy_read(phydev, MII_M1011_IEVENT); + /* Enable the WOL interrupt */ err = __phy_modify(phydev, MII_88E1318S_PHY_CSIER, 0, MII_88E1318S_PHY_CSIER_WOL_EIE); -- cgit v1.2.1 From eb1c28c05894a4b1f6b56c5bf072205e64cfa280 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 23 Apr 2018 16:15:14 +0200 Subject: l2tp: check sockaddr length in pppol2tp_connect() Check sockaddr_len before dereferencing sp->sa_protocol, to ensure that it actually points to valid data. Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Reported-by: syzbot+a70ac890b23b1bf29f5c@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1404bc1c1bb7..1fd9e145076a 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -619,6 +619,13 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, lock_sock(sk); error = -EINVAL; + + if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) && + sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) && + sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) && + sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6)) + goto end; + if (sp->sa_protocol != PX_PROTO_OL2TP) goto end; -- cgit v1.2.1 From a49e2f5d5fb141884452ddb428f551b123d436b5 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 23 Apr 2018 16:38:27 +0200 Subject: pppoe: check sockaddr length in pppoe_connect() We must validate sockaddr_len, otherwise userspace can pass fewer data than we expect and we end up accessing invalid data. Fixes: 224cf5ad14c0 ("ppp: Move the PPP drivers") Reported-by: syzbot+4f03bdf92fdf9ef5ddab@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 1483bc7b01e1..7df07337d69c 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -620,6 +620,10 @@ static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, lock_sock(sk); error = -EINVAL; + + if (sockaddr_len != sizeof(struct sockaddr_pppox)) + goto end; + if (sp->sa_protocol != PX_PROTO_OE) goto end; -- cgit v1.2.1 From 4d945663a6a0acf3cbe45940503f2eb9584bfee7 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 23 Apr 2018 11:43:08 -0500 Subject: amd-xgbe: Add pre/post auto-negotiation phy hooks Add hooks to the driver auto-negotiation (AN) flow to allow the different phy implementations to perform any steps necessary to improve AN. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 16 ++++++++++++++-- drivers/net/ethernet/amd/xgbe/xgbe.h | 5 +++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 072b9f664597..e3d361e242aa 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -437,6 +437,9 @@ static void xgbe_an73_disable(struct xgbe_prv_data *pdata) static void xgbe_an_restart(struct xgbe_prv_data *pdata) { + if (pdata->phy_if.phy_impl.an_pre) + pdata->phy_if.phy_impl.an_pre(pdata); + switch (pdata->an_mode) { case XGBE_AN_MODE_CL73: case XGBE_AN_MODE_CL73_REDRV: @@ -453,6 +456,9 @@ static void xgbe_an_restart(struct xgbe_prv_data *pdata) static void xgbe_an_disable(struct xgbe_prv_data *pdata) { + if (pdata->phy_if.phy_impl.an_post) + pdata->phy_if.phy_impl.an_post(pdata); + switch (pdata->an_mode) { case XGBE_AN_MODE_CL73: case XGBE_AN_MODE_CL73_REDRV: @@ -637,11 +643,11 @@ static enum xgbe_an xgbe_an73_incompat_link(struct xgbe_prv_data *pdata) return XGBE_AN_NO_LINK; } - xgbe_an73_disable(pdata); + xgbe_an_disable(pdata); xgbe_switch_mode(pdata); - xgbe_an73_restart(pdata); + xgbe_an_restart(pdata); return XGBE_AN_INCOMPAT_LINK; } @@ -820,6 +826,9 @@ static void xgbe_an37_state_machine(struct xgbe_prv_data *pdata) pdata->an_result = pdata->an_state; pdata->an_state = XGBE_AN_READY; + if (pdata->phy_if.phy_impl.an_post) + pdata->phy_if.phy_impl.an_post(pdata); + netif_dbg(pdata, link, pdata->netdev, "CL37 AN result: %s\n", xgbe_state_as_string(pdata->an_result)); } @@ -903,6 +912,9 @@ again: pdata->kx_state = XGBE_RX_BPA; pdata->an_start = 0; + if (pdata->phy_if.phy_impl.an_post) + pdata->phy_if.phy_impl.an_post(pdata); + netif_dbg(pdata, link, pdata->netdev, "CL73 AN result: %s\n", xgbe_state_as_string(pdata->an_result)); } diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index ad102c8bac7b..fa0b51ea1b95 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -833,6 +833,7 @@ struct xgbe_hw_if { /* This structure represents implementation specific routines for an * implementation of a PHY. All routines are required unless noted below. * Optional routines: + * an_pre, an_post * kr_training_pre, kr_training_post */ struct xgbe_phy_impl_if { @@ -875,6 +876,10 @@ struct xgbe_phy_impl_if { /* Process results of auto-negotiation */ enum xgbe_mode (*an_outcome)(struct xgbe_prv_data *); + /* Pre/Post auto-negotiation support */ + void (*an_pre)(struct xgbe_prv_data *); + void (*an_post)(struct xgbe_prv_data *); + /* Pre/Post KR training enablement support */ void (*kr_training_pre)(struct xgbe_prv_data *); void (*kr_training_post)(struct xgbe_prv_data *); -- cgit v1.2.1 From 96f4d430c507ed4856048c2dc9c1a2ea5b5e74e4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 23 Apr 2018 11:43:17 -0500 Subject: amd-xgbe: Improve KR auto-negotiation and training Update xgbe-phy-v2.c to make use of the auto-negotiation (AN) phy hooks to improve the ability to successfully complete Clause 73 AN when running at 10gbps. Hardware can sometimes have issues with CDR lock when the AN DME page exchange is being performed. The AN and KR training hooks are used as follows: - The pre AN hook is used to disable CDR tracking in the PHY so that the DME page exchange can be successfully and consistently completed. - The post KR training hook is used to re-enable the CDR tracking so that KR training can successfully complete. - The post AN hook is used to check for an unsuccessful AN which will increase a CDR tracking enablement delay (up to a maximum value). Add two debugfs entries to allow control over use of the CDR tracking workaround. The debugfs entries allow the CDR tracking workaround to be disabled and determine whether to re-enable CDR tracking before or after link training has been initiated. Also, with these changes the receiver reset cycle that is performed during the link status check can be performed less often. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 8 ++ drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c | 16 ++++ drivers/net/ethernet/amd/xgbe/xgbe-main.c | 1 + drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 8 +- drivers/net/ethernet/amd/xgbe/xgbe-pci.c | 2 + drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 125 ++++++++++++++++++++++++++- drivers/net/ethernet/amd/xgbe/xgbe.h | 4 + 7 files changed, 160 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 7ea72ef11a55..d272dc6984ac 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1321,6 +1321,10 @@ #define MDIO_VEND2_AN_STAT 0x8002 #endif +#ifndef MDIO_VEND2_PMA_CDR_CONTROL +#define MDIO_VEND2_PMA_CDR_CONTROL 0x8056 +#endif + #ifndef MDIO_CTRL1_SPEED1G #define MDIO_CTRL1_SPEED1G (MDIO_CTRL1_SPEED10G & ~BMCR_SPEED100) #endif @@ -1369,6 +1373,10 @@ #define XGBE_AN_CL37_TX_CONFIG_MASK 0x08 #define XGBE_AN_CL37_MII_CTRL_8BIT 0x0100 +#define XGBE_PMA_CDR_TRACK_EN_MASK 0x01 +#define XGBE_PMA_CDR_TRACK_EN_OFF 0x00 +#define XGBE_PMA_CDR_TRACK_EN_ON 0x01 + /* Bit setting and getting macros * The get macro will extract the current bit field value from within * the variable diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c index 7d128be61310..b91143947ed2 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c @@ -519,6 +519,22 @@ void xgbe_debugfs_init(struct xgbe_prv_data *pdata) "debugfs_create_file failed\n"); } + if (pdata->vdata->an_cdr_workaround) { + pfile = debugfs_create_bool("an_cdr_workaround", 0600, + pdata->xgbe_debugfs, + &pdata->debugfs_an_cdr_workaround); + if (!pfile) + netdev_err(pdata->netdev, + "debugfs_create_bool failed\n"); + + pfile = debugfs_create_bool("an_cdr_track_early", 0600, + pdata->xgbe_debugfs, + &pdata->debugfs_an_cdr_track_early); + if (!pfile) + netdev_err(pdata->netdev, + "debugfs_create_bool failed\n"); + } + kfree(buf); } diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c index 795e556d4a3f..441d0973957b 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c @@ -349,6 +349,7 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata) XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, UDP4TE, 1); /* Call MDIO/PHY initialization routine */ + pdata->debugfs_an_cdr_workaround = pdata->vdata->an_cdr_workaround; ret = pdata->phy_if.phy_init(pdata); if (ret) return ret; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index e3d361e242aa..1b45cd73a258 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -432,6 +432,8 @@ static void xgbe_an73_disable(struct xgbe_prv_data *pdata) xgbe_an73_set(pdata, false, false); xgbe_an73_disable_interrupts(pdata); + pdata->an_start = 0; + netif_dbg(pdata, link, pdata->netdev, "CL73 AN disabled\n"); } @@ -511,11 +513,11 @@ static enum xgbe_an xgbe_an73_tx_training(struct xgbe_prv_data *pdata, XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg); - if (pdata->phy_if.phy_impl.kr_training_post) - pdata->phy_if.phy_impl.kr_training_post(pdata); - netif_dbg(pdata, link, pdata->netdev, "KR training initiated\n"); + + if (pdata->phy_if.phy_impl.kr_training_post) + pdata->phy_if.phy_impl.kr_training_post(pdata); } return XGBE_AN_PAGE_RECEIVED; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c index eb23f9ba1a9a..82d1f416ee2a 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c @@ -456,6 +456,7 @@ static const struct xgbe_version_data xgbe_v2a = { .irq_reissue_support = 1, .tx_desc_prefetch = 5, .rx_desc_prefetch = 5, + .an_cdr_workaround = 1, }; static const struct xgbe_version_data xgbe_v2b = { @@ -470,6 +471,7 @@ static const struct xgbe_version_data xgbe_v2b = { .irq_reissue_support = 1, .tx_desc_prefetch = 5, .rx_desc_prefetch = 5, + .an_cdr_workaround = 1, }; static const struct pci_device_id xgbe_pci_table[] = { diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 3304a291aa96..b48efc04c4da 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -147,6 +147,14 @@ /* Rate-change complete wait/retry count */ #define XGBE_RATECHANGE_COUNT 500 +/* CDR delay values for KR support (in usec) */ +#define XGBE_CDR_DELAY_INIT 10000 +#define XGBE_CDR_DELAY_INC 10000 +#define XGBE_CDR_DELAY_MAX 100000 + +/* RRC frequency during link status check */ +#define XGBE_RRC_FREQUENCY 10 + enum xgbe_port_mode { XGBE_PORT_MODE_RSVD = 0, XGBE_PORT_MODE_BACKPLANE, @@ -355,6 +363,10 @@ struct xgbe_phy_data { unsigned int redrv_addr; unsigned int redrv_lane; unsigned int redrv_model; + + /* KR AN support */ + unsigned int phy_cdr_notrack; + unsigned int phy_cdr_delay; }; /* I2C, MDIO and GPIO lines are muxed, so only one device at a time */ @@ -2361,7 +2373,7 @@ static int xgbe_phy_link_status(struct xgbe_prv_data *pdata, int *an_restart) return 1; /* No link, attempt a receiver reset cycle */ - if (phy_data->rrc_count++) { + if (phy_data->rrc_count++ > XGBE_RRC_FREQUENCY) { phy_data->rrc_count = 0; xgbe_phy_rrc(pdata); } @@ -2669,6 +2681,103 @@ static bool xgbe_phy_port_enabled(struct xgbe_prv_data *pdata) return true; } +static void xgbe_phy_cdr_track(struct xgbe_prv_data *pdata) +{ + struct xgbe_phy_data *phy_data = pdata->phy_data; + + if (!pdata->debugfs_an_cdr_workaround) + return; + + if (!phy_data->phy_cdr_notrack) + return; + + usleep_range(phy_data->phy_cdr_delay, + phy_data->phy_cdr_delay + 500); + + XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_VEND2_PMA_CDR_CONTROL, + XGBE_PMA_CDR_TRACK_EN_MASK, + XGBE_PMA_CDR_TRACK_EN_ON); + + phy_data->phy_cdr_notrack = 0; +} + +static void xgbe_phy_cdr_notrack(struct xgbe_prv_data *pdata) +{ + struct xgbe_phy_data *phy_data = pdata->phy_data; + + if (!pdata->debugfs_an_cdr_workaround) + return; + + if (phy_data->phy_cdr_notrack) + return; + + XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_VEND2_PMA_CDR_CONTROL, + XGBE_PMA_CDR_TRACK_EN_MASK, + XGBE_PMA_CDR_TRACK_EN_OFF); + + xgbe_phy_rrc(pdata); + + phy_data->phy_cdr_notrack = 1; +} + +static void xgbe_phy_kr_training_post(struct xgbe_prv_data *pdata) +{ + if (!pdata->debugfs_an_cdr_track_early) + xgbe_phy_cdr_track(pdata); +} + +static void xgbe_phy_kr_training_pre(struct xgbe_prv_data *pdata) +{ + if (pdata->debugfs_an_cdr_track_early) + xgbe_phy_cdr_track(pdata); +} + +static void xgbe_phy_an_post(struct xgbe_prv_data *pdata) +{ + struct xgbe_phy_data *phy_data = pdata->phy_data; + + switch (pdata->an_mode) { + case XGBE_AN_MODE_CL73: + case XGBE_AN_MODE_CL73_REDRV: + if (phy_data->cur_mode != XGBE_MODE_KR) + break; + + xgbe_phy_cdr_track(pdata); + + switch (pdata->an_result) { + case XGBE_AN_READY: + case XGBE_AN_COMPLETE: + break; + default: + if (phy_data->phy_cdr_delay < XGBE_CDR_DELAY_MAX) + phy_data->phy_cdr_delay += XGBE_CDR_DELAY_INC; + else + phy_data->phy_cdr_delay = XGBE_CDR_DELAY_INIT; + break; + } + break; + default: + break; + } +} + +static void xgbe_phy_an_pre(struct xgbe_prv_data *pdata) +{ + struct xgbe_phy_data *phy_data = pdata->phy_data; + + switch (pdata->an_mode) { + case XGBE_AN_MODE_CL73: + case XGBE_AN_MODE_CL73_REDRV: + if (phy_data->cur_mode != XGBE_MODE_KR) + break; + + xgbe_phy_cdr_notrack(pdata); + break; + default: + break; + } +} + static void xgbe_phy_stop(struct xgbe_prv_data *pdata) { struct xgbe_phy_data *phy_data = pdata->phy_data; @@ -2680,6 +2789,9 @@ static void xgbe_phy_stop(struct xgbe_prv_data *pdata) xgbe_phy_sfp_reset(phy_data); xgbe_phy_sfp_mod_absent(pdata); + /* Reset CDR support */ + xgbe_phy_cdr_track(pdata); + /* Power off the PHY */ xgbe_phy_power_off(pdata); @@ -2712,6 +2824,9 @@ static int xgbe_phy_start(struct xgbe_prv_data *pdata) /* Start in highest supported mode */ xgbe_phy_set_mode(pdata, phy_data->start_mode); + /* Reset CDR support */ + xgbe_phy_cdr_track(pdata); + /* After starting the I2C controller, we can check for an SFP */ switch (phy_data->port_mode) { case XGBE_PORT_MODE_SFP: @@ -3019,6 +3134,8 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata) } } + phy_data->phy_cdr_delay = XGBE_CDR_DELAY_INIT; + /* Register for driving external PHYs */ mii = devm_mdiobus_alloc(pdata->dev); if (!mii) { @@ -3071,4 +3188,10 @@ void xgbe_init_function_ptrs_phy_v2(struct xgbe_phy_if *phy_if) phy_impl->an_advertising = xgbe_phy_an_advertising; phy_impl->an_outcome = xgbe_phy_an_outcome; + + phy_impl->an_pre = xgbe_phy_an_pre; + phy_impl->an_post = xgbe_phy_an_post; + + phy_impl->kr_training_pre = xgbe_phy_kr_training_pre; + phy_impl->kr_training_post = xgbe_phy_kr_training_post; } diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index fa0b51ea1b95..95d4b56448c6 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -994,6 +994,7 @@ struct xgbe_version_data { unsigned int irq_reissue_support; unsigned int tx_desc_prefetch; unsigned int rx_desc_prefetch; + unsigned int an_cdr_workaround; }; struct xgbe_vxlan_data { @@ -1262,6 +1263,9 @@ struct xgbe_prv_data { unsigned int debugfs_xprop_reg; unsigned int debugfs_xi2c_reg; + + bool debugfs_an_cdr_workaround; + bool debugfs_an_cdr_track_early; }; /* Function prototypes*/ -- cgit v1.2.1 From 117df655f8ed51adb6e6b163812a06ebeae9f453 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 23 Apr 2018 11:43:34 -0500 Subject: amd-xgbe: Only use the SFP supported transceiver signals The SFP eeprom indicates the transceiver signals (Rx LOS, Tx Fault, etc.) that it supports. Update the driver to include checking the eeprom data when deciding whether to use a transceiver signal. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 71 ++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index b48efc04c4da..aac884314000 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -253,6 +253,10 @@ enum xgbe_sfp_speed { #define XGBE_SFP_BASE_VENDOR_SN 4 #define XGBE_SFP_BASE_VENDOR_SN_LEN 16 +#define XGBE_SFP_EXTD_OPT1 1 +#define XGBE_SFP_EXTD_OPT1_RX_LOS BIT(1) +#define XGBE_SFP_EXTD_OPT1_TX_FAULT BIT(3) + #define XGBE_SFP_EXTD_DIAG 28 #define XGBE_SFP_EXTD_DIAG_ADDR_CHANGE BIT(2) @@ -332,6 +336,7 @@ struct xgbe_phy_data { unsigned int sfp_gpio_address; unsigned int sfp_gpio_mask; + unsigned int sfp_gpio_inputs; unsigned int sfp_gpio_rx_los; unsigned int sfp_gpio_tx_fault; unsigned int sfp_gpio_mod_absent; @@ -986,6 +991,49 @@ static void xgbe_phy_sfp_external_phy(struct xgbe_prv_data *pdata) phy_data->sfp_phy_avail = 1; } +static bool xgbe_phy_check_sfp_rx_los(struct xgbe_phy_data *phy_data) +{ + u8 *sfp_extd = phy_data->sfp_eeprom.extd; + + if (!(sfp_extd[XGBE_SFP_EXTD_OPT1] & XGBE_SFP_EXTD_OPT1_RX_LOS)) + return false; + + if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_RX_LOS) + return false; + + if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_rx_los)) + return true; + + return false; +} + +static bool xgbe_phy_check_sfp_tx_fault(struct xgbe_phy_data *phy_data) +{ + u8 *sfp_extd = phy_data->sfp_eeprom.extd; + + if (!(sfp_extd[XGBE_SFP_EXTD_OPT1] & XGBE_SFP_EXTD_OPT1_TX_FAULT)) + return false; + + if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_TX_FAULT) + return false; + + if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_tx_fault)) + return true; + + return false; +} + +static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data) +{ + if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_MOD_ABSENT) + return false; + + if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_mod_absent)) + return true; + + return false; +} + static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata) { struct xgbe_phy_data *phy_data = pdata->phy_data; @@ -1031,6 +1079,10 @@ static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata) if (sfp_base[XGBE_SFP_BASE_EXT_ID] != XGBE_SFP_EXT_ID_SFP) return; + /* Update transceiver signals (eeprom extd/options) */ + phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data); + phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data); + if (xgbe_phy_sfp_parse_quirks(pdata)) return; @@ -1196,7 +1248,6 @@ put: static void xgbe_phy_sfp_signals(struct xgbe_prv_data *pdata) { struct xgbe_phy_data *phy_data = pdata->phy_data; - unsigned int gpio_input; u8 gpio_reg, gpio_ports[2]; int ret; @@ -1211,23 +1262,9 @@ static void xgbe_phy_sfp_signals(struct xgbe_prv_data *pdata) return; } - gpio_input = (gpio_ports[1] << 8) | gpio_ports[0]; - - if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_MOD_ABSENT) { - /* No GPIO, just assume the module is present for now */ - phy_data->sfp_mod_absent = 0; - } else { - if (!(gpio_input & (1 << phy_data->sfp_gpio_mod_absent))) - phy_data->sfp_mod_absent = 0; - } - - if (!(phy_data->sfp_gpio_mask & XGBE_GPIO_NO_RX_LOS) && - (gpio_input & (1 << phy_data->sfp_gpio_rx_los))) - phy_data->sfp_rx_los = 1; + phy_data->sfp_gpio_inputs = (gpio_ports[1] << 8) | gpio_ports[0]; - if (!(phy_data->sfp_gpio_mask & XGBE_GPIO_NO_TX_FAULT) && - (gpio_input & (1 << phy_data->sfp_gpio_tx_fault))) - phy_data->sfp_tx_fault = 1; + phy_data->sfp_mod_absent = xgbe_phy_check_sfp_mod_absent(phy_data); } static void xgbe_phy_sfp_mod_absent(struct xgbe_prv_data *pdata) -- cgit v1.2.1 From 9cf2f437ca5b39828984064fad213e68fc17ef11 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Apr 2018 14:33:37 +0800 Subject: team: fix netconsole setup over team MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The same fix in Commit dbe173079ab5 ("bridge: fix netconsole setup over bridge") is also needed for team driver. While at it, remove the unnecessary parameter *team from team_port_enable_netpoll(). v1->v2: - fix it in a better way, as does bridge. Fixes: 0fb52a27a04a ("team: cleanup netpoll clode") Reported-by: João Avelino Bellomo Filho Signed-off-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/team/team.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index acbe84967834..ddb6bf85a59c 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1072,14 +1072,11 @@ static void team_port_leave(struct team *team, struct team_port *port) } #ifdef CONFIG_NET_POLL_CONTROLLER -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int __team_port_enable_netpoll(struct team_port *port) { struct netpoll *np; int err; - if (!team->dev->npinfo) - return 0; - np = kzalloc(sizeof(*np), GFP_KERNEL); if (!np) return -ENOMEM; @@ -1093,6 +1090,14 @@ static int team_port_enable_netpoll(struct team *team, struct team_port *port) return err; } +static int team_port_enable_netpoll(struct team_port *port) +{ + if (!port->team->dev->npinfo) + return 0; + + return __team_port_enable_netpoll(port); +} + static void team_port_disable_netpoll(struct team_port *port) { struct netpoll *np = port->np; @@ -1107,7 +1112,7 @@ static void team_port_disable_netpoll(struct team_port *port) kfree(np); } #else -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team_port *port) { return 0; } @@ -1221,7 +1226,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev, goto err_vids_add; } - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(port); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); @@ -1918,7 +1923,7 @@ static int team_netpoll_setup(struct net_device *dev, mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { - err = team_port_enable_netpoll(team, port); + err = __team_port_enable_netpoll(port); if (err) { __team_netpoll_cleanup(team); break; -- cgit v1.2.1 From 39035bfdc3f18987aba04165060bfbfa10ffc1cd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 27 Mar 2018 15:21:48 +0100 Subject: ixgbevf: ensure xdp_ring resources are free'd on error exit The current error handling for failed resource setup for xdp_ring data is a break out of the loop and returning 0 indicated everything was OK, when in fact it is not. Fix this by exiting via the error exit label err_setup_tx that will clean up the resources correctly and return and error status. Detected by CoverityScan, CID#1466879 ("Logically dead code") Fixes: 21092e9ce8b1 ("ixgbevf: Add support for XDP_TX action") Signed-off-by: Colin Ian King Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 3d9033f26eff..e3d04f226d57 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -3420,7 +3420,7 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter) if (!err) continue; hw_dbg(&adapter->hw, "Allocation for XDP Queue %u failed\n", j); - break; + goto err_setup_tx; } return 0; -- cgit v1.2.1 From 2707df9773cd2cb8b0f35b8592431b301da9d352 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 30 Mar 2018 17:06:52 -0700 Subject: igb: Fix the transmission mode of queue 0 for Qav mode When Qav mode is enabled, queue 0 should be kept on Stream Reservation mode. From the i210 datasheet, section 8.12.19: "Note: Queue0 QueueMode must be set to 1b when TransmitMode is set to Qav." ("QueueMode 1b" represents the Stream Reservation mode) The solution is to give queue 0 the all the credits it might need, so it has priority over queue 1. A situation where this can happen is when cbs is "installed" only on queue 1, leaving queue 0 alone. For example: $ tc qdisc replace dev enp2s0 handle 100: parent root mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc replace dev enp2s0 parent 100:2 cbs locredit -1470 \ hicredit 30 sendslope -980000 idleslope 20000 offload 1 Signed-off-by: Vinicius Costa Gomes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index c1c0bc30a16d..cce7ada89255 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1700,7 +1700,22 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue, WARN_ON(hw->mac.type != e1000_i210); WARN_ON(queue < 0 || queue > 1); - if (enable) { + if (enable || queue == 0) { + /* i210 does not allow the queue 0 to be in the Strict + * Priority mode while the Qav mode is enabled, so, + * instead of disabling strict priority mode, we give + * queue 0 the maximum of credits possible. + * + * See section 8.12.19 of the i210 datasheet, "Note: + * Queue0 QueueMode must be set to 1b when + * TransmitMode is set to Qav." + */ + if (queue == 0 && !enable) { + /* max "linkspeed" idleslope in kbps */ + idleslope = 1000000; + hicredit = ETH_FRAME_LEN; + } + set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); -- cgit v1.2.1 From d332a38c9519e0208f04da465bc88427db3485f6 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Tue, 10 Apr 2018 10:49:49 -0700 Subject: ice: Fix initialization for num_nodes_added ice_sched_add_nodes_to_layer is used recursively, and so we start with num_nodes_added being 0. This way, in case of an error or if num_nodes is NULL, the function just returns 0 to indicate that no nodes were added. Fixes: 5513b920a4f7 ("ice: Update Tx scheduler tree for VSI multi-Tx queue support") Signed-off-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index f16ff3e4a840..2e6c1d92cc88 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -751,14 +751,14 @@ ice_sched_add_nodes_to_layer(struct ice_port_info *pi, u16 num_added = 0; u32 temp; + *num_nodes_added = 0; + if (!num_nodes) return status; if (!parent || layer < hw->sw_entry_point_layer) return ICE_ERR_PARAM; - *num_nodes_added = 0; - /* max children per node per layer */ max_child_nodes = le16_to_cpu(hw->layer_info[parent->tx_sched_layer].max_children); -- cgit v1.2.1 From 34357a90d5ca8228df4f88b21197f970285b209b Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Wed, 11 Apr 2018 10:41:47 -0700 Subject: ice: Fix incorrect comment for action type Action type 5 defines large action generic values. Fix comment to reflect that better. Signed-off-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 5b13ca1bd85f..7dc5f045e969 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -586,7 +586,7 @@ struct ice_sw_rule_lg_act { #define ICE_LG_ACT_MIRROR_VSI_ID_S 3 #define ICE_LG_ACT_MIRROR_VSI_ID_M (0x3FF << ICE_LG_ACT_MIRROR_VSI_ID_S) - /* Action type = 5 - Large Action */ + /* Action type = 5 - Generic Value */ #define ICE_LG_ACT_GENERIC 0x5 #define ICE_LG_ACT_GENERIC_VALUE_S 3 #define ICE_LG_ACT_GENERIC_VALUE_M (0xFFFF << ICE_LG_ACT_GENERIC_VALUE_S) -- cgit v1.2.1 From 30d84397affb0fcb11beaf049caabfcb1dac65a6 Mon Sep 17 00:00:00 2001 From: Ben Shelton Date: Wed, 11 Apr 2018 12:21:33 -0700 Subject: ice: Do not check INTEVENT bit for OICR interrupts According to the hardware spec, checking the INTEVENT bit isn't a reliable way to detect if an OICR interrupt has occurred. This is because this bit can be cleared by the hardware/firmware before the interrupt service routine has run. So instead, just check for OICR events every time. Fixes: 940b61af02f4 ("ice: Initialize PF and setup miscellaneous interrupt") Signed-off-by: Ben Shelton Signed-off-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_hw_autogen.h | 2 -- drivers/net/ethernet/intel/ice/ice_main.c | 4 ---- 2 files changed, 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h index 1b9e2ef48a9d..499904874b3f 100644 --- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h +++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h @@ -121,8 +121,6 @@ #define PFINT_FW_CTL_CAUSE_ENA_S 30 #define PFINT_FW_CTL_CAUSE_ENA_M BIT(PFINT_FW_CTL_CAUSE_ENA_S) #define PFINT_OICR 0x0016CA00 -#define PFINT_OICR_INTEVENT_S 0 -#define PFINT_OICR_INTEVENT_M BIT(PFINT_OICR_INTEVENT_S) #define PFINT_OICR_HLP_RDY_S 14 #define PFINT_OICR_HLP_RDY_M BIT(PFINT_OICR_HLP_RDY_S) #define PFINT_OICR_CPM_RDY_S 15 diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 210b7910f1cd..5299caf55a7f 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1722,9 +1722,6 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data) oicr = rd32(hw, PFINT_OICR); ena_mask = rd32(hw, PFINT_OICR_ENA); - if (!(oicr & PFINT_OICR_INTEVENT_M)) - goto ena_intr; - if (oicr & PFINT_OICR_GRST_M) { u32 reset; /* we have a reset warning */ @@ -1782,7 +1779,6 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data) } ret = IRQ_HANDLED; -ena_intr: /* re-enable interrupt causes that are not handled during this pass */ wr32(hw, PFINT_OICR_ENA, ena_mask); if (!test_bit(__ICE_DOWN, pf->state)) { -- cgit v1.2.1 From a6361f0ca4b25460f2cdf3235ebe8115f622901e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 23 Apr 2018 17:37:03 -0400 Subject: packet: fix bitfield update race Updates to the bitfields in struct packet_sock are not atomic. Serialize these read-modify-write cycles. Move po->running into a separate variable. Its writes are protected by po->bind_lock (except for one startup case at packet_create). Also replace a textual precondition warning with lockdep annotation. All others are set only in packet_setsockopt. Serialize these updates by holding the socket lock. Analogous to other field updates, also hold the lock when testing whether a ring is active (pg_vec). Fixes: 8dc419447415 ("[PACKET]: Add optional checksum computation for recvmsg") Reported-by: DaeRyong Jeong Reported-by: Byoungyoung Lee Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 60 ++++++++++++++++++++++++++++++++++++-------------- net/packet/internal.h | 10 ++++----- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c31b0687396a..01f3515cada0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -329,11 +329,11 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) skb_set_queue_mapping(skb, queue_index); } -/* register_prot_hook must be invoked with the po->bind_lock held, +/* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ -static void register_prot_hook(struct sock *sk) +static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); @@ -348,8 +348,13 @@ static void register_prot_hook(struct sock *sk) } } -/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock - * held. If the sync parameter is true, we will temporarily drop +static void register_prot_hook(struct sock *sk) +{ + lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); + __register_prot_hook(sk); +} + +/* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the @@ -359,6 +364,8 @@ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); + lockdep_assert_held_once(&po->bind_lock); + po->running = 0; if (po->fanout) @@ -3252,7 +3259,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, if (proto) { po->prot_hook.type = proto; - register_prot_hook(sk); + __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); @@ -3732,12 +3739,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; - po->tp_loss = !!val; - return 0; + + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_loss = !!val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_AUXDATA: { @@ -3748,7 +3761,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; + lock_sock(sk); po->auxdata = !!val; + release_sock(sk); return 0; } case PACKET_ORIGDEV: @@ -3760,7 +3775,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; + lock_sock(sk); po->origdev = !!val; + release_sock(sk); return 0; } case PACKET_VNET_HDR: @@ -3769,15 +3786,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (sock->type != SOCK_RAW) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; - po->has_vnet_hdr = !!val; - return 0; + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->has_vnet_hdr = !!val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_TIMESTAMP: { @@ -3815,11 +3837,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; - po->tp_tx_has_off = !!val; + + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_tx_has_off = !!val; + ret = 0; + } + release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: diff --git a/net/packet/internal.h b/net/packet/internal.h index a1d2b2319ae9..3bb7c5fb3bff 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -112,10 +112,12 @@ struct packet_sock { int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; - unsigned int running:1, /* prot_hook is attached*/ - auxdata:1, + unsigned int running; /* bind_lock must be held */ + unsigned int auxdata:1, /* writer must hold sock lock */ origdev:1, - has_vnet_hdr:1; + has_vnet_hdr:1, + tp_loss:1, + tp_tx_has_off:1; int pressure; int ifindex; /* bound device */ __be16 num; @@ -125,8 +127,6 @@ struct packet_sock { enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; - unsigned int tp_loss:1; - unsigned int tp_tx_has_off:1; unsigned int tp_tstamp; struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); -- cgit v1.2.1 From d805c5209350ae725e3a1ee0204ba27d9e75ce3e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 23 Apr 2018 15:51:38 -0700 Subject: net: ethtool: Add missing kernel doc for FEC parameters While adding support for ethtool::get_fecparam and set_fecparam, kernel doc for these functions was missed, add those. Fixes: 1a5f3da20bd9 ("net: ethtool: add support for forward error correction modes") Signed-off-by: Florian Fainelli Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index ebe41811ed34..b32cd2062f18 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -310,6 +310,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS * instead of the latter), any change to them will be overwritten * by kernel. Returns a negative error code or zero. + * @get_fecparam: Get the network device Forward Error Correction parameters. + * @set_fecparam: Set the network device Forward Error Correction parameters. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must -- cgit v1.2.1 From f8d6203780b73c07dc49ee421fedae8edb76b6e4 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 24 Apr 2018 17:09:30 +0100 Subject: sfc: ARFS filter IDs Associate an arbitrary ID with each ARFS filter, allowing to properly query for expiry. The association is maintained in a hash table, which is protected by a spinlock. v3: fix build warnings when CONFIG_RFS_ACCEL is disabled (thanks lkp-robot). v2: fixed uninitialised variable (thanks davem and lkp-robot). Fixes: 3af0f34290f6 ("sfc: replace asynchronous filter operations") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 80 +++++++++++-------- drivers/net/ethernet/sfc/efx.c | 143 ++++++++++++++++++++++++++++++++++ drivers/net/ethernet/sfc/efx.h | 21 +++++ drivers/net/ethernet/sfc/farch.c | 41 ++++++++-- drivers/net/ethernet/sfc/net_driver.h | 36 +++++++++ drivers/net/ethernet/sfc/rx.c | 62 +++++++++++++-- 6 files changed, 337 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 83ce229f4eb7..63036d9bf3e6 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -3999,29 +3999,6 @@ static void efx_ef10_prepare_flr(struct efx_nic *efx) atomic_set(&efx->active_queues, 0); } -static bool efx_ef10_filter_equal(const struct efx_filter_spec *left, - const struct efx_filter_spec *right) -{ - if ((left->match_flags ^ right->match_flags) | - ((left->flags ^ right->flags) & - (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX))) - return false; - - return memcmp(&left->outer_vid, &right->outer_vid, - sizeof(struct efx_filter_spec) - - offsetof(struct efx_filter_spec, outer_vid)) == 0; -} - -static unsigned int efx_ef10_filter_hash(const struct efx_filter_spec *spec) -{ - BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3); - return jhash2((const u32 *)&spec->outer_vid, - (sizeof(struct efx_filter_spec) - - offsetof(struct efx_filter_spec, outer_vid)) / 4, - 0); - /* XXX should we randomise the initval? */ -} - /* Decide whether a filter should be exclusive or else should allow * delivery to additional recipients. Currently we decide that * filters for specific local unicast MAC and IP addresses are @@ -4346,7 +4323,7 @@ static s32 efx_ef10_filter_insert(struct efx_nic *efx, goto out_unlock; match_pri = rc; - hash = efx_ef10_filter_hash(spec); + hash = efx_filter_spec_hash(spec); is_mc_recip = efx_filter_is_mc_recipient(spec); if (is_mc_recip) bitmap_zero(mc_rem_map, EFX_EF10_FILTER_SEARCH_LIMIT); @@ -4378,7 +4355,7 @@ static s32 efx_ef10_filter_insert(struct efx_nic *efx, if (!saved_spec) { if (ins_index < 0) ins_index = i; - } else if (efx_ef10_filter_equal(spec, saved_spec)) { + } else if (efx_filter_spec_equal(spec, saved_spec)) { if (spec->priority < saved_spec->priority && spec->priority != EFX_FILTER_PRI_AUTO) { rc = -EPERM; @@ -4762,27 +4739,62 @@ static s32 efx_ef10_filter_get_rx_ids(struct efx_nic *efx, static bool efx_ef10_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id, unsigned int filter_idx) { + struct efx_filter_spec *spec, saved_spec; struct efx_ef10_filter_table *table; - struct efx_filter_spec *spec; - bool ret; + struct efx_arfs_rule *rule = NULL; + bool ret = true, force = false; + u16 arfs_id; down_read(&efx->filter_sem); table = efx->filter_state; down_write(&table->lock); spec = efx_ef10_filter_entry_spec(table, filter_idx); - if (!spec || spec->priority != EFX_FILTER_PRI_HINT) { - ret = true; + if (!spec || spec->priority != EFX_FILTER_PRI_HINT) goto out_unlock; - } - if (!rps_may_expire_flow(efx->net_dev, spec->dmaq_id, flow_id, 0)) { - ret = false; - goto out_unlock; + spin_lock_bh(&efx->rps_hash_lock); + if (!efx->rps_hash_table) { + /* In the absence of the table, we always return 0 to ARFS. */ + arfs_id = 0; + } else { + rule = efx_rps_hash_find(efx, spec); + if (!rule) + /* ARFS table doesn't know of this filter, so remove it */ + goto expire; + arfs_id = rule->arfs_id; + ret = efx_rps_check_rule(rule, filter_idx, &force); + if (force) + goto expire; + if (!ret) { + spin_unlock_bh(&efx->rps_hash_lock); + goto out_unlock; + } } - + if (!rps_may_expire_flow(efx->net_dev, spec->dmaq_id, flow_id, arfs_id)) + ret = false; + else if (rule) + rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING; +expire: + saved_spec = *spec; /* remove operation will kfree spec */ + spin_unlock_bh(&efx->rps_hash_lock); + /* At this point (since we dropped the lock), another thread might queue + * up a fresh insertion request (but the actual insertion will be held + * up by our possession of the filter table lock). In that case, it + * will set rule->filter_id to EFX_ARFS_FILTER_ID_PENDING, meaning that + * the rule is not removed by efx_rps_hash_del() below. + */ ret = efx_ef10_filter_remove_internal(efx, 1U << spec->priority, filter_idx, true) == 0; + /* While we can't safely dereference rule (we dropped the lock), we can + * still test it for NULL. + */ + if (ret && rule) { + /* Expiring, so remove entry from ARFS table */ + spin_lock_bh(&efx->rps_hash_lock); + efx_rps_hash_del(efx, &saved_spec); + spin_unlock_bh(&efx->rps_hash_lock); + } out_unlock: up_write(&table->lock); up_read(&efx->filter_sem); diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 692dd729ee2a..a4ebd8715494 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -3027,6 +3027,10 @@ static int efx_init_struct(struct efx_nic *efx, mutex_init(&efx->mac_lock); #ifdef CONFIG_RFS_ACCEL mutex_init(&efx->rps_mutex); + spin_lock_init(&efx->rps_hash_lock); + /* Failure to allocate is not fatal, but may degrade ARFS performance */ + efx->rps_hash_table = kcalloc(EFX_ARFS_HASH_TABLE_SIZE, + sizeof(*efx->rps_hash_table), GFP_KERNEL); #endif efx->phy_op = &efx_dummy_phy_operations; efx->mdio.dev = net_dev; @@ -3070,6 +3074,10 @@ static void efx_fini_struct(struct efx_nic *efx) { int i; +#ifdef CONFIG_RFS_ACCEL + kfree(efx->rps_hash_table); +#endif + for (i = 0; i < EFX_MAX_CHANNELS; i++) kfree(efx->channel[i]); @@ -3092,6 +3100,141 @@ void efx_update_sw_stats(struct efx_nic *efx, u64 *stats) stats[GENERIC_STAT_rx_noskb_drops] = atomic_read(&efx->n_rx_noskb_drops); } +bool efx_filter_spec_equal(const struct efx_filter_spec *left, + const struct efx_filter_spec *right) +{ + if ((left->match_flags ^ right->match_flags) | + ((left->flags ^ right->flags) & + (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX))) + return false; + + return memcmp(&left->outer_vid, &right->outer_vid, + sizeof(struct efx_filter_spec) - + offsetof(struct efx_filter_spec, outer_vid)) == 0; +} + +u32 efx_filter_spec_hash(const struct efx_filter_spec *spec) +{ + BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3); + return jhash2((const u32 *)&spec->outer_vid, + (sizeof(struct efx_filter_spec) - + offsetof(struct efx_filter_spec, outer_vid)) / 4, + 0); +} + +#ifdef CONFIG_RFS_ACCEL +bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx, + bool *force) +{ + if (rule->filter_id == EFX_ARFS_FILTER_ID_PENDING) { + /* ARFS is currently updating this entry, leave it */ + return false; + } + if (rule->filter_id == EFX_ARFS_FILTER_ID_ERROR) { + /* ARFS tried and failed to update this, so it's probably out + * of date. Remove the filter and the ARFS rule entry. + */ + rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING; + *force = true; + return true; + } else if (WARN_ON(rule->filter_id != filter_idx)) { /* can't happen */ + /* ARFS has moved on, so old filter is not needed. Since we did + * not mark the rule with EFX_ARFS_FILTER_ID_REMOVING, it will + * not be removed by efx_rps_hash_del() subsequently. + */ + *force = true; + return true; + } + /* Remove it iff ARFS wants to. */ + return true; +} + +struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx, + const struct efx_filter_spec *spec) +{ + u32 hash = efx_filter_spec_hash(spec); + + WARN_ON(!spin_is_locked(&efx->rps_hash_lock)); + if (!efx->rps_hash_table) + return NULL; + return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE]; +} + +struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx, + const struct efx_filter_spec *spec) +{ + struct efx_arfs_rule *rule; + struct hlist_head *head; + struct hlist_node *node; + + head = efx_rps_hash_bucket(efx, spec); + if (!head) + return NULL; + hlist_for_each(node, head) { + rule = container_of(node, struct efx_arfs_rule, node); + if (efx_filter_spec_equal(spec, &rule->spec)) + return rule; + } + return NULL; +} + +struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx, + const struct efx_filter_spec *spec, + bool *new) +{ + struct efx_arfs_rule *rule; + struct hlist_head *head; + struct hlist_node *node; + + head = efx_rps_hash_bucket(efx, spec); + if (!head) + return NULL; + hlist_for_each(node, head) { + rule = container_of(node, struct efx_arfs_rule, node); + if (efx_filter_spec_equal(spec, &rule->spec)) { + *new = false; + return rule; + } + } + rule = kmalloc(sizeof(*rule), GFP_ATOMIC); + *new = true; + if (rule) { + memcpy(&rule->spec, spec, sizeof(rule->spec)); + hlist_add_head(&rule->node, head); + } + return rule; +} + +void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec) +{ + struct efx_arfs_rule *rule; + struct hlist_head *head; + struct hlist_node *node; + + head = efx_rps_hash_bucket(efx, spec); + if (WARN_ON(!head)) + return; + hlist_for_each(node, head) { + rule = container_of(node, struct efx_arfs_rule, node); + if (efx_filter_spec_equal(spec, &rule->spec)) { + /* Someone already reused the entry. We know that if + * this check doesn't fire (i.e. filter_id == REMOVING) + * then the REMOVING mark was put there by our caller, + * because caller is holding a lock on filter table and + * only holders of that lock set REMOVING. + */ + if (rule->filter_id != EFX_ARFS_FILTER_ID_REMOVING) + return; + hlist_del(node); + kfree(rule); + return; + } + } + /* We didn't find it. */ + WARN_ON(1); +} +#endif + /* RSS contexts. We're using linked lists and crappy O(n) algorithms, because * (a) this is an infrequent control-plane operation and (b) n is small (max 64) */ diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index a3140e16fcef..3f759ebdcf10 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -186,6 +186,27 @@ static inline void efx_filter_rfs_expire(struct work_struct *data) {} #endif bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec); +bool efx_filter_spec_equal(const struct efx_filter_spec *left, + const struct efx_filter_spec *right); +u32 efx_filter_spec_hash(const struct efx_filter_spec *spec); + +#ifdef CONFIG_RFS_ACCEL +bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx, + bool *force); + +struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx, + const struct efx_filter_spec *spec); + +/* @new is written to indicate if entry was newly added (true) or if an old + * entry was found and returned (false). + */ +struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx, + const struct efx_filter_spec *spec, + bool *new); + +void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec); +#endif + /* RSS contexts */ struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx); struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id); diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c index 7174ef5e5c5e..c72adf8b52ea 100644 --- a/drivers/net/ethernet/sfc/farch.c +++ b/drivers/net/ethernet/sfc/farch.c @@ -2905,18 +2905,45 @@ bool efx_farch_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id, { struct efx_farch_filter_state *state = efx->filter_state; struct efx_farch_filter_table *table; - bool ret = false; + bool ret = false, force = false; + u16 arfs_id; down_write(&state->lock); + spin_lock_bh(&efx->rps_hash_lock); table = &state->table[EFX_FARCH_FILTER_TABLE_RX_IP]; if (test_bit(index, table->used_bitmap) && - table->spec[index].priority == EFX_FILTER_PRI_HINT && - rps_may_expire_flow(efx->net_dev, table->spec[index].dmaq_id, - flow_id, 0)) { - efx_farch_filter_table_clear_entry(efx, table, index); - ret = true; + table->spec[index].priority == EFX_FILTER_PRI_HINT) { + struct efx_arfs_rule *rule = NULL; + struct efx_filter_spec spec; + + efx_farch_filter_to_gen_spec(&spec, &table->spec[index]); + if (!efx->rps_hash_table) { + /* In the absence of the table, we always returned 0 to + * ARFS, so use the same to query it. + */ + arfs_id = 0; + } else { + rule = efx_rps_hash_find(efx, &spec); + if (!rule) { + /* ARFS table doesn't know of this filter, remove it */ + force = true; + } else { + arfs_id = rule->arfs_id; + if (!efx_rps_check_rule(rule, index, &force)) + goto out_unlock; + } + } + if (force || rps_may_expire_flow(efx->net_dev, spec.dmaq_id, + flow_id, arfs_id)) { + if (rule) + rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING; + efx_rps_hash_del(efx, &spec); + efx_farch_filter_table_clear_entry(efx, table, index); + ret = true; + } } - +out_unlock: + spin_unlock_bh(&efx->rps_hash_lock); up_write(&state->lock); return ret; } diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index eea3808b3f25..65568925c3ef 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -734,6 +734,35 @@ struct efx_rss_context { }; #ifdef CONFIG_RFS_ACCEL +/* Order of these is important, since filter_id >= %EFX_ARFS_FILTER_ID_PENDING + * is used to test if filter does or will exist. + */ +#define EFX_ARFS_FILTER_ID_PENDING -1 +#define EFX_ARFS_FILTER_ID_ERROR -2 +#define EFX_ARFS_FILTER_ID_REMOVING -3 +/** + * struct efx_arfs_rule - record of an ARFS filter and its IDs + * @node: linkage into hash table + * @spec: details of the filter (used as key for hash table). Use efx->type to + * determine which member to use. + * @rxq_index: channel to which the filter will steer traffic. + * @arfs_id: filter ID which was returned to ARFS + * @filter_id: index in software filter table. May be + * %EFX_ARFS_FILTER_ID_PENDING if filter was not inserted yet, + * %EFX_ARFS_FILTER_ID_ERROR if filter insertion failed, or + * %EFX_ARFS_FILTER_ID_REMOVING if expiry is currently removing the filter. + */ +struct efx_arfs_rule { + struct hlist_node node; + struct efx_filter_spec spec; + u16 rxq_index; + u16 arfs_id; + s32 filter_id; +}; + +/* Size chosen so that the table is one page (4kB) */ +#define EFX_ARFS_HASH_TABLE_SIZE 512 + /** * struct efx_async_filter_insertion - Request to asynchronously insert a filter * @net_dev: Reference to the netdevice @@ -873,6 +902,10 @@ struct efx_async_filter_insertion { * @rps_expire_channel's @rps_flow_id * @rps_slot_map: bitmap of in-flight entries in @rps_slot * @rps_slot: array of ARFS insertion requests for efx_filter_rfs_work() + * @rps_hash_lock: Protects ARFS filter mapping state (@rps_hash_table and + * @rps_next_id). + * @rps_hash_table: Mapping between ARFS filters and their various IDs + * @rps_next_id: next arfs_id for an ARFS filter * @active_queues: Count of RX and TX queues that haven't been flushed and drained. * @rxq_flush_pending: Count of number of receive queues that need to be flushed. * Decremented when the efx_flush_rx_queue() is called. @@ -1029,6 +1062,9 @@ struct efx_nic { unsigned int rps_expire_index; unsigned long rps_slot_map; struct efx_async_filter_insertion rps_slot[EFX_RPS_MAX_IN_FLIGHT]; + spinlock_t rps_hash_lock; + struct hlist_head *rps_hash_table; + u32 rps_next_id; #endif atomic_t active_queues; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 9c593c661cbf..64a94f242027 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -834,9 +834,29 @@ static void efx_filter_rfs_work(struct work_struct *data) struct efx_nic *efx = netdev_priv(req->net_dev); struct efx_channel *channel = efx_get_channel(efx, req->rxq_index); int slot_idx = req - efx->rps_slot; + struct efx_arfs_rule *rule; + u16 arfs_id = 0; int rc; rc = efx->type->filter_insert(efx, &req->spec, true); + if (efx->rps_hash_table) { + spin_lock_bh(&efx->rps_hash_lock); + rule = efx_rps_hash_find(efx, &req->spec); + /* The rule might have already gone, if someone else's request + * for the same spec was already worked and then expired before + * we got around to our work. In that case we have nothing + * tying us to an arfs_id, meaning that as soon as the filter + * is considered for expiry it will be removed. + */ + if (rule) { + if (rc < 0) + rule->filter_id = EFX_ARFS_FILTER_ID_ERROR; + else + rule->filter_id = rc; + arfs_id = rule->arfs_id; + } + spin_unlock_bh(&efx->rps_hash_lock); + } if (rc >= 0) { /* Remember this so we can check whether to expire the filter * later. @@ -848,18 +868,18 @@ static void efx_filter_rfs_work(struct work_struct *data) if (req->spec.ether_type == htons(ETH_P_IP)) netif_info(efx, rx_status, efx->net_dev, - "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n", + "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d id %u]\n", (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP", req->spec.rem_host, ntohs(req->spec.rem_port), req->spec.loc_host, ntohs(req->spec.loc_port), - req->rxq_index, req->flow_id, rc); + req->rxq_index, req->flow_id, rc, arfs_id); else netif_info(efx, rx_status, efx->net_dev, - "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d]\n", + "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d id %u]\n", (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP", req->spec.rem_host, ntohs(req->spec.rem_port), req->spec.loc_host, ntohs(req->spec.loc_port), - req->rxq_index, req->flow_id, rc); + req->rxq_index, req->flow_id, rc, arfs_id); } /* Release references */ @@ -872,8 +892,10 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, { struct efx_nic *efx = netdev_priv(net_dev); struct efx_async_filter_insertion *req; + struct efx_arfs_rule *rule; struct flow_keys fk; int slot_idx; + bool new; int rc; /* find a free slot */ @@ -926,12 +948,42 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, req->spec.rem_port = fk.ports.src; req->spec.loc_port = fk.ports.dst; + if (efx->rps_hash_table) { + /* Add it to ARFS hash table */ + spin_lock(&efx->rps_hash_lock); + rule = efx_rps_hash_add(efx, &req->spec, &new); + if (!rule) { + rc = -ENOMEM; + goto out_unlock; + } + if (new) + rule->arfs_id = efx->rps_next_id++ % RPS_NO_FILTER; + rc = rule->arfs_id; + /* Skip if existing or pending filter already does the right thing */ + if (!new && rule->rxq_index == rxq_index && + rule->filter_id >= EFX_ARFS_FILTER_ID_PENDING) + goto out_unlock; + rule->rxq_index = rxq_index; + rule->filter_id = EFX_ARFS_FILTER_ID_PENDING; + spin_unlock(&efx->rps_hash_lock); + } else { + /* Without an ARFS hash table, we just use arfs_id 0 for all + * filters. This means if multiple flows hash to the same + * flow_id, all but the most recently touched will be eligible + * for expiry. + */ + rc = 0; + } + + /* Queue the request */ dev_hold(req->net_dev = net_dev); INIT_WORK(&req->work, efx_filter_rfs_work); req->rxq_index = rxq_index; req->flow_id = flow_id; schedule_work(&req->work); - return 0; + return rc; +out_unlock: + spin_unlock(&efx->rps_hash_lock); out_clear: clear_bit(slot_idx, &efx->rps_slot_map); return rc; -- cgit v1.2.1 From d6fef10c750e64f248543d2eee7c86a4a019f7ec Mon Sep 17 00:00:00 2001 From: Md Fahad Iqbal Polash Date: Mon, 16 Apr 2018 10:07:03 -0700 Subject: ice: Fix insufficient memory issue in ice_aq_manage_mac_read For the MAC read operation, the device can return up to two (LAN and WoL) MAC addresses. Without access to adequate memory, the device will return an error. Fixed this by allocating the right amount of memory. Also, logic to detect and copy the LAN MAC address into the port_info structure has been added. Note that the WoL MAC address is ignored currently as the WoL feature isn't supported yet. Fixes: dc49c7723676 ("ice: Get MAC/PHY/link info and scheduler topology") Signed-off-by: Md Fahad Iqbal Polash Signed-off-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_common.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 21977ec984c4..71d032cc5fa7 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -78,6 +78,7 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size, struct ice_aq_desc desc; enum ice_status status; u16 flags; + u8 i; cmd = &desc.params.mac_read; @@ -98,8 +99,16 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size, return ICE_ERR_CFG; } - ether_addr_copy(hw->port_info->mac.lan_addr, resp->mac_addr); - ether_addr_copy(hw->port_info->mac.perm_addr, resp->mac_addr); + /* A single port can report up to two (LAN and WoL) addresses */ + for (i = 0; i < cmd->num_addr; i++) + if (resp[i].addr_type == ICE_AQC_MAN_MAC_ADDR_TYPE_LAN) { + ether_addr_copy(hw->port_info->mac.lan_addr, + resp[i].mac_addr); + ether_addr_copy(hw->port_info->mac.perm_addr, + resp[i].mac_addr); + break; + } + return 0; } @@ -464,9 +473,12 @@ enum ice_status ice_init_hw(struct ice_hw *hw) if (status) goto err_unroll_sched; - /* Get port MAC information */ - mac_buf_len = sizeof(struct ice_aqc_manage_mac_read_resp); - mac_buf = devm_kzalloc(ice_hw_to_dev(hw), mac_buf_len, GFP_KERNEL); + /* Get MAC information */ + /* A single port can report up to two (LAN and WoL) addresses */ + mac_buf = devm_kcalloc(ice_hw_to_dev(hw), 2, + sizeof(struct ice_aqc_manage_mac_read_resp), + GFP_KERNEL); + mac_buf_len = 2 * sizeof(struct ice_aqc_manage_mac_read_resp); if (!mac_buf) { status = ICE_ERR_NO_MEMORY; -- cgit v1.2.1