2006-01-03 02:04:38 +08:00
|
|
|
/*
|
|
|
|
* net/tipc/bcast.c: TIPC broadcast code
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2015-02-05 21:36:43 +08:00
|
|
|
* Copyright (c) 2004-2006, 2014-2015, Ericsson AB
|
2006-01-03 02:04:38 +08:00
|
|
|
* Copyright (c) 2004, Intel Corporation.
|
2011-01-08 02:00:11 +08:00
|
|
|
* Copyright (c) 2005, 2010-2011, Wind River Systems
|
2006-01-03 02:04:38 +08:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
2006-01-11 20:30:43 +08:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
2006-01-03 02:04:38 +08:00
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
2006-01-11 20:30:43 +08:00
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. Neither the names of the copyright holders nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived from
|
|
|
|
* this software without specific prior written permission.
|
2006-01-03 02:04:38 +08:00
|
|
|
*
|
2006-01-11 20:30:43 +08:00
|
|
|
* Alternatively, this software may be distributed under the terms of the
|
|
|
|
* GNU General Public License ("GPL") version 2 as published by the Free
|
|
|
|
* Software Foundation.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
2006-01-03 02:04:38 +08:00
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
#include <linux/tipc_config.h>
|
2014-07-17 08:41:00 +08:00
|
|
|
#include "socket.h"
|
|
|
|
#include "msg.h"
|
2006-01-03 02:04:38 +08:00
|
|
|
#include "bcast.h"
|
2011-04-08 02:57:53 +08:00
|
|
|
#include "name_distr.h"
|
2015-10-22 20:51:33 +08:00
|
|
|
#include "link.h"
|
|
|
|
#include "node.h"
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2014-03-27 12:54:35 +08:00
|
|
|
#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */
|
tipc: extend broadcast link window size
The default fix broadcast window size is currently set to 20 packets.
This is a very low value, set at a time when we were still testing on
10 Mb/s hubs, and a change to it is long overdue.
Commit 7845989cb4b3da1db ("net: tipc: fix stall during bclink wakeup procedure")
revealed a problem with this low value. For messages of importance LOW,
the backlog queue limit will be calculated to 30 packets, while a
single, maximum sized message of 66000 bytes, carried across a 1500 MTU
network consists of 46 packets.
This leads to the following scenario (among others leading to the same
situation):
1: Msg 1 of 46 packets is sent. 20 packets go to the transmit queue, 26
packets to the backlog queue.
2: Msg 2 of 46 packets is attempted sent, but rejected because there is
no more space in the backlog queue at this level. The sender is added
to the wakeup queue with a "pending packets chain size" number of 46.
3: Some packets in the transmit queue are acked and released. We try to
wake up the sender, but the pending size of 46 is bigger than the LOW
wakeup limit of 30, so this doesn't happen.
5: Subsequent acks releases all the remaining buffers. Each time we test
for the wakeup criteria and find that 46 still is larger than 30,
even after both the transmit and the backlog queues are empty.
6: The sender is never woken up and given a chance to send its message.
He is stuck.
We could now loosen the wakeup criteria (used by link_prepare_wakeup())
to become equal to the send criteria (used by tipc_link_xmit()), i.e.,
by ignoring the "pending packets chain size" value altogether, or we can
just increase the queue limits so that the criteria can be satisfied
anyway. There are good reasons (potentially multiple waiting senders) to
not opt for the former solution, so we choose the latter one.
This commit fixes the problem by giving the broadcast link window a
default value of 50 packets. We also introduce a new minimum link
window size BCLINK_MIN_WIN of 32, which is enough to always avoid the
described situation. Finally, in order to not break any existing users
which may set the window explicitly, we enforce that the window is set
to the new minimum value in case the user is trying to set it to
anything lower.
Fixes: 7845989cb4b3da1db ("net: tipc: fix stall during bclink wakeup procedure")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-19 21:21:37 +08:00
|
|
|
#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
|
|
|
|
#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2010-05-11 22:30:07 +08:00
|
|
|
const char tipc_bclink_name[] = "broadcast-link";
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
/**
|
|
|
|
* struct tipc_bcbearer_pair - a pair of bearers used by broadcast link
|
|
|
|
* @primary: pointer to primary bearer
|
|
|
|
* @secondary: pointer to secondary bearer
|
|
|
|
*
|
|
|
|
* Bearers must have same priority and same set of reachable destinations
|
|
|
|
* to be paired.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct tipc_bcbearer_pair {
|
|
|
|
struct tipc_bearer *primary;
|
|
|
|
struct tipc_bearer *secondary;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define BCBEARER MAX_BEARERS
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct tipc_bcbearer - bearer used by broadcast link
|
|
|
|
* @bearer: (non-standard) broadcast bearer structure
|
|
|
|
* @media: (non-standard) broadcast media structure
|
|
|
|
* @bpairs: array of bearer pairs
|
|
|
|
* @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort()
|
|
|
|
* @remains: temporary node map used by tipc_bcbearer_send()
|
|
|
|
* @remains_new: temporary node map used tipc_bcbearer_send()
|
|
|
|
*
|
|
|
|
* Note: The fields labelled "temporary" are incorporated into the bearer
|
|
|
|
* to avoid consuming potentially limited stack space through the use of
|
|
|
|
* large local variables within multicast routines. Concurrent access is
|
|
|
|
* prevented through use of the spinlock "bcast_lock".
|
|
|
|
*/
|
|
|
|
struct tipc_bcbearer {
|
|
|
|
struct tipc_bearer bearer;
|
|
|
|
struct tipc_media media;
|
|
|
|
struct tipc_bcbearer_pair bpairs[MAX_BEARERS];
|
|
|
|
struct tipc_bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
|
|
|
|
struct tipc_node_map remains;
|
|
|
|
struct tipc_node_map remains_new;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct tipc_bc_base - link used for broadcast messages
|
|
|
|
* @link: (non-standard) broadcast link structure
|
|
|
|
* @node: (non-standard) node structure representing b'cast link's peer node
|
|
|
|
* @bcast_nodes: map of broadcast-capable nodes
|
|
|
|
* @retransmit_to: node that most recently requested a retransmit
|
|
|
|
*
|
|
|
|
* Handles sequence numbering, fragmentation, bundling, etc.
|
|
|
|
*/
|
|
|
|
struct tipc_bc_base {
|
|
|
|
struct tipc_link link;
|
|
|
|
struct tipc_node node;
|
|
|
|
struct sk_buff_head arrvq;
|
|
|
|
struct sk_buff_head inputq;
|
|
|
|
struct tipc_node_map bcast_nodes;
|
|
|
|
struct tipc_node *retransmit_to;
|
|
|
|
};
|
|
|
|
|
2015-10-22 20:51:35 +08:00
|
|
|
static struct tipc_bc_base *tipc_bc_base(struct net *net)
|
|
|
|
{
|
|
|
|
return tipc_net(net)->bcbase;
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
/**
|
|
|
|
* tipc_nmap_equal - test for equality of node maps
|
|
|
|
*/
|
|
|
|
static int tipc_nmap_equal(struct tipc_node_map *nm_a,
|
|
|
|
struct tipc_node_map *nm_b)
|
|
|
|
{
|
|
|
|
return !memcmp(nm_a, nm_b, sizeof(*nm_a));
|
|
|
|
}
|
|
|
|
|
2010-10-13 21:20:35 +08:00
|
|
|
static void tipc_nmap_diff(struct tipc_node_map *nm_a,
|
|
|
|
struct tipc_node_map *nm_b,
|
|
|
|
struct tipc_node_map *nm_diff);
|
2014-04-21 10:55:51 +08:00
|
|
|
static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node);
|
|
|
|
static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node);
|
2015-01-09 15:27:07 +08:00
|
|
|
static void tipc_bclink_lock(struct net *net)
|
2014-05-05 08:56:15 +08:00
|
|
|
{
|
2015-10-22 20:51:34 +08:00
|
|
|
tipc_bcast_lock(net);
|
2014-05-05 08:56:15 +08:00
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
static void tipc_bclink_unlock(struct net *net)
|
2014-05-05 08:56:15 +08:00
|
|
|
{
|
2015-10-22 20:51:34 +08:00
|
|
|
tipc_bcast_unlock(net);
|
2014-05-05 08:56:17 +08:00
|
|
|
}
|
|
|
|
|
2015-02-05 21:36:44 +08:00
|
|
|
void tipc_bclink_input(struct net *net)
|
|
|
|
{
|
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
tipc_sk_mcast_rcv(net, &tn->bcbase->arrvq, &tn->bcbase->inputq);
|
2015-02-05 21:36:44 +08:00
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
uint tipc_bcast_get_mtu(void)
|
2014-07-17 08:41:00 +08:00
|
|
|
{
|
|
|
|
return MAX_PKT_DEFAULT_MCAST;
|
|
|
|
}
|
|
|
|
|
2006-03-21 14:37:04 +08:00
|
|
|
static u32 bcbuf_acks(struct sk_buff *buf)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2006-01-13 05:22:32 +08:00
|
|
|
return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2006-03-21 14:37:04 +08:00
|
|
|
static void bcbuf_set_acks(struct sk_buff *buf, u32 acks)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2006-01-13 05:22:32 +08:00
|
|
|
TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2006-03-21 14:37:04 +08:00
|
|
|
static void bcbuf_decr_acks(struct sk_buff *buf)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
|
|
|
bcbuf_set_acks(buf, bcbuf_acks(buf) - 1);
|
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
void tipc_bclink_add_node(struct net *net, u32 addr)
|
2011-10-24 23:18:12 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
|
|
|
|
tipc_bclink_lock(net);
|
2015-10-22 20:51:33 +08:00
|
|
|
tipc_nmap_add(&tn->bcbase->bcast_nodes, addr);
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2011-10-24 23:18:12 +08:00
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
void tipc_bclink_remove_node(struct net *net, u32 addr)
|
2011-10-24 23:18:12 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
|
|
|
|
tipc_bclink_lock(net);
|
2015-10-22 20:51:33 +08:00
|
|
|
tipc_nmap_remove(&tn->bcbase->bcast_nodes, addr);
|
2015-06-28 21:44:44 +08:00
|
|
|
|
|
|
|
/* Last node? => reset backlog queue */
|
2015-10-22 20:51:33 +08:00
|
|
|
if (!tn->bcbase->bcast_nodes.count)
|
|
|
|
tipc_link_purge_backlog(&tn->bcbase->link);
|
2015-06-28 21:44:44 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2011-10-24 23:18:12 +08:00
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
static void bclink_set_last_sent(struct net *net)
|
2010-08-17 19:00:09 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
struct tipc_link *bcl = tn->bcl;
|
|
|
|
|
2015-05-14 22:46:18 +08:00
|
|
|
bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1);
|
2010-08-17 19:00:09 +08:00
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
u32 tipc_bclink_get_last_sent(struct net *net)
|
2010-08-17 19:00:09 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
|
2015-05-14 22:46:15 +08:00
|
|
|
return tn->bcl->silent_intv_cnt;
|
2010-08-17 19:00:09 +08:00
|
|
|
}
|
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
static void bclink_update_last_sent(struct tipc_node *node, u32 seqno)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
node->bclink.last_sent = less_eq(node->bclink.last_sent, seqno) ?
|
|
|
|
seqno : node->bclink.last_sent;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2012-07-10 18:55:09 +08:00
|
|
|
/**
|
2011-01-19 02:53:16 +08:00
|
|
|
* tipc_bclink_retransmit_to - get most recent node to request retransmission
|
|
|
|
*
|
2014-05-05 08:56:15 +08:00
|
|
|
* Called with bclink_lock locked
|
2011-01-19 02:53:16 +08:00
|
|
|
*/
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_node *tipc_bclink_retransmit_to(struct net *net)
|
2011-01-19 02:53:16 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
return tn->bcbase->retransmit_to;
|
2011-01-19 02:53:16 +08:00
|
|
|
}
|
|
|
|
|
2007-02-09 22:25:21 +08:00
|
|
|
/**
|
2006-01-03 02:04:38 +08:00
|
|
|
* bclink_retransmit_pkt - retransmit broadcast packets
|
|
|
|
* @after: sequence number of last packet to *not* retransmit
|
|
|
|
* @to: sequence number of last packet to retransmit
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2014-05-05 08:56:15 +08:00
|
|
|
* Called with bclink_lock locked
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-01-09 15:27:07 +08:00
|
|
|
static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2014-11-26 11:41:52 +08:00
|
|
|
struct sk_buff *skb;
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_link *bcl = tn->bcl;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-03-14 04:08:10 +08:00
|
|
|
skb_queue_walk(&bcl->transmq, skb) {
|
2015-01-09 15:26:58 +08:00
|
|
|
if (more(buf_seqno(skb), after)) {
|
|
|
|
tipc_link_retransmit(bcl, skb, mod(to - after));
|
2014-11-26 11:41:52 +08:00
|
|
|
break;
|
2015-01-09 15:26:58 +08:00
|
|
|
}
|
2014-11-26 11:41:52 +08:00
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-09-07 17:05:48 +08:00
|
|
|
/**
|
|
|
|
* bclink_prepare_wakeup - prepare users for wakeup after congestion
|
|
|
|
* @bcl: broadcast link
|
|
|
|
* @resultq: queue for users which can be woken up
|
|
|
|
* Move a number of waiting users, as permitted by available space in
|
|
|
|
* the send queue, from link wait queue to specified queue for wakeup
|
|
|
|
*/
|
|
|
|
static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq)
|
|
|
|
{
|
|
|
|
int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
|
|
|
|
int imp, lim;
|
|
|
|
struct sk_buff *skb, *tmp;
|
|
|
|
|
|
|
|
skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) {
|
|
|
|
imp = TIPC_SKB_CB(skb)->chain_imp;
|
|
|
|
lim = bcl->window + bcl->backlog[imp].limit;
|
|
|
|
pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
|
|
|
|
if ((pnd[imp] + bcl->backlog[imp].len) >= lim)
|
|
|
|
continue;
|
|
|
|
skb_unlink(skb, &bcl->wakeupq);
|
|
|
|
skb_queue_tail(resultq, skb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-08 02:12:34 +08:00
|
|
|
/**
|
|
|
|
* tipc_bclink_wakeup_users - wake up pending users
|
|
|
|
*
|
|
|
|
* Called with no locks taken
|
|
|
|
*/
|
2015-01-09 15:27:05 +08:00
|
|
|
void tipc_bclink_wakeup_users(struct net *net)
|
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-08 02:12:34 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2015-09-07 17:05:48 +08:00
|
|
|
struct tipc_link *bcl = tn->bcl;
|
|
|
|
struct sk_buff_head resultq;
|
2014-12-03 23:58:40 +08:00
|
|
|
|
2015-09-07 17:05:48 +08:00
|
|
|
skb_queue_head_init(&resultq);
|
|
|
|
bclink_prepare_wakeup(bcl, &resultq);
|
|
|
|
tipc_sk_rcv(net, &resultq);
|
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-08 02:12:34 +08:00
|
|
|
}
|
|
|
|
|
2007-02-09 22:25:21 +08:00
|
|
|
/**
|
2006-01-18 07:38:21 +08:00
|
|
|
* tipc_bclink_acknowledge - handle acknowledgement of broadcast packets
|
2006-01-03 02:04:38 +08:00
|
|
|
* @n_ptr: node that sent acknowledgement info
|
|
|
|
* @acked: broadcast sequence # that has been acknowledged
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2014-05-05 08:56:15 +08:00
|
|
|
* Node is locked, bclink_lock unlocked.
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2008-09-03 14:38:32 +08:00
|
|
|
void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2014-11-26 11:41:52 +08:00
|
|
|
struct sk_buff *skb, *tmp;
|
2006-01-03 02:04:38 +08:00
|
|
|
unsigned int released = 0;
|
2015-01-09 15:27:07 +08:00
|
|
|
struct net *net = n_ptr->net;
|
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-03-14 04:08:09 +08:00
|
|
|
if (unlikely(!n_ptr->bclink.recv_permitted))
|
|
|
|
return;
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2015-03-14 04:08:09 +08:00
|
|
|
|
2011-10-25 03:26:24 +08:00
|
|
|
/* Bail out if tx queue is empty (no clean up is required) */
|
2015-03-14 04:08:10 +08:00
|
|
|
skb = skb_peek(&tn->bcl->transmq);
|
2014-11-26 11:41:52 +08:00
|
|
|
if (!skb)
|
2011-10-25 03:26:24 +08:00
|
|
|
goto exit;
|
|
|
|
|
|
|
|
/* Determine which messages need to be acknowledged */
|
|
|
|
if (acked == INVALID_LINK_SEQ) {
|
|
|
|
/*
|
|
|
|
* Contact with specified node has been lost, so need to
|
|
|
|
* acknowledge sent messages only (if other nodes still exist)
|
|
|
|
* or both sent and unsent messages (otherwise)
|
|
|
|
*/
|
2015-10-22 20:51:33 +08:00
|
|
|
if (tn->bcbase->bcast_nodes.count)
|
2015-05-14 22:46:15 +08:00
|
|
|
acked = tn->bcl->silent_intv_cnt;
|
2011-10-25 03:26:24 +08:00
|
|
|
else
|
2015-05-14 22:46:15 +08:00
|
|
|
acked = tn->bcl->snd_nxt;
|
2011-10-25 03:26:24 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Bail out if specified sequence number does not correspond
|
|
|
|
* to a message that has been sent and not yet acknowledged
|
|
|
|
*/
|
2014-11-26 11:41:52 +08:00
|
|
|
if (less(acked, buf_seqno(skb)) ||
|
2015-05-14 22:46:15 +08:00
|
|
|
less(tn->bcl->silent_intv_cnt, acked) ||
|
2011-10-25 03:26:24 +08:00
|
|
|
less_eq(acked, n_ptr->bclink.acked))
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Skip over packets that node has previously acknowledged */
|
2015-03-14 04:08:10 +08:00
|
|
|
skb_queue_walk(&tn->bcl->transmq, skb) {
|
2014-11-26 11:41:52 +08:00
|
|
|
if (more(buf_seqno(skb), n_ptr->bclink.acked))
|
|
|
|
break;
|
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
|
|
|
|
/* Update packets that node is now acknowledging */
|
2015-03-14 04:08:10 +08:00
|
|
|
skb_queue_walk_from_safe(&tn->bcl->transmq, skb, tmp) {
|
2014-11-26 11:41:52 +08:00
|
|
|
if (more(buf_seqno(skb), acked))
|
|
|
|
break;
|
2015-03-14 04:08:10 +08:00
|
|
|
bcbuf_decr_acks(skb);
|
|
|
|
bclink_set_last_sent(net);
|
2014-11-26 11:41:52 +08:00
|
|
|
if (bcbuf_acks(skb) == 0) {
|
2015-03-14 04:08:10 +08:00
|
|
|
__skb_unlink(skb, &tn->bcl->transmq);
|
2014-11-26 11:41:52 +08:00
|
|
|
kfree_skb(skb);
|
2006-01-03 02:04:38 +08:00
|
|
|
released = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
n_ptr->bclink.acked = acked;
|
|
|
|
|
|
|
|
/* Try resolving broadcast link congestion, if necessary */
|
2015-03-14 04:08:10 +08:00
|
|
|
if (unlikely(skb_peek(&tn->bcl->backlogq))) {
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_link_push_packets(tn->bcl);
|
|
|
|
bclink_set_last_sent(net);
|
2010-08-17 19:00:09 +08:00
|
|
|
}
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
if (unlikely(released && !skb_queue_empty(&tn->bcl->wakeupq)))
|
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-08 02:12:34 +08:00
|
|
|
n_ptr->action_flags |= TIPC_WAKEUP_BCAST_USERS;
|
2011-10-25 03:26:24 +08:00
|
|
|
exit:
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2012-07-10 18:55:09 +08:00
|
|
|
/**
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
* tipc_bclink_update_link_state - update broadcast link state
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
tipc: purge tipc_net_lock lock
Now tipc routing hierarchy comprises the structures 'node', 'link'and
'bearer'. The whole hierarchy is protected by a big read/write lock,
tipc_net_lock, to ensure that nothing is added or removed while code
is accessing any of these structures. Obviously the locking policy
makes node, link and bearer components closely bound together so that
their relationship becomes unnecessarily complex. In the worst case,
such locking policy not only has a negative influence on performance,
but also it's prone to lead to deadlock occasionally.
In order o decouple the complex relationship between bearer and node
as well as link, the locking policy is adjusted as follows:
- Bearer level
RTNL lock is used on update side, and RCU is used on read side.
Meanwhile, all bearer instances including broadcast bearer are
saved into bearer_list array.
- Node and link level
All node instances are saved into two tipc_node_list and node_htable
lists. The two lists are protected by node_list_lock on write side,
and they are guarded with RCU lock on read side. All members in node
structure including link instances are protected by node spin lock.
- The relationship between bearer and node
When link accesses bearer, it first needs to find the bearer with
its bearer identity from the bearer_list array. When bearer accesses
node, it can iterate the node_htable hash list with the node
address to find the corresponding node.
In the new locking policy, every component has its private locking
solution and the relationship between bearer and node is very simple,
that is, they can find each other with node address or bearer identity
from node_htable hash list or bearer_list array.
Until now above all changes have been done, so tipc_net_lock can be
removed safely.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 10:55:48 +08:00
|
|
|
* RCU and node lock set
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-02-05 21:36:36 +08:00
|
|
|
void tipc_bclink_update_link_state(struct tipc_node *n_ptr,
|
2015-01-09 15:27:04 +08:00
|
|
|
u32 last_sent)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
struct sk_buff *buf;
|
2015-02-05 21:36:36 +08:00
|
|
|
struct net *net = n_ptr->net;
|
2015-01-09 15:27:04 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2006-01-03 02:04:38 +08:00
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
/* Ignore "stale" link state info */
|
|
|
|
if (less_eq(last_sent, n_ptr->bclink.last_in))
|
|
|
|
return;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
/* Update link synchronization state; quit if in sync */
|
|
|
|
bclink_update_last_sent(n_ptr, last_sent);
|
|
|
|
|
|
|
|
if (n_ptr->bclink.last_sent == n_ptr->bclink.last_in)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Update out-of-sync state; quit if loss is still unconfirmed */
|
|
|
|
if ((++n_ptr->bclink.oos_state) == 1) {
|
|
|
|
if (n_ptr->bclink.deferred_size < (TIPC_MIN_LINK_WIN / 2))
|
|
|
|
return;
|
|
|
|
n_ptr->bclink.oos_state++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Don't NACK if one has been recently sent (or seen) */
|
|
|
|
if (n_ptr->bclink.oos_state & 0x1)
|
2006-01-03 02:04:38 +08:00
|
|
|
return;
|
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
/* Send NACK */
|
2010-10-13 21:20:35 +08:00
|
|
|
buf = tipc_buf_acquire(INT_H_SIZE);
|
2006-01-03 02:04:38 +08:00
|
|
|
if (buf) {
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
struct tipc_msg *msg = buf_msg(buf);
|
2015-03-14 04:08:10 +08:00
|
|
|
struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferdq);
|
2014-11-26 11:41:53 +08:00
|
|
|
u32 to = skb ? buf_seqno(skb) - 1 : n_ptr->bclink.last_sent;
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
|
2015-02-05 21:36:36 +08:00
|
|
|
tipc_msg_init(tn->own_addr, msg, BCAST_PROTOCOL, STATE_MSG,
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
INT_H_SIZE, n_ptr->addr);
|
2011-01-26 05:12:39 +08:00
|
|
|
msg_set_non_seq(msg, 1);
|
2015-01-09 15:27:04 +08:00
|
|
|
msg_set_mc_netid(msg, tn->net_id);
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
msg_set_bcast_ack(msg, n_ptr->bclink.last_in);
|
|
|
|
msg_set_bcgap_after(msg, n_ptr->bclink.last_in);
|
2014-11-26 11:41:53 +08:00
|
|
|
msg_set_bcgap_to(msg, to);
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2015-01-09 15:27:06 +08:00
|
|
|
tipc_bearer_send(net, MAX_BEARERS, buf, NULL);
|
2015-01-09 15:27:07 +08:00
|
|
|
tn->bcl->stats.sent_nacks++;
|
|
|
|
tipc_bclink_unlock(net);
|
2011-11-05 01:24:29 +08:00
|
|
|
kfree_skb(buf);
|
2006-01-03 02:04:38 +08:00
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
n_ptr->bclink.oos_state++;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *hdr)
|
|
|
|
{
|
|
|
|
u16 last = msg_last_bcast(hdr);
|
|
|
|
int mtyp = msg_type(hdr);
|
|
|
|
|
|
|
|
if (unlikely(msg_user(hdr) != LINK_PROTOCOL))
|
|
|
|
return;
|
|
|
|
if (mtyp == STATE_MSG) {
|
|
|
|
tipc_bclink_update_link_state(n, last);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Compatibility: older nodes don't know BCAST_PROTOCOL synchronization,
|
|
|
|
* and transfer synch info in LINK_PROTOCOL messages.
|
|
|
|
*/
|
|
|
|
if (tipc_node_is_up(n))
|
|
|
|
return;
|
|
|
|
if ((mtyp != RESET_MSG) && (mtyp != ACTIVATE_MSG))
|
|
|
|
return;
|
|
|
|
n->bclink.last_sent = last;
|
|
|
|
n->bclink.last_in = last;
|
|
|
|
n->bclink.oos_state = 0;
|
|
|
|
}
|
|
|
|
|
2012-07-10 18:55:09 +08:00
|
|
|
/**
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
* bclink_peek_nack - monitor retransmission requests sent by other nodes
|
2006-01-03 02:04:38 +08:00
|
|
|
*
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
* Delay any upcoming NACK by this node if another node has already
|
|
|
|
* requested the first message this node is going to ask for.
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-01-09 15:27:05 +08:00
|
|
|
static void bclink_peek_nack(struct net *net, struct tipc_msg *msg)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-01-09 15:27:05 +08:00
|
|
|
struct tipc_node *n_ptr = tipc_node_find(net, msg_destnode(msg));
|
2006-01-03 02:04:38 +08:00
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
if (unlikely(!n_ptr))
|
2006-01-03 02:04:38 +08:00
|
|
|
return;
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_lock(n_ptr);
|
2012-11-16 13:51:30 +08:00
|
|
|
if (n_ptr->bclink.recv_permitted &&
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
(n_ptr->bclink.last_in != n_ptr->bclink.last_sent) &&
|
|
|
|
(n_ptr->bclink.last_in == msg_bcgap_after(msg)))
|
|
|
|
n_ptr->bclink.oos_state = 2;
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_unlock(n_ptr);
|
2015-03-26 18:10:24 +08:00
|
|
|
tipc_node_put(n_ptr);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster
|
2014-07-17 08:41:03 +08:00
|
|
|
* and to identified node local sockets
|
2015-01-09 15:27:05 +08:00
|
|
|
* @net: the applicable net namespace
|
2014-11-26 11:41:55 +08:00
|
|
|
* @list: chain of buffers containing message
|
2014-07-17 08:41:00 +08:00
|
|
|
* Consumes the buffer chain, except when returning -ELINKCONG
|
|
|
|
* Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
|
|
|
|
*/
|
2015-10-22 20:51:33 +08:00
|
|
|
int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
|
2014-07-17 08:41:00 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
struct tipc_link *bcl = tn->bcl;
|
2015-10-22 20:51:33 +08:00
|
|
|
struct tipc_bc_base *bclink = tn->bcbase;
|
2014-07-17 08:41:00 +08:00
|
|
|
int rc = 0;
|
|
|
|
int bc = 0;
|
2014-11-26 11:41:55 +08:00
|
|
|
struct sk_buff *skb;
|
2015-02-05 21:36:44 +08:00
|
|
|
struct sk_buff_head arrvq;
|
|
|
|
struct sk_buff_head inputq;
|
2014-07-17 08:41:00 +08:00
|
|
|
|
|
|
|
/* Prepare clone of message for local node */
|
2014-11-26 11:41:55 +08:00
|
|
|
skb = tipc_msg_reassemble(list);
|
2015-07-17 04:54:23 +08:00
|
|
|
if (unlikely(!skb))
|
2014-07-17 08:41:00 +08:00
|
|
|
return -EHOSTUNREACH;
|
2015-07-17 04:54:23 +08:00
|
|
|
|
2015-02-05 21:36:44 +08:00
|
|
|
/* Broadcast to all nodes */
|
2014-07-17 08:41:00 +08:00
|
|
|
if (likely(bclink)) {
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2014-07-17 08:41:00 +08:00
|
|
|
if (likely(bclink->bcast_nodes.count)) {
|
2015-01-09 15:27:06 +08:00
|
|
|
rc = __tipc_link_xmit(net, bcl, list);
|
2014-07-17 08:41:00 +08:00
|
|
|
if (likely(!rc)) {
|
2015-03-14 04:08:10 +08:00
|
|
|
u32 len = skb_queue_len(&bcl->transmq);
|
2014-11-26 11:41:52 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
bclink_set_last_sent(net);
|
2014-07-17 08:41:00 +08:00
|
|
|
bcl->stats.queue_sz_counts++;
|
2014-11-26 11:41:52 +08:00
|
|
|
bcl->stats.accu_queue_sz += len;
|
2014-07-17 08:41:00 +08:00
|
|
|
}
|
|
|
|
bc = 1;
|
|
|
|
}
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2014-07-17 08:41:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(!bc))
|
2014-11-26 11:41:55 +08:00
|
|
|
__skb_queue_purge(list);
|
2014-07-17 08:41:00 +08:00
|
|
|
|
2015-02-05 21:36:44 +08:00
|
|
|
if (unlikely(rc)) {
|
2014-11-26 11:41:55 +08:00
|
|
|
kfree_skb(skb);
|
2015-02-05 21:36:44 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* Deliver message clone */
|
|
|
|
__skb_queue_head_init(&arrvq);
|
|
|
|
skb_queue_head_init(&inputq);
|
|
|
|
__skb_queue_tail(&arrvq, skb);
|
|
|
|
tipc_sk_mcast_rcv(net, &arrvq, &inputq);
|
2014-07-17 08:41:00 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2012-07-10 18:55:09 +08:00
|
|
|
/**
|
2011-10-28 04:43:09 +08:00
|
|
|
* bclink_accept_pkt - accept an incoming, in-sequence broadcast packet
|
|
|
|
*
|
2014-05-05 08:56:15 +08:00
|
|
|
* Called with both sending node's lock and bclink_lock taken.
|
2011-10-28 04:43:09 +08:00
|
|
|
*/
|
|
|
|
static void bclink_accept_pkt(struct tipc_node *node, u32 seqno)
|
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(node->net, tipc_net_id);
|
|
|
|
|
2011-10-28 04:43:09 +08:00
|
|
|
bclink_update_last_sent(node, seqno);
|
|
|
|
node->bclink.last_in = seqno;
|
|
|
|
node->bclink.oos_state = 0;
|
2015-01-09 15:27:07 +08:00
|
|
|
tn->bcl->stats.recv_info++;
|
2011-10-28 04:43:09 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Unicast an ACK periodically, ensuring that
|
|
|
|
* all nodes in the cluster don't ACK at the same time
|
|
|
|
*/
|
2015-01-09 15:27:10 +08:00
|
|
|
if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) {
|
2015-07-17 04:54:19 +08:00
|
|
|
tipc_link_proto_xmit(node_active_link(node, node->addr),
|
tipc: simplify link mtu negotiation
When a link is being established, the two endpoints advertise their
respective interface MTU in the transmitted RESET and ACTIVATE messages.
If there is any difference, the lower of the two MTUs will be selected
for use by both endpoints.
However, as a remnant of earlier attempts to introduce TIPC level
routing. there also exists an MTU discovery mechanism. If an intermediate
node has a lower MTU than the two endpoints, they will discover this
through a bisectional approach, and finally adopt this MTU for common use.
Since there is no TIPC level routing, and probably never will be,
this mechanism doesn't make any sense, and only serves to make the
link level protocol unecessarily complex.
In this commit, we eliminate the MTU discovery algorithm,and fall back
to the simple MTU advertising approach. This change is fully backwards
compatible.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-02 21:33:02 +08:00
|
|
|
STATE_MSG, 0, 0, 0, 0);
|
2015-01-09 15:27:07 +08:00
|
|
|
tn->bcl->stats.sent_acks++;
|
2011-10-28 04:43:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-10 18:55:09 +08:00
|
|
|
/**
|
2014-02-18 16:06:46 +08:00
|
|
|
* tipc_bclink_rcv - receive a broadcast packet, and deliver upwards
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
tipc: purge tipc_net_lock lock
Now tipc routing hierarchy comprises the structures 'node', 'link'and
'bearer'. The whole hierarchy is protected by a big read/write lock,
tipc_net_lock, to ensure that nothing is added or removed while code
is accessing any of these structures. Obviously the locking policy
makes node, link and bearer components closely bound together so that
their relationship becomes unnecessarily complex. In the worst case,
such locking policy not only has a negative influence on performance,
but also it's prone to lead to deadlock occasionally.
In order o decouple the complex relationship between bearer and node
as well as link, the locking policy is adjusted as follows:
- Bearer level
RTNL lock is used on update side, and RCU is used on read side.
Meanwhile, all bearer instances including broadcast bearer are
saved into bearer_list array.
- Node and link level
All node instances are saved into two tipc_node_list and node_htable
lists. The two lists are protected by node_list_lock on write side,
and they are guarded with RCU lock on read side. All members in node
structure including link instances are protected by node spin lock.
- The relationship between bearer and node
When link accesses bearer, it first needs to find the bearer with
its bearer identity from the bearer_list array. When bearer accesses
node, it can iterate the node_htable hash list with the node
address to find the corresponding node.
In the new locking policy, every component has its private locking
solution and the relationship between bearer and node is very simple,
that is, they can find each other with node address or bearer identity
from node_htable hash list or bearer_list array.
Until now above all changes have been done, so tipc_net_lock can be
removed safely.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 10:55:48 +08:00
|
|
|
* RCU is locked, no other locks set
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-01-09 15:27:04 +08:00
|
|
|
void tipc_bclink_rcv(struct net *net, struct sk_buff *buf)
|
2006-06-26 14:40:01 +08:00
|
|
|
{
|
2015-01-09 15:27:04 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_link *bcl = tn->bcl;
|
2006-01-03 02:04:38 +08:00
|
|
|
struct tipc_msg *msg = buf_msg(buf);
|
2011-04-08 01:57:25 +08:00
|
|
|
struct tipc_node *node;
|
2006-01-03 02:04:38 +08:00
|
|
|
u32 next_in;
|
|
|
|
u32 seqno;
|
2014-07-17 08:41:01 +08:00
|
|
|
int deferred = 0;
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
int pos = 0;
|
|
|
|
struct sk_buff *iskb;
|
2015-02-05 21:36:44 +08:00
|
|
|
struct sk_buff_head *arrvq, *inputq;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2011-04-08 01:57:25 +08:00
|
|
|
/* Screen out unwanted broadcast messages */
|
2015-01-09 15:27:04 +08:00
|
|
|
if (msg_mc_netid(msg) != tn->net_id)
|
2011-04-08 01:57:25 +08:00
|
|
|
goto exit;
|
|
|
|
|
2015-01-09 15:27:05 +08:00
|
|
|
node = tipc_node_find(net, msg_prevnode(msg));
|
2011-04-08 01:57:25 +08:00
|
|
|
if (unlikely(!node))
|
|
|
|
goto exit;
|
|
|
|
|
|
|
|
tipc_node_lock(node);
|
2012-11-16 13:51:30 +08:00
|
|
|
if (unlikely(!node->bclink.recv_permitted))
|
2011-04-08 01:57:25 +08:00
|
|
|
goto unlock;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2011-10-27 03:33:44 +08:00
|
|
|
/* Handle broadcast protocol message */
|
2006-01-03 02:04:38 +08:00
|
|
|
if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) {
|
2011-04-08 02:57:53 +08:00
|
|
|
if (msg_type(msg) != STATE_MSG)
|
|
|
|
goto unlock;
|
2015-01-09 15:27:10 +08:00
|
|
|
if (msg_destnode(msg) == tn->own_addr) {
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_bclink_acknowledge(node, msg_bcast_ack(msg));
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2006-01-03 02:04:38 +08:00
|
|
|
bcl->stats.recv_nacks++;
|
2015-10-22 20:51:33 +08:00
|
|
|
tn->bcbase->retransmit_to = node;
|
2015-01-09 15:27:07 +08:00
|
|
|
bclink_retransmit_pkt(tn, msg_bcgap_after(msg),
|
2006-01-03 02:04:38 +08:00
|
|
|
msg_bcgap_to(msg));
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
tipc: fix potential deadlock when all links are reset
[ 60.988363] ======================================================
[ 60.988754] [ INFO: possible circular locking dependency detected ]
[ 60.989152] 3.19.0+ #194 Not tainted
[ 60.989377] -------------------------------------------------------
[ 60.989781] swapper/3/0 is trying to acquire lock:
[ 60.990079] (&(&n_ptr->lock)->rlock){+.-...}, at: [<ffffffffa0006dca>] tipc_link_retransmit+0x1aa/0x240 [tipc]
[ 60.990743]
[ 60.990743] but task is already holding lock:
[ 60.991106] (&(&bclink->lock)->rlock){+.-...}, at: [<ffffffffa00004be>] tipc_bclink_lock+0x8e/0xa0 [tipc]
[ 60.991738]
[ 60.991738] which lock already depends on the new lock.
[ 60.991738]
[ 60.992174]
[ 60.992174] the existing dependency chain (in reverse order) is:
[ 60.992174]
-> #1 (&(&bclink->lock)->rlock){+.-...}:
[ 60.992174] [<ffffffff810a9c0c>] lock_acquire+0x9c/0x140
[ 60.992174] [<ffffffff8179c41f>] _raw_spin_lock_bh+0x3f/0x50
[ 60.992174] [<ffffffffa00004be>] tipc_bclink_lock+0x8e/0xa0 [tipc]
[ 60.992174] [<ffffffffa0000f57>] tipc_bclink_add_node+0x97/0xf0 [tipc]
[ 60.992174] [<ffffffffa0011815>] tipc_node_link_up+0xf5/0x110 [tipc]
[ 60.992174] [<ffffffffa0007783>] link_state_event+0x2b3/0x4f0 [tipc]
[ 60.992174] [<ffffffffa00193c0>] tipc_link_proto_rcv+0x24c/0x418 [tipc]
[ 60.992174] [<ffffffffa0008857>] tipc_rcv+0x827/0xac0 [tipc]
[ 60.992174] [<ffffffffa0002ca3>] tipc_l2_rcv_msg+0x73/0xd0 [tipc]
[ 60.992174] [<ffffffff81646e66>] __netif_receive_skb_core+0x746/0x980
[ 60.992174] [<ffffffff816470c1>] __netif_receive_skb+0x21/0x70
[ 60.992174] [<ffffffff81647295>] netif_receive_skb_internal+0x35/0x130
[ 60.992174] [<ffffffff81648218>] napi_gro_receive+0x158/0x1d0
[ 60.992174] [<ffffffff81559e05>] e1000_clean_rx_irq+0x155/0x490
[ 60.992174] [<ffffffff8155c1b7>] e1000_clean+0x267/0x990
[ 60.992174] [<ffffffff81647b60>] net_rx_action+0x150/0x360
[ 60.992174] [<ffffffff8105ec43>] __do_softirq+0x123/0x360
[ 60.992174] [<ffffffff8105f12e>] irq_exit+0x8e/0xb0
[ 60.992174] [<ffffffff8179f9f5>] do_IRQ+0x65/0x110
[ 60.992174] [<ffffffff8179da6f>] ret_from_intr+0x0/0x13
[ 60.992174] [<ffffffff8100de9f>] arch_cpu_idle+0xf/0x20
[ 60.992174] [<ffffffff8109dfa6>] cpu_startup_entry+0x2f6/0x3f0
[ 60.992174] [<ffffffff81033cda>] start_secondary+0x13a/0x150
[ 60.992174]
-> #0 (&(&n_ptr->lock)->rlock){+.-...}:
[ 60.992174] [<ffffffff810a8f7d>] __lock_acquire+0x163d/0x1ca0
[ 60.992174] [<ffffffff810a9c0c>] lock_acquire+0x9c/0x140
[ 60.992174] [<ffffffff8179c41f>] _raw_spin_lock_bh+0x3f/0x50
[ 60.992174] [<ffffffffa0006dca>] tipc_link_retransmit+0x1aa/0x240 [tipc]
[ 60.992174] [<ffffffffa0001e11>] tipc_bclink_rcv+0x611/0x640 [tipc]
[ 60.992174] [<ffffffffa0008646>] tipc_rcv+0x616/0xac0 [tipc]
[ 60.992174] [<ffffffffa0002ca3>] tipc_l2_rcv_msg+0x73/0xd0 [tipc]
[ 60.992174] [<ffffffff81646e66>] __netif_receive_skb_core+0x746/0x980
[ 60.992174] [<ffffffff816470c1>] __netif_receive_skb+0x21/0x70
[ 60.992174] [<ffffffff81647295>] netif_receive_skb_internal+0x35/0x130
[ 60.992174] [<ffffffff81648218>] napi_gro_receive+0x158/0x1d0
[ 60.992174] [<ffffffff81559e05>] e1000_clean_rx_irq+0x155/0x490
[ 60.992174] [<ffffffff8155c1b7>] e1000_clean+0x267/0x990
[ 60.992174] [<ffffffff81647b60>] net_rx_action+0x150/0x360
[ 60.992174] [<ffffffff8105ec43>] __do_softirq+0x123/0x360
[ 60.992174] [<ffffffff8105f12e>] irq_exit+0x8e/0xb0
[ 60.992174] [<ffffffff8179f9f5>] do_IRQ+0x65/0x110
[ 60.992174] [<ffffffff8179da6f>] ret_from_intr+0x0/0x13
[ 60.992174] [<ffffffff8100de9f>] arch_cpu_idle+0xf/0x20
[ 60.992174] [<ffffffff8109dfa6>] cpu_startup_entry+0x2f6/0x3f0
[ 60.992174] [<ffffffff81033cda>] start_secondary+0x13a/0x150
[ 60.992174]
[ 60.992174] other info that might help us debug this:
[ 60.992174]
[ 60.992174] Possible unsafe locking scenario:
[ 60.992174]
[ 60.992174] CPU0 CPU1
[ 60.992174] ---- ----
[ 60.992174] lock(&(&bclink->lock)->rlock);
[ 60.992174] lock(&(&n_ptr->lock)->rlock);
[ 60.992174] lock(&(&bclink->lock)->rlock);
[ 60.992174] lock(&(&n_ptr->lock)->rlock);
[ 60.992174]
[ 60.992174] *** DEADLOCK ***
[ 60.992174]
[ 60.992174] 3 locks held by swapper/3/0:
[ 60.992174] #0: (rcu_read_lock){......}, at: [<ffffffff81646791>] __netif_receive_skb_core+0x71/0x980
[ 60.992174] #1: (rcu_read_lock){......}, at: [<ffffffffa0002c35>] tipc_l2_rcv_msg+0x5/0xd0 [tipc]
[ 60.992174] #2: (&(&bclink->lock)->rlock){+.-...}, at: [<ffffffffa00004be>] tipc_bclink_lock+0x8e/0xa0 [tipc]
[ 60.992174]
The correct the sequence of grabbing n_ptr->lock and bclink->lock
should be that the former is first held and the latter is then taken,
which exactly happened on CPU1. But especially when the retransmission
of broadcast link is failed, bclink->lock is first held in
tipc_bclink_rcv(), and n_ptr->lock is taken in link_retransmit_failure()
called by tipc_link_retransmit() subsequently, which is demonstrated on
CPU0. As a result, deadlock occurs.
If the order of holding the two locks happening on CPU0 is reversed, the
deadlock risk will be relieved. Therefore, the node lock taken in
link_retransmit_failure() originally is moved to tipc_bclink_rcv()
so that it's obtained before bclink lock. But the precondition of
the adjustment of node lock is that responding to bclink reset event
must be moved from tipc_bclink_unlock() to tipc_node_unlock().
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-26 18:10:23 +08:00
|
|
|
tipc_node_unlock(node);
|
2006-01-03 02:04:38 +08:00
|
|
|
} else {
|
2011-04-08 01:57:25 +08:00
|
|
|
tipc_node_unlock(node);
|
2015-01-09 15:27:05 +08:00
|
|
|
bclink_peek_nack(net, msg);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
2015-03-26 18:10:24 +08:00
|
|
|
tipc_node_put(node);
|
2011-04-08 01:57:25 +08:00
|
|
|
goto exit;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2011-04-08 01:57:25 +08:00
|
|
|
/* Handle in-sequence broadcast message */
|
2006-01-03 02:04:38 +08:00
|
|
|
seqno = msg_seqno(msg);
|
2011-10-27 03:33:44 +08:00
|
|
|
next_in = mod(node->bclink.last_in + 1);
|
2015-10-22 20:51:33 +08:00
|
|
|
arrvq = &tn->bcbase->arrvq;
|
|
|
|
inputq = &tn->bcbase->inputq;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
|
|
|
if (likely(seqno == next_in)) {
|
2011-10-27 03:33:44 +08:00
|
|
|
receive:
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
/* Deliver message to destination */
|
2006-01-03 02:04:38 +08:00
|
|
|
if (likely(msg_isdata(msg))) {
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2011-10-28 04:43:09 +08:00
|
|
|
bclink_accept_pkt(node, seqno);
|
2015-02-05 21:36:44 +08:00
|
|
|
spin_lock_bh(&inputq->lock);
|
|
|
|
__skb_queue_tail(arrvq, buf);
|
|
|
|
spin_unlock_bh(&inputq->lock);
|
|
|
|
node->action_flags |= TIPC_BCAST_MSG_EVT;
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_unlock(node);
|
2006-01-03 02:04:38 +08:00
|
|
|
} else if (msg_user(msg) == MSG_BUNDLER) {
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2011-10-28 04:43:09 +08:00
|
|
|
bclink_accept_pkt(node, seqno);
|
2006-01-03 02:04:38 +08:00
|
|
|
bcl->stats.recv_bundles++;
|
|
|
|
bcl->stats.recv_bundled += msg_msgcnt(msg);
|
2015-02-05 21:36:44 +08:00
|
|
|
pos = 0;
|
|
|
|
while (tipc_msg_extract(buf, &iskb, &pos)) {
|
|
|
|
spin_lock_bh(&inputq->lock);
|
|
|
|
__skb_queue_tail(arrvq, iskb);
|
|
|
|
spin_unlock_bh(&inputq->lock);
|
|
|
|
}
|
|
|
|
node->action_flags |= TIPC_BCAST_MSG_EVT;
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_unlock(node);
|
2006-01-03 02:04:38 +08:00
|
|
|
} else if (msg_user(msg) == MSG_FRAGMENTER) {
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2011-10-28 04:43:09 +08:00
|
|
|
bclink_accept_pkt(node, seqno);
|
2015-03-25 18:09:40 +08:00
|
|
|
tipc_buf_append(&node->bclink.reasm_buf, &buf);
|
|
|
|
if (unlikely(!buf && !node->bclink.reasm_buf)) {
|
|
|
|
tipc_bclink_unlock(net);
|
|
|
|
goto unlock;
|
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
bcl->stats.recv_fragments++;
|
2014-05-14 17:39:12 +08:00
|
|
|
if (buf) {
|
2006-01-03 02:04:38 +08:00
|
|
|
bcl->stats.recv_fragmented++;
|
tipc: message reassembly using fragment chain
When the first fragment of a long data data message is received on a link, a
reassembly buffer large enough to hold the data from this and all subsequent
fragments of the message is allocated. The payload of each new fragment is
copied into this buffer upon arrival. When the last fragment is received, the
reassembled message is delivered upwards to the port/socket layer.
Not only is this an inefficient approach, but it may also cause bursts of
reassembly failures in low memory situations. since we may fail to allocate
the necessary large buffer in the first place. Furthermore, after 100 subsequent
such failures the link will be reset, something that in reality aggravates the
situation.
To remedy this problem, this patch introduces a different approach. Instead of
allocating a big reassembly buffer, we now append the arriving fragments
to a reassembly chain on the link, and deliver the whole chain up to the
socket layer once the last fragment has been received. This is safe because
the retransmission layer of a TIPC link always delivers packets in strict
uninterrupted order, to the reassembly layer as to all other upper layers.
Hence there can never be more than one fragment chain pending reassembly at
any given time in a link, and we can trust (but still verify) that the
fragments will be chained up in the correct order.
Signed-off-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-11-06 16:28:06 +08:00
|
|
|
msg = buf_msg(buf);
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2013-11-06 16:28:05 +08:00
|
|
|
goto receive;
|
|
|
|
}
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_unlock(node);
|
2006-01-03 02:04:38 +08:00
|
|
|
} else {
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2011-10-28 04:43:09 +08:00
|
|
|
bclink_accept_pkt(node, seqno);
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_unlock(node);
|
2011-11-05 01:24:29 +08:00
|
|
|
kfree_skb(buf);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
2011-04-08 01:57:25 +08:00
|
|
|
buf = NULL;
|
2011-10-27 03:33:44 +08:00
|
|
|
|
|
|
|
/* Determine new synchronization state */
|
2011-04-08 01:57:25 +08:00
|
|
|
tipc_node_lock(node);
|
2011-10-27 03:33:44 +08:00
|
|
|
if (unlikely(!tipc_node_is_up(node)))
|
|
|
|
goto unlock;
|
|
|
|
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
if (node->bclink.last_in == node->bclink.last_sent)
|
2011-10-27 03:33:44 +08:00
|
|
|
goto unlock;
|
|
|
|
|
2015-03-14 04:08:10 +08:00
|
|
|
if (skb_queue_empty(&node->bclink.deferdq)) {
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
node->bclink.oos_state = 1;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
2015-03-14 04:08:10 +08:00
|
|
|
msg = buf_msg(skb_peek(&node->bclink.deferdq));
|
2011-10-27 03:33:44 +08:00
|
|
|
seqno = msg_seqno(msg);
|
|
|
|
next_in = mod(next_in + 1);
|
|
|
|
if (seqno != next_in)
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
/* Take in-sequence message from deferred queue & deliver it */
|
2015-03-14 04:08:10 +08:00
|
|
|
buf = __skb_dequeue(&node->bclink.deferdq);
|
2011-10-27 03:33:44 +08:00
|
|
|
goto receive;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle out-of-sequence broadcast message */
|
|
|
|
if (less(next_in, seqno)) {
|
2015-03-14 04:08:10 +08:00
|
|
|
deferred = tipc_link_defer_pkt(&node->bclink.deferdq,
|
2011-10-27 03:33:44 +08:00
|
|
|
buf);
|
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-28 02:17:53 +08:00
|
|
|
bclink_update_last_sent(node, seqno);
|
2011-04-08 01:57:25 +08:00
|
|
|
buf = NULL;
|
2014-07-17 08:41:01 +08:00
|
|
|
}
|
2011-10-27 03:33:44 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2011-10-27 04:13:35 +08:00
|
|
|
|
2011-10-27 03:33:44 +08:00
|
|
|
if (deferred)
|
|
|
|
bcl->stats.deferred_recv++;
|
2011-10-27 03:57:26 +08:00
|
|
|
else
|
|
|
|
bcl->stats.duplicates++;
|
2011-10-27 03:33:44 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2011-10-27 04:13:35 +08:00
|
|
|
|
2011-04-08 01:57:25 +08:00
|
|
|
unlock:
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_node_unlock(node);
|
2015-03-26 18:10:24 +08:00
|
|
|
tipc_node_put(node);
|
2011-04-08 01:57:25 +08:00
|
|
|
exit:
|
2011-11-05 01:24:29 +08:00
|
|
|
kfree_skb(buf);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2008-09-03 14:38:32 +08:00
|
|
|
u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2012-11-16 13:51:30 +08:00
|
|
|
return (n_ptr->bclink.recv_permitted &&
|
2015-01-09 15:27:07 +08:00
|
|
|
(tipc_bclink_get_last_sent(n_ptr->net) != n_ptr->bclink.acked));
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2006-01-18 07:38:21 +08:00
|
|
|
* tipc_bcbearer_send - send a packet through the broadcast pseudo-bearer
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2011-04-07 22:44:54 +08:00
|
|
|
* Send packet over as many bearers as necessary to reach all nodes
|
|
|
|
* that have joined the broadcast link.
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2011-04-07 22:44:54 +08:00
|
|
|
* Returns 0 (packet sent successfully) under all circumstances,
|
|
|
|
* since the broadcast link's pseudo-bearer never blocks
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-01-09 15:27:07 +08:00
|
|
|
static int tipc_bcbearer_send(struct net *net, struct sk_buff *buf,
|
|
|
|
struct tipc_bearer *unused1,
|
2006-03-21 14:37:52 +08:00
|
|
|
struct tipc_media_addr *unused2)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
|
|
|
int bp_index;
|
tipc: ensure sequential message delivery across dual bearers
When we run broadcast packets over dual bearers/interfaces, the
current transmission code is flipping bearers between each sent
packet, with the purpose of leveraging the double bandwidth
available. The receiving bclink is resequencing the packets if
needed, so all messages are delivered upwards from the broadcast
link in the correct order, even if they may arrive in concurrent
interrupts.
However, at the moment of delivery upwards to the socket, we release
all spinlocks (bclink_lock, node_lock), so it is still possible
that arriving messages bypass each other before they reach the socket
queue.
We fix this by applying the same technique we are using for unicast
traffic. We use a link selector (i.e., the last bit of sending port
number) to ensure that messages from the same sender socket always are
sent over the same bearer. This guarantees sequential delivery between
socket pairs, which is sufficient to satisfy the protocol spec, as well
as all known user requirements.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-17 08:41:04 +08:00
|
|
|
struct tipc_msg *msg = buf_msg(buf);
|
2015-01-09 15:27:04 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_bcbearer *bcbearer = tn->bcbearer;
|
2015-10-22 20:51:33 +08:00
|
|
|
struct tipc_bc_base *bclink = tn->bcbase;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2013-05-01 20:04:44 +08:00
|
|
|
/* Prepare broadcast link message for reliable transmission,
|
2011-04-07 22:44:54 +08:00
|
|
|
* if first time trying to send it;
|
|
|
|
* preparation is skipped for broadcast link protocol messages
|
|
|
|
* since they are sent in an unreliable manner and don't need it
|
|
|
|
*/
|
2006-01-03 02:04:38 +08:00
|
|
|
if (likely(!msg_non_seq(buf_msg(buf)))) {
|
2011-10-24 23:18:12 +08:00
|
|
|
bcbuf_set_acks(buf, bclink->bcast_nodes.count);
|
2008-06-05 08:54:48 +08:00
|
|
|
msg_set_non_seq(msg, 1);
|
2015-01-09 15:27:04 +08:00
|
|
|
msg_set_mc_netid(msg, tn->net_id);
|
2015-01-09 15:27:07 +08:00
|
|
|
tn->bcl->stats.sent_info++;
|
2011-10-24 23:18:12 +08:00
|
|
|
if (WARN_ON(!bclink->bcast_nodes.count)) {
|
2011-05-24 01:14:18 +08:00
|
|
|
dump_stack();
|
|
|
|
return 0;
|
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Send buffer over bearers until all targets reached */
|
2011-10-24 23:18:12 +08:00
|
|
|
bcbearer->remains = bclink->bcast_nodes;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
|
|
|
for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
|
2011-01-08 02:00:11 +08:00
|
|
|
struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary;
|
|
|
|
struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary;
|
tipc: ensure sequential message delivery across dual bearers
When we run broadcast packets over dual bearers/interfaces, the
current transmission code is flipping bearers between each sent
packet, with the purpose of leveraging the double bandwidth
available. The receiving bclink is resequencing the packets if
needed, so all messages are delivered upwards from the broadcast
link in the correct order, even if they may arrive in concurrent
interrupts.
However, at the moment of delivery upwards to the socket, we release
all spinlocks (bclink_lock, node_lock), so it is still possible
that arriving messages bypass each other before they reach the socket
queue.
We fix this by applying the same technique we are using for unicast
traffic. We use a link selector (i.e., the last bit of sending port
number) to ensure that messages from the same sender socket always are
sent over the same bearer. This guarantees sequential delivery between
socket pairs, which is sufficient to satisfy the protocol spec, as well
as all known user requirements.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-17 08:41:04 +08:00
|
|
|
struct tipc_bearer *bp[2] = {p, s};
|
|
|
|
struct tipc_bearer *b = bp[msg_link_selector(msg)];
|
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 20:04:46 +08:00
|
|
|
struct sk_buff *tbuf;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
|
|
|
if (!p)
|
2013-05-01 20:04:44 +08:00
|
|
|
break; /* No more bearers to try */
|
tipc: ensure sequential message delivery across dual bearers
When we run broadcast packets over dual bearers/interfaces, the
current transmission code is flipping bearers between each sent
packet, with the purpose of leveraging the double bandwidth
available. The receiving bclink is resequencing the packets if
needed, so all messages are delivered upwards from the broadcast
link in the correct order, even if they may arrive in concurrent
interrupts.
However, at the moment of delivery upwards to the socket, we release
all spinlocks (bclink_lock, node_lock), so it is still possible
that arriving messages bypass each other before they reach the socket
queue.
We fix this by applying the same technique we are using for unicast
traffic. We use a link selector (i.e., the last bit of sending port
number) to ensure that messages from the same sender socket always are
sent over the same bearer. This guarantees sequential delivery between
socket pairs, which is sufficient to satisfy the protocol spec, as well
as all known user requirements.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-17 08:41:04 +08:00
|
|
|
if (!b)
|
|
|
|
b = p;
|
2013-05-01 20:04:45 +08:00
|
|
|
tipc_nmap_diff(&bcbearer->remains, &b->nodes,
|
2013-05-01 20:04:44 +08:00
|
|
|
&bcbearer->remains_new);
|
2006-06-26 14:53:20 +08:00
|
|
|
if (bcbearer->remains_new.count == bcbearer->remains.count)
|
2013-05-01 20:04:44 +08:00
|
|
|
continue; /* Nothing added by bearer pair */
|
2006-01-03 02:04:38 +08:00
|
|
|
|
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 20:04:46 +08:00
|
|
|
if (bp_index == 0) {
|
|
|
|
/* Use original buffer for first bearer */
|
2015-01-09 15:27:06 +08:00
|
|
|
tipc_bearer_send(net, b->identity, buf, &b->bcast_addr);
|
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 20:04:46 +08:00
|
|
|
} else {
|
|
|
|
/* Avoid concurrent buffer access */
|
2014-06-12 06:36:26 +08:00
|
|
|
tbuf = pskb_copy_for_clone(buf, GFP_ATOMIC);
|
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 20:04:46 +08:00
|
|
|
if (!tbuf)
|
|
|
|
break;
|
2015-01-09 15:27:06 +08:00
|
|
|
tipc_bearer_send(net, b->identity, tbuf,
|
|
|
|
&b->bcast_addr);
|
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 20:04:46 +08:00
|
|
|
kfree_skb(tbuf); /* Bearer keeps a clone */
|
|
|
|
}
|
2006-06-26 14:53:20 +08:00
|
|
|
if (bcbearer->remains_new.count == 0)
|
2013-05-01 20:04:44 +08:00
|
|
|
break; /* All targets reached */
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2006-06-26 14:53:20 +08:00
|
|
|
bcbearer->remains = bcbearer->remains_new;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
2007-02-09 22:25:21 +08:00
|
|
|
|
2011-04-07 22:44:54 +08:00
|
|
|
return 0;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2006-01-18 07:38:21 +08:00
|
|
|
* tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-01-09 15:27:06 +08:00
|
|
|
void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr,
|
|
|
|
u32 node, bool action)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-01-09 15:27:06 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_bcbearer *bcbearer = tn->bcbearer;
|
2011-12-30 09:55:27 +08:00
|
|
|
struct tipc_bcbearer_pair *bp_temp = bcbearer->bpairs_temp;
|
|
|
|
struct tipc_bcbearer_pair *bp_curr;
|
2014-04-21 10:55:45 +08:00
|
|
|
struct tipc_bearer *b;
|
2006-01-03 02:04:38 +08:00
|
|
|
int b_index;
|
|
|
|
int pri;
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2014-04-21 10:55:51 +08:00
|
|
|
if (action)
|
|
|
|
tipc_nmap_add(nm_ptr, node);
|
|
|
|
else
|
|
|
|
tipc_nmap_remove(nm_ptr, node);
|
|
|
|
|
2006-01-03 02:04:38 +08:00
|
|
|
/* Group bearers by priority (can assume max of two per priority) */
|
|
|
|
memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp));
|
|
|
|
|
2014-04-21 10:55:45 +08:00
|
|
|
rcu_read_lock();
|
2006-01-03 02:04:38 +08:00
|
|
|
for (b_index = 0; b_index < MAX_BEARERS; b_index++) {
|
2015-01-09 15:27:06 +08:00
|
|
|
b = rcu_dereference_rtnl(tn->bearer_list[b_index]);
|
2014-03-27 12:54:34 +08:00
|
|
|
if (!b || !b->nodes.count)
|
2006-01-03 02:04:38 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!bp_temp[b->priority].primary)
|
|
|
|
bp_temp[b->priority].primary = b;
|
|
|
|
else
|
|
|
|
bp_temp[b->priority].secondary = b;
|
|
|
|
}
|
2014-04-21 10:55:45 +08:00
|
|
|
rcu_read_unlock();
|
2006-01-03 02:04:38 +08:00
|
|
|
|
|
|
|
/* Create array of bearer pairs for broadcasting */
|
|
|
|
bp_curr = bcbearer->bpairs;
|
|
|
|
memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
|
|
|
|
|
2006-01-14 05:22:22 +08:00
|
|
|
for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) {
|
2006-01-03 02:04:38 +08:00
|
|
|
|
|
|
|
if (!bp_temp[pri].primary)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
bp_curr->primary = bp_temp[pri].primary;
|
|
|
|
|
|
|
|
if (bp_temp[pri].secondary) {
|
2006-01-18 07:38:21 +08:00
|
|
|
if (tipc_nmap_equal(&bp_temp[pri].primary->nodes,
|
|
|
|
&bp_temp[pri].secondary->nodes)) {
|
2006-01-03 02:04:38 +08:00
|
|
|
bp_curr->secondary = bp_temp[pri].secondary;
|
|
|
|
} else {
|
|
|
|
bp_curr++;
|
|
|
|
bp_curr->primary = bp_temp[pri].secondary;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bp_curr++;
|
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2014-11-24 18:10:29 +08:00
|
|
|
static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
|
|
|
|
struct tipc_stats *stats)
|
2014-11-20 17:29:12 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct nlattr *nest;
|
|
|
|
|
|
|
|
struct nla_map {
|
|
|
|
__u32 key;
|
|
|
|
__u32 val;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct nla_map map[] = {
|
|
|
|
{TIPC_NLA_STATS_RX_INFO, stats->recv_info},
|
|
|
|
{TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
|
|
|
|
{TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
|
|
|
|
{TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
|
|
|
|
{TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
|
|
|
|
{TIPC_NLA_STATS_TX_INFO, stats->sent_info},
|
|
|
|
{TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
|
|
|
|
{TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
|
|
|
|
{TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
|
|
|
|
{TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled},
|
|
|
|
{TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks},
|
|
|
|
{TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv},
|
|
|
|
{TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks},
|
|
|
|
{TIPC_NLA_STATS_TX_ACKS, stats->sent_acks},
|
|
|
|
{TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted},
|
|
|
|
{TIPC_NLA_STATS_DUPLICATES, stats->duplicates},
|
|
|
|
{TIPC_NLA_STATS_LINK_CONGS, stats->link_congs},
|
|
|
|
{TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz},
|
|
|
|
{TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ?
|
|
|
|
(stats->accu_queue_sz / stats->queue_sz_counts) : 0}
|
|
|
|
};
|
|
|
|
|
|
|
|
nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS);
|
|
|
|
if (!nest)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(map); i++)
|
|
|
|
if (nla_put_u32(skb, map[i].key, map[i].val))
|
|
|
|
goto msg_full;
|
|
|
|
|
|
|
|
nla_nest_end(skb, nest);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
msg_full:
|
|
|
|
nla_nest_cancel(skb, nest);
|
|
|
|
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
|
2014-11-20 17:29:12 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
void *hdr;
|
|
|
|
struct nlattr *attrs;
|
|
|
|
struct nlattr *prop;
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
struct tipc_link *bcl = tn->bcl;
|
2014-11-20 17:29:12 +08:00
|
|
|
|
|
|
|
if (!bcl)
|
|
|
|
return 0;
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2014-11-20 17:29:12 +08:00
|
|
|
|
2015-02-09 16:50:03 +08:00
|
|
|
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
|
2014-11-20 17:29:12 +08:00
|
|
|
NLM_F_MULTI, TIPC_NL_LINK_GET);
|
|
|
|
if (!hdr)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
|
|
|
|
if (!attrs)
|
|
|
|
goto msg_full;
|
|
|
|
|
|
|
|
/* The broadcast link is always up */
|
|
|
|
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
|
|
|
|
goto attr_msg_full;
|
|
|
|
|
|
|
|
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST))
|
|
|
|
goto attr_msg_full;
|
|
|
|
if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
|
|
|
|
goto attr_msg_full;
|
2015-05-14 22:46:15 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto attr_msg_full;
|
2015-05-14 22:46:15 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto attr_msg_full;
|
|
|
|
|
|
|
|
prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
|
|
|
|
if (!prop)
|
|
|
|
goto attr_msg_full;
|
tipc: introduce starvation free send algorithm
Currently, we only use a single counter; the length of the backlog
queue, to determine whether a message should be accepted to the queue
or not. Each time a message is being sent, the queue length is compared
to a threshold value for the message's importance priority. If the queue
length is beyond this threshold, the message is rejected. This algorithm
implies a risk of starvation of low importance senders during very high
load, because it may take a long time before the backlog queue has
decreased enough to accept a lower level message.
We now eliminate this risk by introducing a counter for each importance
priority. When a message is sent, we check only the queue level for that
particular message's priority. If that is ok, the message can be added
to the backlog, irrespective of the queue level for other priorities.
This way, each level is guaranteed a certain portion of the total
bandwidth, and any risk of starvation is eliminated.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-26 00:07:24 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto prop_msg_full;
|
|
|
|
nla_nest_end(msg->skb, prop);
|
|
|
|
|
|
|
|
err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats);
|
|
|
|
if (err)
|
|
|
|
goto attr_msg_full;
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2014-11-20 17:29:12 +08:00
|
|
|
nla_nest_end(msg->skb, attrs);
|
|
|
|
genlmsg_end(msg->skb, hdr);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
prop_msg_full:
|
|
|
|
nla_nest_cancel(msg->skb, prop);
|
|
|
|
attr_msg_full:
|
|
|
|
nla_nest_cancel(msg->skb, attrs);
|
|
|
|
msg_full:
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2014-11-20 17:29:12 +08:00
|
|
|
genlmsg_cancel(msg->skb, hdr);
|
|
|
|
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
int tipc_bclink_reset_stats(struct net *net)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
struct tipc_link *bcl = tn->bcl;
|
|
|
|
|
2006-01-03 02:04:38 +08:00
|
|
|
if (!bcl)
|
|
|
|
return -ENOPROTOOPT;
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2006-01-03 02:04:38 +08:00
|
|
|
memset(&bcl->stats, 0, sizeof(bcl->stats));
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2008-07-15 13:44:01 +08:00
|
|
|
return 0;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
int tipc_bclink_set_queue_limits(struct net *net, u32 limit)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
struct tipc_link *bcl = tn->bcl;
|
|
|
|
|
2006-01-03 02:04:38 +08:00
|
|
|
if (!bcl)
|
|
|
|
return -ENOPROTOOPT;
|
tipc: extend broadcast link window size
The default fix broadcast window size is currently set to 20 packets.
This is a very low value, set at a time when we were still testing on
10 Mb/s hubs, and a change to it is long overdue.
Commit 7845989cb4b3da1db ("net: tipc: fix stall during bclink wakeup procedure")
revealed a problem with this low value. For messages of importance LOW,
the backlog queue limit will be calculated to 30 packets, while a
single, maximum sized message of 66000 bytes, carried across a 1500 MTU
network consists of 46 packets.
This leads to the following scenario (among others leading to the same
situation):
1: Msg 1 of 46 packets is sent. 20 packets go to the transmit queue, 26
packets to the backlog queue.
2: Msg 2 of 46 packets is attempted sent, but rejected because there is
no more space in the backlog queue at this level. The sender is added
to the wakeup queue with a "pending packets chain size" number of 46.
3: Some packets in the transmit queue are acked and released. We try to
wake up the sender, but the pending size of 46 is bigger than the LOW
wakeup limit of 30, so this doesn't happen.
5: Subsequent acks releases all the remaining buffers. Each time we test
for the wakeup criteria and find that 46 still is larger than 30,
even after both the transmit and the backlog queues are empty.
6: The sender is never woken up and given a chance to send its message.
He is stuck.
We could now loosen the wakeup criteria (used by link_prepare_wakeup())
to become equal to the send criteria (used by tipc_link_xmit()), i.e.,
by ignoring the "pending packets chain size" value altogether, or we can
just increase the queue limits so that the criteria can be satisfied
anyway. There are good reasons (potentially multiple waiting senders) to
not opt for the former solution, so we choose the latter one.
This commit fixes the problem by giving the broadcast link window a
default value of 50 packets. We also introduce a new minimum link
window size BCLINK_MIN_WIN of 32, which is enough to always avoid the
described situation. Finally, in order to not break any existing users
which may set the window explicitly, we enforce that the window is set
to the new minimum value in case the user is trying to set it to
anything lower.
Fixes: 7845989cb4b3da1db ("net: tipc: fix stall during bclink wakeup procedure")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-19 21:21:37 +08:00
|
|
|
if (limit < BCLINK_WIN_MIN)
|
|
|
|
limit = BCLINK_WIN_MIN;
|
|
|
|
if (limit > TIPC_MAX_LINK_WIN)
|
2006-01-03 02:04:38 +08:00
|
|
|
return -EINVAL;
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_link_set_queue_limits(bcl, limit);
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_unlock(net);
|
2008-07-15 13:44:01 +08:00
|
|
|
return 0;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-05-06 19:58:55 +08:00
|
|
|
int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[])
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
u32 win;
|
|
|
|
struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
|
|
|
|
|
|
|
|
if (!attrs[TIPC_NLA_LINK_PROP])
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!props[TIPC_NLA_PROP_WIN])
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
|
|
|
|
|
|
|
|
return tipc_bclink_set_queue_limits(net, win);
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
int tipc_bcast_init(struct net *net)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-01-09 15:27:06 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_bcbearer *bcbearer;
|
2015-10-22 20:51:33 +08:00
|
|
|
struct tipc_bc_base *bclink;
|
2015-01-09 15:27:07 +08:00
|
|
|
struct tipc_link *bcl;
|
2015-01-09 15:27:06 +08:00
|
|
|
|
2014-05-05 08:56:16 +08:00
|
|
|
bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC);
|
|
|
|
if (!bcbearer)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC);
|
|
|
|
if (!bclink) {
|
|
|
|
kfree(bcbearer);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
bcl = &bclink->link;
|
2006-01-03 02:04:38 +08:00
|
|
|
bcbearer->bearer.media = &bcbearer->media;
|
2006-01-18 07:38:21 +08:00
|
|
|
bcbearer->media.send_msg = tipc_bcbearer_send;
|
2011-04-07 22:22:31 +08:00
|
|
|
sprintf(bcbearer->media.name, "tipc-broadcast");
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-10-22 20:51:34 +08:00
|
|
|
spin_lock_init(&tipc_net(net)->bclock);
|
2015-03-14 04:08:10 +08:00
|
|
|
__skb_queue_head_init(&bcl->transmq);
|
|
|
|
__skb_queue_head_init(&bcl->backlogq);
|
|
|
|
__skb_queue_head_init(&bcl->deferdq);
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
skb_queue_head_init(&bcl->wakeupq);
|
2015-05-14 22:46:15 +08:00
|
|
|
bcl->snd_nxt = 1;
|
2006-06-27 17:53:55 +08:00
|
|
|
spin_lock_init(&bclink->node.lock);
|
2015-02-05 21:36:44 +08:00
|
|
|
__skb_queue_head_init(&bclink->arrvq);
|
|
|
|
skb_queue_head_init(&bclink->inputq);
|
2006-01-03 02:04:38 +08:00
|
|
|
bcl->owner = &bclink->node;
|
2015-01-09 15:27:07 +08:00
|
|
|
bcl->owner->net = net;
|
tipc: simplify link mtu negotiation
When a link is being established, the two endpoints advertise their
respective interface MTU in the transmitted RESET and ACTIVATE messages.
If there is any difference, the lower of the two MTUs will be selected
for use by both endpoints.
However, as a remnant of earlier attempts to introduce TIPC level
routing. there also exists an MTU discovery mechanism. If an intermediate
node has a lower MTU than the two endpoints, they will discover this
through a bisectional approach, and finally adopt this MTU for common use.
Since there is no TIPC level routing, and probably never will be,
this mechanism doesn't make any sense, and only serves to make the
link level protocol unecessarily complex.
In this commit, we eliminate the MTU discovery algorithm,and fall back
to the simple MTU advertising approach. This change is fully backwards
compatible.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-02 21:33:02 +08:00
|
|
|
bcl->mtu = MAX_PKT_DEFAULT_MCAST;
|
2006-01-18 07:38:21 +08:00
|
|
|
tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
|
tipc: decouple the relationship between bearer and link
Currently on both paths of message transmission and reception, the
read lock of tipc_net_lock must be held before bearer is accessed,
while the write lock of tipc_net_lock has to be taken before bearer
is configured. Although it can ensure that bearer is always valid on
the two data paths, link and bearer is closely bound together.
So as the part of effort of removing tipc_net_lock, the locking
policy of bearer protection will be adjusted as below: on the two
data paths, RCU is used, and on the configuration path of bearer,
RTNL lock is applied.
Now RCU just covers the path of message reception. To make it possible
to protect the path of message transmission with RCU, link should not
use its stored bearer pointer to access bearer, but it should use the
bearer identity of its attached bearer as index to get bearer instance
from bearer_list array, which can help us decouple the relationship
between bearer and link. As a result, bearer on the path of message
transmission can be safely protected by RCU when we access bearer_list
array within RCU lock protection.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 10:55:46 +08:00
|
|
|
bcl->bearer_id = MAX_BEARERS;
|
2015-01-09 15:27:06 +08:00
|
|
|
rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer);
|
2015-02-05 21:36:36 +08:00
|
|
|
bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg;
|
2015-10-22 20:51:35 +08:00
|
|
|
|
2009-03-19 10:11:29 +08:00
|
|
|
strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
|
2015-01-09 15:27:07 +08:00
|
|
|
tn->bcbearer = bcbearer;
|
2015-10-22 20:51:33 +08:00
|
|
|
tn->bcbase = bclink;
|
2015-01-09 15:27:07 +08:00
|
|
|
tn->bcl = bcl;
|
2014-05-05 08:56:16 +08:00
|
|
|
return 0;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:35 +08:00
|
|
|
void tipc_bcast_reinit(struct net *net)
|
|
|
|
{
|
|
|
|
struct tipc_bc_base *b = tipc_bc_base(net);
|
|
|
|
|
|
|
|
msg_set_prevnode(b->link.pmsg, tipc_own_addr(net));
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:33 +08:00
|
|
|
void tipc_bcast_stop(struct net *net)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-01-09 15:27:06 +08:00
|
|
|
struct tipc_net *tn = net_generic(net, tipc_net_id);
|
|
|
|
|
2015-01-09 15:27:07 +08:00
|
|
|
tipc_bclink_lock(net);
|
|
|
|
tipc_link_purge_queues(tn->bcl);
|
|
|
|
tipc_bclink_unlock(net);
|
2015-01-09 15:27:06 +08:00
|
|
|
RCU_INIT_POINTER(tn->bearer_list[BCBEARER], NULL);
|
2014-05-05 08:56:16 +08:00
|
|
|
synchronize_net();
|
2015-01-09 15:27:07 +08:00
|
|
|
kfree(tn->bcbearer);
|
2015-10-22 20:51:33 +08:00
|
|
|
kfree(tn->bcbase);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2010-05-11 22:30:14 +08:00
|
|
|
/**
|
|
|
|
* tipc_nmap_add - add a node to a node map
|
|
|
|
*/
|
2014-04-21 10:55:51 +08:00
|
|
|
static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node)
|
2010-05-11 22:30:14 +08:00
|
|
|
{
|
|
|
|
int n = tipc_node(node);
|
|
|
|
int w = n / WSIZE;
|
|
|
|
u32 mask = (1 << (n % WSIZE));
|
|
|
|
|
|
|
|
if ((nm_ptr->map[w] & mask) == 0) {
|
|
|
|
nm_ptr->count++;
|
|
|
|
nm_ptr->map[w] |= mask;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* tipc_nmap_remove - remove a node from a node map
|
|
|
|
*/
|
2014-04-21 10:55:51 +08:00
|
|
|
static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
|
2010-05-11 22:30:14 +08:00
|
|
|
{
|
|
|
|
int n = tipc_node(node);
|
|
|
|
int w = n / WSIZE;
|
|
|
|
u32 mask = (1 << (n % WSIZE));
|
|
|
|
|
|
|
|
if ((nm_ptr->map[w] & mask) != 0) {
|
|
|
|
nm_ptr->map[w] &= ~mask;
|
|
|
|
nm_ptr->count--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* tipc_nmap_diff - find differences between node maps
|
|
|
|
* @nm_a: input node map A
|
|
|
|
* @nm_b: input node map B
|
|
|
|
* @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
|
|
|
|
*/
|
2010-10-13 21:20:35 +08:00
|
|
|
static void tipc_nmap_diff(struct tipc_node_map *nm_a,
|
|
|
|
struct tipc_node_map *nm_b,
|
|
|
|
struct tipc_node_map *nm_diff)
|
2010-05-11 22:30:14 +08:00
|
|
|
{
|
|
|
|
int stop = ARRAY_SIZE(nm_a->map);
|
|
|
|
int w;
|
|
|
|
int b;
|
|
|
|
u32 map;
|
|
|
|
|
|
|
|
memset(nm_diff, 0, sizeof(*nm_diff));
|
|
|
|
for (w = 0; w < stop; w++) {
|
|
|
|
map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]);
|
|
|
|
nm_diff->map[w] = map;
|
|
|
|
if (map != 0) {
|
|
|
|
for (b = 0 ; b < WSIZE; b++) {
|
|
|
|
if (map & (1 << b))
|
|
|
|
nm_diff->count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|