mirror of
https://github.com/adulau/aha.git
synced 2025-01-04 07:03:38 +00:00
IPoIB/cm: Fix performance regression on Mellanox
commit 518b1646
("IPoIB/cm: Fix SRQ WR leak") introduced a severe
performance regression on Mellanox cards, because keeping a QP in the
error state for extended periods of time moves hardware to the slow
path (until the QP is destroyed). For example, MPI latency goes from
~3 usecs to ~7 usecs.
Fix this by posting a send WR on one of the QPs that are being
flushed, instead of using a separate drain QP that is kept in the
error state.
This fixes bug <https://bugs.openfabrics.org/show_bug.cgi?id=636>,
reported and bisected by Scott Weitzenkamp at Cisco and debugged by
Sasha Mikheev at Voltaire.
Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
This commit is contained in:
parent
8b7e15772a
commit
ec56dc0b7f
2 changed files with 37 additions and 40 deletions
|
@ -156,7 +156,7 @@ struct ipoib_cm_data {
|
||||||
* - and then invoke a Destroy QP or Reset QP.
|
* - and then invoke a Destroy QP or Reset QP.
|
||||||
*
|
*
|
||||||
* We use the second option and wait for a completion on the
|
* We use the second option and wait for a completion on the
|
||||||
* rx_drain_qp before destroying QPs attached to our SRQ.
|
* same CQ before destroying QPs attached to our SRQ.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
enum ipoib_cm_state {
|
enum ipoib_cm_state {
|
||||||
|
@ -199,7 +199,6 @@ struct ipoib_cm_dev_priv {
|
||||||
struct ib_srq *srq;
|
struct ib_srq *srq;
|
||||||
struct ipoib_cm_rx_buf *srq_ring;
|
struct ipoib_cm_rx_buf *srq_ring;
|
||||||
struct ib_cm_id *id;
|
struct ib_cm_id *id;
|
||||||
struct ib_qp *rx_drain_qp; /* generates WR described in 10.3.1 */
|
|
||||||
struct list_head passive_ids; /* state: LIVE */
|
struct list_head passive_ids; /* state: LIVE */
|
||||||
struct list_head rx_error_list; /* state: ERROR */
|
struct list_head rx_error_list; /* state: ERROR */
|
||||||
struct list_head rx_flush_list; /* state: FLUSH, drain not started */
|
struct list_head rx_flush_list; /* state: FLUSH, drain not started */
|
||||||
|
|
|
@ -69,8 +69,9 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
|
||||||
|
|
||||||
#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
|
#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
|
||||||
|
|
||||||
static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
|
static struct ib_send_wr ipoib_cm_rx_drain_wr = {
|
||||||
.wr_id = IPOIB_CM_RX_DRAIN_WRID
|
.wr_id = IPOIB_CM_RX_DRAIN_WRID,
|
||||||
|
.opcode = IB_WR_SEND,
|
||||||
};
|
};
|
||||||
|
|
||||||
static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
|
static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
|
||||||
|
@ -163,16 +164,22 @@ partial_error:
|
||||||
|
|
||||||
static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
|
static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
|
||||||
{
|
{
|
||||||
struct ib_recv_wr *bad_wr;
|
struct ib_send_wr *bad_wr;
|
||||||
|
struct ipoib_cm_rx *p;
|
||||||
|
|
||||||
/* rx_drain_qp send queue depth is 1, so
|
/* We only reserved 1 extra slot in CQ for drain WRs, so
|
||||||
* make sure we have at most 1 outstanding WR. */
|
* make sure we have at most 1 outstanding WR. */
|
||||||
if (list_empty(&priv->cm.rx_flush_list) ||
|
if (list_empty(&priv->cm.rx_flush_list) ||
|
||||||
!list_empty(&priv->cm.rx_drain_list))
|
!list_empty(&priv->cm.rx_drain_list))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
|
/*
|
||||||
ipoib_warn(priv, "failed to post rx_drain wr\n");
|
* QPs on flush list are error state. This way, a "flush
|
||||||
|
* error" WC will be immediately generated for each WR we post.
|
||||||
|
*/
|
||||||
|
p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
|
||||||
|
if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
|
||||||
|
ipoib_warn(priv, "failed to post drain wr\n");
|
||||||
|
|
||||||
list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
|
list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
|
||||||
}
|
}
|
||||||
|
@ -199,10 +206,10 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
|
||||||
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
||||||
struct ib_qp_init_attr attr = {
|
struct ib_qp_init_attr attr = {
|
||||||
.event_handler = ipoib_cm_rx_event_handler,
|
.event_handler = ipoib_cm_rx_event_handler,
|
||||||
.send_cq = priv->cq, /* does not matter, we never send anything */
|
.send_cq = priv->cq, /* For drain WR */
|
||||||
.recv_cq = priv->cq,
|
.recv_cq = priv->cq,
|
||||||
.srq = priv->cm.srq,
|
.srq = priv->cm.srq,
|
||||||
.cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
|
.cap.max_send_wr = 1, /* For drain WR */
|
||||||
.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
|
.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
|
||||||
.sq_sig_type = IB_SIGNAL_ALL_WR,
|
.sq_sig_type = IB_SIGNAL_ALL_WR,
|
||||||
.qp_type = IB_QPT_RC,
|
.qp_type = IB_QPT_RC,
|
||||||
|
@ -242,6 +249,27 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
|
||||||
ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
|
ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Current Mellanox HCA firmware won't generate completions
|
||||||
|
* with error for drain WRs unless the QP has been moved to
|
||||||
|
* RTS first. This work-around leaves a window where a QP has
|
||||||
|
* moved to error asynchronously, but this will eventually get
|
||||||
|
* fixed in firmware, so let's not error out if modify QP
|
||||||
|
* fails.
|
||||||
|
*/
|
||||||
|
qp_attr.qp_state = IB_QPS_RTS;
|
||||||
|
ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
|
||||||
|
if (ret) {
|
||||||
|
ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
|
||||||
|
if (ret) {
|
||||||
|
ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -623,38 +651,11 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
|
||||||
int ipoib_cm_dev_open(struct net_device *dev)
|
int ipoib_cm_dev_open(struct net_device *dev)
|
||||||
{
|
{
|
||||||
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
struct ipoib_dev_priv *priv = netdev_priv(dev);
|
||||||
struct ib_qp_init_attr qp_init_attr = {
|
|
||||||
.send_cq = priv->cq, /* does not matter, we never send anything */
|
|
||||||
.recv_cq = priv->cq,
|
|
||||||
.cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
|
|
||||||
.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
|
|
||||||
.cap.max_recv_wr = 1,
|
|
||||||
.cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
|
|
||||||
.sq_sig_type = IB_SIGNAL_ALL_WR,
|
|
||||||
.qp_type = IB_QPT_UC,
|
|
||||||
};
|
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
|
if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
|
|
||||||
if (IS_ERR(priv->cm.rx_drain_qp)) {
|
|
||||||
printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
|
|
||||||
ret = PTR_ERR(priv->cm.rx_drain_qp);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We put the QP in error state directly. This way, a "flush
|
|
||||||
* error" WC will be immediately generated for each WR we post.
|
|
||||||
*/
|
|
||||||
ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
|
|
||||||
if (ret) {
|
|
||||||
ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
|
|
||||||
goto err_qp;
|
|
||||||
}
|
|
||||||
|
|
||||||
priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
|
priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
|
||||||
if (IS_ERR(priv->cm.id)) {
|
if (IS_ERR(priv->cm.id)) {
|
||||||
printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
|
printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
|
||||||
|
@ -676,8 +677,6 @@ err_listen:
|
||||||
ib_destroy_cm_id(priv->cm.id);
|
ib_destroy_cm_id(priv->cm.id);
|
||||||
err_cm:
|
err_cm:
|
||||||
priv->cm.id = NULL;
|
priv->cm.id = NULL;
|
||||||
err_qp:
|
|
||||||
ib_destroy_qp(priv->cm.rx_drain_qp);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -740,7 +739,6 @@ void ipoib_cm_dev_stop(struct net_device *dev)
|
||||||
kfree(p);
|
kfree(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
ib_destroy_qp(priv->cm.rx_drain_qp);
|
|
||||||
cancel_delayed_work(&priv->cm.stale_task);
|
cancel_delayed_work(&priv->cm.stale_task);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue