From ecdc428e4c5d821a07baf4f8b1718faf67b9026f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 12 Nov 2009 11:14:13 -0800 Subject: [PATCH 01/45] IB/mlx4: Remove unneeded code There is no such flag DE - the field is reserved and should be zero. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/qp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 219b10397b4..518d561970a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -897,7 +897,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | (to_mlx4_st(ibqp->qp_type) << 16)); - context->flags |= cpu_to_be32(1 << 8); /* DE? */ if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); From 417608c20a4c8397bc5307d949ec01ea0a0dd8e5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 12 Nov 2009 11:19:44 -0800 Subject: [PATCH 02/45] IB/mlx4: Remove limitation on LSO header size Current code has a limitation: an LSO header is not allowed to cross a 64 byte boundary. This patch removes this limitation by setting the WQE RR for large headers thus allowing LSO headers of any size. The extra buffer reserved for MLX4_IB_QP_LSO QPs has been doubled, from 64 to 128 bytes, assuming this is reasonable upper limit for header length. Also, this patch will cause IB_DEVICE_UD_TSO to be set only for HCA FW versions that set MLX4_DEV_CAP_FLAG_BLH; e.g. FW version 2.6.000 and higher. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/qp.c | 24 ++++++++++++------------ drivers/net/mlx4/fw.c | 1 + include/linux/mlx4/device.h | 1 + 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3cb3f47a10b..e596537ff35 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -103,7 +103,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - if (dev->dev->caps.max_gso_sz) + if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 518d561970a..847030c89a8 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -54,7 +54,8 @@ enum { /* * Largest possible UD header: send with GRH and immediate data. */ - MLX4_IB_UD_HEADER_SIZE = 72 + MLX4_IB_UD_HEADER_SIZE = 72, + MLX4_IB_LSO_HEADER_SPARE = 128, }; struct mlx4_ib_sqp { @@ -67,7 +68,8 @@ struct mlx4_ib_sqp { }; enum { - MLX4_IB_MIN_SQ_STRIDE = 6 + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, }; static const __be32 mlx4_ib_opcode[] = { @@ -261,7 +263,7 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) case IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + - ((flags & MLX4_IB_QP_LSO) ? 64 : 0); + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); case IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); @@ -1466,16 +1468,12 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, - __be32 *lso_hdr_sz) + __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); - /* - * This is a temporary limitation and will be removed in - * a forthcoming FW release: - */ - if (unlikely(halign > 64)) - return -EINVAL; + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4))) @@ -1521,6 +1519,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 dummy; __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; int i; spin_lock_irqsave(&qp->sq.lock, flags); @@ -1529,6 +1528,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; + blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { err = -ENOMEM; @@ -1615,7 +1615,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr->opcode == IB_WR_LSO) { - err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz); + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); if (unlikely(err)) { *bad_wr = wr; goto out; @@ -1686,7 +1686,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | - (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 3c16602172f..7194be3a289 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -90,6 +90,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags) [ 9] = "Q_Key violation counter", [10] = "VMM", [12] = "DPDP", + [15] = "Big LSO headers", [16] = "MW support", [17] = "APM support", [18] = "Atomic ops support", diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ce7cc6c7bcb..e92d1bfdb33 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -61,6 +61,7 @@ enum { MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1 << 8, MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1 << 9, MLX4_DEV_CAP_FLAG_DPDP = 1 << 12, + MLX4_DEV_CAP_FLAG_BLH = 1 << 15, MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1 << 16, MLX4_DEV_CAP_FLAG_APM = 1 << 17, MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18, From c1ccaf2478f84c2665cf57f981db143aa582d646 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 12 Nov 2009 11:32:27 -0800 Subject: [PATCH 03/45] IB/iser: Rewrite SG handling for RDMA logic After dma-mapping an SG list provided by the SCSI midlayer, iser has to make sure the mapped SG is "aligned for RDMA" in the sense that its possible to produce one mapping in the HCA IOMMU which represents the whole SG. Next, the mapped SG is formatted for registration with the HCA. This patch re-writes the logic that does the above, to make it clearer and simpler. It also fixes a bug in the being aligned for RDMA checks, where a "start" check wasn't done but rather only "end" check. Signed-off-by: Alexander Nezhinsky Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_memory.c | 120 ++++++++++------------ 1 file changed, 55 insertions(+), 65 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index b9453d068e9..274c883ef3e 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -209,6 +209,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, mem_copy->copy_buf = NULL; } +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) + /** * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses * and returns the length of resulting physical address array (may be less than @@ -221,62 +223,52 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, * where --few fragments of the same page-- are present in the SG as * consecutive elements. Also, it handles one entry SG. */ + static int iser_sg_to_page_vec(struct iser_data_buf *data, struct iser_page_vec *page_vec, struct ib_device *ibdev) { - struct scatterlist *sgl = (struct scatterlist *)data->buf; - struct scatterlist *sg; - u64 first_addr, last_addr, page; - int end_aligned; - unsigned int cur_page = 0; + struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + u64 start_addr, end_addr, page, chunk_start = 0; unsigned long total_sz = 0; - int i; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; /* compute the offset of first element */ page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; + new_chunk = 1; + cur_page = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; total_sz += dma_len; - first_addr = ib_sg_dma_address(ibdev, sg); - last_addr = first_addr + dma_len; - - end_aligned = !(last_addr & ~MASK_4K); - - /* continue to collect page fragments till aligned or SG ends */ - while (!end_aligned && (i + 1 < data->dma_nents)) { - sg = sg_next(sg); - i++; - dma_len = ib_sg_dma_len(ibdev, sg); - total_sz += dma_len; - last_addr = ib_sg_dma_address(ibdev, sg) + dma_len; - end_aligned = !(last_addr & ~MASK_4K); + /* collect page fragments until aligned or end of SG list */ + if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; } + new_chunk = 1; - /* handle the 1st page in the 1st DMA element */ - if (cur_page == 0) { - page = first_addr & MASK_4K; - page_vec->pages[cur_page] = page; - cur_page++; + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & MASK_4K; + do { + page_vec->pages[cur_page++] = page; page += SIZE_4K; - } else - page = first_addr; - - for (; page < last_addr; page += SIZE_4K) { - page_vec->pages[cur_page] = page; - cur_page++; - } - + } while (page < end_addr); } + page_vec->data_size = total_sz; iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); return cur_page; } -#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) /** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned @@ -284,42 +276,40 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ -static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data, - struct ib_device *ibdev) +static int iser_data_buf_aligned_len(struct iser_data_buf *data, + struct ib_device *ibdev) { - struct scatterlist *sgl, *sg; - u64 end_addr, next_addr; - int i, cnt; - unsigned int ret_len = 0; + struct scatterlist *sgl, *sg, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (data->dma_nents == 1) + return 1; sgl = (struct scatterlist *)data->buf; + start_addr = ib_sg_dma_address(ibdev, sgl); - cnt = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " - "offset: %ld sz: %ld\n", i, - (unsigned long)sg_phys(sg), - (unsigned long)sg->offset, - (unsigned long)sg->length); */ - end_addr = ib_sg_dma_address(ibdev, sg) + - ib_sg_dma_len(ibdev, sg); - /* iser_dbg("Checking sg iobuf end address " - "0x%08lX\n", end_addr); */ - if (i + 1 < data->dma_nents) { - next_addr = ib_sg_dma_address(ibdev, sg_next(sg)); - /* are i, i+1 fragments of the same page? */ - if (end_addr == next_addr) { - cnt++; - continue; - } else if (!IS_4K_ALIGNED(end_addr)) { - ret_len = cnt + 1; - break; - } - } - cnt++; + if (start_check && !IS_4K_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else + start_check = 1; + + if (!IS_4K_ALIGNED(end_addr)) + break; } - if (i == data->dma_nents) - ret_len = cnt; /* loop ended */ + ret_len = (next_sg) ? i : i+1; iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", ret_len, data->dma_nents, data); return ret_len; From be504b0b9fbe9ba447c93ef0f5789f377102d555 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Thu, 12 Nov 2009 15:51:16 -0800 Subject: [PATCH 04/45] mlx4_core: Fix parsing of reserved EQ cap Value returned by firmware is the actual value, not a log. Signed-off-by: Liran Liss Signed-off-by: Roland Dreier --- drivers/net/mlx4/fw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 7194be3a289..04f42ae1eda 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -236,7 +236,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET); dev_cap->max_mpts = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET); - dev_cap->reserved_eqs = 1 << (field & 0xf); + dev_cap->reserved_eqs = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET); dev_cap->max_eqs = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET); From a7ca1f00ed2921b804d7ebda0f6fca8c9078fa42 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 16 Nov 2009 09:30:33 -0800 Subject: [PATCH 05/45] RDMA/ucma: Add option to manually set IB path Export rdma_set_ib_paths to user space to allow applications to manually set the IB path used for connections. This allows alternative ways for a user space application or library to obtain path record information, including retrieving path information from cached data, avoiding direct interaction with the IB SA. The IB SA is a single, centralized entity that can limit scaling on large clusters running MPI applications. Future changes to the rdma cm can expand on this framework to support the full range of features allowed by the IB CM, such as separate forward and reverse paths and APM. Signed-off-by: Sean Hefty Reviewed-By: Jason Gunthorpe Signed-off-by: Roland Dreier --- drivers/infiniband/core/sa_query.c | 6 ++++ drivers/infiniband/core/ucma.c | 49 ++++++++++++++++++++++++++++++ include/rdma/ib_sa.h | 6 ++++ include/rdma/ib_user_sa.h | 16 ++++++++++ include/rdma/rdma_user_cm.h | 6 ++-- 5 files changed, 81 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 82543716d59..7e1ffd8ccd5 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -604,6 +604,12 @@ retry: return ret ? ret : id; } +void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) +{ + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); +} +EXPORT_SYMBOL(ib_sa_unpack_path); + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index bb96d3c4b0f..f1cbd26a9de 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -43,6 +43,7 @@ #include #include #include +#include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -812,6 +813,51 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, return ret; } +static int ucma_set_ib_path(struct ucma_context *ctx, + struct ib_path_rec_data *path_data, size_t optlen) +{ + struct ib_sa_path_rec sa_path; + struct rdma_cm_event event; + int ret; + + if (optlen % sizeof(*path_data)) + return -EINVAL; + + for (; optlen; optlen -= sizeof(*path_data), path_data++) { + if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL)) + break; + } + + if (!optlen) + return -EINVAL; + + ib_sa_unpack_path(path_data->path_rec, &sa_path); + ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + if (ret) + return ret; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + return ucma_event_handler(ctx->cm_id, &event); +} + +static int ucma_set_option_ib(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret; + + switch (optname) { + case RDMA_OPTION_IB_PATH: + ret = ucma_set_ib_path(ctx, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + static int ucma_set_option_level(struct ucma_context *ctx, int level, int optname, void *optval, size_t optlen) { @@ -821,6 +867,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level, case RDMA_OPTION_ID: ret = ucma_set_option_id(ctx, optname, optval, optlen); break; + case RDMA_OPTION_IB: + ret = ucma_set_option_ib(ctx, optname, optval, optlen); + break; default: ret = -ENOSYS; } diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 3841c1aff69..1082afaed15 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -379,4 +379,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr); +/** + * ib_sa_unpack_path - Convert a path record from MAD format to struct + * ib_sa_path_rec. + */ +void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); + #endif /* IB_SA_H */ diff --git a/include/rdma/ib_user_sa.h b/include/rdma/ib_user_sa.h index 659120157e1..cfc7c9ba781 100644 --- a/include/rdma/ib_user_sa.h +++ b/include/rdma/ib_user_sa.h @@ -35,6 +35,22 @@ #include +enum { + IB_PATH_GMP = 1, + IB_PATH_PRIMARY = (1<<1), + IB_PATH_ALTERNATE = (1<<2), + IB_PATH_OUTBOUND = (1<<3), + IB_PATH_INBOUND = (1<<4), + IB_PATH_INBOUND_REVERSE = (1<<5), + IB_PATH_BIDIRECTIONAL = IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE +}; + +struct ib_path_rec_data { + __u32 flags; + __u32 reserved; + __u32 path_rec[16]; +}; + struct ib_user_path_rec { __u8 dgid[16]; __u8 sgid[16]; diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h index c55705460b8..1d165022c02 100644 --- a/include/rdma/rdma_user_cm.h +++ b/include/rdma/rdma_user_cm.h @@ -215,12 +215,14 @@ struct rdma_ucm_event_resp { /* Option levels */ enum { - RDMA_OPTION_ID = 0 + RDMA_OPTION_ID = 0, + RDMA_OPTION_IB = 1 }; /* Option details */ enum { - RDMA_OPTION_ID_TOS = 0 + RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_IB_PATH = 1 }; struct rdma_ucm_set_option { From 0f9ea5d2ab5cef732d5abbe62b9e9af3007bae81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2009 14:24:34 -0800 Subject: [PATCH 06/45] RDMA/addr: Use appropriate locking with for_each_netdev() for_each_netdev() should be used with RTNL or dev_base_lock held, or else we risk a crash. Signed-off-by: Eric Dumazet Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index bd07803e918..373f1118d57 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -131,6 +131,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: + read_lock(&dev_base_lock); for_each_netdev(&init_net, dev) { if (ipv6_chk_addr(&init_net, &((struct sockaddr_in6 *) addr)->sin6_addr, @@ -139,6 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) break; } } + read_unlock(&dev_base_lock); break; #endif } @@ -391,14 +393,17 @@ static int addr_resolve_local(struct sockaddr *src_in, { struct in6_addr *a; + read_lock(&dev_base_lock); for_each_netdev(&init_net, dev) if (ipv6_chk_addr(&init_net, &((struct sockaddr_in6 *) dst_in)->sin6_addr, dev, 1)) break; - if (!dev) + if (!dev) { + read_unlock(&dev_base_lock); return -EADDRNOTAVAIL; + } a = &((struct sockaddr_in6 *) src_in)->sin6_addr; @@ -416,6 +421,7 @@ static int addr_resolve_local(struct sockaddr *src_in, if (!ret) memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); } + read_unlock(&dev_base_lock); break; } #endif From 1c9b281997b5876c0c8ed62506b56db89d262b57 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 19 Nov 2009 12:55:21 -0800 Subject: [PATCH 07/45] RDMA/cma: Correct detection of SA Created MGID RDMA CM treats AF_INET6 addresses that are either 0 or prefixed with FF1x:A01B::/32 as MGIDs, but the detection for the prefix was buggy; fix it up. Signed-off-by: Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 075317884b5..8bb2cf4031a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2687,7 +2687,7 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && - ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFF10A01B) == + ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); From e2e626972e652d18520f84d69fc06cfa307d11ff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 19 Nov 2009 12:55:22 -0800 Subject: [PATCH 08/45] RDMA/cma: Fix AF_INET6 support in multicast joining If joining to an AF_INET6 address, we need to map the address to a MGID in the same way as the IP stack. The old code would just fall through to the IPv4 case and generate garbage. Signed-off-by: Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 8bb2cf4031a..052b4c01745 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2691,6 +2691,11 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if ((addr->sa_family == AF_INET6)) { + ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) From 6266ed6e4164466177238b11ecb825a3a108a3e4 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 12:55:22 -0800 Subject: [PATCH 09/45] RDMA/cma: Replace net_device pointer with index Provide the device interface when resolving route information to ensure that the correct outbound device is used. This will also simplify processing of sin6_scope_id for IPv6 support. Based on work from: David Wilder Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 14 +++++++++++++- drivers/infiniband/core/cma.c | 2 +- include/rdma/ib_addr.h | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 373f1118d57..788a02ef01d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -107,7 +107,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); if (dst_dev_addr) memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); - dev_addr->src_dev = dev; + dev_addr->bound_dev_if = dev->ifindex; return 0; } EXPORT_SYMBOL(rdma_copy_addr); @@ -117,6 +117,15 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) struct net_device *dev; int ret = -EADDRNOTAVAIL; + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; + } + switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(&init_net, @@ -231,6 +240,8 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = dst_ip; fl.nl_u.ip4_u.saddr = src_ip; + fl.oif = addr->bound_dev_if; + ret = ip_route_output_key(&init_net, &rt, &fl); if (ret) goto out; @@ -279,6 +290,7 @@ static int addr6_resolve_remote(struct sockaddr_in6 *src_in, memset(&fl, 0, sizeof fl); fl.nl_u.ip6_u.daddr = dst_in->sin6_addr; fl.nl_u.ip6_u.saddr = src_in->sin6_addr; + fl.oif = addr->bound_dev_if; dst = ip6_route_output(&init_net, NULL, &fl); if (!dst) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 052b4c01745..699ad12b3a2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2820,7 +2820,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id dev_addr = &id_priv->id.route.addr.dev_addr; - if ((dev_addr->src_dev == ndev) && + if ((dev_addr->bound_dev_if == ndev->ifindex) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 483057b2f4b..27f17cc2c91 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -61,7 +61,7 @@ struct rdma_dev_addr { unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; enum rdma_node_type dev_type; - struct net_device *src_dev; + int bound_dev_if; }; /** From d2e0886245aa9eebc1a4710c861d263b09eac493 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 12:55:22 -0800 Subject: [PATCH 10/45] IB/addr: Verify source and destination address families match If a source address is provided, verify that the address family matches that of the destination address. If the source is not specified, use the same address family as the destination. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 788a02ef01d..b59ba7ccef0 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -461,18 +461,27 @@ int rdma_resolve_ip(struct rdma_addr_client *client, if (!req) return -ENOMEM; - if (src_addr) - memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr)); - memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr)); + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) { + ret = -EINVAL; + goto err; + } + + memcpy(src_in, src_addr, ip_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; req->client = client; atomic_inc(&client->refcount); - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve_local(src_in, dst_in, addr); if (req->status == -EADDRNOTAVAIL) req->status = addr_resolve_remote(src_in, dst_in, addr); @@ -490,10 +499,12 @@ int rdma_resolve_ip(struct rdma_addr_client *client, default: ret = req->status; atomic_dec(&client->refcount); - kfree(req); - break; + goto err; } return ret; +err: + kfree(req); + return ret; } EXPORT_SYMBOL(rdma_resolve_ip); From c4315d85f9b76834289fd503796c01b8311c4b84 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 12:57:18 -0800 Subject: [PATCH 11/45] IB/addr: Store net_device type instead of translating to RDMA transport The struct rdma_dev_addr stores net_device address information: the source device address, destination hardware address, and broadcast address. For consistency, store the net_device type rather than converting it to the rdma_node_type. The type indicates the format of the various hardware addresses, which is what we're concerned with, and not the RDMA node type that the address may map to. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 13 +------------ drivers/infiniband/core/cma.c | 6 +++--- include/rdma/ib_addr.h | 3 ++- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b59ba7ccef0..de5fe161a1b 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -92,17 +91,7 @@ EXPORT_SYMBOL(rdma_addr_unregister_client); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr) { - switch (dev->type) { - case ARPHRD_INFINIBAND: - dev_addr->dev_type = RDMA_NODE_IB_CA; - break; - case ARPHRD_ETHER: - dev_addr->dev_type = RDMA_NODE_RNIC; - break; - default: - return -EADDRNOTAVAIL; - } - + dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); if (dst_dev_addr) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 699ad12b3a2..b305b5c17f8 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -330,11 +330,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) union ib_gid gid; int ret = -ENODEV; - switch (rdma_node_get_transport(dev_addr->dev_type)) { - case RDMA_TRANSPORT_IB: + switch (dev_addr->dev_type) { + case ARPHRD_INFINIBAND: ib_addr_get_sgid(dev_addr, &gid); break; - case RDMA_TRANSPORT_IWARP: + case ARPHRD_ETHER: iw_addr_get_sgid(dev_addr, &gid); break; default: diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 27f17cc2c91..3a39c55d2b9 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -60,7 +61,7 @@ struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; - enum rdma_node_type dev_type; + unsigned short dev_type; int bound_dev_if; }; From 6f8372b69c3198e06cecb1df2cb9682d0c55e657 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 13:26:06 -0800 Subject: [PATCH 12/45] RDMA/cm: fix loopback address support The RDMA CM is intended to support the use of a loopback address when establishing a connection; however, the behavior of the CM when loopback addresses are used is confusing and does not always work, depending on whether loopback was specified by the server, the client, or both. The defined behavior of rdma_bind_addr is to associate an RDMA device with an rdma_cm_id, as long as the user specified a non- zero address. (ie they weren't just trying to reserve a port) Currently, if the loopback address is passed to rdam_bind_addr, no device is associated with the rdma_cm_id. Fix this. If a loopback address is specified by the client as the destination address for a connection, it will fail to establish a connection. This is true even if the server is listing across all addresses or on the loopback address itself. The issue is that the server tries to translate the IP address carried in the REQ message to a local net_device address, which fails. The translation is not needed in this case, since the REQ carries the actual HW address that should be used. Finally, cleanup loopback support to be more transport neutral. Replace separate calls to get/set the sgid and dgid from the device address to a single call that behaves correctly depending on the format of the device address. And support both IPv4 and IPv6 address formats. Signed-off-by: Sean Hefty [ Fixed RDS build by s/ib_addr_get/rdma_addr_get/ - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 77 +++++++++++++++++++--------------- drivers/infiniband/core/ucma.c | 8 ++-- include/rdma/ib_addr.h | 31 +++++--------- net/rds/ib.c | 4 +- net/rds/iw.c | 4 +- 5 files changed, 61 insertions(+), 63 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index b305b5c17f8..38867a46d39 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -330,17 +330,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) union ib_gid gid; int ret = -ENODEV; - switch (dev_addr->dev_type) { - case ARPHRD_INFINIBAND: - ib_addr_get_sgid(dev_addr, &gid); - break; - case ARPHRD_ETHER: - iw_addr_get_sgid(dev_addr, &gid); - break; - default: - return -ENODEV; - } - + rdma_addr_get_sgid(dev_addr, &gid); list_for_each_entry(cma_dev, &dev_list, list) { ret = ib_find_cached_gid(cma_dev->device, &gid, &id_priv->id.port_num, NULL); @@ -1032,11 +1022,17 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr); - if (ret) - goto destroy_id; + if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey); + } else { + ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, + &rt->addr.dev_addr); + if (ret) + goto destroy_id; + } + rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; @@ -1071,10 +1067,12 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr); - if (ret) - goto err; + if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { + ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, + &id->route.addr.dev_addr); + if (ret) + goto err; + } id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; @@ -1565,8 +1563,8 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct sockaddr_in6 *sin6; memset(&path_rec, 0, sizeof path_rec); - ib_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - ib_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); + rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; @@ -1781,7 +1779,11 @@ port_found: if (ret) goto out; - ib_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); + id_priv->id.route.addr.dev_addr.dev_type = + (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB) ? + ARPHRD_INFINIBAND : ARPHRD_ETHER; + + rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); @@ -1839,7 +1841,7 @@ out: static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; - struct sockaddr_in *src_in, *dst_in; + struct sockaddr *src, *dst; union ib_gid gid; int ret; @@ -1853,14 +1855,19 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) goto err; } - ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); - ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); + rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - if (cma_zero_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) { - src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr; - dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr; - src_in->sin_family = dst_in->sin_family; - src_in->sin_addr.s_addr = dst_in->sin_addr.s_addr; + src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + if (cma_zero_addr(src)) { + dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; + if ((src->sa_family = dst->sa_family) == AF_INET) { + ((struct sockaddr_in *) src)->sin_addr.s_addr = + ((struct sockaddr_in *) dst)->sin_addr.s_addr; + } else { + ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr, + &((struct sockaddr_in6 *) dst)->sin6_addr); + } } work->id = id_priv; @@ -2089,7 +2096,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; - if (!cma_any_addr(addr)) { + if (cma_loopback_addr(addr)) { + ret = cma_bind_loopback(id_priv); + } else if (!cma_zero_addr(addr)) { ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); if (ret) goto err1; @@ -2108,7 +2117,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (!cma_any_addr(addr)) { + if (id_priv->cma_dev) { mutex_lock(&lock); cma_detach_from_dev(id_priv); mutex_unlock(&lock); @@ -2721,7 +2730,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); - ib_addr_get_sgid(dev_addr, &rec.port_gid); + rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index f1cbd26a9de..b2e16c332d5 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -563,10 +563,10 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, switch (route->num_paths) { case 0: dev_addr = &route->addr.dev_addr; - ib_addr_get_dgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].dgid); - ib_addr_get_sgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].sgid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); break; case 2: diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 3a39c55d2b9..fa0d52b8e62 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -122,40 +122,29 @@ static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } -static inline void ib_addr_get_sgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) +static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { - memcpy(gid, dev_addr->src_dev_addr + 4, sizeof *gid); + return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } -static inline void ib_addr_set_sgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) +static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - memcpy(dev_addr->src_dev_addr + 4, gid, sizeof *gid); + memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } -static inline void ib_addr_get_dgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) +static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - memcpy(gid, dev_addr->dst_dev_addr + 4, sizeof *gid); + memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } -static inline void ib_addr_set_dgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) +static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - memcpy(dev_addr->dst_dev_addr + 4, gid, sizeof *gid); + memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } -static inline void iw_addr_get_sgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) +static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - memcpy(gid, dev_addr->src_dev_addr, sizeof *gid); -} - -static inline void iw_addr_get_dgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) -{ - memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid); + memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } #endif /* IB_ADDR_H */ diff --git a/net/rds/ib.c b/net/rds/ib.c index 536ebe5d3f6..3b899236104 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -182,8 +182,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, ic = conn->c_transport_data; dev_addr = &ic->i_cm_id->route.addr.dev_addr; - ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); iinfo->max_send_wr = ic->i_send_ring.w_nr; diff --git a/net/rds/iw.c b/net/rds/iw.c index db224f7c293..b28fa8525b2 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -184,8 +184,8 @@ static int rds_iw_conn_info_visitor(struct rds_connection *conn, ic = conn->c_transport_data; dev_addr = &ic->i_cm_id->route.addr.dev_addr; - ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); iinfo->max_send_wr = ic->i_send_ring.w_nr; From 923c100ef019bf15fb89b6fa3d3ad0485d25d59b Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 13:26:51 -0800 Subject: [PATCH 13/45] IB/addr: Simplify resolving IPv4 addresses Merge resolve local/remote address resolution into a single data flow to ensure consistent access and use of the local routing tables. Based on work from: David Wilder Jason Gunthorpe Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 81 ++++++++++------------------------ 1 file changed, 23 insertions(+), 58 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index de5fe161a1b..38a7184ea74 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -184,17 +184,6 @@ static void addr_send_arp(struct sockaddr *dst_in) memset(&fl, 0, sizeof fl); switch (dst_in->sa_family) { - case AF_INET: - fl.nl_u.ip4_u.daddr = - ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - - if (ip_route_output_key(&init_net, &rt, &fl)) - return; - - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rt); - break; - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: { @@ -215,9 +204,9 @@ static void addr_send_arp(struct sockaddr *dst_in) } } -static int addr4_resolve_remote(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) +static int addr4_resolve(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) { __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; @@ -235,6 +224,16 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, if (ret) goto out; + src_in->sin_family = AF_INET; + src_in->sin_addr.s_addr = rt->rt_src; + + if (rt->idev->dev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + goto put; + } + /* If the device does ARP internally, return 'done' */ if (rt->idev->dev->flags & IFF_NOARP) { rdma_copy_addr(addr, rt->idev->dev, NULL); @@ -242,21 +241,14 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, } neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); - if (!neigh) { + if (!neigh || !(neigh->nud_state & NUD_VALID)) { + neigh_event_send(rt->u.dst.neighbour, NULL); ret = -ENODATA; + if (neigh) + goto release; goto put; } - if (!(neigh->nud_state & NUD_VALID)) { - ret = -ENODATA; - goto release; - } - - if (!src_ip) { - src_in->sin_family = dst_in->sin_family; - src_in->sin_addr.s_addr = rt->rt_src; - } - ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); release: neigh_release(neigh); @@ -305,12 +297,12 @@ static int addr6_resolve_remote(struct sockaddr_in6 *src_in, } #endif -static int addr_resolve_remote(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) +static int addr_resolve(struct sockaddr *src_in, + struct sockaddr *dst_in, + struct rdma_dev_addr *addr) { if (src_in->sa_family == AF_INET) { - return addr4_resolve_remote((struct sockaddr_in *) src_in, + return addr4_resolve((struct sockaddr_in *) src_in, (struct sockaddr_in *) dst_in, addr); } else return addr6_resolve_remote((struct sockaddr_in6 *) src_in, @@ -330,8 +322,7 @@ static void process_req(struct work_struct *work) if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve_remote(src_in, dst_in, - req->addr); + req->status = addr_resolve(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -363,32 +354,6 @@ static int addr_resolve_local(struct sockaddr *src_in, int ret; switch (dst_in->sa_family) { - case AF_INET: - { - __be32 src_ip = ((struct sockaddr_in *) src_in)->sin_addr.s_addr; - __be32 dst_ip = ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - - dev = ip_dev_find(&init_net, dst_ip); - if (!dev) - return -EADDRNOTAVAIL; - - if (ipv4_is_zeronet(src_ip)) { - src_in->sa_family = dst_in->sa_family; - ((struct sockaddr_in *) src_in)->sin_addr.s_addr = dst_ip; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv4_is_loopback(src_ip)) { - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } else { - ret = rdma_translate_ip(src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } - dev_put(dev); - break; - } - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: { @@ -473,7 +438,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->status = addr_resolve_local(src_in, dst_in, addr); if (req->status == -EADDRNOTAVAIL) - req->status = addr_resolve_remote(src_in, dst_in, addr); + req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: From d14714df61681cfecf945a58436edf197327e87f Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 19 Nov 2009 16:46:25 -0800 Subject: [PATCH 15/45] IB/addr: Fix IPv6 routing lookup Include link scope as part of address resolution. Combine local and remote address resolution into a single, simpler code path. Fix error checking in the IPv6 routing lookups. Based on work from: David Wilder Jason Gunthorpe Signed-off-by: Sean Hefty [ Fix up cma_check_linklocal() for !IPV6 case. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 148 ++++++++++----------------------- drivers/infiniband/core/cma.c | 47 +++++++---- 2 files changed, 76 insertions(+), 119 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 38a7184ea74..abbb06996f9 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -176,34 +176,6 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static void addr_send_arp(struct sockaddr *dst_in) -{ - struct rtable *rt; - struct flowi fl; - - memset(&fl, 0, sizeof fl); - - switch (dst_in->sa_family) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case AF_INET6: - { - struct dst_entry *dst; - - fl.nl_u.ip6_u.daddr = - ((struct sockaddr_in6 *) dst_in)->sin6_addr; - - dst = ip6_route_output(&init_net, NULL, &fl); - if (!dst) - return; - - neigh_event_send(dst->neighbour, NULL); - dst_release(dst); - break; - } -#endif - } -} - static int addr4_resolve(struct sockaddr_in *src_in, struct sockaddr_in *dst_in, struct rdma_dev_addr *addr) @@ -259,39 +231,63 @@ out: } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int addr6_resolve_remote(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) { struct flowi fl; struct neighbour *neigh; struct dst_entry *dst; - int ret = -ENODATA; + int ret; memset(&fl, 0, sizeof fl); - fl.nl_u.ip6_u.daddr = dst_in->sin6_addr; - fl.nl_u.ip6_u.saddr = src_in->sin6_addr; + ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr); + ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr); fl.oif = addr->bound_dev_if; dst = ip6_route_output(&init_net, NULL, &fl); - if (!dst) - return ret; + if ((ret = dst->error)) + goto put; - if (dst->dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, dst->dev, NULL); - } else { - neigh = dst->neighbour; - if (neigh && (neigh->nud_state & NUD_VALID)) - ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); + if (ipv6_addr_any(&fl.fl6_src)) { + ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, + &fl.fl6_dst, 0, &fl.fl6_src); + if (ret) + goto put; + + src_in->sin6_family = AF_INET6; + ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src); } + if (dst->dev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + goto put; + } + + /* If the device does ARP internally, return 'done' */ + if (dst->dev->flags & IFF_NOARP) { + ret = rdma_copy_addr(addr, dst->dev, NULL); + goto put; + } + + neigh = dst->neighbour; + if (!neigh || !(neigh->nud_state & NUD_VALID)) { + neigh_event_send(dst->neighbour, NULL); + ret = -ENODATA; + goto put; + } + + ret = rdma_copy_addr(addr, dst->dev, neigh->ha); +put: dst_release(dst); return ret; } #else -static int addr6_resolve_remote(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) { return -EADDRNOTAVAIL; } @@ -305,7 +301,7 @@ static int addr_resolve(struct sockaddr *src_in, return addr4_resolve((struct sockaddr_in *) src_in, (struct sockaddr_in *) dst_in, addr); } else - return addr6_resolve_remote((struct sockaddr_in6 *) src_in, + return addr6_resolve((struct sockaddr_in6 *) src_in, (struct sockaddr_in6 *) dst_in, addr); } @@ -346,60 +342,6 @@ static void process_req(struct work_struct *work) } } -static int addr_resolve_local(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) -{ - struct net_device *dev; - int ret; - - switch (dst_in->sa_family) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case AF_INET6: - { - struct in6_addr *a; - - read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) - if (ipv6_chk_addr(&init_net, - &((struct sockaddr_in6 *) dst_in)->sin6_addr, - dev, 1)) - break; - - if (!dev) { - read_unlock(&dev_base_lock); - return -EADDRNOTAVAIL; - } - - a = &((struct sockaddr_in6 *) src_in)->sin6_addr; - - if (ipv6_addr_any(a)) { - src_in->sa_family = dst_in->sa_family; - ((struct sockaddr_in6 *) src_in)->sin6_addr = - ((struct sockaddr_in6 *) dst_in)->sin6_addr; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv6_addr_loopback(a)) { - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } else { - ret = rdma_translate_ip(src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } - read_unlock(&dev_base_lock); - break; - } -#endif - - default: - ret = -EADDRNOTAVAIL; - break; - } - - return ret; -} - int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, @@ -436,10 +378,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->client = client; atomic_inc(&client->refcount); - req->status = addr_resolve_local(src_in, dst_in, addr); - if (req->status == -EADDRNOTAVAIL) - req->status = addr_resolve(src_in, dst_in, addr); - + req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: req->timeout = jiffies; @@ -448,7 +387,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); - addr_send_arp(dst_in); break; default: ret = req->status; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 38867a46d39..fbdd7310600 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1472,15 +1472,6 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af) -{ - struct sockaddr_storage addr_in; - - memset(&addr_in, 0, sizeof addr_in); - addr_in.ss_family = af; - return rdma_bind_addr(id, (struct sockaddr *) &addr_in); -} - int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; @@ -1488,7 +1479,8 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == CMA_IDLE) { - ret = cma_bind_any(id, AF_INET); + ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; + ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); if (ret) return ret; } @@ -1885,10 +1877,14 @@ err: static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { - if (src_addr && src_addr->sa_family) - return rdma_bind_addr(id, src_addr); - else - return cma_bind_any(id, dst_addr->sa_family); + if (!src_addr || !src_addr->sa_family) { + src_addr = (struct sockaddr *) &id->route.addr.src_addr; + if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) { + ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = + ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + } + } + return rdma_bind_addr(id, src_addr); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2084,6 +2080,25 @@ static int cma_get_port(struct rdma_id_private *id_priv) return ret; } +static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, + struct sockaddr *addr) +{ +#if defined(CONFIG_IPv6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; + + if (addr->sa_family != AF_INET6) + return 0; + + sin6 = (struct sockaddr_in6 *) addr; + if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && + !sin6->sin6_scope_id) + return -EINVAL; + + dev_addr->bound_dev_if = sin6->sin6_scope_id; +#endif + return 0; +} + int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; @@ -2096,6 +2111,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; + ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + if (ret) + goto err1; + if (cma_loopback_addr(addr)) { ret = cma_bind_loopback(id_priv); } else if (!cma_zero_addr(addr)) { From 0cd4d0fd9b0a4e10c091fc6316d1bf92885dcd9c Mon Sep 17 00:00:00 2001 From: "David J. Wilder" Date: Wed, 9 Dec 2009 10:03:00 -0800 Subject: [PATCH 16/45] IPoIB: Clear ipoib_neigh.dgid in ipoib_neigh_alloc() IPoIB can miss a change in destination GID under some conditions. The problem is caused when ipoib_neigh->dgid contains a stale address. The fix is to set ipoib_neigh->dgid to zero in ipoib_neigh_alloc(). This can happen when a system using bonding on its IPoIB interfaces has switched its active interface from interface A to B and back to A. The system that fails over will not correctly processes the 2nd address change, as described below. When an address has changed neighbor->ha is updated with the new address. Each neighbor has an associated ipoib_neigh. ipoib_neigh->dgid also holds a copy of the remote node's hardware address. When an address changes neighbor->ha is updated by the network layer (arp code) with the new address. IPoIB detects this change in ipoib_start_xmit() by comparing neighbor->ha with ipoib_neigh->dgid. The bug is that ipoib_neigh->dgid may already contain the new address (A) thus the change from B to A is missed by ipoib. Here is the sequence of events: ipoib_neigh->dgid = A and neighbor->ha = A The address is switched to B (the first switch) neighbor->ha = B The change is seen in ipoib_start_xmit() -- neighbor->ha != ipoib_neigh->dgid so ipoib_neigh is released, and a new one is allocated. The allocator may return the same chunk of memory that was just released, therefore ipoib_neigh->dgid still contains A at this point. ipoib_neigh->dgid should be updated in neigh_add_path(), but if the following conditions are true dgid is not updated: 1) __path_find() returns a path 2) path->ah is NULL The remote system now switches from address B to A, neighbor->ha is updated to A. Now we have again : ipoib_neigh->dgid = A and neighbor->ha = A Since the addresses are the same ipoib won't process the change in address. Fix this by zeroing out the dgid field when allocating a new struct ipoib_neigh. Signed-off-by: David Wilder Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2bf5116deec..df3eb8c9fd9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -884,6 +884,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, neigh->neighbour = neighbour; neigh->dev = dev; + memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); *to_ipoib_neigh(neighbour) = neigh; skb_queue_head_init(&neigh->queue); ipoib_cm_set(neigh, NULL); From 598cb6f327c99ceaf81c45c32504669b2028712b Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 9 Dec 2009 10:05:28 -0800 Subject: [PATCH 17/45] IB/ipath: Use bitmap_weight() Use bitmap_weight() instead of finding all set bits in bitmap by hand. Signed-off-by: Akinobu Mita Cc: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 013d1380e77..d2787fe8030 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "ipath_kernel.h" #include "ipath_verbs.h" @@ -1697,7 +1698,7 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, unsigned len, int avail) { unsigned long flags; - unsigned end, cnt = 0, next; + unsigned end, cnt = 0; /* There are two bits per send buffer (busy and generation) */ start *= 2; @@ -1748,12 +1749,7 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, if (dd->ipath_pioupd_thresh) { end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); - next = find_first_bit(dd->ipath_pioavailkernel, end); - while (next < end) { - cnt++; - next = find_next_bit(dd->ipath_pioavailkernel, end, - next + 1); - } + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); } spin_unlock_irqrestore(&ipath_pioavail_lock, flags); From 9420269428b3dc80c98e52beac60a3976fbef7d2 Mon Sep 17 00:00:00 2001 From: Alexander Schmidt Date: Wed, 9 Dec 2009 10:11:04 -0800 Subject: [PATCH 18/45] IB/ehca: Rework destroy_eq() The ibmebus_free_irq() function, which might sleep, was called with interrupts disabled. To fix this, make sure that no interrupts are running by killing the interrupt tasklet. Also lock the shca_list_lock to protect against the poll_eqs_timer running concurrently. Signed-off-by: Alexander Schmidt Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_classes.h | 1 + drivers/infiniband/hw/ehca/ehca_eq.c | 9 ++++++--- drivers/infiniband/hw/ehca/ehca_main.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index c825142a2fb..0136abd50dd 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -375,6 +375,7 @@ extern rwlock_t ehca_qp_idr_lock; extern rwlock_t ehca_cq_idr_lock; extern struct idr ehca_qp_idr; extern struct idr ehca_cq_idr; +extern spinlock_t shca_list_lock; extern int ehca_static_rate; extern int ehca_port_act_time; diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c index 523e733c630..3b87589b8ea 100644 --- a/drivers/infiniband/hw/ehca/ehca_eq.c +++ b/drivers/infiniband/hw/ehca/ehca_eq.c @@ -169,12 +169,15 @@ int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq) unsigned long flags; u64 h_ret; - spin_lock_irqsave(&eq->spinlock, flags); ibmebus_free_irq(eq->ist, (void *)shca); - h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + spin_lock_irqsave(&shca_list_lock, flags); + eq->is_initialized = 0; + spin_unlock_irqrestore(&shca_list_lock, flags); - spin_unlock_irqrestore(&eq->spinlock, flags); + tasklet_kill(&eq->interrupt_task); + + h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); if (h_ret != H_SUCCESS) { ehca_err(&shca->ib_device, "Can't free EQ resources."); diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index fb2d83c5bf0..129a6bebd6e 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -123,7 +123,7 @@ DEFINE_IDR(ehca_qp_idr); DEFINE_IDR(ehca_cq_idr); static LIST_HEAD(shca_list); /* list of all registered ehcas */ -static DEFINE_SPINLOCK(shca_list_lock); +DEFINE_SPINLOCK(shca_list_lock); static struct timer_list poll_eqs_timer; From e5dec39474fac3458ad6a649eab8cabfc977ae87 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 13:51:36 -0800 Subject: [PATCH 19/45] RDMA/nes: In nes_post_send() always set bad_wr on error On error, set bad_wr in nes_post_send(). Stop processing ib_wr queue when an error is detected. Signed-off-by: Frank Zago Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index a680c42d6e8..25b52d2478a 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3386,8 +3386,10 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, wqe_count = 0; total_payload_length = 0; - if (nesqp->ibqp_state > IB_QPS_RTS) - return -EINVAL; + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } spin_lock_irqsave(&nesqp->lock, flags); @@ -3498,6 +3500,9 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, break; } + if (err) + break; + if (ib_wr->send_flags & IB_SEND_SIGNALED) { wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; } @@ -3522,6 +3527,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, spin_unlock_irqrestore(&nesqp->lock, flags); +out: if (err) *bad_wr = ib_wr; return err; From 4293fdc115e1e4f83dcb9ec6cbd3a54c563835f0 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 13:51:36 -0800 Subject: [PATCH 20/45] RDMA/nes: In nes_post_recv() always set bad_wr on error On error, set bad_wr in nes_post_recv(). Stop processing ib_wr queue when an error is detected. Signed-off-by: Frank Zago Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 25b52d2478a..0b17c01bb9f 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3554,8 +3554,10 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, u32 counter; u32 total_payload_length; - if (nesqp->ibqp_state > IB_QPS_RTS) - return -EINVAL; + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } spin_lock_irqsave(&nesqp->lock, flags); @@ -3618,6 +3620,7 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, spin_unlock_irqrestore(&nesqp->lock, flags); +out: if (err) *bad_wr = ib_wr; return err; From 649fe4aeab8c9b90eb31c899791534add0c78e04 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 13:51:37 -0800 Subject: [PATCH 21/45] RDMA/nes: Add support for IB_WR_*INV Add support for IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV and IB_WR_LOCAL_INV. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 205 ++++++++++++++------------ 1 file changed, 114 insertions(+), 91 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 0b17c01bb9f..499dd78cb82 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3373,18 +3373,12 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, struct nes_device *nesdev = nesvnic->nesdev; struct nes_qp *nesqp = to_nesqp(ibqp); struct nes_hw_qp_wqe *wqe; - int err; + int err = 0; u32 qsize = nesqp->hwqp.sq_size; u32 head; - u32 wqe_misc; - u32 wqe_count; + u32 wqe_misc = 0; + u32 wqe_count = 0; u32 counter; - u32 total_payload_length; - - err = 0; - wqe_misc = 0; - wqe_count = 0; - total_payload_length = 0; if (nesqp->ibqp_state > IB_QPS_RTS) { err = -EINVAL; @@ -3415,91 +3409,117 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, u64temp = (u64)(ib_wr->wr_id); set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp); - switch (ib_wr->opcode) { - case IB_WR_SEND: - if (ib_wr->send_flags & IB_SEND_SOLICITED) { - wqe_misc = NES_IWARP_SQ_OP_SENDSE; - } else { - wqe_misc = NES_IWARP_SQ_OP_SEND; - } - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - err = -EINVAL; - break; - } - if (ib_wr->send_flags & IB_SEND_FENCE) { - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - } - if ((ib_wr->send_flags & IB_SEND_INLINE) && - ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && - (ib_wr->sg_list[0].length <= 64)) { - memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], - (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - ib_wr->sg_list[0].length); - wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; - } else { - fill_wqe_sg_send(wqe, ib_wr, 1); - } + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (IB_WR_SEND == ib_wr->opcode) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSE; + else + wqe_misc = NES_IWARP_SQ_OP_SEND; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSEINV; + else + wqe_misc = NES_IWARP_SQ_OP_SENDINV; - break; - case IB_WR_RDMA_WRITE: - wqe_misc = NES_IWARP_SQ_OP_RDMAW; - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", - ib_wr->num_sge, - nesdev->nesadapter->max_sge); - err = -EINVAL; - break; - } - if (ib_wr->send_flags & IB_SEND_FENCE) { - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - } - - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, - ib_wr->wr.rdma.rkey); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, - ib_wr->wr.rdma.remote_addr); - - if ((ib_wr->send_flags & IB_SEND_INLINE) && - ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && - (ib_wr->sg_list[0].length <= 64)) { - memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], - (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - ib_wr->sg_list[0].length); - wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; - } else { - fill_wqe_sg_send(wqe, ib_wr, 1); - } - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = - wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; - break; - case IB_WR_RDMA_READ: - /* iWARP only supports 1 sge for RDMA reads */ - if (ib_wr->num_sge > 1) { - nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", - ib_wr->num_sge); - err = -EINVAL; - break; - } - wqe_misc = NES_IWARP_SQ_OP_RDMAR; - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, - ib_wr->wr.rdma.remote_addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, - ib_wr->wr.rdma.rkey); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, - ib_wr->sg_list->length); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, - ib_wr->sg_list->addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, - ib_wr->sg_list->lkey); - break; - default: - /* error */ - err = -EINVAL; - break; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); } + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + break; + case IB_WR_RDMA_WRITE: + wqe_misc = NES_IWARP_SQ_OP_RDMAW; + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", + ib_wr->num_sge, nesdev->nesadapter->max_sge); + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* iWARP only supports 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", + ib_wr->num_sge); + err = -EINVAL; + break; + } + if (ib_wr->opcode == IB_WR_RDMA_READ) { + wqe_misc = NES_IWARP_SQ_OP_RDMAR; + } else { + wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } + + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, + ib_wr->sg_list->length); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + ib_wr->sg_list->addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, + ib_wr->sg_list->lkey); + break; + case IB_WR_LOCAL_INV: + wqe_misc = NES_IWARP_SQ_OP_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, + ib_wr->ex.invalidate_rkey); + break; + default: + /* error */ + err = -EINVAL; + break; + } + if (err) break; @@ -3729,6 +3749,9 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) nes_debug(NES_DBG_CQ, "Operation = Send.\n"); entry->opcode = IB_WC_SEND; break; + case NES_IWARP_SQ_OP_LOCINV: + entry->opcode = IB_WR_LOCAL_INV; + break; } nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); From 55464d461bdcffc4422aebfb750eacf99e3c0f27 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 9 Dec 2009 14:20:04 -0800 Subject: [PATCH 22/45] IB: Clarify the documentation of ib_post_send() Clarify the behavior of ib_post_send() when a list of work requests is passed in and an immediate error is returned. Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier --- include/rdma/ib_verbs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c179318edd9..09509edb1c5 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1425,6 +1425,11 @@ int ib_destroy_qp(struct ib_qp *qp); * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. + * + * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate + * error is returned, the QP state shall not be affected, + * ib_post_send() will return an immediate error after queueing any + * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, From f7111821e51a58ee0f548f256743121ce1b365ae Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 9 Dec 2009 14:21:36 -0800 Subject: [PATCH 23/45] IB: Fix typo in ipoib.txt Delete extra words in "is to takes advantage of". Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier --- Documentation/infiniband/ipoib.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt index 6d40f00b358..64eeb55d0c0 100644 --- a/Documentation/infiniband/ipoib.txt +++ b/Documentation/infiniband/ipoib.txt @@ -36,11 +36,11 @@ Datagram vs Connected modes fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes. In connected mode, the IB RC (Reliable Connected) transport is used. - Connected mode is to takes advantage of the connected nature of the - IB transport and allows an MTU up to the maximal IP packet size of - 64K, which reduces the number of IP packets needed for handling - large UDP datagrams, TCP segments, etc and increases the performance - for large messages. + Connected mode takes advantage of the connected nature of the IB + transport and allows an MTU up to the maximal IP packet size of 64K, + which reduces the number of IP packets needed for handling large UDP + datagrams, TCP segments, etc and increases the performance for large + messages. In connected mode, the interface's UD QP is still used for multicast and communication with peers that don't support connected mode. In From df42245a3c246ec1eeeedbc3e5edbcc17f081c79 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 9 Dec 2009 14:30:44 -0800 Subject: [PATCH 24/45] IB/uverbs: Fix return of PTR_ERR() of wrong pointer in ib_uverbs_get_context() Signed-off-by: Roel Kluin Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 56feab6c251..112d3970222 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -285,7 +285,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext = ibdev->alloc_ucontext(ibdev, &udata); if (IS_ERR(ucontext)) { - ret = PTR_ERR(file->ucontext); + ret = PTR_ERR(ucontext); goto err; } From c597b0240b686427248b6d4fc8adbe22f9a04c11 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 14:56:11 -0800 Subject: [PATCH 25/45] RDMA/amso1100: Fix error paths in post_send and post_recv Always set bad_wr when an immediate error is detected. Signed-off-by: Frank Zago Signed-off-by: Roland Dreier --- drivers/infiniband/hw/amso1100/c2_qp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c index a6d89440ad2..ad518868df7 100644 --- a/drivers/infiniband/hw/amso1100/c2_qp.c +++ b/drivers/infiniband/hw/amso1100/c2_qp.c @@ -798,8 +798,10 @@ int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, u8 actual_sge_count; u32 msg_size; - if (qp->state > IB_QPS_RTS) - return -EINVAL; + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } while (ib_wr) { @@ -930,6 +932,7 @@ int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, ib_wr = ib_wr->next; } +out: if (err) *bad_wr = ib_wr; return err; @@ -944,8 +947,10 @@ int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, unsigned long lock_flags; int err = 0; - if (qp->state > IB_QPS_RTS) - return -EINVAL; + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } /* * Try and post each work request @@ -998,6 +1003,7 @@ int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, ib_wr = ib_wr->next; } +out: if (err) *bad_wr = ib_wr; return err; From e147de03610fab7781c09aaed078a932e549ed4a Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Wed, 9 Dec 2009 15:07:25 -0800 Subject: [PATCH 26/45] IB/ehca: Fix error paths in post_send and post_recv Always set bad_wr when an immediate error is detected. Do not report success if an error occurred. Signed-off-by: Frank Zago Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_reqs.c | 67 ++++++++++++-------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 8fd88cd828f..e3ec7fdd67b 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -400,7 +400,6 @@ static inline void map_ib_wc_status(u32 cqe_status, static inline int post_one_send(struct ehca_qp *my_qp, struct ib_send_wr *cur_send_wr, - struct ib_send_wr **bad_send_wr, int hidden) { struct ehca_wqe *wqe_p; @@ -412,8 +411,6 @@ static inline int post_one_send(struct ehca_qp *my_qp, wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue); if (unlikely(!wqe_p)) { /* too many posted work requests: queue overflow */ - if (bad_send_wr) - *bad_send_wr = cur_send_wr; ehca_err(my_qp->ib_qp.device, "Too many posted WQEs " "qp_num=%x", my_qp->ib_qp.qp_num); return -ENOMEM; @@ -433,8 +430,6 @@ static inline int post_one_send(struct ehca_qp *my_qp, */ if (unlikely(ret)) { my_qp->ipz_squeue.current_q_offset = start_offset; - if (bad_send_wr) - *bad_send_wr = cur_send_wr; ehca_err(my_qp->ib_qp.device, "Could not write WQE " "qp_num=%x", my_qp->ib_qp.qp_num); return -EINVAL; @@ -448,7 +443,6 @@ int ehca_post_send(struct ib_qp *qp, struct ib_send_wr **bad_send_wr) { struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); - struct ib_send_wr *cur_send_wr; int wqe_cnt = 0; int ret = 0; unsigned long flags; @@ -457,7 +451,8 @@ int ehca_post_send(struct ib_qp *qp, if (unlikely(my_qp->state < IB_QPS_RTS)) { ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", my_qp->state, qp->qp_num); - return -EINVAL; + ret = -EINVAL; + goto out; } /* LOCK the QUEUE */ @@ -476,24 +471,21 @@ int ehca_post_send(struct ib_qp *qp, struct ib_send_wr circ_wr; memset(&circ_wr, 0, sizeof(circ_wr)); circ_wr.opcode = IB_WR_RDMA_READ; - post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */ + post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */ wqe_cnt++; ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num); my_qp->message_count = my_qp->packet_count = 0; } /* loop processes list of send reqs */ - for (cur_send_wr = send_wr; cur_send_wr != NULL; - cur_send_wr = cur_send_wr->next) { - ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0); + while (send_wr) { + ret = post_one_send(my_qp, send_wr, 0); if (unlikely(ret)) { - /* if one or more WQEs were successful, don't fail */ - if (wqe_cnt) - ret = 0; goto post_send_exit0; } wqe_cnt++; - } /* eof for cur_send_wr */ + send_wr = send_wr->next; + } post_send_exit0: iosync(); /* serialize GAL register access */ @@ -503,6 +495,10 @@ post_send_exit0: my_qp, qp->qp_num, wqe_cnt, ret); my_qp->message_count += wqe_cnt; spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + +out: + if (ret) + *bad_send_wr = send_wr; return ret; } @@ -511,7 +507,6 @@ static int internal_post_recv(struct ehca_qp *my_qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { - struct ib_recv_wr *cur_recv_wr; struct ehca_wqe *wqe_p; int wqe_cnt = 0; int ret = 0; @@ -522,27 +517,23 @@ static int internal_post_recv(struct ehca_qp *my_qp, if (unlikely(!HAS_RQ(my_qp))) { ehca_err(dev, "QP has no RQ ehca_qp=%p qp_num=%x ext_type=%d", my_qp, my_qp->real_qp_num, my_qp->ext_type); - return -ENODEV; + ret = -ENODEV; + goto out; } /* LOCK the QUEUE */ spin_lock_irqsave(&my_qp->spinlock_r, flags); - /* loop processes list of send reqs */ - for (cur_recv_wr = recv_wr; cur_recv_wr != NULL; - cur_recv_wr = cur_recv_wr->next) { + /* loop processes list of recv reqs */ + while (recv_wr) { u64 start_offset = my_qp->ipz_rqueue.current_q_offset; /* get pointer next to free WQE */ wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue); if (unlikely(!wqe_p)) { /* too many posted work requests: queue overflow */ - if (bad_recv_wr) - *bad_recv_wr = cur_recv_wr; - if (wqe_cnt == 0) { - ret = -ENOMEM; - ehca_err(dev, "Too many posted WQEs " - "qp_num=%x", my_qp->real_qp_num); - } + ret = -ENOMEM; + ehca_err(dev, "Too many posted WQEs " + "qp_num=%x", my_qp->real_qp_num); goto post_recv_exit0; } /* @@ -552,7 +543,7 @@ static int internal_post_recv(struct ehca_qp *my_qp, rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size; /* write a RECV WQE into the QUEUE */ - ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, cur_recv_wr, + ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr, rq_map_idx); /* * if something failed, @@ -560,22 +551,20 @@ static int internal_post_recv(struct ehca_qp *my_qp, */ if (unlikely(ret)) { my_qp->ipz_rqueue.current_q_offset = start_offset; - *bad_recv_wr = cur_recv_wr; - if (wqe_cnt == 0) { - ret = -EINVAL; - ehca_err(dev, "Could not write WQE " - "qp_num=%x", my_qp->real_qp_num); - } + ret = -EINVAL; + ehca_err(dev, "Could not write WQE " + "qp_num=%x", my_qp->real_qp_num); goto post_recv_exit0; } qmap_entry = &my_qp->rq_map.map[rq_map_idx]; - qmap_entry->app_wr_id = get_app_wr_id(cur_recv_wr->wr_id); + qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id); qmap_entry->reported = 0; qmap_entry->cqe_req = 1; wqe_cnt++; - } /* eof for cur_recv_wr */ + recv_wr = recv_wr->next; + } /* eof for recv_wr */ post_recv_exit0: iosync(); /* serialize GAL register access */ @@ -584,6 +573,11 @@ post_recv_exit0: ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", my_qp, my_qp->real_qp_num, wqe_cnt, ret); spin_unlock_irqrestore(&my_qp->spinlock_r, flags); + +out: + if (ret) + *bad_recv_wr = recv_wr; + return ret; } @@ -597,6 +591,7 @@ int ehca_post_recv(struct ib_qp *qp, if (unlikely(my_qp->state == IB_QPS_RESET)) { ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", my_qp->state, qp->qp_num); + *bad_recv_wr = recv_wr; return -EINVAL; } From e293a26fe97c8598a96562c1c9376d9ae6cb96dd Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:54 -0800 Subject: [PATCH 27/45] RDMA/nes: Correct fast memory registration implementation Replace alloc_fmr, unmap_fmr, dealloc_fmr and map_phys_fmr with alloc_fast_reg_mr, alloc_fast_reg_page_list, free_fast_reg_page_list. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 5 +- drivers/infiniband/hw/nes/nes_hw.h | 27 +- drivers/infiniband/hw/nes/nes_user.h | 1 + drivers/infiniband/hw/nes/nes_verbs.c | 559 +++++++++++++------------- 4 files changed, 300 insertions(+), 292 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 3512d6de301..3d9bbff4f7a 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -424,8 +424,9 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { nesadapter->base_pd = 1; - nesadapter->device_cap_flags = - IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; + nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter) [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]); diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index f28a41ba9fa..8a4c7383bc0 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -546,11 +546,23 @@ enum nes_iwarp_sq_fmr_wqe_word_idx { NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14, }; +enum nes_iwarp_sq_fmr_opcodes { + NES_IWARP_SQ_FMR_WQE_ZERO_BASED = (1<<6), + NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K = (0<<7), + NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M = (1<<7), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ = (1<<16), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE = (1<<17), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ = (1<<18), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19), + NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND = (1<<20), +}; + +#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK 0xFF; + enum nes_iwarp_sq_locinv_wqe_word_idx { NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6, }; - enum nes_iwarp_rq_wqe_word_idx { NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1, NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2, @@ -1153,6 +1165,19 @@ struct nes_pbl { /* TODO: need to add list for two level tables */ }; +#define NES_4K_PBL_CHUNK_SIZE 4096 + +struct nes_fast_mr_wqe_pbl { + u64 *kva; + dma_addr_t paddr; +}; + +struct nes_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + struct nes_fast_mr_wqe_pbl nes_wqe_pbl; + u64 pbl; +}; + struct nes_listener { struct work_struct work; struct workqueue_struct *wq; diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index cc90c14b49e..ce62f3c0037 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -86,6 +86,7 @@ enum iwnes_memreg_type { IWNES_MEMREG_TYPE_CQ = 0x0002, IWNES_MEMREG_TYPE_MW = 0x0003, IWNES_MEMREG_TYPE_FMR = 0x0004, + IWNES_MEMREG_TYPE_FMEM = 0x0005, }; struct nes_mem_reg_req { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 499dd78cb82..0a2b18bad6e 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -275,342 +275,236 @@ static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, } -/** - * nes_alloc_fmr +/* + * nes_alloc_fast_mr */ -static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd, - int ibmr_access_flags, - struct ib_fmr_attr *ibfmr_attr) +static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u32 page_count) { - unsigned long flags; - struct nes_pd *nespd = to_nespd(ibpd); - struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_fmr *nesfmr; - struct nes_cqp_request *cqp_request; struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; int ret; - u32 stag; - u32 stag_index = 0; - u32 next_stag_index = 0; - u32 driver_key = 0; + struct nes_adapter *nesadapter = nesdev->nesadapter; u32 opcode = 0; - u8 stag_key = 0; - int i=0; - struct nes_vpbl vpbl; + u16 major_code; + u64 region_length = page_count * PAGE_SIZE; - get_random_bytes(&next_stag_index, sizeof(next_stag_index)); - stag_key = (u8)next_stag_index; - driver_key = 0; - - next_stag_index >>= 8; - next_stag_index %= nesadapter->max_mr; - - ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, - nesadapter->max_mr, &stag_index, &next_stag_index); - if (ret) { - goto failed_resource_alloc; - } - - nesfmr = kzalloc(sizeof(*nesfmr), GFP_KERNEL); - if (!nesfmr) { - ret = -ENOMEM; - goto failed_fmr_alloc; - } - - nesfmr->nesmr.mode = IWNES_MEMREG_TYPE_FMR; - if (ibfmr_attr->max_pages == 1) { - /* use zero length PBL */ - nesfmr->nesmr.pbl_4k = 0; - nesfmr->nesmr.pbls_used = 0; - } else if (ibfmr_attr->max_pages <= 32) { - /* use PBL 256 */ - nesfmr->nesmr.pbl_4k = 0; - nesfmr->nesmr.pbls_used = 1; - } else if (ibfmr_attr->max_pages <= 512) { - /* use 4K PBLs */ - nesfmr->nesmr.pbl_4k = 1; - nesfmr->nesmr.pbls_used = 1; - } else { - /* use two level 4K PBLs */ - /* add support for two level 256B PBLs */ - nesfmr->nesmr.pbl_4k = 1; - nesfmr->nesmr.pbls_used = 1 + (ibfmr_attr->max_pages >> 9) + - ((ibfmr_attr->max_pages & 511) ? 1 : 0); - } - /* Register the region with the adapter */ - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - - /* track PBL resources */ - if (nesfmr->nesmr.pbls_used != 0) { - if (nesfmr->nesmr.pbl_4k) { - if (nesfmr->nesmr.pbls_used > nesadapter->free_4kpbl) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_avail; - } else { - nesadapter->free_4kpbl -= nesfmr->nesmr.pbls_used; - } - } else { - if (nesfmr->nesmr.pbls_used > nesadapter->free_256pbl) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_avail; - } else { - nesadapter->free_256pbl -= nesfmr->nesmr.pbls_used; - } - } - } - - /* one level pbl */ - if (nesfmr->nesmr.pbls_used == 0) { - nesfmr->root_vpbl.pbl_vbase = NULL; - nes_debug(NES_DBG_MR, "zero level pbl \n"); - } else if (nesfmr->nesmr.pbls_used == 1) { - /* can change it to kmalloc & dma_map_single */ - nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &nesfmr->root_vpbl.pbl_pbase); - if (!nesfmr->root_vpbl.pbl_vbase) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_alloc; - } - nesfmr->leaf_pbl_cnt = 0; - nes_debug(NES_DBG_MR, "one level pbl, root_vpbl.pbl_vbase=%p \n", - nesfmr->root_vpbl.pbl_vbase); - } - /* two level pbl */ - else { - nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, - &nesfmr->root_vpbl.pbl_pbase); - if (!nesfmr->root_vpbl.pbl_vbase) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_vpbl_alloc; - } - - nesfmr->leaf_pbl_cnt = nesfmr->nesmr.pbls_used-1; - nesfmr->root_vpbl.leaf_vpbl = kzalloc(sizeof(*nesfmr->root_vpbl.leaf_vpbl)*1024, GFP_ATOMIC); - if (!nesfmr->root_vpbl.leaf_vpbl) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - ret = -ENOMEM; - goto failed_leaf_vpbl_alloc; - } - - nes_debug(NES_DBG_MR, "two level pbl, root_vpbl.pbl_vbase=%p" - " leaf_pbl_cnt=%d root_vpbl.leaf_vpbl=%p\n", - nesfmr->root_vpbl.pbl_vbase, nesfmr->leaf_pbl_cnt, nesfmr->root_vpbl.leaf_vpbl); - - for (i=0; ileaf_pbl_cnt; i++) - nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase = NULL; - - for (i=0; ileaf_pbl_cnt; i++) { - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - - if (!vpbl.pbl_vbase) { - ret = -ENOMEM; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - goto failed_leaf_vpbl_pages_alloc; - } - - nesfmr->root_vpbl.pbl_vbase[i].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); - nesfmr->root_vpbl.pbl_vbase[i].pa_high = cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); - nesfmr->root_vpbl.leaf_vpbl[i] = vpbl; - - nes_debug(NES_DBG_MR, "pbase_low=0x%x, pbase_high=0x%x, vpbl=%p\n", - nesfmr->root_vpbl.pbl_vbase[i].pa_low, - nesfmr->root_vpbl.pbl_vbase[i].pa_high, - &nesfmr->root_vpbl.leaf_vpbl[i]); - } - } - nesfmr->ib_qp = NULL; - nesfmr->access_rights =0; - - stag = stag_index << 8; - stag |= driver_key; - stag += (u32)stag_key; - - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); cqp_request = nes_get_cqp_request(nesdev); if (cqp_request == NULL) { nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); - ret = -ENOMEM; - goto failed_leaf_vpbl_pages_alloc; + return -ENOMEM; } + nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, " + "region_length = %llu\n", + page_count, region_length); cqp_request->waiting = 1; cqp_wqe = &cqp_request->cqp_wqe; - nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", - stag, stag_index); - - opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; - - if (nesfmr->nesmr.pbl_4k == 1) - opcode |= NES_CQP_STAG_PBL_BLK_SIZE; - - if (ibmr_access_flags & IB_ACCESS_REMOTE_WRITE) { - opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | - NES_CQP_STAG_RIGHTS_LOCAL_WRITE | NES_CQP_STAG_REM_ACC_EN; - nesfmr->access_rights |= - NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_RIGHTS_LOCAL_WRITE | - NES_CQP_STAG_REM_ACC_EN; + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesadapter->free_4kpbl > 0) { + nesadapter->free_4kpbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } else { + /* No 4kpbl's available: */ + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_debug(NES_DBG_MR, "Out of Pbls\n"); + nes_free_cqp_request(nesdev, cqp_request); + return -ENOMEM; } - if (ibmr_access_flags & IB_ACCESS_REMOTE_READ) { - opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | - NES_CQP_STAG_RIGHTS_LOCAL_READ | NES_CQP_STAG_REM_ACC_EN; - nesfmr->access_rights |= - NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_RIGHTS_LOCAL_READ | - NES_CQP_STAG_REM_ACC_EN; - } + opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR | + NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN; + /* + * The current OFED API does not support the zero based TO option. + * If added then need to changed the NES_CQP_STAG_VA* option. Also, + * the API does not support that ability to have the MR set for local + * access only when created and not allow the SQ op to override. Given + * this the remote enable must be set here. + */ nes_fill_init_cqp_wqe(cqp_wqe, nesdev); set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1); - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = - cpu_to_le32((nesfmr->nesmr.pbls_used>1) ? - (nesfmr->nesmr.pbls_used-1) : nesfmr->nesmr.pbls_used); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + barrier(); atomic_set(&cqp_request->refcount, 2); nes_post_cqp_request(nesdev, cqp_request); /* Wait for CQP */ - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - stag, ret, cqp_request->major_code, cqp_request->minor_code); + ret = wait_event_timeout(cqp_request->waitq, + (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); - if ((!ret) || (cqp_request->major_code)) { - nes_put_cqp_request(nesdev, cqp_request); - ret = (!ret) ? -ETIME : -EIO; - goto failed_leaf_vpbl_pages_alloc; - } + nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, " + "wait_event_timeout ret = %u, CQP Major:Minor codes = " + "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code, + cqp_request->minor_code); + major_code = cqp_request->major_code; nes_put_cqp_request(nesdev, cqp_request); - nesfmr->nesmr.ibfmr.lkey = stag; - nesfmr->nesmr.ibfmr.rkey = stag; - nesfmr->attr = *ibfmr_attr; - return &nesfmr->nesmr.ibfmr; - - failed_leaf_vpbl_pages_alloc: - /* unroll all allocated pages */ - for (i=0; ileaf_pbl_cnt; i++) { - if (nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase) { - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase, - nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase); - } - } - if (nesfmr->root_vpbl.leaf_vpbl) - kfree(nesfmr->root_vpbl.leaf_vpbl); - - failed_leaf_vpbl_alloc: - if (nesfmr->leaf_pbl_cnt == 0) { - if (nesfmr->root_vpbl.pbl_vbase) - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); - } else - pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); - - failed_vpbl_alloc: - if (nesfmr->nesmr.pbls_used != 0) { + if (!ret || major_code) { spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nesfmr->nesmr.pbl_4k) - nesadapter->free_4kpbl += nesfmr->nesmr.pbls_used; - else - nesadapter->free_256pbl += nesfmr->nesmr.pbls_used; + nesadapter->free_4kpbl++; spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); } -failed_vpbl_avail: - kfree(nesfmr); - - failed_fmr_alloc: - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - - failed_resource_alloc: - return ERR_PTR(ret); + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + return 0; } - -/** - * nes_dealloc_fmr +/* + * nes_alloc_fast_reg_mr */ -static int nes_dealloc_fmr(struct ib_fmr *ibfmr) +struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len) { - unsigned long flags; - struct nes_mr *nesmr = to_nesmr_from_ibfmr(ibfmr); - struct nes_fmr *nesfmr = to_nesfmr(nesmr); - struct nes_vnic *nesvnic = to_nesvnic(ibfmr->device); + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; - int i = 0; - int rc; - /* free the resources */ - if (nesfmr->leaf_pbl_cnt == 0) { - /* single PBL case */ - if (nesfmr->root_vpbl.pbl_vbase) - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); + u32 next_stag_index; + u8 stag_key = 0; + u32 driver_key = 0; + int err = 0; + u32 stag_index = 0; + struct nes_mr *nesmr; + u32 stag; + int ret; + struct ib_mr *ibmr; +/* + * Note: Set to always use a fixed length single page entry PBL. This is to allow + * for the fast_reg_mr operation to always know the size of the PBL. + */ + if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, + &next_stag_index); + if (err) + return ERR_PTR(err); + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n", + stag, stag_index); + + ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_FMEM; + ibmr = &nesmr->ibmr; } else { - for (i = 0; i < nesfmr->leaf_pbl_cnt; i++) { - pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase, - nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase); - } - kfree(nesfmr->root_vpbl.leaf_vpbl); - pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase, - nesfmr->root_vpbl.pbl_pbase); + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); } - nesmr->ibmw.device = ibfmr->device; - nesmr->ibmw.pd = ibfmr->pd; - nesmr->ibmw.rkey = ibfmr->rkey; - nesmr->ibmw.uobject = NULL; + return ibmr; +} - rc = nes_dealloc_mw(&nesmr->ibmw); +/* + * nes_alloc_fast_reg_page_list + */ +static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list( + struct ib_device *ibdev, + int page_list_len) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct ib_fast_reg_page_list *pifrpl; + struct nes_ib_fast_reg_page_list *pnesfrpl; - if ((rc == 0) && (nesfmr->nesmr.pbls_used != 0)) { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nesfmr->nesmr.pbl_4k) { - nesadapter->free_4kpbl += nesfmr->nesmr.pbls_used; - WARN_ON(nesadapter->free_4kpbl > nesadapter->max_4kpbl); - } else { - nesadapter->free_256pbl += nesfmr->nesmr.pbls_used; - WARN_ON(nesadapter->free_256pbl > nesadapter->max_256pbl); - } - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + if (page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + /* + * Allocate the ib_fast_reg_page_list structure, the + * nes_fast_bpl structure, and the PLB table. + */ + pnesfrpl = kmalloc(sizeof(struct nes_ib_fast_reg_page_list) + + page_list_len * sizeof(u64), GFP_KERNEL); + + if (!pnesfrpl) + return ERR_PTR(-ENOMEM); + + pifrpl = &pnesfrpl->ibfrpl; + pifrpl->page_list = &pnesfrpl->pbl; + pifrpl->max_page_list_len = page_list_len; + /* + * Allocate the WQE PBL + */ + pnesfrpl->nes_wqe_pbl.kva = pci_alloc_consistent(nesdev->pcidev, + page_list_len * sizeof(u64), + &pnesfrpl->nes_wqe_pbl.paddr); + + if (!pnesfrpl->nes_wqe_pbl.kva) { + kfree(pnesfrpl); + return ERR_PTR(-ENOMEM); } + nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, " + "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, " + "pbl.paddr= %p\n", pnesfrpl, &pnesfrpl->ibfrpl, + pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva, + (void *)pnesfrpl->nes_wqe_pbl.paddr); - return rc; + return pifrpl; } - -/** - * nes_map_phys_fmr +/* + * nes_free_fast_reg_page_list */ -static int nes_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) +static void nes_free_fast_reg_page_list(struct ib_fast_reg_page_list *pifrpl) { - return 0; + struct nes_vnic *nesvnic = to_nesvnic(pifrpl->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_fast_reg_page_list *pnesfrpl; + + pnesfrpl = container_of(pifrpl, struct nes_ib_fast_reg_page_list, ibfrpl); + /* + * Free the WQE PBL. + */ + pci_free_consistent(nesdev->pcidev, + pifrpl->max_page_list_len * sizeof(u64), + pnesfrpl->nes_wqe_pbl.kva, + pnesfrpl->nes_wqe_pbl.paddr); + /* + * Free the PBL structure + */ + kfree(pnesfrpl); } - -/** - * nes_unmap_frm - */ -static int nes_unmap_fmr(struct list_head *ibfmr_list) -{ - return 0; -} - - - /** * nes_query_device */ @@ -3514,6 +3408,91 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, ib_wr->ex.invalidate_rkey); break; + case IB_WR_FAST_REG_MR: + { + int i; + int flags = ib_wr->wr.fast_reg.access_flags; + struct nes_ib_fast_reg_page_list *pnesfrpl = + container_of(ib_wr->wr.fast_reg.page_list, + struct nes_ib_fast_reg_page_list, + ibfrpl); + u64 *src_page_list = pnesfrpl->ibfrpl.page_list; + u64 *dst_page_list = pnesfrpl->nes_wqe_pbl.kva; + + if (ib_wr->wr.fast_reg.page_list_len > + (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) { + nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n"); + err = -EINVAL; + break; + } + wqe_misc = NES_IWARP_SQ_OP_FAST_REG; + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX, + ib_wr->wr.fast_reg.iova_start); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, + ib_wr->wr.fast_reg.length); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX, + ib_wr->wr.fast_reg.rkey); + /* Set page size: */ + if (ib_wr->wr.fast_reg.page_shift == 12) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K; + } else if (ib_wr->wr.fast_reg.page_shift == 21) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M; + } else { + nes_debug(NES_DBG_IW_TX, "Invalid page shift," + " ib_wr=%u, max=1\n", ib_wr->num_sge); + err = -EINVAL; + break; + } + /* Set access_flags */ + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ; + if (flags & IB_ACCESS_LOCAL_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE; + + if (flags & IB_ACCESS_REMOTE_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE; + + if (flags & IB_ACCESS_REMOTE_READ) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ; + + if (flags & IB_ACCESS_MW_BIND) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND; + + /* Fill in PBL info: */ + if (ib_wr->wr.fast_reg.page_list_len > + pnesfrpl->ibfrpl.max_page_list_len) { + nes_debug(NES_DBG_IW_TX, "Invalid page list length," + " ib_wr=%p, value=%u, max=%u\n", + ib_wr, ib_wr->wr.fast_reg.page_list_len, + pnesfrpl->ibfrpl.max_page_list_len); + err = -EINVAL; + break; + } + + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX, + pnesfrpl->nes_wqe_pbl.paddr); + + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX, + ib_wr->wr.fast_reg.page_list_len * 8); + + for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++) + dst_page_list[i] = cpu_to_le64(src_page_list[i]); + + nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %p, " + "length: %d, rkey: %0x, pgl_paddr: %p, " + "page_list_len: %u, wqe_misc: %x\n", + (void *)ib_wr->wr.fast_reg.iova_start, + ib_wr->wr.fast_reg.length, + ib_wr->wr.fast_reg.rkey, + (void *)pnesfrpl->nes_wqe_pbl.paddr, + ib_wr->wr.fast_reg.page_list_len, + wqe_misc); + break; + } default: /* error */ err = -EINVAL; @@ -3752,6 +3731,9 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) case NES_IWARP_SQ_OP_LOCINV: entry->opcode = IB_WR_LOCAL_INV; break; + case NES_IWARP_SQ_OP_FAST_REG: + entry->opcode = IB_WC_FAST_REG_MR; + break; } nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); @@ -3922,10 +3904,9 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; nesibdev->ibdev.bind_mw = nes_bind_mw; - nesibdev->ibdev.alloc_fmr = nes_alloc_fmr; - nesibdev->ibdev.unmap_fmr = nes_unmap_fmr; - nesibdev->ibdev.dealloc_fmr = nes_dealloc_fmr; - nesibdev->ibdev.map_phys_fmr = nes_map_phys_fmr; + nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr; + nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list; + nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list; nesibdev->ibdev.attach_mcast = nes_multicast_attach; nesibdev->ibdev.detach_mcast = nes_multicast_detach; From a276510328d0721c252b37044c51e2fb4efe0364 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: [PATCH 28/45] RDMA/nes: Add additional SFP+ PHY uC status check and PHY reset Add additional PHY uC status check in case PHY firmware is not running properly with heartbeat. Add a hard PHY reset if uC status is 0x0 after initial reset. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 3d9bbff4f7a..b59ca565942 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1356,6 +1356,8 @@ int nes_init_phy(struct nes_device *nesdev) } if ((phy_type == NES_PHY_TYPE_ARGUS) || (phy_type == NES_PHY_TYPE_SFP_D)) { + u32 first_time = 1; + /* Check firmware heartbeat */ nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); @@ -1363,8 +1365,13 @@ int nes_init_phy(struct nes_device *nesdev) nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if (temp_phy_data != temp_phy_data2) - return 0; + if (temp_phy_data != temp_phy_data2) { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if ((temp_phy_data & 0xff) > 0x20) + return 0; + printk(PFX "Reinitializing PHY\n"); + } /* no heartbeat, configure the PHY */ nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); @@ -1400,7 +1407,7 @@ int nes_init_phy(struct nes_device *nesdev) temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); do { if (counter++ > 150) { - nes_debug(NES_DBG_PHY, "No PHY heartbeat\n"); + printk(PFX "No PHY heartbeat\n"); break; } mdelay(1); @@ -1414,11 +1421,20 @@ int nes_init_phy(struct nes_device *nesdev) nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); if (counter++ > 300) { - nes_debug(NES_DBG_PHY, "PHY did not track\n"); - break; + if (((temp_phy_data & 0xff) == 0x0) && first_time) { + first_time = 0; + counter = 0; + /* reset AMCC PHY and try again */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040); + continue; + } else { + printk(PFX "PHY did not track\n"); + break; + } } mdelay(10); - } while (((temp_phy_data & 0xff) != 0x50) && ((temp_phy_data & 0xff) != 0x70)); + } while ((temp_phy_data & 0xff) < 0x30); /* setup signal integrity */ nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000); From d14152da13dc29aa70cddd8ca214a13e3597eb7f Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: [PATCH 29/45] RDMA/nes: Implement IB_SIGNAL_ALL_WR as an iWARP extension Add IB_SINGAL_ALL_WR support as an iWARP extension. If set, make sure all WR for the QP are signalled. Consolidate flags used in nesqp structure. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 6 ++++-- drivers/infiniband/hw/nes/nes_verbs.h | 15 +++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 0a2b18bad6e..8ea75614b87 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1407,6 +1407,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, return ERR_PTR(-EINVAL); } + nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); + /* update the QP table */ nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", @@ -3502,9 +3504,9 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, if (err) break; - if (ib_wr->send_flags & IB_SEND_SIGNALED) { + if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all) wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; - } + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc); ib_wr = ib_wr->next; diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 89822d75f82..ac8b86b8cc1 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -167,17 +167,20 @@ struct nes_qp { enum ib_event_type terminate_eventtype; wait_queue_head_t kick_waitq; u16 in_disconnect; + u16 active_conn:1; + u16 skip_lsmm:1; + u16 user_mode:1; + u16 hte_added:1; + u16 flush_issued:1; + u16 destroyed:1; + u16 sq_kmapped:1; + u16 sig_all:1; + u16 rsvd:8; u16 private_data_len; u16 term_sq_flush_code; u16 term_rq_flush_code; - u8 active_conn; - u8 skip_lsmm; - u8 user_mode; - u8 hte_added; u8 hw_iwarp_state; - u8 flush_issued; u8 hw_tcp_state; u8 term_flags; - u8 destroyed; }; #endif /* NES_VERBS_H */ From 75742c630ed552ad963948c9f3e596e96eed7a9f Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: [PATCH 30/45] RDMA/nes: Clean up struct nes_qp Remove unused and not really used variables. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 6 ------ drivers/infiniband/hw/nes/nes_verbs.h | 9 +-------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 8ea75614b87..ea4e22ea2d4 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1398,8 +1398,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n", nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp)); spin_lock_init(&nesqp->lock); - init_waitqueue_head(&nesqp->state_waitq); - init_waitqueue_head(&nesqp->kick_waitq); nes_add_ref(&nesqp->ibqp); break; default: @@ -3005,7 +3003,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, " already done based on hw state.\n", nesqp->hwqp.qp_id); issue_modify_qp = 0; - nesqp->in_disconnect = 0; } switch (nesqp->hw_iwarp_state) { case NES_AEQE_IWARP_STATE_CLOSING: @@ -3018,7 +3015,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, break; default: next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; - nesqp->in_disconnect = 1; nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; break; } @@ -3035,7 +3031,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; issue_modify_qp = 1; - nesqp->in_disconnect = 1; break; case IB_QPS_ERR: case IB_QPS_RESET: @@ -3058,7 +3053,6 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) && (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) { next_iwarp_state |= NES_CQP_QP_RESET; - nesqp->in_disconnect = 1; } else { nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n", nesqp->hwqp.qp_id, nesqp->hw_tcp_state); diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index ac8b86b8cc1..795aa4f9259 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -135,19 +135,15 @@ struct nes_qp { struct ib_qp ibqp; void *allocated_buffer; struct iw_cm_id *cm_id; - struct workqueue_struct *wq; struct nes_cq *nesscq; struct nes_cq *nesrcq; struct nes_pd *nespd; void *cm_node; /* handle of the node this QP is associated with */ struct ietf_mpa_frame *ietf_frame; dma_addr_t ietf_frame_pbase; - wait_queue_head_t state_waitq; struct ib_mr *lsmm_mr; - unsigned long socket; struct nes_hw_qp hwqp; struct work_struct work; - struct work_struct ae_work; enum ib_qp_state ibqp_state; u32 iwarp_state; u32 hte_index; @@ -165,17 +161,14 @@ struct nes_qp { struct page *page; struct timer_list terminate_timer; enum ib_event_type terminate_eventtype; - wait_queue_head_t kick_waitq; - u16 in_disconnect; u16 active_conn:1; u16 skip_lsmm:1; u16 user_mode:1; u16 hte_added:1; u16 flush_issued:1; u16 destroyed:1; - u16 sq_kmapped:1; u16 sig_all:1; - u16 rsvd:8; + u16 rsvd:9; u16 private_data_len; u16 term_sq_flush_code; u16 term_rq_flush_code; From 5924aea6e26712cd372aa23ed432d4cefbb050d2 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: [PATCH 31/45] RDMA/nes: Add max_cqe check to nes_create_cq() Add a check to nes_create_cq() to return -EINVAL if creating a CQ with depth > max_cqe (32766). Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index b59ca565942..6f625a962b3 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -482,7 +482,7 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { nesadapter->max_irrq_wr = (u32temp >> 16) & 3; nesadapter->max_sge = 4; - nesadapter->max_cqe = 32767; + nesadapter->max_cqe = 32766; if (nes_read_eeprom_values(nesdev, nesadapter)) { printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index ea4e22ea2d4..155286b6e74 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -527,7 +527,7 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; props->max_sge = nesdev->nesadapter->max_sge; props->max_cq = nesibdev->max_cq; - props->max_cqe = nesdev->nesadapter->max_cqe - 1; + props->max_cqe = nesdev->nesadapter->max_cqe; props->max_mr = nesibdev->max_mr; props->max_mw = nesibdev->max_mr; props->max_pd = nesibdev->max_pd; @@ -1543,6 +1543,9 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, unsigned long flags; int ret; + if (entries > nesadapter->max_cqe) + return ERR_PTR(-EINVAL); + err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, nesadapter->max_cq, &cq_num, &nesadapter->next_cq); if (err) { From fa6c87d5104512bf73cf62162cec9ef6eba707c7 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 9 Dec 2009 15:21:56 -0800 Subject: [PATCH 32/45] RDMA/nes: Update copyright and branding string Update copyright from Intel-NE, Inc. to Intel Corporation. Use proper branding string in Kconfig and simplify description. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/Kconfig | 9 ++++----- drivers/infiniband/hw/nes/nes.c | 2 +- drivers/infiniband/hw/nes/nes.h | 2 +- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/infiniband/hw/nes/nes_cm.h | 2 +- drivers/infiniband/hw/nes/nes_context.h | 2 +- drivers/infiniband/hw/nes/nes_hw.c | 2 +- drivers/infiniband/hw/nes/nes_hw.h | 2 +- drivers/infiniband/hw/nes/nes_nic.c | 2 +- drivers/infiniband/hw/nes/nes_user.h | 2 +- drivers/infiniband/hw/nes/nes_utils.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.h | 2 +- 13 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig index d449eb6ec78..846dc97cf26 100644 --- a/drivers/infiniband/hw/nes/Kconfig +++ b/drivers/infiniband/hw/nes/Kconfig @@ -4,14 +4,13 @@ config INFINIBAND_NES select LIBCRC32C select INET_LRO ---help--- - This is a low-level driver for NetEffect RDMA enabled - Network Interface Cards (RNIC). + This is the RDMA Network Interface Card (RNIC) driver for + NetEffect Ethernet Cluster Server Adapters. config INFINIBAND_NES_DEBUG bool "Verbose debugging output" depends on INFINIBAND_NES default n ---help--- - This option causes the NetEffect RNIC driver to produce debug - messages. Select this if you are developing the driver - or trying to diagnose a problem. + This option enables debug messages from the NetEffect RNIC + driver. Select this if you are diagnosing a problem. diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index cbde0cfe27e..88d31148cb3 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index bcc6abc4faf..98840564bb2 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 73473db1986..dbe54550533 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 90e8e4d8a5c..5e4808c061d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h index 0fb8d81d9a6..b4393a16099 100644 --- a/drivers/infiniband/hw/nes/nes_context.h +++ b/drivers/infiniband/hw/nes/nes_context.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 6f625a962b3..9fc0273dd40 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index 8a4c7383bc0..084be0ee689 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -1,5 +1,5 @@ /* -* Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. +* Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index e593af3354b..5a7b554f411 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h index ce62f3c0037..71e133ab209 100644 --- a/drivers/infiniband/hw/nes/nes_user.h +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 9687c397ce1..729d525c5b7 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 155286b6e74..08686523df0 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 795aa4f9259..cc7a60481e5 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * Copyright (c) 2006 - 2009 Intel Corporation. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two From d85ddd835b33a9a0f2276ce068318da3fd1ad76a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 9 Dec 2009 15:21:57 -0800 Subject: [PATCH 33/45] RDMA/nes: Pass correct size to ioremap_nocache() The size argument to ioremap_nocache should be the size of desired information, not the pointer to it. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @expression@ expression *x; @@ x = <+... *sizeof(x) ...+>// Signed-off-by: Julia Lawall Acked-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 88d31148cb3..b9d09bafd6c 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -521,7 +521,8 @@ static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_i spin_lock_init(&nesdev->indexed_regs_lock); /* Remap the PCI registers in adapter BAR0 to kernel VA space */ - mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), sizeof(mmio_regs)); + mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), + pci_resource_len(pcidev, BAR_0)); if (mmio_regs == NULL) { printk(KERN_ERR PFX "Unable to remap BAR0\n"); ret = -EIO; From 9b84dbe7f479a5a5fa53d689c2adf214ce7760e5 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:53:36 -0800 Subject: [PATCH 34/45] RDMA/nes: Fix MAX_CM_BUFFER define Change MAX_CM_BUFFER for MPA frames to be conformant to RFC 5044: we need 512 + 20 instead of 512. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 5e4808c061d..911846ae5c7 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -47,6 +47,8 @@ #define IEFT_MPA_KEY_REP "MPA ID Rep Frame" #define IETF_MPA_KEY_SIZE 16 #define IETF_MPA_VERSION 1 +#define IETF_MAX_PRIV_DATA_LEN 512 +#define IETF_MPA_FRAME_SIZE 20 enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ @@ -169,7 +171,7 @@ struct nes_timer_entry { #define NES_CM_DEF_SEQ2 0x18ed5740 #define NES_CM_DEF_LOCAL_ID2 0xb807 -#define MAX_CM_BUFFER 512 +#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_MAX_PRIV_DATA_LEN) typedef u32 nes_addr_t; From 8ac7f6e1af5309d4fdf6805fb64ef48c1c820d85 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:53:46 -0800 Subject: [PATCH 35/45] RDMA/nes: Fix query of ORD values The ORD size needs updating as we are supporting more inbound READ resources per connection. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 08686523df0..67a87cb9d77 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -534,16 +534,16 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop props->max_sge_rd = 1; switch (nesdev->nesadapter->max_irrq_wr) { case 0: - props->max_qp_rd_atom = 1; + props->max_qp_rd_atom = 2; break; case 1: - props->max_qp_rd_atom = 4; + props->max_qp_rd_atom = 8; break; case 2: - props->max_qp_rd_atom = 16; + props->max_qp_rd_atom = 32; break; case 3: - props->max_qp_rd_atom = 32; + props->max_qp_rd_atom = 64; break; default: props->max_qp_rd_atom = 0; From 1cf078c9951b531bc222a5195306a3a927c24fc9 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:53:54 -0800 Subject: [PATCH 36/45] RDMA/nes: MPA request/response error checking During Xansation testing, we saw that error handling of MPA frame msg/response is not handled properly. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 31 ++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index dbe54550533..ae0946342a9 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -251,6 +251,33 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, mpa_frame = (struct ietf_mpa_frame *)buffer; cm_node->mpa_frame_size = ntohs(mpa_frame->priv_data_len); + /* make sure mpa private data len is less than 512 bytes */ + if (cm_node->mpa_frame_size > IETF_MAX_PRIV_DATA_LEN) { + nes_debug(NES_DBG_CM, "The received Length of Private" + " Data field exceeds 512 octets\n"); + return -EINVAL; + } + /* + * make sure MPA receiver interoperate with the + * received MPA version and MPA key information + * + */ + if (mpa_frame->rev != mpa_version) { + nes_debug(NES_DBG_CM, "The received mpa version" + " can not be interoperated\n"); + return -EINVAL; + } + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } else { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } if (cm_node->mpa_frame_size + sizeof(struct ietf_mpa_frame) != len) { nes_debug(NES_DBG_CM, "The received ietf buffer was not right" @@ -1974,7 +2001,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (!cm_node) return NULL; mpa_frame = &cm_node->mpa_frame; - strcpy(mpa_frame->key, IEFT_MPA_KEY_REQ); + memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); mpa_frame->flags = IETF_MPA_FLAGS_CRC; mpa_frame->rev = IETF_MPA_VERSION; mpa_frame->priv_data_len = htons(private_data_len); @@ -2929,7 +2956,7 @@ int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) if (cm_node->mpa_frame_size > MAX_CM_BUFFER) return -EINVAL; - strcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP); + memcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); if (loopback) { memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len); loopback->mpa_frame.priv_data_len = pdata_len; From 69524e1aff75e4ed8efcb7d699c97d55c317a950 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:03 -0800 Subject: [PATCH 37/45] RDMA/nes: Resource not freed for REJECTed connections During testing of REJECT connection error handling, we saw that the cm_id resources are not released. When the retransmit timer expires, we need to send a reset message to remote node before issuing the ABORTED event. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index ae0946342a9..08fcd25f788 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -513,6 +513,8 @@ static void nes_retrans_expired(struct nes_cm_node *cm_node) send_reset(cm_node, NULL); break; default: + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); create_event(cm_node, NES_CM_EVENT_ABORTED); } } From c5a7d4897156667a58fd8479f6227143573fe82d Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:08 -0800 Subject: [PATCH 38/45] RDMA/nes: Fix crash in nes_accept() While running IMP_EXT's window test, we saw a crash in nes_accept(). Here is the sequence of what happened: (1) In MVAPICH2, connect request is received for port #0. FIX: Add a nes_connect() check to make sure local or remote tcp port is not 0. (2) Remote node's (passive) TCP stack sends a reset when it gets a connect request because of port = 0. Active side set the connect error to IW_CM_EVENT_STATUS_REJECTED when it received the RST from remote node. FIX: The corect error code is -ECONNRESET. (3) Wrong error code of IW_CM_EVENT_STATUS_REJECTED causes the core to destroy its listener ports. Here there are connections that may have sent an MPA request up and waiting for accept or reject. But the listener and its cm_nodes have been freed already causing the crash noticed. FIX: The cm_node is freed only if its state is not NES_CM_STATE_MPAREQ_RCVD. If cm_node's state is NES_CM_STATE_MPAREQ_RCVD then its new state is set to NES_CM_STATE_LISTENER_DESTROYED and it is not freed. When nes_accept() or nes_reject() is received, its state is checked for NES_CM_STATE_LISTENER_DESTROYED and in this case the cm_node is freed and error is returned. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 77 +++++++++++++++++++----------- drivers/infiniband/hw/nes/nes_cm.h | 1 + 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 08fcd25f788..ec04786b606 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -978,6 +978,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, reset_entry); { struct nes_cm_node *loopback = cm_node->loopbackpartner; + enum nes_cm_node_state old_state; if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) { rem_ref_cm_node(cm_node->cm_core, cm_node); } else { @@ -989,11 +990,12 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, NES_CM_STATE_CLOSED; WARN_ON(1); } else { - cm_node->state = - NES_CM_STATE_CLOSED; - rem_ref_cm_node( - cm_node->cm_core, - cm_node); + old_state = cm_node->state; + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; + if (old_state != NES_CM_STATE_MPAREQ_RCVD) + rem_ref_cm_node( + cm_node->cm_core, + cm_node); } } else { struct nes_cm_event event; @@ -1009,6 +1011,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, loopback->loc_port; event.cm_info.cm_id = loopback->cm_id; cm_event_connect_error(&event); + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; loopback->state = NES_CM_STATE_CLOSED; event.cm_node = cm_node; @@ -2131,30 +2134,39 @@ static int mini_cm_reject(struct nes_cm_core *cm_core, cm_node->state = NES_CM_STATE_CLOSED; rem_ref_cm_node(cm_core, cm_node); } else { - ret = send_mpa_reject(cm_node); - if (ret) { - cm_node->state = NES_CM_STATE_CLOSED; - err = send_reset(cm_node, NULL); - if (err) - WARN_ON(1); - } else - cm_id->add_ref(cm_id); + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + } else { + ret = send_mpa_reject(cm_node); + if (ret) { + cm_node->state = NES_CM_STATE_CLOSED; + err = send_reset(cm_node, NULL); + if (err) + WARN_ON(1); + } else + cm_id->add_ref(cm_id); + } } } else { cm_node->cm_id = NULL; - event.cm_node = loopback; - event.cm_info.rem_addr = loopback->rem_addr; - event.cm_info.loc_addr = loopback->loc_addr; - event.cm_info.rem_port = loopback->rem_port; - event.cm_info.loc_port = loopback->loc_port; - event.cm_info.cm_id = loopback->cm_id; - cm_event_mpa_reject(&event); - rem_ref_cm_node(cm_core, cm_node); - loopback->state = NES_CM_STATE_CLOSING; + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + rem_ref_cm_node(cm_core, loopback); + } else { + event.cm_node = loopback; + event.cm_info.rem_addr = loopback->rem_addr; + event.cm_info.loc_addr = loopback->loc_addr; + event.cm_info.rem_port = loopback->rem_port; + event.cm_info.loc_port = loopback->loc_port; + event.cm_info.cm_id = loopback->cm_id; + cm_event_mpa_reject(&event); + rem_ref_cm_node(cm_core, cm_node); + loopback->state = NES_CM_STATE_CLOSING; - cm_id = loopback->cm_id; - rem_ref_cm_node(cm_core, loopback); - cm_id->rem_ref(cm_id); + cm_id = loopback->cm_id; + rem_ref_cm_node(cm_core, loopback); + cm_id->rem_ref(cm_id); + } } return ret; @@ -2198,6 +2210,7 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod case NES_CM_STATE_UNKNOWN: case NES_CM_STATE_INITED: case NES_CM_STATE_CLOSED: + case NES_CM_STATE_LISTENER_DESTROYED: ret = rem_ref_cm_node(cm_core, cm_node); break; case NES_CM_STATE_TSA: @@ -2716,8 +2729,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct nes_pd *nespd; u64 tagged_offset; - - ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) return -EINVAL; @@ -2733,6 +2744,13 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) "%s\n", cm_node, nesvnic, nesvnic->netdev, nesvnic->netdev->name); + if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) { + if (cm_node->loopbackpartner) + rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner); + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -EINVAL; + } + /* associate the node with the QP */ nesqp->cm_node = (void *)cm_node; cm_node->nesqp = nesqp; @@ -3003,6 +3021,9 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!nesdev) return -EINVAL; + if (!(cm_id->local_addr.sin_port) || !(cm_id->remote_addr.sin_port)) + return -EINVAL; + nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, ntohl(nesvnic->local_ipaddr), @@ -3375,7 +3396,7 @@ static void cm_event_connect_error(struct nes_cm_event *event) nesqp->cm_id = NULL; cm_id->provider_data = NULL; cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = IW_CM_EVENT_STATUS_REJECTED; + cm_event.status = -ECONNRESET; cm_event.provider_data = cm_id->provider_data; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 911846ae5c7..d9825fda70a 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -200,6 +200,7 @@ enum nes_cm_node_state { NES_CM_STATE_TIME_WAIT, NES_CM_STATE_LAST_ACK, NES_CM_STATE_CLOSING, + NES_CM_STATE_LISTENER_DESTROYED, NES_CM_STATE_CLOSED }; From f9f3f1e08b4d66bfda2a0c2d49a26c80489a0725 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:14 -0800 Subject: [PATCH 39/45] RDMA/nes: Abnormal listener exit causes loopback node crash When the listener is destroyed for a loopback connection, the listener node gets a reset event. This causes a crash as the listener is not expecting a reset event. Code review of cm_event_reset() during debugging showed the cm_id ref count is incremented after calling its event handler and not before. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index ec04786b606..20e21f1a18b 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1014,18 +1014,6 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; loopback->state = NES_CM_STATE_CLOSED; - event.cm_node = cm_node; - event.cm_info.rem_addr = - cm_node->rem_addr; - event.cm_info.loc_addr = - cm_node->loc_addr; - event.cm_info.rem_port = - cm_node->rem_port; - event.cm_info.loc_port = - cm_node->loc_port; - event.cm_info.cm_id = cm_node->cm_id; - cm_event_reset(&event); - rem_ref_cm_node(cm_node->cm_core, cm_node); @@ -3440,6 +3428,8 @@ static void cm_event_reset(struct nes_cm_event *event) nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id); nesqp = cm_id->provider_data; + if (!nesqp) + return; nesqp->cm_id = NULL; /* cm_id->provider_data = NULL; */ @@ -3451,8 +3441,8 @@ static void cm_event_reset(struct nes_cm_event *event) cm_event.private_data = NULL; cm_event.private_data_len = 0; - ret = cm_id->event_handler(cm_id, &cm_event); cm_id->add_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); atomic_inc(&cm_closes); cm_event.event = IW_CM_EVENT_CLOSE; cm_event.status = IW_CM_EVENT_STATUS_OK; From 886f98a31586fd560fe83c44ad72e3ebe62f8e2e Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:18 -0800 Subject: [PATCH 40/45] RDMA/nes: Fix Xansation test crash on cm_node ref_count While running a Xansation test, an active side node crashed. The problem started on the passive side, which generated an STtag that was 0. The passive side sent a TERMINATE instead of an MPA REJECT msg. The active side, receives TERMINATE and sends connect_err() and set the cm_node state to CLOSED. The passive side sends FIN + ACK after TERMINATE. Active side ends up in handle_ack_pkt() and send_reset(). send_reset() consumes 1 cm_node's ref_count. Because the cm_node is in CLOSED state, which means that cm_node will be destroyed after completion of the connect_err() indication, CM will crash after send_reset(). Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 20e21f1a18b..a25816812ce 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1610,6 +1610,7 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, break; case NES_CM_STATE_CLOSED: cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); send_reset(cm_node, skb); break; case NES_CM_STATE_TSA: @@ -1661,9 +1662,15 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, passive_open_err(cm_node, skb, 1); break; case NES_CM_STATE_LISTENING: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; case NES_CM_STATE_CLOSED: cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); send_reset(cm_node, skb); break; case NES_CM_STATE_ESTABLISHED: @@ -1732,8 +1739,13 @@ static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, dev_kfree_skb_any(skb); break; case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; case NES_CM_STATE_CLOSED: cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); send_reset(cm_node, skb); break; case NES_CM_STATE_LAST_ACK: @@ -2193,8 +2205,11 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod case NES_CM_STATE_CLOSING: ret = -1; break; - case NES_CM_STATE_MPAREJ_RCVD: case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_MPAREJ_RCVD: case NES_CM_STATE_UNKNOWN: case NES_CM_STATE_INITED: case NES_CM_STATE_CLOSED: From fd000e12a564bdeaec5e5a438d341d9132409f26 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:23 -0800 Subject: [PATCH 41/45] RDMA/nes: Check for zero STag STags are generated randomly but the driver does not correctly prevent a zero STag. Using STag zero is privileged and causes a user space application to fail. This change prevents the driver from trying to allocate a zero STag. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 3 ++- drivers/infiniband/hw/nes/nes_verbs.c | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 9fc0273dd40..b1c2cbb88f0 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -437,11 +437,12 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]); - /* mark the usual suspect QPs and CQs as in use */ + /* mark the usual suspect QPs, MR and CQs as in use */ for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) { set_bit(u32temp, nesadapter->allocated_qps); set_bit(u32temp, nesadapter->allocated_cqs); } + set_bit(0, nesadapter->allocated_mrs); for (u32temp = 0; u32temp < 20; u32temp++) set_bit(u32temp, nesadapter->allocated_pds); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 67a87cb9d77..53dc39f4300 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2503,9 +2503,6 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, stag = stag_index << 8; stag |= driver_key; stag += (u32)stag_key; - if (stag == 0) { - stag = 1; - } iova_start = virt; /* Make the leaf PBL the root if only one PBL */ From d2fa9b26e181d1e3c3df06a57fa13b04afee0e16 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:28 -0800 Subject: [PATCH 42/45] RDMA/nes: Free kmap() resources We fail when creating many qps as kmap() fails for sq_vbase. Fix this by doing kunmap() as soon as we are done with sq_vbase. We do kunmap() in one of the locations below: (1) nes_destroy_qp() (2) nes_accept() (3) nes_connect_event We keep a flag to avoid multiple calls to kunmap(). Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 10 ++++++++++ drivers/infiniband/hw/nes/nes_verbs.c | 10 ++++++++-- drivers/infiniband/hw/nes/nes_verbs.h | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index a25816812ce..b139806a966 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -2836,6 +2837,10 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame)); wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | @@ -3304,6 +3309,11 @@ static void cm_event_connected(struct nes_cm_event *event) wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + /* use the reserved spot on the WQ for the extra first WQE */ nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 53dc39f4300..64d3136e374 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1015,6 +1015,7 @@ static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl, kunmap(nesqp->page); return -ENOMEM; } + nesqp->sq_kmapped = 1; nesqp->hwqp.q2_vbase = mem; mem += 256; memset(nesqp->hwqp.q2_vbase, 0, 256); @@ -1092,7 +1093,10 @@ static inline void nes_free_qp_mem(struct nes_device *nesdev, pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase ); nesqp->pbl_vbase = NULL; - kunmap(nesqp->page); + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } } } @@ -1501,8 +1505,10 @@ static int nes_destroy_qp(struct ib_qp *ibqp) nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index; } } - if (nesqp->pbl_pbase) + if (nesqp->pbl_pbase && nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; kunmap(nesqp->page); + } } else { /* Clean any pending completions from the cq(s) */ if (nesqp->nesscq) diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index cc7a60481e5..2df9993e0ca 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -175,5 +175,6 @@ struct nes_qp { u8 hw_iwarp_state; u8 hw_tcp_state; u8 term_flags; + u8 sq_kmapped; }; #endif /* NES_VERBS_H */ From b1190d3e0d548615ee7c38c10b5fc376a76b7afd Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:32 -0800 Subject: [PATCH 43/45] RDMA/nes: FIN during MPA startup causes timeout A FIN that is received during an MPA start up sequence causes a timeout in iwcm.c. The connection has not been completely closed so the iwcm code is waiting for resources to be cleaned up. This closes the connection so everything cleans up correctly. Signed-off-by: Don Wood Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index b139806a966..4acf04a03e1 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1354,13 +1354,20 @@ static void handle_fin_pkt(struct nes_cm_node *cm_node) case NES_CM_STATE_SYN_RCVD: case NES_CM_STATE_SYN_SENT: case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_MPAREQ_SENT: case NES_CM_STATE_MPAREJ_RCVD: cm_node->tcp_cntxt.rcv_nxt++; cleanup_retrans_entry(cm_node); cm_node->state = NES_CM_STATE_LAST_ACK; send_fin(cm_node, NULL); break; + case NES_CM_STATE_MPAREQ_SENT: + create_event(cm_node, NES_CM_EVENT_ABORTED); + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); + break; case NES_CM_STATE_FIN_WAIT1: cm_node->tcp_cntxt.rcv_nxt++; cleanup_retrans_entry(cm_node); From 7a576dfd9ed4fd0f32bb838ce4f644af201ac7df Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 9 Dec 2009 15:54:33 -0800 Subject: [PATCH 44/45] RDMA/nes: Fix stale ARP issue When the remote node's ethernet address changes, the connection keeps trying to connect using the old address. The connection wil continue failing until the driver is unloaded and loaded again (eiter reboot or rmmod). Fix this by checking that the NIC has the correct address before starting a connection. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 37 ++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 4acf04a03e1..39468c27703 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1098,12 +1098,13 @@ static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, /** * nes_addr_resolve_neigh */ -static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip) +static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex) { struct rtable *rt; struct flowi fl; struct neighbour *neigh; - int rc = -1; + int rc = arpindex; + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = htonl(dst_ip); @@ -1119,6 +1120,21 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip) nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X" " is %pM, Gateway is 0x%08X \n", dst_ip, neigh->ha, ntohl(rt->rt_gateway)); + + if (arpindex >= 0) { + if (!memcmp(nesadapter->arp_table[arpindex].mac_addr, + neigh->ha, ETH_ALEN)){ + /* Mac address same as in nes_arp_table */ + neigh_release(neigh); + ip_rt_put(rt); + return rc; + } + + nes_manage_arp_cache(nesvnic->netdev, + nesadapter->arp_table[arpindex].mac_addr, + dst_ip, NES_ARP_DELETE); + } + nes_manage_arp_cache(nesvnic->netdev, neigh->ha, dst_ip, NES_ARP_ADD); rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL, @@ -1134,7 +1150,6 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip) return rc; } - /** * make_cm_node - create a new instance of a cm node */ @@ -1144,6 +1159,7 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, { struct nes_cm_node *cm_node; struct timespec ts; + int oldarpindex = 0; int arpindex = 0; struct nes_device *nesdev; struct nes_adapter *nesadapter; @@ -1197,17 +1213,18 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, nesadapter = nesdev->nesadapter; cm_node->loopbackpartner = NULL; + /* get the mac addr for the remote node */ if (ipv4_is_loopback(htonl(cm_node->rem_addr))) arpindex = nes_arp_table(nesdev, ntohl(nesvnic->local_ipaddr), NULL, NES_ARP_RESOLVE); - else - arpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); + else { + oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); + + } if (arpindex < 0) { - arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr); - if (arpindex < 0) { - kfree(cm_node); - return NULL; - } + kfree(cm_node); + return NULL; } /* copy the mac addr to node context */ From 48617f862f9e58ca2a609fea6a76733aff55d672 Mon Sep 17 00:00:00 2001 From: Frank Zago Date: Tue, 15 Dec 2009 23:39:10 -0800 Subject: [PATCH 45/45] RDMA/cxgb3: Fix error paths in post_send and post_recv Always set bad_wr when an immediate error is detected. Return ENOMEM for queue full instead of EINVAL to match other drivers. Signed-off-by: Frank Zago Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/iwch_qp.c | 32 +++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 1cecf98829a..3eb8cecf81d 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -365,18 +365,19 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qhp->lock, flag); if (qhp->attr.state > IWCH_QP_STATE_RTS) { spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; + err = -EINVAL; + goto out; } num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, qhp->wq.sq_size_log2); if (num_wrs <= 0) { spin_unlock_irqrestore(&qhp->lock, flag); - return -ENOMEM; + err = -ENOMEM; + goto out; } while (wr) { if (num_wrs == 0) { err = -ENOMEM; - *bad_wr = wr; break; } idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); @@ -428,10 +429,8 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wr->opcode); err = -EINVAL; } - if (err) { - *bad_wr = wr; + if (err) break; - } wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; sqp->wr_id = wr->wr_id; sqp->opcode = wr2opcode(t3_wr_opcode); @@ -454,6 +453,10 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } spin_unlock_irqrestore(&qhp->lock, flag); ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; return err; } @@ -471,18 +474,19 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, spin_lock_irqsave(&qhp->lock, flag); if (qhp->attr.state > IWCH_QP_STATE_RTS) { spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; + err = -EINVAL; + goto out; } num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, qhp->wq.rq_size_log2) - 1; if (!wr) { spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; + err = -ENOMEM; + goto out; } while (wr) { if (wr->num_sge > T3_MAX_SGE) { err = -EINVAL; - *bad_wr = wr; break; } idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); @@ -494,10 +498,10 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, err = build_zero_stag_recv(qhp, wqe, wr); else err = -ENOMEM; - if (err) { - *bad_wr = wr; + + if (err) break; - } + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP); @@ -511,6 +515,10 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, } spin_unlock_irqrestore(&qhp->lock, flag); ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; return err; }