mlx4_ib: Use optimal numbers of MTT entries.

yishaih · Mukesh Kacker · commit f2eb345800f1 · 2015-07-07T14:38:08.000-07:00
Auto recognition of contiguous physical pages.
Reduce number of MTTs in MPT based on contiguous pages.
Considering alignment issues between virtual address
and physical ones.

Signed-off-by: Yishai Hadas &lt;yishaih@mellanox.com&gt;
Signed-off-by: Vladimir Sokolovsky &lt;vlad@mellanox.com&gt;
(Ported from Mellanox OFED 2.4)

Signed-off-by: Mukesh Kacker &lt;mukesh.kacker@oracle.com&gt;
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
@@ -86,50 +86,302 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
 	return ERR_PTR(err);
 }
 
+static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
+						struct mlx4_mtt *mtt,
+						u64 mtt_size,
+						u64 mtt_shift,
+						u64 len,
+						u64 cur_start_addr,
+						u64 *pages,
+						int *start_index,
+						int *npages)
+{
+	int k;
+	int err = 0;
+	u64 mtt_entries;
+	u64 cur_end_addr = cur_start_addr + len;
+	u64 cur_end_addr_aligned = 0;
+
+	len += (cur_start_addr & (mtt_size-1ULL));
+	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
+	len += (cur_end_addr_aligned - cur_end_addr);
+	if (len & (mtt_size-1ULL)) {
+		WARN(1 ,
+		"write_block: len %llx is not aligned to mtt_size %llx\n",
+			len, mtt_size);
+		return -EINVAL;
+	}
+
+
+	mtt_entries = (len >> mtt_shift);
+
+	/* Align the MTT start address to
+		the mtt_size.
+		Required to handle cases when the MR
+		starts in the middle of an MTT record.
+		Was not required in old code since
+		the physical addresses provided by
+		the dma subsystem were page aligned,
+		which was also the MTT size.
+	*/
+	cur_start_addr = round_down(cur_start_addr, mtt_size);
+	/* A new block is started ...*/
+	for (k = 0; k < mtt_entries; ++k) {
+		pages[*npages] = cur_start_addr + (mtt_size * k);
+		(*npages)++;
+		/*
+		 * Be friendly to mlx4_write_mtt() and
+		 * pass it chunks of appropriate size.
+		 */
+		if (*npages == PAGE_SIZE / sizeof(u64)) {
+			err = mlx4_write_mtt(dev->dev,
+					mtt, *start_index,
+					*npages, pages);
+			if (err)
+				return err;
+
+			(*start_index) += *npages;
+			*npages = 0;
+		}
+	}
+
+	return 0;
+}
+
 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
 			   struct ib_umem *umem)
 {
 	u64 *pages;
-	int i, k, entry;
-	int n;
-	int len;
+	int entry;
+	u64 len = 0;
 	int err = 0;
+	u64 mtt_size;
+	u64 cur_start_addr = 0;
+	u64 mtt_shift;
+	int start_index = 0;
+	int npages = 0;
 	struct scatterlist *sg;
 
 	pages = (u64 *) __get_free_page(GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
-	i = n = 0;
+	mtt_shift = mtt->page_shift;
+	mtt_size = 1ULL << mtt_shift;
 
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> mtt->page_shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] = sg_dma_address(sg) +
-				umem->page_size * k;
-			/*
-			 * Be friendly to mlx4_write_mtt() and
-			 * pass it chunks of appropriate size.
-			 */
-			if (i == PAGE_SIZE / sizeof (u64)) {
-				err = mlx4_write_mtt(dev->dev, mtt, n,
-						     i, pages);
-				if (err)
-					goto out;
-				n += i;
-				i = 0;
+			if (cur_start_addr + len ==
+			    sg_dma_address(sg)) {
+				/* still the same block */
+				len += sg_dma_len(sg);
+				continue;
 			}
-		}
+			/* A new block is started ...*/
+			/* If len is malaligned, write an extra mtt entry to
+			    cover the misaligned area (round up the division)
+			*/
+			err = mlx4_ib_umem_write_mtt_block(dev,
+						mtt, mtt_size, mtt_shift,
+						len, cur_start_addr,
+						pages,
+						&start_index,
+						&npages);
+			if (err)
+				goto out;
+
+			cur_start_addr =
+				sg_dma_address(sg);
+			len = sg_dma_len(sg);
+	}
+
+	/* Handle the last block */
+	if (len > 0) {
+		/*  If len is malaligned, write an extra mtt entry to cover
+		     the misaligned area (round up the division)
+		*/
+		err = mlx4_ib_umem_write_mtt_block(dev,
+						mtt, mtt_size, mtt_shift,
+						len, cur_start_addr,
+						pages,
+						&start_index,
+						&npages);
+			if (err)
+				goto out;
 	}
 
-	if (i)
-		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+	if (npages)
+		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
 
 out:
 	free_page((unsigned long) pages);
 	return err;
 }
 
+static inline u64 alignment_of(u64 ptr)
+{
+	return ilog2(ptr & (~(ptr-1)));
+}
+
+static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
+						u64 current_block_end,
+						u64 block_shift)
+{
+	/* Check whether the alignment of the new block
+	     is aligned as well as the previous block.
+	     Block address must start with zeros till size of entity_size.
+	*/
+	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
+		/* It is not as well aligned as the
+		previous block-reduce the mtt size
+		accordingly.
+		Here we take the last right bit
+		which is 1.
+		*/
+		block_shift = alignment_of(next_block_start);
+
+	/*  Check whether the alignment of the
+	     end of previous block - is it aligned
+	     as well as the start of the block
+	*/
+	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
+		/* It is not as well aligned as
+		the start of the block - reduce the
+		mtt size accordingly.
+		*/
+		block_shift = alignment_of(current_block_end);
+
+	return block_shift;
+}
+
+/* Calculate optimal mtt size based on contiguous pages.
+* Function will return also the number of pages that are not aligned to the
+   calculated mtt_size to be added to total number
+    of pages. For that we should check the first chunk length & last chunk
+    length and if not aligned to mtt_size we should increment
+    the non_aligned_pages number.
+    All chunks in the middle already handled as part of mtt shift calculation
+    for both their start & end addresses.
+*/
+static int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
+						u64 start_va,
+						int *num_of_mtts)
+{
+	u64 block_shift = MLX4_MAX_MTT_SHIFT;
+	u64 current_block_len = 0;
+	u64 current_block_start = 0;
+	u64 misalignment_bits;
+	u64 first_block_start = 0;
+	u64 last_block_end = 0;
+	u64 total_len = 0;
+	u64 last_block_aligned_end = 0;
+	u64 min_shift = ilog2(umem->page_size);
+	struct scatterlist *sg;
+	int i;
+	u64 next_block_start;
+	u64 current_block_end;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+		/* Initialization - save the first chunk start as
+		    the current_block_start - block means contiguous pages.
+		*/
+		if (current_block_len == 0 && current_block_start == 0) {
+			first_block_start = current_block_start =
+				sg_dma_address(sg);
+			/* Find the bits that are different between
+			    the physical address and the virtual
+			    address for the start of the MR.
+			*/
+			/* umem_get aligned the start_va to a page
+			   boundry. Therefore, we need to align the
+			   start va to the same boundry */
+			/* misalignment_bits is needed to handle the
+			   case of a single memory region. In this
+			   case, the rest of the logic will not reduce
+			   the block size.  If we use a block size
+			   which is bigger than the alignment of the
+			   misalignment bits, we might use the virtual
+			   page number instead of the physical page
+			   number, resulting in access to the wrong
+			   data. */
+			misalignment_bits =
+			(start_va & (~(((u64)(umem->page_size))-1ULL)))
+						^ current_block_start;
+			block_shift = min(alignment_of(misalignment_bits)
+				, block_shift);
+		}
+
+		/* Go over the scatter entries and check
+		     if they continue the previous scatter entry.
+		*/
+		next_block_start =
+			sg_dma_address(sg);
+		current_block_end = current_block_start
+			+ current_block_len;
+		/* If we have a split (non-contig.) between two block*/
+		if (current_block_end != next_block_start) {
+			block_shift = mlx4_ib_umem_calc_block_mtt(
+					next_block_start,
+					current_block_end,
+					block_shift);
+
+			/* If we reached the minimum shift for 4k
+			     page we stop the loop.
+			*/
+			if (block_shift <= min_shift)
+				goto end;
+
+			/* If not saved yet we are in first block -
+			     we save the length of first block to
+			     calculate the non_aligned_pages number at
+			*    the end.
+			*/
+			total_len += current_block_len;
+
+			/* Start a new block */
+			current_block_start = next_block_start;
+			current_block_len =
+				sg_dma_len(sg);
+			continue;
+		}
+		/* The scatter entry is another part of
+		     the current block, increase the block size
+		* An entry in the scatter can be larger than
+		4k (page) as of dma mapping
+		which merge some blocks together.
+		*/
+		current_block_len +=
+			sg_dma_len(sg);
+	}
+
+	/* Account for the last block in the total len */
+	total_len += current_block_len;
+	/* Add to the first block the misalignment that it suffers from.*/
+	total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
+	last_block_end = current_block_start+current_block_len;
+	last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
+	total_len += (last_block_aligned_end - last_block_end);
+
+	WARN((total_len & ((1ULL<<block_shift)-1ULL)),
+		" misaligned total length detected (%llu, %llu)!",
+		total_len, block_shift);
+
+	*num_of_mtts = total_len >> block_shift;
+end:
+	if (block_shift < min_shift) {
+		/* If shift is less than the min we set a WARN and
+		     return the min shift.
+		*/
+		WARN(1,
+		"mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
+		block_shift);
+
+		block_shift = min_shift;
+	}
+	return block_shift;
+
+}
+
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
 				  struct ib_udata *udata)
@@ -154,7 +406,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	}
 
 	n = ib_umem_page_count(mr->umem);
-	shift = ilog2(mr->umem->page_size);
+	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
+		&n);
 
 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
 			    convert_access(access_flags), n, shift, &mr->mmr);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
@@ -383,6 +383,10 @@ enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
 
+enum {
+	MLX4_MAX_MTT_SHIFT		= 31
+};
+
 enum mlx4_qp_region {
 	MLX4_QP_REGION_FW = 0,
 	MLX4_QP_REGION_RSS_RAW_ETH,