Skip to content

Commit 9901abf

Browse files
Guy Levidledford
authored andcommitted
IB/mlx4: Use optimal numbers of MTT entries
Optimize the device performance by assigning multiple physical pages, which are contiguous, to a single MTT. As a result, the number of MTTs is reduced and in turn save cache misses of MTTs. Signed-off-by: Guy Levi <[email protected]> Signed-off-by: Yishai Hadas <[email protected]> Signed-off-by: Leon Romanovsky <[email protected]> Signed-off-by: Doug Ledford <[email protected]>
1 parent 2b62185 commit 9901abf

File tree

1 file changed

+261
-24
lines changed
  • drivers/infiniband/hw/mlx4

1 file changed

+261
-24
lines changed

drivers/infiniband/hw/mlx4/mr.c

Lines changed: 261 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -87,50 +87,287 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
8787
return ERR_PTR(err);
8888
}
8989

90+
enum {
91+
MLX4_MAX_MTT_SHIFT = 31
92+
};
93+
94+
static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
95+
struct mlx4_mtt *mtt,
96+
u64 mtt_size, u64 mtt_shift, u64 len,
97+
u64 cur_start_addr, u64 *pages,
98+
int *start_index, int *npages)
99+
{
100+
u64 cur_end_addr = cur_start_addr + len;
101+
u64 cur_end_addr_aligned = 0;
102+
u64 mtt_entries;
103+
int err = 0;
104+
int k;
105+
106+
len += (cur_start_addr & (mtt_size - 1ULL));
107+
cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
108+
len += (cur_end_addr_aligned - cur_end_addr);
109+
if (len & (mtt_size - 1ULL)) {
110+
pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n",
111+
len, mtt_size);
112+
return -EINVAL;
113+
}
114+
115+
mtt_entries = (len >> mtt_shift);
116+
117+
/*
118+
* Align the MTT start address to the mtt_size.
119+
* Required to handle cases when the MR starts in the middle of an MTT
120+
* record. Was not required in old code since the physical addresses
121+
* provided by the dma subsystem were page aligned, which was also the
122+
* MTT size.
123+
*/
124+
cur_start_addr = round_down(cur_start_addr, mtt_size);
125+
/* A new block is started ... */
126+
for (k = 0; k < mtt_entries; ++k) {
127+
pages[*npages] = cur_start_addr + (mtt_size * k);
128+
(*npages)++;
129+
/*
130+
* Be friendly to mlx4_write_mtt() and pass it chunks of
131+
* appropriate size.
132+
*/
133+
if (*npages == PAGE_SIZE / sizeof(u64)) {
134+
err = mlx4_write_mtt(dev->dev, mtt, *start_index,
135+
*npages, pages);
136+
if (err)
137+
return err;
138+
139+
(*start_index) += *npages;
140+
*npages = 0;
141+
}
142+
}
143+
144+
return 0;
145+
}
146+
147+
static inline u64 alignment_of(u64 ptr)
148+
{
149+
return ilog2(ptr & (~(ptr - 1)));
150+
}
151+
152+
static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
153+
u64 current_block_end,
154+
u64 block_shift)
155+
{
156+
/* Check whether the alignment of the new block is aligned as well as
157+
* the previous block.
158+
* Block address must start with zeros till size of entity_size.
159+
*/
160+
if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
161+
/*
162+
* It is not as well aligned as the previous block-reduce the
163+
* mtt size accordingly. Here we take the last right bit which
164+
* is 1.
165+
*/
166+
block_shift = alignment_of(next_block_start);
167+
168+
/*
169+
* Check whether the alignment of the end of previous block - is it
170+
* aligned as well as the start of the block
171+
*/
172+
if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
173+
/*
174+
* It is not as well aligned as the start of the block -
175+
* reduce the mtt size accordingly.
176+
*/
177+
block_shift = alignment_of(current_block_end);
178+
179+
return block_shift;
180+
}
181+
90182
int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
91183
struct ib_umem *umem)
92184
{
93185
u64 *pages;
94-
int i, k, entry;
95-
int n;
96-
int len;
186+
u64 len = 0;
97187
int err = 0;
188+
u64 mtt_size;
189+
u64 cur_start_addr = 0;
190+
u64 mtt_shift;
191+
int start_index = 0;
192+
int npages = 0;
98193
struct scatterlist *sg;
194+
int i;
99195

100196
pages = (u64 *) __get_free_page(GFP_KERNEL);
101197
if (!pages)
102198
return -ENOMEM;
103199

104-
i = n = 0;
200+
mtt_shift = mtt->page_shift;
201+
mtt_size = 1ULL << mtt_shift;
105202

106-
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
107-
len = sg_dma_len(sg) >> mtt->page_shift;
108-
for (k = 0; k < len; ++k) {
109-
pages[i++] = sg_dma_address(sg) +
110-
(k << umem->page_shift);
111-
/*
112-
* Be friendly to mlx4_write_mtt() and
113-
* pass it chunks of appropriate size.
114-
*/
115-
if (i == PAGE_SIZE / sizeof (u64)) {
116-
err = mlx4_write_mtt(dev->dev, mtt, n,
117-
i, pages);
118-
if (err)
119-
goto out;
120-
n += i;
121-
i = 0;
122-
}
203+
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
204+
if (cur_start_addr + len == sg_dma_address(sg)) {
205+
/* still the same block */
206+
len += sg_dma_len(sg);
207+
continue;
123208
}
209+
/*
210+
* A new block is started ...
211+
* If len is malaligned, write an extra mtt entry to cover the
212+
* misaligned area (round up the division)
213+
*/
214+
err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
215+
mtt_shift, len,
216+
cur_start_addr,
217+
pages, &start_index,
218+
&npages);
219+
if (err)
220+
goto out;
221+
222+
cur_start_addr = sg_dma_address(sg);
223+
len = sg_dma_len(sg);
124224
}
125225

126-
if (i)
127-
err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
226+
/* Handle the last block */
227+
if (len > 0) {
228+
/*
229+
* If len is malaligned, write an extra mtt entry to cover
230+
* the misaligned area (round up the division)
231+
*/
232+
err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
233+
mtt_shift, len,
234+
cur_start_addr, pages,
235+
&start_index, &npages);
236+
if (err)
237+
goto out;
238+
}
239+
240+
if (npages)
241+
err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
128242

129243
out:
130244
free_page((unsigned long) pages);
131245
return err;
132246
}
133247

248+
/*
249+
* Calculate optimal mtt size based on contiguous pages.
250+
* Function will return also the number of pages that are not aligned to the
251+
* calculated mtt_size to be added to total number of pages. For that we should
252+
* check the first chunk length & last chunk length and if not aligned to
253+
* mtt_size we should increment the non_aligned_pages number. All chunks in the
254+
* middle already handled as part of mtt shift calculation for both their start
255+
* & end addresses.
256+
*/
257+
static int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
258+
u64 start_va,
259+
int *num_of_mtts)
260+
{
261+
u64 block_shift = MLX4_MAX_MTT_SHIFT;
262+
u64 min_shift = umem->page_shift;
263+
u64 last_block_aligned_end = 0;
264+
u64 current_block_start = 0;
265+
u64 first_block_start = 0;
266+
u64 current_block_len = 0;
267+
u64 last_block_end = 0;
268+
struct scatterlist *sg;
269+
u64 current_block_end;
270+
u64 misalignment_bits;
271+
u64 next_block_start;
272+
u64 total_len = 0;
273+
int i;
274+
275+
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
276+
/*
277+
* Initialization - save the first chunk start as the
278+
* current_block_start - block means contiguous pages.
279+
*/
280+
if (current_block_len == 0 && current_block_start == 0) {
281+
current_block_start = sg_dma_address(sg);
282+
first_block_start = current_block_start;
283+
/*
284+
* Find the bits that are different between the physical
285+
* address and the virtual address for the start of the
286+
* MR.
287+
* umem_get aligned the start_va to a page boundary.
288+
* Therefore, we need to align the start va to the same
289+
* boundary.
290+
* misalignment_bits is needed to handle the case of a
291+
* single memory region. In this case, the rest of the
292+
* logic will not reduce the block size. If we use a
293+
* block size which is bigger than the alignment of the
294+
* misalignment bits, we might use the virtual page
295+
* number instead of the physical page number, resulting
296+
* in access to the wrong data.
297+
*/
298+
misalignment_bits =
299+
(start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL)))
300+
^ current_block_start;
301+
block_shift = min(alignment_of(misalignment_bits),
302+
block_shift);
303+
}
304+
305+
/*
306+
* Go over the scatter entries and check if they continue the
307+
* previous scatter entry.
308+
*/
309+
next_block_start = sg_dma_address(sg);
310+
current_block_end = current_block_start + current_block_len;
311+
/* If we have a split (non-contig.) between two blocks */
312+
if (current_block_end != next_block_start) {
313+
block_shift = mlx4_ib_umem_calc_block_mtt
314+
(next_block_start,
315+
current_block_end,
316+
block_shift);
317+
318+
/*
319+
* If we reached the minimum shift for 4k page we stop
320+
* the loop.
321+
*/
322+
if (block_shift <= min_shift)
323+
goto end;
324+
325+
/*
326+
* If not saved yet we are in first block - we save the
327+
* length of first block to calculate the
328+
* non_aligned_pages number at the end.
329+
*/
330+
total_len += current_block_len;
331+
332+
/* Start a new block */
333+
current_block_start = next_block_start;
334+
current_block_len = sg_dma_len(sg);
335+
continue;
336+
}
337+
/* The scatter entry is another part of the current block,
338+
* increase the block size.
339+
* An entry in the scatter can be larger than 4k (page) as of
340+
* dma mapping which merge some blocks together.
341+
*/
342+
current_block_len += sg_dma_len(sg);
343+
}
344+
345+
/* Account for the last block in the total len */
346+
total_len += current_block_len;
347+
/* Add to the first block the misalignment that it suffers from. */
348+
total_len += (first_block_start & ((1ULL << block_shift) - 1ULL));
349+
last_block_end = current_block_start + current_block_len;
350+
last_block_aligned_end = round_up(last_block_end, 1 << block_shift);
351+
total_len += (last_block_aligned_end - last_block_end);
352+
353+
if (total_len & ((1ULL << block_shift) - 1ULL))
354+
pr_warn("misaligned total length detected (%llu, %llu)!",
355+
total_len, block_shift);
356+
357+
*num_of_mtts = total_len >> block_shift;
358+
end:
359+
if (block_shift < min_shift) {
360+
/*
361+
* If shift is less than the min we set a warning and return the
362+
* min shift.
363+
*/
364+
pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift);
365+
366+
block_shift = min_shift;
367+
}
368+
return block_shift;
369+
}
370+
134371
struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
135372
u64 virt_addr, int access_flags,
136373
struct ib_udata *udata)
@@ -155,7 +392,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
155392
}
156393

157394
n = ib_umem_page_count(mr->umem);
158-
shift = mr->umem->page_shift;
395+
shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n);
159396

160397
err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
161398
convert_access(access_flags), n, shift, &mr->mmr);

0 commit comments

Comments
 (0)