@@ -86,50 +86,302 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
86
86
return ERR_PTR (err );
87
87
}
88
88
89
+ static int mlx4_ib_umem_write_mtt_block (struct mlx4_ib_dev * dev ,
90
+ struct mlx4_mtt * mtt ,
91
+ u64 mtt_size ,
92
+ u64 mtt_shift ,
93
+ u64 len ,
94
+ u64 cur_start_addr ,
95
+ u64 * pages ,
96
+ int * start_index ,
97
+ int * npages )
98
+ {
99
+ int k ;
100
+ int err = 0 ;
101
+ u64 mtt_entries ;
102
+ u64 cur_end_addr = cur_start_addr + len ;
103
+ u64 cur_end_addr_aligned = 0 ;
104
+
105
+ len += (cur_start_addr & (mtt_size - 1ULL ));
106
+ cur_end_addr_aligned = round_up (cur_end_addr , mtt_size );
107
+ len += (cur_end_addr_aligned - cur_end_addr );
108
+ if (len & (mtt_size - 1ULL )) {
109
+ WARN (1 ,
110
+ "write_block: len %llx is not aligned to mtt_size %llx\n" ,
111
+ len , mtt_size );
112
+ return - EINVAL ;
113
+ }
114
+
115
+
116
+ mtt_entries = (len >> mtt_shift );
117
+
118
+ /* Align the MTT start address to
119
+ the mtt_size.
120
+ Required to handle cases when the MR
121
+ starts in the middle of an MTT record.
122
+ Was not required in old code since
123
+ the physical addresses provided by
124
+ the dma subsystem were page aligned,
125
+ which was also the MTT size.
126
+ */
127
+ cur_start_addr = round_down (cur_start_addr , mtt_size );
128
+ /* A new block is started ...*/
129
+ for (k = 0 ; k < mtt_entries ; ++ k ) {
130
+ pages [* npages ] = cur_start_addr + (mtt_size * k );
131
+ (* npages )++ ;
132
+ /*
133
+ * Be friendly to mlx4_write_mtt() and
134
+ * pass it chunks of appropriate size.
135
+ */
136
+ if (* npages == PAGE_SIZE / sizeof (u64 )) {
137
+ err = mlx4_write_mtt (dev -> dev ,
138
+ mtt , * start_index ,
139
+ * npages , pages );
140
+ if (err )
141
+ return err ;
142
+
143
+ (* start_index ) += * npages ;
144
+ * npages = 0 ;
145
+ }
146
+ }
147
+
148
+ return 0 ;
149
+ }
150
+
89
151
int mlx4_ib_umem_write_mtt (struct mlx4_ib_dev * dev , struct mlx4_mtt * mtt ,
90
152
struct ib_umem * umem )
91
153
{
92
154
u64 * pages ;
93
- int i , k , entry ;
94
- int n ;
95
- int len ;
155
+ int entry ;
156
+ u64 len = 0 ;
96
157
int err = 0 ;
158
+ u64 mtt_size ;
159
+ u64 cur_start_addr = 0 ;
160
+ u64 mtt_shift ;
161
+ int start_index = 0 ;
162
+ int npages = 0 ;
97
163
struct scatterlist * sg ;
98
164
99
165
pages = (u64 * ) __get_free_page (GFP_KERNEL );
100
166
if (!pages )
101
167
return - ENOMEM ;
102
168
103
- i = n = 0 ;
169
+ mtt_shift = mtt -> page_shift ;
170
+ mtt_size = 1ULL << mtt_shift ;
104
171
105
172
for_each_sg (umem -> sg_head .sgl , sg , umem -> nmap , entry ) {
106
- len = sg_dma_len (sg ) >> mtt -> page_shift ;
107
- for (k = 0 ; k < len ; ++ k ) {
108
- pages [i ++ ] = sg_dma_address (sg ) +
109
- umem -> page_size * k ;
110
- /*
111
- * Be friendly to mlx4_write_mtt() and
112
- * pass it chunks of appropriate size.
113
- */
114
- if (i == PAGE_SIZE / sizeof (u64 )) {
115
- err = mlx4_write_mtt (dev -> dev , mtt , n ,
116
- i , pages );
117
- if (err )
118
- goto out ;
119
- n += i ;
120
- i = 0 ;
173
+ if (cur_start_addr + len ==
174
+ sg_dma_address (sg )) {
175
+ /* still the same block */
176
+ len += sg_dma_len (sg );
177
+ continue ;
121
178
}
122
- }
179
+ /* A new block is started ...*/
180
+ /* If len is malaligned, write an extra mtt entry to
181
+ cover the misaligned area (round up the division)
182
+ */
183
+ err = mlx4_ib_umem_write_mtt_block (dev ,
184
+ mtt , mtt_size , mtt_shift ,
185
+ len , cur_start_addr ,
186
+ pages ,
187
+ & start_index ,
188
+ & npages );
189
+ if (err )
190
+ goto out ;
191
+
192
+ cur_start_addr =
193
+ sg_dma_address (sg );
194
+ len = sg_dma_len (sg );
195
+ }
196
+
197
+ /* Handle the last block */
198
+ if (len > 0 ) {
199
+ /* If len is malaligned, write an extra mtt entry to cover
200
+ the misaligned area (round up the division)
201
+ */
202
+ err = mlx4_ib_umem_write_mtt_block (dev ,
203
+ mtt , mtt_size , mtt_shift ,
204
+ len , cur_start_addr ,
205
+ pages ,
206
+ & start_index ,
207
+ & npages );
208
+ if (err )
209
+ goto out ;
123
210
}
124
211
125
- if (i )
126
- err = mlx4_write_mtt (dev -> dev , mtt , n , i , pages );
212
+
213
+ if (npages )
214
+ err = mlx4_write_mtt (dev -> dev , mtt , start_index , npages , pages );
127
215
128
216
out :
129
217
free_page ((unsigned long ) pages );
130
218
return err ;
131
219
}
132
220
221
+ static inline u64 alignment_of (u64 ptr )
222
+ {
223
+ return ilog2 (ptr & (~(ptr - 1 )));
224
+ }
225
+
226
+ static int mlx4_ib_umem_calc_block_mtt (u64 next_block_start ,
227
+ u64 current_block_end ,
228
+ u64 block_shift )
229
+ {
230
+ /* Check whether the alignment of the new block
231
+ is aligned as well as the previous block.
232
+ Block address must start with zeros till size of entity_size.
233
+ */
234
+ if ((next_block_start & ((1ULL << block_shift ) - 1ULL )) != 0 )
235
+ /* It is not as well aligned as the
236
+ previous block-reduce the mtt size
237
+ accordingly.
238
+ Here we take the last right bit
239
+ which is 1.
240
+ */
241
+ block_shift = alignment_of (next_block_start );
242
+
243
+ /* Check whether the alignment of the
244
+ end of previous block - is it aligned
245
+ as well as the start of the block
246
+ */
247
+ if (((current_block_end ) & ((1ULL << block_shift ) - 1ULL )) != 0 )
248
+ /* It is not as well aligned as
249
+ the start of the block - reduce the
250
+ mtt size accordingly.
251
+ */
252
+ block_shift = alignment_of (current_block_end );
253
+
254
+ return block_shift ;
255
+ }
256
+
257
+ /* Calculate optimal mtt size based on contiguous pages.
258
+ * Function will return also the number of pages that are not aligned to the
259
+ calculated mtt_size to be added to total number
260
+ of pages. For that we should check the first chunk length & last chunk
261
+ length and if not aligned to mtt_size we should increment
262
+ the non_aligned_pages number.
263
+ All chunks in the middle already handled as part of mtt shift calculation
264
+ for both their start & end addresses.
265
+ */
266
+ static int mlx4_ib_umem_calc_optimal_mtt_size (struct ib_umem * umem ,
267
+ u64 start_va ,
268
+ int * num_of_mtts )
269
+ {
270
+ u64 block_shift = MLX4_MAX_MTT_SHIFT ;
271
+ u64 current_block_len = 0 ;
272
+ u64 current_block_start = 0 ;
273
+ u64 misalignment_bits ;
274
+ u64 first_block_start = 0 ;
275
+ u64 last_block_end = 0 ;
276
+ u64 total_len = 0 ;
277
+ u64 last_block_aligned_end = 0 ;
278
+ u64 min_shift = ilog2 (umem -> page_size );
279
+ struct scatterlist * sg ;
280
+ int i ;
281
+ u64 next_block_start ;
282
+ u64 current_block_end ;
283
+
284
+ for_each_sg (umem -> sg_head .sgl , sg , umem -> nmap , i ) {
285
+ /* Initialization - save the first chunk start as
286
+ the current_block_start - block means contiguous pages.
287
+ */
288
+ if (current_block_len == 0 && current_block_start == 0 ) {
289
+ first_block_start = current_block_start =
290
+ sg_dma_address (sg );
291
+ /* Find the bits that are different between
292
+ the physical address and the virtual
293
+ address for the start of the MR.
294
+ */
295
+ /* umem_get aligned the start_va to a page
296
+ boundry. Therefore, we need to align the
297
+ start va to the same boundry */
298
+ /* misalignment_bits is needed to handle the
299
+ case of a single memory region. In this
300
+ case, the rest of the logic will not reduce
301
+ the block size. If we use a block size
302
+ which is bigger than the alignment of the
303
+ misalignment bits, we might use the virtual
304
+ page number instead of the physical page
305
+ number, resulting in access to the wrong
306
+ data. */
307
+ misalignment_bits =
308
+ (start_va & (~(((u64 )(umem -> page_size ))- 1ULL )))
309
+ ^ current_block_start ;
310
+ block_shift = min (alignment_of (misalignment_bits )
311
+ , block_shift );
312
+ }
313
+
314
+ /* Go over the scatter entries and check
315
+ if they continue the previous scatter entry.
316
+ */
317
+ next_block_start =
318
+ sg_dma_address (sg );
319
+ current_block_end = current_block_start
320
+ + current_block_len ;
321
+ /* If we have a split (non-contig.) between two block*/
322
+ if (current_block_end != next_block_start ) {
323
+ block_shift = mlx4_ib_umem_calc_block_mtt (
324
+ next_block_start ,
325
+ current_block_end ,
326
+ block_shift );
327
+
328
+ /* If we reached the minimum shift for 4k
329
+ page we stop the loop.
330
+ */
331
+ if (block_shift <= min_shift )
332
+ goto end ;
333
+
334
+ /* If not saved yet we are in first block -
335
+ we save the length of first block to
336
+ calculate the non_aligned_pages number at
337
+ * the end.
338
+ */
339
+ total_len += current_block_len ;
340
+
341
+ /* Start a new block */
342
+ current_block_start = next_block_start ;
343
+ current_block_len =
344
+ sg_dma_len (sg );
345
+ continue ;
346
+ }
347
+ /* The scatter entry is another part of
348
+ the current block, increase the block size
349
+ * An entry in the scatter can be larger than
350
+ 4k (page) as of dma mapping
351
+ which merge some blocks together.
352
+ */
353
+ current_block_len +=
354
+ sg_dma_len (sg );
355
+ }
356
+
357
+ /* Account for the last block in the total len */
358
+ total_len += current_block_len ;
359
+ /* Add to the first block the misalignment that it suffers from.*/
360
+ total_len += (first_block_start & ((1ULL <<block_shift )- 1ULL ));
361
+ last_block_end = current_block_start + current_block_len ;
362
+ last_block_aligned_end = round_up (last_block_end , 1 <<block_shift );
363
+ total_len += (last_block_aligned_end - last_block_end );
364
+
365
+ WARN ((total_len & ((1ULL <<block_shift )- 1ULL )),
366
+ " misaligned total length detected (%llu, %llu)!" ,
367
+ total_len , block_shift );
368
+
369
+ * num_of_mtts = total_len >> block_shift ;
370
+ end :
371
+ if (block_shift < min_shift ) {
372
+ /* If shift is less than the min we set a WARN and
373
+ return the min shift.
374
+ */
375
+ WARN (1 ,
376
+ "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n" ,
377
+ block_shift );
378
+
379
+ block_shift = min_shift ;
380
+ }
381
+ return block_shift ;
382
+
383
+ }
384
+
133
385
struct ib_mr * mlx4_ib_reg_user_mr (struct ib_pd * pd , u64 start , u64 length ,
134
386
u64 virt_addr , int access_flags ,
135
387
struct ib_udata * udata )
@@ -154,7 +406,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
154
406
}
155
407
156
408
n = ib_umem_page_count (mr -> umem );
157
- shift = ilog2 (mr -> umem -> page_size );
409
+ shift = mlx4_ib_umem_calc_optimal_mtt_size (mr -> umem , start ,
410
+ & n );
158
411
159
412
err = mlx4_mr_alloc (dev -> dev , to_mpd (pd )-> pdn , virt_addr , length ,
160
413
convert_access (access_flags ), n , shift , & mr -> mmr );
0 commit comments