Skip to content

Commit 780bc79

Browse files
amlutomstsirkin
authored andcommitted
virtio_ring: Support DMA APIs
virtio_ring currently sends the device (usually a hypervisor) physical addresses of its I/O buffers. This is okay when DMA addresses and physical addresses are the same thing, but this isn't always the case. For example, this never works on Xen guests, and it is likely to fail if a physical "virtio" device ever ends up behind an IOMMU or swiotlb. The immediate use case for me is to enable virtio on Xen guests. For that to work, we need vring to support DMA address translation as well as a corresponding change to virtio_pci or to another driver. Signed-off-by: Andy Lutomirski <[email protected]> Signed-off-by: Michael S. Tsirkin <[email protected]>
1 parent d26c96c commit 780bc79

File tree

3 files changed

+183
-36
lines changed

3 files changed

+183
-36
lines changed

drivers/virtio/Kconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ config VIRTIO_INPUT
6060

6161
config VIRTIO_MMIO
6262
tristate "Platform bus driver for memory mapped virtio devices"
63-
depends on HAS_IOMEM
63+
depends on HAS_IOMEM && HAS_DMA
6464
select VIRTIO
6565
---help---
6666
This drivers provides support for memory mapped virtio

drivers/virtio/virtio_ring.c

Lines changed: 165 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <linux/module.h>
2525
#include <linux/hrtimer.h>
2626
#include <linux/kmemleak.h>
27+
#include <linux/dma-mapping.h>
2728

2829
#ifdef DEBUG
2930
/* For development, we want to crash whenever the ring is screwed. */
@@ -54,6 +55,11 @@
5455
#define END_USE(vq)
5556
#endif
5657

58+
struct vring_desc_state {
59+
void *data; /* Data for callback. */
60+
struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
61+
};
62+
5763
struct vring_virtqueue {
5864
struct virtqueue vq;
5965

@@ -98,8 +104,8 @@ struct vring_virtqueue {
98104
ktime_t last_add_time;
99105
#endif
100106

101-
/* Tokens for callbacks. */
102-
void *data[];
107+
/* Per-descriptor state. */
108+
struct vring_desc_state desc_state[];
103109
};
104110

105111
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
@@ -128,6 +134,79 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
128134
return false;
129135
}
130136

137+
/*
138+
* The DMA ops on various arches are rather gnarly right now, and
139+
* making all of the arch DMA ops work on the vring device itself
140+
* is a mess. For now, we use the parent device for DMA ops.
141+
*/
142+
struct device *vring_dma_dev(const struct vring_virtqueue *vq)
143+
{
144+
return vq->vq.vdev->dev.parent;
145+
}
146+
147+
/* Map one sg entry. */
148+
static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
149+
struct scatterlist *sg,
150+
enum dma_data_direction direction)
151+
{
152+
if (!vring_use_dma_api(vq->vq.vdev))
153+
return (dma_addr_t)sg_phys(sg);
154+
155+
/*
156+
* We can't use dma_map_sg, because we don't use scatterlists in
157+
* the way it expects (we don't guarantee that the scatterlist
158+
* will exist for the lifetime of the mapping).
159+
*/
160+
return dma_map_page(vring_dma_dev(vq),
161+
sg_page(sg), sg->offset, sg->length,
162+
direction);
163+
}
164+
165+
static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
166+
void *cpu_addr, size_t size,
167+
enum dma_data_direction direction)
168+
{
169+
if (!vring_use_dma_api(vq->vq.vdev))
170+
return (dma_addr_t)virt_to_phys(cpu_addr);
171+
172+
return dma_map_single(vring_dma_dev(vq),
173+
cpu_addr, size, direction);
174+
}
175+
176+
static void vring_unmap_one(const struct vring_virtqueue *vq,
177+
struct vring_desc *desc)
178+
{
179+
u16 flags;
180+
181+
if (!vring_use_dma_api(vq->vq.vdev))
182+
return;
183+
184+
flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
185+
186+
if (flags & VRING_DESC_F_INDIRECT) {
187+
dma_unmap_single(vring_dma_dev(vq),
188+
virtio64_to_cpu(vq->vq.vdev, desc->addr),
189+
virtio32_to_cpu(vq->vq.vdev, desc->len),
190+
(flags & VRING_DESC_F_WRITE) ?
191+
DMA_FROM_DEVICE : DMA_TO_DEVICE);
192+
} else {
193+
dma_unmap_page(vring_dma_dev(vq),
194+
virtio64_to_cpu(vq->vq.vdev, desc->addr),
195+
virtio32_to_cpu(vq->vq.vdev, desc->len),
196+
(flags & VRING_DESC_F_WRITE) ?
197+
DMA_FROM_DEVICE : DMA_TO_DEVICE);
198+
}
199+
}
200+
201+
static int vring_mapping_error(const struct vring_virtqueue *vq,
202+
dma_addr_t addr)
203+
{
204+
if (!vring_use_dma_api(vq->vq.vdev))
205+
return 0;
206+
207+
return dma_mapping_error(vring_dma_dev(vq), addr);
208+
}
209+
131210
static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
132211
unsigned int total_sg, gfp_t gfp)
133212
{
@@ -161,7 +240,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
161240
struct vring_virtqueue *vq = to_vvq(_vq);
162241
struct scatterlist *sg;
163242
struct vring_desc *desc;
164-
unsigned int i, n, avail, descs_used, uninitialized_var(prev);
243+
unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
165244
int head;
166245
bool indirect;
167246

@@ -201,21 +280,15 @@ static inline int virtqueue_add(struct virtqueue *_vq,
201280

202281
if (desc) {
203282
/* Use a single buffer which doesn't continue */
204-
vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
205-
vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc));
206-
/* avoid kmemleak false positive (hidden by virt_to_phys) */
207-
kmemleak_ignore(desc);
208-
vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
209-
283+
indirect = true;
210284
/* Set up rest to use this indirect table. */
211285
i = 0;
212286
descs_used = 1;
213-
indirect = true;
214287
} else {
288+
indirect = false;
215289
desc = vq->vring.desc;
216290
i = head;
217291
descs_used = total_sg;
218-
indirect = false;
219292
}
220293

221294
if (vq->vq.num_free < descs_used) {
@@ -230,22 +303,27 @@ static inline int virtqueue_add(struct virtqueue *_vq,
230303
return -ENOSPC;
231304
}
232305

233-
/* We're about to use some buffers from the free list. */
234-
vq->vq.num_free -= descs_used;
235-
236306
for (n = 0; n < out_sgs; n++) {
237307
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
308+
dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
309+
if (vring_mapping_error(vq, addr))
310+
goto unmap_release;
311+
238312
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
239-
desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
313+
desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
240314
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
241315
prev = i;
242316
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
243317
}
244318
}
245319
for (; n < (out_sgs + in_sgs); n++) {
246320
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
321+
dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
322+
if (vring_mapping_error(vq, addr))
323+
goto unmap_release;
324+
247325
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
248-
desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
326+
desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
249327
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
250328
prev = i;
251329
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -254,14 +332,33 @@ static inline int virtqueue_add(struct virtqueue *_vq,
254332
/* Last one doesn't continue. */
255333
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
256334

335+
if (indirect) {
336+
/* Now that the indirect table is filled in, map it. */
337+
dma_addr_t addr = vring_map_single(
338+
vq, desc, total_sg * sizeof(struct vring_desc),
339+
DMA_TO_DEVICE);
340+
if (vring_mapping_error(vq, addr))
341+
goto unmap_release;
342+
343+
vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
344+
vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
345+
346+
vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
347+
}
348+
349+
/* We're using some buffers from the free list. */
350+
vq->vq.num_free -= descs_used;
351+
257352
/* Update free pointer */
258353
if (indirect)
259354
vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
260355
else
261356
vq->free_head = i;
262357

263-
/* Set token. */
264-
vq->data[head] = data;
358+
/* Store token and indirect buffer state. */
359+
vq->desc_state[head].data = data;
360+
if (indirect)
361+
vq->desc_state[head].indir_desc = desc;
265362

266363
/* Put entry in available array (but don't update avail->idx until they
267364
* do sync). */
@@ -284,6 +381,24 @@ static inline int virtqueue_add(struct virtqueue *_vq,
284381
virtqueue_kick(_vq);
285382

286383
return 0;
384+
385+
unmap_release:
386+
err_idx = i;
387+
i = head;
388+
389+
for (n = 0; n < total_sg; n++) {
390+
if (i == err_idx)
391+
break;
392+
vring_unmap_one(vq, &desc[i]);
393+
i = vq->vring.desc[i].next;
394+
}
395+
396+
vq->vq.num_free += total_sg;
397+
398+
if (indirect)
399+
kfree(desc);
400+
401+
return -EIO;
287402
}
288403

289404
/**
@@ -454,27 +569,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
454569

455570
static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
456571
{
457-
unsigned int i;
572+
unsigned int i, j;
573+
u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
458574

459575
/* Clear data ptr. */
460-
vq->data[head] = NULL;
576+
vq->desc_state[head].data = NULL;
461577

462-
/* Put back on free list: find end */
578+
/* Put back on free list: unmap first-level descriptors and find end */
463579
i = head;
464580

465-
/* Free the indirect table */
466-
if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
467-
kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
468-
469-
while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
581+
while (vq->vring.desc[i].flags & nextflag) {
582+
vring_unmap_one(vq, &vq->vring.desc[i]);
470583
i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
471584
vq->vq.num_free++;
472585
}
473586

587+
vring_unmap_one(vq, &vq->vring.desc[i]);
474588
vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
475589
vq->free_head = head;
590+
476591
/* Plus final descriptor */
477592
vq->vq.num_free++;
593+
594+
/* Free the indirect table, if any, now that it's unmapped. */
595+
if (vq->desc_state[head].indir_desc) {
596+
struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
597+
u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
598+
599+
BUG_ON(!(vq->vring.desc[head].flags &
600+
cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
601+
BUG_ON(len == 0 || len % sizeof(struct vring_desc));
602+
603+
for (j = 0; j < len / sizeof(struct vring_desc); j++)
604+
vring_unmap_one(vq, &indir_desc[j]);
605+
606+
kfree(vq->desc_state[head].indir_desc);
607+
vq->desc_state[head].indir_desc = NULL;
608+
}
478609
}
479610

480611
static inline bool more_used(const struct vring_virtqueue *vq)
@@ -529,13 +660,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
529660
BAD_RING(vq, "id %u out of range\n", i);
530661
return NULL;
531662
}
532-
if (unlikely(!vq->data[i])) {
663+
if (unlikely(!vq->desc_state[i].data)) {
533664
BAD_RING(vq, "id %u is not a head!\n", i);
534665
return NULL;
535666
}
536667

537668
/* detach_buf clears data, so grab it now. */
538-
ret = vq->data[i];
669+
ret = vq->desc_state[i].data;
539670
detach_buf(vq, i);
540671
vq->last_used_idx++;
541672
/* If we expect an interrupt for the next entry, tell host
@@ -709,10 +840,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
709840
START_USE(vq);
710841

711842
for (i = 0; i < vq->vring.num; i++) {
712-
if (!vq->data[i])
843+
if (!vq->desc_state[i].data)
713844
continue;
714845
/* detach_buf clears data, so grab it now. */
715-
buf = vq->data[i];
846+
buf = vq->desc_state[i].data;
716847
detach_buf(vq, i);
717848
vq->avail_idx_shadow--;
718849
vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
@@ -766,7 +897,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
766897
return NULL;
767898
}
768899

769-
vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
900+
vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
901+
GFP_KERNEL);
770902
if (!vq)
771903
return NULL;
772904

@@ -800,11 +932,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
800932

801933
/* Put everything in free lists. */
802934
vq->free_head = 0;
803-
for (i = 0; i < num-1; i++) {
935+
for (i = 0; i < num-1; i++)
804936
vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
805-
vq->data[i] = NULL;
806-
}
807-
vq->data[i] = NULL;
937+
memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
808938

809939
return &vq->vq;
810940
}

tools/virtio/linux/dma-mapping.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#ifndef _LINUX_DMA_MAPPING_H
2+
#define _LINUX_DMA_MAPPING_H
3+
4+
#ifdef CONFIG_HAS_DMA
5+
# error Virtio userspace code does not support CONFIG_HAS_DMA
6+
#endif
7+
8+
#define PCI_DMA_BUS_IS_PHYS 1
9+
10+
enum dma_data_direction {
11+
DMA_BIDIRECTIONAL = 0,
12+
DMA_TO_DEVICE = 1,
13+
DMA_FROM_DEVICE = 2,
14+
DMA_NONE = 3,
15+
};
16+
17+
#endif

0 commit comments

Comments
 (0)