Skip to content

Commit e2c9784

Browse files
rustyrussellLinus Torvalds
authored andcommitted
lguest: documentation III: Drivers
Documentation: The Drivers Signed-off-by: Rusty Russell <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent b2b47c2 commit e2c9784

File tree

6 files changed

+562
-39
lines changed

6 files changed

+562
-39
lines changed

drivers/block/lguest_blk.c

Lines changed: 157 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1-
/* A simple block driver for lguest.
1+
/*D:400
2+
* The Guest block driver
23
*
3-
* Copyright 2006 Rusty Russell <[email protected]> IBM Corporation
4+
* This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5+
* The mechanism is simple: we place the information about the request in the
6+
* device page, then use SEND_DMA (containing the data for a write, or an empty
7+
* "ping" DMA for a read).
8+
:*/
9+
/* Copyright 2006 Rusty Russell <[email protected]> IBM Corporation
410
*
511
* This program is free software; you can redistribute it and/or modify
612
* it under the terms of the GNU General Public License as published by
@@ -25,27 +31,50 @@
2531

2632
static char next_block_index = 'a';
2733

34+
/*D:420 Here is the structure which holds all the information we need about
35+
* each Guest block device.
36+
*
37+
* I'm sure at this stage, you're wondering "hey, where was the adventure I was
38+
* promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39+
* my blog". I think Real adventures have boring bits, too, and you're in the
40+
* middle of one. But it gets better. Just not quite yet. */
2841
struct blockdev
2942
{
43+
/* The block queue infrastructure wants a spinlock: it is held while it
44+
* calls our block request function. We grab it in our interrupt
45+
* handler so the responses don't mess with new requests. */
3046
spinlock_t lock;
3147

32-
/* The disk structure for the kernel. */
48+
/* The disk structure registered with kernel. */
3349
struct gendisk *disk;
3450

35-
/* The major number for this disk. */
51+
/* The major device number for this disk, and the interrupt. We only
52+
* really keep them here for completeness; we'd need them if we
53+
* supported device unplugging. */
3654
int major;
3755
int irq;
3856

57+
/* The physical address of this device's memory page */
3958
unsigned long phys_addr;
40-
/* The mapped block page. */
59+
/* The mapped memory page for convenient acces. */
4160
struct lguest_block_page *lb_page;
4261

43-
/* We only have a single request outstanding at a time. */
62+
/* We only have a single request outstanding at a time: this is it. */
4463
struct lguest_dma dma;
4564
struct request *req;
4665
};
4766

48-
/* Jens gave me this nice helper to end all chunks of a request. */
67+
/*D:495 We originally used end_request() throughout the driver, but it turns
68+
* out that end_request() is deprecated, and doesn't actually end the request
69+
* (which seems like a good reason to deprecate it!). It simply ends the first
70+
* bio. So if we had 3 bios in a "struct request" we would do all 3,
71+
* end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72+
* work as we needed to do.
73+
*
74+
* This reinforced to me that I do not understand the block layer.
75+
*
76+
* Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77+
* request. This improved disk speed by 130%. */
4978
static void end_entire_request(struct request *req, int uptodate)
5079
{
5180
if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
@@ -55,30 +84,62 @@ static void end_entire_request(struct request *req, int uptodate)
5584
end_that_request_last(req, uptodate);
5685
}
5786

87+
/* I'm told there are only two stories in the world worth telling: love and
88+
* hate. So there used to be a love scene here like this:
89+
*
90+
* Launcher: We could make beautiful I/O together, you and I.
91+
* Guest: My, that's a big disk!
92+
*
93+
* Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
94+
95+
/*D:490 This is the interrupt handler, called when a block read or write has
96+
* been completed for us. */
5897
static irqreturn_t lgb_irq(int irq, void *_bd)
5998
{
99+
/* We handed our "struct blockdev" as the argument to request_irq(), so
100+
* it is passed through to us here. This tells us which device we're
101+
* dealing with in case we have more than one. */
60102
struct blockdev *bd = _bd;
61103
unsigned long flags;
62104

105+
/* We weren't doing anything? Strange, but could happen if we shared
106+
* interrupts (we don't!). */
63107
if (!bd->req) {
64108
pr_debug("No work!\n");
65109
return IRQ_NONE;
66110
}
67111

112+
/* Not done yet? That's equally strange. */
68113
if (!bd->lb_page->result) {
69114
pr_debug("No result!\n");
70115
return IRQ_NONE;
71116
}
72117

118+
/* We have to grab the lock before ending the request. */
73119
spin_lock_irqsave(&bd->lock, flags);
120+
/* "result" is 1 for success, 2 for failure: end_entire_request() wants
121+
* to know whether this succeeded or not. */
74122
end_entire_request(bd->req, bd->lb_page->result == 1);
123+
/* Clear out request, it's done. */
75124
bd->req = NULL;
125+
/* Reset incoming DMA for next time. */
76126
bd->dma.used_len = 0;
127+
/* Ready for more reads or writes */
77128
blk_start_queue(bd->disk->queue);
78129
spin_unlock_irqrestore(&bd->lock, flags);
130+
131+
/* The interrupt was for us, we dealt with it. */
79132
return IRQ_HANDLED;
80133
}
81134

135+
/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136+
* each of which contains "struct bio_vec"s, each of which contains a page, an
137+
* offset and a length.
138+
*
139+
* Fortunately there are iterators to help us walk through the "struct
140+
* request". Even more fortunately, there were plenty of places to steal the
141+
* code from. We pack the "struct request" into our "struct lguest_dma" and
142+
* return the total length. */
82143
static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
83144
{
84145
unsigned int i = 0, idx, len = 0;
@@ -87,35 +148,53 @@ static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
87148
rq_for_each_bio(bio, req) {
88149
struct bio_vec *bvec;
89150
bio_for_each_segment(bvec, bio, idx) {
151+
/* We told the block layer not to give us too many. */
90152
BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
153+
/* If we had a zero-length segment, it would look like
154+
* the end of the data referred to by the "struct
155+
* lguest_dma", so make sure that doesn't happen. */
91156
BUG_ON(!bvec->bv_len);
157+
/* Convert page & offset to a physical address */
92158
dma->addr[i] = page_to_phys(bvec->bv_page)
93159
+ bvec->bv_offset;
94160
dma->len[i] = bvec->bv_len;
95161
len += bvec->bv_len;
96162
i++;
97163
}
98164
}
165+
/* If the array isn't full, we mark the end with a 0 length */
99166
if (i < LGUEST_MAX_DMA_SECTIONS)
100167
dma->len[i] = 0;
101168
return len;
102169
}
103170

171+
/* This creates an empty DMA, useful for prodding the Host without sending data
172+
* (ie. when we want to do a read) */
104173
static void empty_dma(struct lguest_dma *dma)
105174
{
106175
dma->len[0] = 0;
107176
}
108177

178+
/*D:470 Setting up a request is fairly easy: */
109179
static void setup_req(struct blockdev *bd,
110180
int type, struct request *req, struct lguest_dma *dma)
111181
{
182+
/* The type is 1 (write) or 0 (read). */
112183
bd->lb_page->type = type;
184+
/* The sector on disk where the read or write starts. */
113185
bd->lb_page->sector = req->sector;
186+
/* The result is initialized to 0 (unfinished). */
114187
bd->lb_page->result = 0;
188+
/* The current request (so we can end it in the interrupt handler). */
115189
bd->req = req;
190+
/* The number of bytes: returned as a side-effect of req_to_dma(),
191+
* which packs the block layer's "struct request" into our "struct
192+
* lguest_dma" */
116193
bd->lb_page->bytes = req_to_dma(req, dma);
117194
}
118195

196+
/*D:450 Write is pretty straightforward: we pack the request into a "struct
197+
* lguest_dma", then use SEND_DMA to send the request. */
119198
static void do_write(struct blockdev *bd, struct request *req)
120199
{
121200
struct lguest_dma send;
@@ -126,6 +205,9 @@ static void do_write(struct blockdev *bd, struct request *req)
126205
lguest_send_dma(bd->phys_addr, &send);
127206
}
128207

208+
/* Read is similar to write, except we pack the request into our receive
209+
* "struct lguest_dma" and send through an empty DMA just to tell the Host that
210+
* there's a request pending. */
129211
static void do_read(struct blockdev *bd, struct request *req)
130212
{
131213
struct lguest_dma ping;
@@ -137,21 +219,30 @@ static void do_read(struct blockdev *bd, struct request *req)
137219
lguest_send_dma(bd->phys_addr, &ping);
138220
}
139221

222+
/*D:440 This where requests come in: we get handed the request queue and are
223+
* expected to pull a "struct request" off it until we've finished them or
224+
* we're waiting for a reply: */
140225
static void do_lgb_request(struct request_queue *q)
141226
{
142227
struct blockdev *bd;
143228
struct request *req;
144229

145230
again:
231+
/* This sometimes returns NULL even on the very first time around. I
232+
* wonder if it's something to do with letting elves handle the request
233+
* queue... */
146234
req = elv_next_request(q);
147235
if (!req)
148236
return;
149237

238+
/* We attached the struct blockdev to the disk: get it back */
150239
bd = req->rq_disk->private_data;
151-
/* Sometimes we get repeated requests after blk_stop_queue. */
240+
/* Sometimes we get repeated requests after blk_stop_queue(), but we
241+
* can only handle one at a time. */
152242
if (bd->req)
153243
return;
154244

245+
/* We only do reads and writes: no tricky business! */
155246
if (!blk_fs_request(req)) {
156247
pr_debug("Got non-command 0x%08x\n", req->cmd_type);
157248
req->errors++;
@@ -164,20 +255,31 @@ static void do_lgb_request(struct request_queue *q)
164255
else
165256
do_read(bd, req);
166257

167-
/* Wait for interrupt to tell us it's done. */
258+
/* We've put out the request, so stop any more coming in until we get
259+
* an interrupt, which takes us to lgb_irq() to re-enable the queue. */
168260
blk_stop_queue(q);
169261
}
170262

263+
/*D:430 This is the "struct block_device_operations" we attach to the disk at
264+
* the end of lguestblk_probe(). It doesn't seem to want much. */
171265
static struct block_device_operations lguestblk_fops = {
172266
.owner = THIS_MODULE,
173267
};
174268

269+
/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
270+
* quite why. I do know that the IDE code sent two or three of the maintainers
271+
* insane, perhaps this is the fringe of the same disease?
272+
*
273+
* As in the console code, the probe function gets handed the generic
274+
* lguest_device from lguest_bus.c: */
175275
static int lguestblk_probe(struct lguest_device *lgdev)
176276
{
177277
struct blockdev *bd;
178278
int err;
179279
int irqflags = IRQF_SHARED;
180280

281+
/* First we allocate our own "struct blockdev" and initialize the easy
282+
* fields. */
181283
bd = kmalloc(sizeof(*bd), GFP_KERNEL);
182284
if (!bd)
183285
return -ENOMEM;
@@ -187,59 +289,100 @@ static int lguestblk_probe(struct lguest_device *lgdev)
187289
bd->req = NULL;
188290
bd->dma.used_len = 0;
189291
bd->dma.len[0] = 0;
292+
/* The descriptor in the lguest_devices array provided by the Host
293+
* gives the Guest the physical page number of the device's page. */
190294
bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
191295

296+
/* We use lguest_map() to get a pointer to the device page */
192297
bd->lb_page = lguest_map(bd->phys_addr, 1);
193298
if (!bd->lb_page) {
194299
err = -ENOMEM;
195300
goto out_free_bd;
196301
}
197302

303+
/* We need a major device number: 0 means "assign one dynamically". */
198304
bd->major = register_blkdev(0, "lguestblk");
199305
if (bd->major < 0) {
200306
err = bd->major;
201307
goto out_unmap;
202308
}
203309

310+
/* This allocates a "struct gendisk" where we pack all the information
311+
* about the disk which the rest of Linux sees. We ask for one minor
312+
* number; I do wonder if we should be asking for more. */
204313
bd->disk = alloc_disk(1);
205314
if (!bd->disk) {
206315
err = -ENOMEM;
207316
goto out_unregister_blkdev;
208317
}
209318

319+
/* Every disk needs a queue for requests to come in: we set up the
320+
* queue with a callback function (the core of our driver) and the lock
321+
* to use. */
210322
bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
211323
if (!bd->disk->queue) {
212324
err = -ENOMEM;
213325
goto out_put_disk;
214326
}
215327

216-
/* We can only handle a certain number of sg entries */
328+
/* We can only handle a certain number of pointers in our SEND_DMA
329+
* call, so we set that with blk_queue_max_hw_segments(). This is not
330+
* to be confused with blk_queue_max_phys_segments() of course! I
331+
* know, who could possibly confuse the two?
332+
*
333+
* Well, it's simple to tell them apart: this one seems to work and the
334+
* other one didn't. */
217335
blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
218-
/* Buffers must not cross page boundaries */
336+
337+
/* Due to technical limitations of our Host (and simple coding) we
338+
* can't have a single buffer which crosses a page boundary. Tell it
339+
* here. This means that our maximum request size is 16
340+
* (LGUEST_MAX_DMA_SECTIONS) pages. */
219341
blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
220342

343+
/* We name our disk: this becomes the device name when udev does its
344+
* magic thing and creates the device node, such as /dev/lgba.
345+
* next_block_index is a global which starts at 'a'. Unfortunately
346+
* this simple increment logic means that the 27th disk will be called
347+
* "/dev/lgb{". In that case, I recommend having at least 29 disks, so
348+
* your /dev directory will be balanced. */
221349
sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
350+
351+
/* We look to the device descriptor again to see if this device's
352+
* interrupts are expected to be random. If they are, we tell the irq
353+
* subsystem. At the moment this bit is always set. */
222354
if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
223355
irqflags |= IRQF_SAMPLE_RANDOM;
356+
357+
/* Now we have the name and irqflags, we can request the interrupt; we
358+
* give it the "struct blockdev" we have set up to pass to lgb_irq()
359+
* when there is an interrupt. */
224360
err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
225361
if (err)
226362
goto out_cleanup_queue;
227363

364+
/* We bind our one-entry DMA pool to the key for this block device so
365+
* the Host can reply to our requests. The key is equal to the
366+
* physical address of the device's page, which is conveniently
367+
* unique. */
228368
err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
229369
if (err)
230370
goto out_free_irq;
231371

372+
/* We finish our disk initialization and add the disk to the system. */
232373
bd->disk->major = bd->major;
233374
bd->disk->first_minor = 0;
234375
bd->disk->private_data = bd;
235376
bd->disk->fops = &lguestblk_fops;
236-
/* This is initialized to the disk size by the other end. */
377+
/* This is initialized to the disk size by the Launcher. */
237378
set_capacity(bd->disk, bd->lb_page->num_sectors);
238379
add_disk(bd->disk);
239380

240381
printk(KERN_INFO "%s: device %i at major %d\n",
241382
bd->disk->disk_name, lgdev->index, bd->major);
242383

384+
/* We don't need to keep the "struct blockdev" around, but if we ever
385+
* implemented device removal, we'd need this. */
243386
lgdev->private = bd;
244387
return 0;
245388

@@ -258,6 +401,8 @@ static int lguestblk_probe(struct lguest_device *lgdev)
258401
return err;
259402
}
260403

404+
/*D:410 The boilerplate code for registering the lguest block driver is just
405+
* like the console: */
261406
static struct lguest_driver lguestblk_drv = {
262407
.name = "lguestblk",
263408
.owner = THIS_MODULE,

0 commit comments

Comments
 (0)