1
- /* A simple block driver for lguest.
1
+ /*D:400
2
+ * The Guest block driver
2
3
*
3
- * Copyright 2006 Rusty Russell <[email protected] > IBM Corporation
4
+ * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5
+ * The mechanism is simple: we place the information about the request in the
6
+ * device page, then use SEND_DMA (containing the data for a write, or an empty
7
+ * "ping" DMA for a read).
8
+ :*/
9
+ /* Copyright 2006 Rusty Russell <[email protected] > IBM Corporation
4
10
*
5
11
* This program is free software; you can redistribute it and/or modify
6
12
* it under the terms of the GNU General Public License as published by
25
31
26
32
static char next_block_index = 'a' ;
27
33
34
+ /*D:420 Here is the structure which holds all the information we need about
35
+ * each Guest block device.
36
+ *
37
+ * I'm sure at this stage, you're wondering "hey, where was the adventure I was
38
+ * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39
+ * my blog". I think Real adventures have boring bits, too, and you're in the
40
+ * middle of one. But it gets better. Just not quite yet. */
28
41
struct blockdev
29
42
{
43
+ /* The block queue infrastructure wants a spinlock: it is held while it
44
+ * calls our block request function. We grab it in our interrupt
45
+ * handler so the responses don't mess with new requests. */
30
46
spinlock_t lock ;
31
47
32
- /* The disk structure for the kernel. */
48
+ /* The disk structure registered with kernel. */
33
49
struct gendisk * disk ;
34
50
35
- /* The major number for this disk. */
51
+ /* The major device number for this disk, and the interrupt. We only
52
+ * really keep them here for completeness; we'd need them if we
53
+ * supported device unplugging. */
36
54
int major ;
37
55
int irq ;
38
56
57
+ /* The physical address of this device's memory page */
39
58
unsigned long phys_addr ;
40
- /* The mapped block page. */
59
+ /* The mapped memory page for convenient acces . */
41
60
struct lguest_block_page * lb_page ;
42
61
43
- /* We only have a single request outstanding at a time. */
62
+ /* We only have a single request outstanding at a time: this is it . */
44
63
struct lguest_dma dma ;
45
64
struct request * req ;
46
65
};
47
66
48
- /* Jens gave me this nice helper to end all chunks of a request. */
67
+ /*D:495 We originally used end_request() throughout the driver, but it turns
68
+ * out that end_request() is deprecated, and doesn't actually end the request
69
+ * (which seems like a good reason to deprecate it!). It simply ends the first
70
+ * bio. So if we had 3 bios in a "struct request" we would do all 3,
71
+ * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72
+ * work as we needed to do.
73
+ *
74
+ * This reinforced to me that I do not understand the block layer.
75
+ *
76
+ * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77
+ * request. This improved disk speed by 130%. */
49
78
static void end_entire_request (struct request * req , int uptodate )
50
79
{
51
80
if (end_that_request_first (req , uptodate , req -> hard_nr_sectors ))
@@ -55,30 +84,62 @@ static void end_entire_request(struct request *req, int uptodate)
55
84
end_that_request_last (req , uptodate );
56
85
}
57
86
87
+ /* I'm told there are only two stories in the world worth telling: love and
88
+ * hate. So there used to be a love scene here like this:
89
+ *
90
+ * Launcher: We could make beautiful I/O together, you and I.
91
+ * Guest: My, that's a big disk!
92
+ *
93
+ * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
94
+
95
+ /*D:490 This is the interrupt handler, called when a block read or write has
96
+ * been completed for us. */
58
97
static irqreturn_t lgb_irq (int irq , void * _bd )
59
98
{
99
+ /* We handed our "struct blockdev" as the argument to request_irq(), so
100
+ * it is passed through to us here. This tells us which device we're
101
+ * dealing with in case we have more than one. */
60
102
struct blockdev * bd = _bd ;
61
103
unsigned long flags ;
62
104
105
+ /* We weren't doing anything? Strange, but could happen if we shared
106
+ * interrupts (we don't!). */
63
107
if (!bd -> req ) {
64
108
pr_debug ("No work!\n" );
65
109
return IRQ_NONE ;
66
110
}
67
111
112
+ /* Not done yet? That's equally strange. */
68
113
if (!bd -> lb_page -> result ) {
69
114
pr_debug ("No result!\n" );
70
115
return IRQ_NONE ;
71
116
}
72
117
118
+ /* We have to grab the lock before ending the request. */
73
119
spin_lock_irqsave (& bd -> lock , flags );
120
+ /* "result" is 1 for success, 2 for failure: end_entire_request() wants
121
+ * to know whether this succeeded or not. */
74
122
end_entire_request (bd -> req , bd -> lb_page -> result == 1 );
123
+ /* Clear out request, it's done. */
75
124
bd -> req = NULL ;
125
+ /* Reset incoming DMA for next time. */
76
126
bd -> dma .used_len = 0 ;
127
+ /* Ready for more reads or writes */
77
128
blk_start_queue (bd -> disk -> queue );
78
129
spin_unlock_irqrestore (& bd -> lock , flags );
130
+
131
+ /* The interrupt was for us, we dealt with it. */
79
132
return IRQ_HANDLED ;
80
133
}
81
134
135
+ /*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136
+ * each of which contains "struct bio_vec"s, each of which contains a page, an
137
+ * offset and a length.
138
+ *
139
+ * Fortunately there are iterators to help us walk through the "struct
140
+ * request". Even more fortunately, there were plenty of places to steal the
141
+ * code from. We pack the "struct request" into our "struct lguest_dma" and
142
+ * return the total length. */
82
143
static unsigned int req_to_dma (struct request * req , struct lguest_dma * dma )
83
144
{
84
145
unsigned int i = 0 , idx , len = 0 ;
@@ -87,35 +148,53 @@ static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
87
148
rq_for_each_bio (bio , req ) {
88
149
struct bio_vec * bvec ;
89
150
bio_for_each_segment (bvec , bio , idx ) {
151
+ /* We told the block layer not to give us too many. */
90
152
BUG_ON (i == LGUEST_MAX_DMA_SECTIONS );
153
+ /* If we had a zero-length segment, it would look like
154
+ * the end of the data referred to by the "struct
155
+ * lguest_dma", so make sure that doesn't happen. */
91
156
BUG_ON (!bvec -> bv_len );
157
+ /* Convert page & offset to a physical address */
92
158
dma -> addr [i ] = page_to_phys (bvec -> bv_page )
93
159
+ bvec -> bv_offset ;
94
160
dma -> len [i ] = bvec -> bv_len ;
95
161
len += bvec -> bv_len ;
96
162
i ++ ;
97
163
}
98
164
}
165
+ /* If the array isn't full, we mark the end with a 0 length */
99
166
if (i < LGUEST_MAX_DMA_SECTIONS )
100
167
dma -> len [i ] = 0 ;
101
168
return len ;
102
169
}
103
170
171
+ /* This creates an empty DMA, useful for prodding the Host without sending data
172
+ * (ie. when we want to do a read) */
104
173
static void empty_dma (struct lguest_dma * dma )
105
174
{
106
175
dma -> len [0 ] = 0 ;
107
176
}
108
177
178
+ /*D:470 Setting up a request is fairly easy: */
109
179
static void setup_req (struct blockdev * bd ,
110
180
int type , struct request * req , struct lguest_dma * dma )
111
181
{
182
+ /* The type is 1 (write) or 0 (read). */
112
183
bd -> lb_page -> type = type ;
184
+ /* The sector on disk where the read or write starts. */
113
185
bd -> lb_page -> sector = req -> sector ;
186
+ /* The result is initialized to 0 (unfinished). */
114
187
bd -> lb_page -> result = 0 ;
188
+ /* The current request (so we can end it in the interrupt handler). */
115
189
bd -> req = req ;
190
+ /* The number of bytes: returned as a side-effect of req_to_dma(),
191
+ * which packs the block layer's "struct request" into our "struct
192
+ * lguest_dma" */
116
193
bd -> lb_page -> bytes = req_to_dma (req , dma );
117
194
}
118
195
196
+ /*D:450 Write is pretty straightforward: we pack the request into a "struct
197
+ * lguest_dma", then use SEND_DMA to send the request. */
119
198
static void do_write (struct blockdev * bd , struct request * req )
120
199
{
121
200
struct lguest_dma send ;
@@ -126,6 +205,9 @@ static void do_write(struct blockdev *bd, struct request *req)
126
205
lguest_send_dma (bd -> phys_addr , & send );
127
206
}
128
207
208
+ /* Read is similar to write, except we pack the request into our receive
209
+ * "struct lguest_dma" and send through an empty DMA just to tell the Host that
210
+ * there's a request pending. */
129
211
static void do_read (struct blockdev * bd , struct request * req )
130
212
{
131
213
struct lguest_dma ping ;
@@ -137,21 +219,30 @@ static void do_read(struct blockdev *bd, struct request *req)
137
219
lguest_send_dma (bd -> phys_addr , & ping );
138
220
}
139
221
222
+ /*D:440 This where requests come in: we get handed the request queue and are
223
+ * expected to pull a "struct request" off it until we've finished them or
224
+ * we're waiting for a reply: */
140
225
static void do_lgb_request (struct request_queue * q )
141
226
{
142
227
struct blockdev * bd ;
143
228
struct request * req ;
144
229
145
230
again :
231
+ /* This sometimes returns NULL even on the very first time around. I
232
+ * wonder if it's something to do with letting elves handle the request
233
+ * queue... */
146
234
req = elv_next_request (q );
147
235
if (!req )
148
236
return ;
149
237
238
+ /* We attached the struct blockdev to the disk: get it back */
150
239
bd = req -> rq_disk -> private_data ;
151
- /* Sometimes we get repeated requests after blk_stop_queue. */
240
+ /* Sometimes we get repeated requests after blk_stop_queue(), but we
241
+ * can only handle one at a time. */
152
242
if (bd -> req )
153
243
return ;
154
244
245
+ /* We only do reads and writes: no tricky business! */
155
246
if (!blk_fs_request (req )) {
156
247
pr_debug ("Got non-command 0x%08x\n" , req -> cmd_type );
157
248
req -> errors ++ ;
@@ -164,20 +255,31 @@ static void do_lgb_request(struct request_queue *q)
164
255
else
165
256
do_read (bd , req );
166
257
167
- /* Wait for interrupt to tell us it's done. */
258
+ /* We've put out the request, so stop any more coming in until we get
259
+ * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
168
260
blk_stop_queue (q );
169
261
}
170
262
263
+ /*D:430 This is the "struct block_device_operations" we attach to the disk at
264
+ * the end of lguestblk_probe(). It doesn't seem to want much. */
171
265
static struct block_device_operations lguestblk_fops = {
172
266
.owner = THIS_MODULE ,
173
267
};
174
268
269
+ /*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
270
+ * quite why. I do know that the IDE code sent two or three of the maintainers
271
+ * insane, perhaps this is the fringe of the same disease?
272
+ *
273
+ * As in the console code, the probe function gets handed the generic
274
+ * lguest_device from lguest_bus.c: */
175
275
static int lguestblk_probe (struct lguest_device * lgdev )
176
276
{
177
277
struct blockdev * bd ;
178
278
int err ;
179
279
int irqflags = IRQF_SHARED ;
180
280
281
+ /* First we allocate our own "struct blockdev" and initialize the easy
282
+ * fields. */
181
283
bd = kmalloc (sizeof (* bd ), GFP_KERNEL );
182
284
if (!bd )
183
285
return - ENOMEM ;
@@ -187,59 +289,100 @@ static int lguestblk_probe(struct lguest_device *lgdev)
187
289
bd -> req = NULL ;
188
290
bd -> dma .used_len = 0 ;
189
291
bd -> dma .len [0 ] = 0 ;
292
+ /* The descriptor in the lguest_devices array provided by the Host
293
+ * gives the Guest the physical page number of the device's page. */
190
294
bd -> phys_addr = (lguest_devices [lgdev -> index ].pfn << PAGE_SHIFT );
191
295
296
+ /* We use lguest_map() to get a pointer to the device page */
192
297
bd -> lb_page = lguest_map (bd -> phys_addr , 1 );
193
298
if (!bd -> lb_page ) {
194
299
err = - ENOMEM ;
195
300
goto out_free_bd ;
196
301
}
197
302
303
+ /* We need a major device number: 0 means "assign one dynamically". */
198
304
bd -> major = register_blkdev (0 , "lguestblk" );
199
305
if (bd -> major < 0 ) {
200
306
err = bd -> major ;
201
307
goto out_unmap ;
202
308
}
203
309
310
+ /* This allocates a "struct gendisk" where we pack all the information
311
+ * about the disk which the rest of Linux sees. We ask for one minor
312
+ * number; I do wonder if we should be asking for more. */
204
313
bd -> disk = alloc_disk (1 );
205
314
if (!bd -> disk ) {
206
315
err = - ENOMEM ;
207
316
goto out_unregister_blkdev ;
208
317
}
209
318
319
+ /* Every disk needs a queue for requests to come in: we set up the
320
+ * queue with a callback function (the core of our driver) and the lock
321
+ * to use. */
210
322
bd -> disk -> queue = blk_init_queue (do_lgb_request , & bd -> lock );
211
323
if (!bd -> disk -> queue ) {
212
324
err = - ENOMEM ;
213
325
goto out_put_disk ;
214
326
}
215
327
216
- /* We can only handle a certain number of sg entries */
328
+ /* We can only handle a certain number of pointers in our SEND_DMA
329
+ * call, so we set that with blk_queue_max_hw_segments(). This is not
330
+ * to be confused with blk_queue_max_phys_segments() of course! I
331
+ * know, who could possibly confuse the two?
332
+ *
333
+ * Well, it's simple to tell them apart: this one seems to work and the
334
+ * other one didn't. */
217
335
blk_queue_max_hw_segments (bd -> disk -> queue , LGUEST_MAX_DMA_SECTIONS );
218
- /* Buffers must not cross page boundaries */
336
+
337
+ /* Due to technical limitations of our Host (and simple coding) we
338
+ * can't have a single buffer which crosses a page boundary. Tell it
339
+ * here. This means that our maximum request size is 16
340
+ * (LGUEST_MAX_DMA_SECTIONS) pages. */
219
341
blk_queue_segment_boundary (bd -> disk -> queue , PAGE_SIZE - 1 );
220
342
343
+ /* We name our disk: this becomes the device name when udev does its
344
+ * magic thing and creates the device node, such as /dev/lgba.
345
+ * next_block_index is a global which starts at 'a'. Unfortunately
346
+ * this simple increment logic means that the 27th disk will be called
347
+ * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
348
+ * your /dev directory will be balanced. */
221
349
sprintf (bd -> disk -> disk_name , "lgb%c" , next_block_index ++ );
350
+
351
+ /* We look to the device descriptor again to see if this device's
352
+ * interrupts are expected to be random. If they are, we tell the irq
353
+ * subsystem. At the moment this bit is always set. */
222
354
if (lguest_devices [lgdev -> index ].features & LGUEST_DEVICE_F_RANDOMNESS )
223
355
irqflags |= IRQF_SAMPLE_RANDOM ;
356
+
357
+ /* Now we have the name and irqflags, we can request the interrupt; we
358
+ * give it the "struct blockdev" we have set up to pass to lgb_irq()
359
+ * when there is an interrupt. */
224
360
err = request_irq (bd -> irq , lgb_irq , irqflags , bd -> disk -> disk_name , bd );
225
361
if (err )
226
362
goto out_cleanup_queue ;
227
363
364
+ /* We bind our one-entry DMA pool to the key for this block device so
365
+ * the Host can reply to our requests. The key is equal to the
366
+ * physical address of the device's page, which is conveniently
367
+ * unique. */
228
368
err = lguest_bind_dma (bd -> phys_addr , & bd -> dma , 1 , bd -> irq );
229
369
if (err )
230
370
goto out_free_irq ;
231
371
372
+ /* We finish our disk initialization and add the disk to the system. */
232
373
bd -> disk -> major = bd -> major ;
233
374
bd -> disk -> first_minor = 0 ;
234
375
bd -> disk -> private_data = bd ;
235
376
bd -> disk -> fops = & lguestblk_fops ;
236
- /* This is initialized to the disk size by the other end . */
377
+ /* This is initialized to the disk size by the Launcher . */
237
378
set_capacity (bd -> disk , bd -> lb_page -> num_sectors );
238
379
add_disk (bd -> disk );
239
380
240
381
printk (KERN_INFO "%s: device %i at major %d\n" ,
241
382
bd -> disk -> disk_name , lgdev -> index , bd -> major );
242
383
384
+ /* We don't need to keep the "struct blockdev" around, but if we ever
385
+ * implemented device removal, we'd need this. */
243
386
lgdev -> private = bd ;
244
387
return 0 ;
245
388
@@ -258,6 +401,8 @@ static int lguestblk_probe(struct lguest_device *lgdev)
258
401
return err ;
259
402
}
260
403
404
+ /*D:410 The boilerplate code for registering the lguest block driver is just
405
+ * like the console: */
261
406
static struct lguest_driver lguestblk_drv = {
262
407
.name = "lguestblk" ,
263
408
.owner = THIS_MODULE ,
0 commit comments