|
52 | 52 | #endif
|
53 | 53 | #include <linux/string.h>
|
54 | 54 | #include <linux/skbuff.h>
|
| 55 | +#include <linux/splice.h> |
55 | 56 | #include <linux/cache.h>
|
56 | 57 | #include <linux/rtnetlink.h>
|
57 | 58 | #include <linux/init.h>
|
|
71 | 72 | static struct kmem_cache *skbuff_head_cache __read_mostly;
|
72 | 73 | static struct kmem_cache *skbuff_fclone_cache __read_mostly;
|
73 | 74 |
|
| 75 | +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, |
| 76 | + struct pipe_buffer *buf) |
| 77 | +{ |
| 78 | + struct sk_buff *skb = (struct sk_buff *) buf->private; |
| 79 | + |
| 80 | + kfree_skb(skb); |
| 81 | +} |
| 82 | + |
| 83 | +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, |
| 84 | + struct pipe_buffer *buf) |
| 85 | +{ |
| 86 | + struct sk_buff *skb = (struct sk_buff *) buf->private; |
| 87 | + |
| 88 | + skb_get(skb); |
| 89 | +} |
| 90 | + |
| 91 | +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, |
| 92 | + struct pipe_buffer *buf) |
| 93 | +{ |
| 94 | + return 1; |
| 95 | +} |
| 96 | + |
| 97 | + |
| 98 | +/* Pipe buffer operations for a socket. */ |
| 99 | +static struct pipe_buf_operations sock_pipe_buf_ops = { |
| 100 | + .can_merge = 0, |
| 101 | + .map = generic_pipe_buf_map, |
| 102 | + .unmap = generic_pipe_buf_unmap, |
| 103 | + .confirm = generic_pipe_buf_confirm, |
| 104 | + .release = sock_pipe_buf_release, |
| 105 | + .steal = sock_pipe_buf_steal, |
| 106 | + .get = sock_pipe_buf_get, |
| 107 | +}; |
| 108 | + |
74 | 109 | /*
|
75 | 110 | * Keep out-of-line to prevent kernel bloat.
|
76 | 111 | * __builtin_return_address is not used because it is not always
|
@@ -1122,6 +1157,217 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
|
1122 | 1157 | return -EFAULT;
|
1123 | 1158 | }
|
1124 | 1159 |
|
| 1160 | +/* |
| 1161 | + * Callback from splice_to_pipe(), if we need to release some pages |
| 1162 | + * at the end of the spd in case we error'ed out in filling the pipe. |
| 1163 | + */ |
| 1164 | +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) |
| 1165 | +{ |
| 1166 | + struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; |
| 1167 | + |
| 1168 | + kfree_skb(skb); |
| 1169 | +} |
| 1170 | + |
| 1171 | +/* |
| 1172 | + * Fill page/offset/length into spd, if it can hold more pages. |
| 1173 | + */ |
| 1174 | +static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, |
| 1175 | + unsigned int len, unsigned int offset, |
| 1176 | + struct sk_buff *skb) |
| 1177 | +{ |
| 1178 | + if (unlikely(spd->nr_pages == PIPE_BUFFERS)) |
| 1179 | + return 1; |
| 1180 | + |
| 1181 | + spd->pages[spd->nr_pages] = page; |
| 1182 | + spd->partial[spd->nr_pages].len = len; |
| 1183 | + spd->partial[spd->nr_pages].offset = offset; |
| 1184 | + spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb); |
| 1185 | + spd->nr_pages++; |
| 1186 | + return 0; |
| 1187 | +} |
| 1188 | + |
| 1189 | +/* |
| 1190 | + * Map linear and fragment data from the skb to spd. Returns number of |
| 1191 | + * pages mapped. |
| 1192 | + */ |
| 1193 | +static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, |
| 1194 | + unsigned int *total_len, |
| 1195 | + struct splice_pipe_desc *spd) |
| 1196 | +{ |
| 1197 | + unsigned int nr_pages = spd->nr_pages; |
| 1198 | + unsigned int poff, plen, len, toff, tlen; |
| 1199 | + int headlen, seg; |
| 1200 | + |
| 1201 | + toff = *offset; |
| 1202 | + tlen = *total_len; |
| 1203 | + if (!tlen) |
| 1204 | + goto err; |
| 1205 | + |
| 1206 | + /* |
| 1207 | + * if the offset is greater than the linear part, go directly to |
| 1208 | + * the fragments. |
| 1209 | + */ |
| 1210 | + headlen = skb_headlen(skb); |
| 1211 | + if (toff >= headlen) { |
| 1212 | + toff -= headlen; |
| 1213 | + goto map_frag; |
| 1214 | + } |
| 1215 | + |
| 1216 | + /* |
| 1217 | + * first map the linear region into the pages/partial map, skipping |
| 1218 | + * any potential initial offset. |
| 1219 | + */ |
| 1220 | + len = 0; |
| 1221 | + while (len < headlen) { |
| 1222 | + void *p = skb->data + len; |
| 1223 | + |
| 1224 | + poff = (unsigned long) p & (PAGE_SIZE - 1); |
| 1225 | + plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff); |
| 1226 | + len += plen; |
| 1227 | + |
| 1228 | + if (toff) { |
| 1229 | + if (plen <= toff) { |
| 1230 | + toff -= plen; |
| 1231 | + continue; |
| 1232 | + } |
| 1233 | + plen -= toff; |
| 1234 | + poff += toff; |
| 1235 | + toff = 0; |
| 1236 | + } |
| 1237 | + |
| 1238 | + plen = min(plen, tlen); |
| 1239 | + if (!plen) |
| 1240 | + break; |
| 1241 | + |
| 1242 | + /* |
| 1243 | + * just jump directly to update and return, no point |
| 1244 | + * in going over fragments when the output is full. |
| 1245 | + */ |
| 1246 | + if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb)) |
| 1247 | + goto done; |
| 1248 | + |
| 1249 | + tlen -= plen; |
| 1250 | + } |
| 1251 | + |
| 1252 | + /* |
| 1253 | + * then map the fragments |
| 1254 | + */ |
| 1255 | +map_frag: |
| 1256 | + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { |
| 1257 | + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; |
| 1258 | + |
| 1259 | + plen = f->size; |
| 1260 | + poff = f->page_offset; |
| 1261 | + |
| 1262 | + if (toff) { |
| 1263 | + if (plen <= toff) { |
| 1264 | + toff -= plen; |
| 1265 | + continue; |
| 1266 | + } |
| 1267 | + plen -= toff; |
| 1268 | + poff += toff; |
| 1269 | + toff = 0; |
| 1270 | + } |
| 1271 | + |
| 1272 | + plen = min(plen, tlen); |
| 1273 | + if (!plen) |
| 1274 | + break; |
| 1275 | + |
| 1276 | + if (spd_fill_page(spd, f->page, plen, poff, skb)) |
| 1277 | + break; |
| 1278 | + |
| 1279 | + tlen -= plen; |
| 1280 | + } |
| 1281 | + |
| 1282 | +done: |
| 1283 | + if (spd->nr_pages - nr_pages) { |
| 1284 | + *offset = 0; |
| 1285 | + *total_len = tlen; |
| 1286 | + return 0; |
| 1287 | + } |
| 1288 | +err: |
| 1289 | + return 1; |
| 1290 | +} |
| 1291 | + |
| 1292 | +/* |
| 1293 | + * Map data from the skb to a pipe. Should handle both the linear part, |
| 1294 | + * the fragments, and the frag list. It does NOT handle frag lists within |
| 1295 | + * the frag list, if such a thing exists. We'd probably need to recurse to |
| 1296 | + * handle that cleanly. |
| 1297 | + */ |
| 1298 | +int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, |
| 1299 | + struct pipe_inode_info *pipe, unsigned int tlen, |
| 1300 | + unsigned int flags) |
| 1301 | +{ |
| 1302 | + struct partial_page partial[PIPE_BUFFERS]; |
| 1303 | + struct page *pages[PIPE_BUFFERS]; |
| 1304 | + struct splice_pipe_desc spd = { |
| 1305 | + .pages = pages, |
| 1306 | + .partial = partial, |
| 1307 | + .flags = flags, |
| 1308 | + .ops = &sock_pipe_buf_ops, |
| 1309 | + .spd_release = sock_spd_release, |
| 1310 | + }; |
| 1311 | + struct sk_buff *skb; |
| 1312 | + |
| 1313 | + /* |
| 1314 | + * I'd love to avoid the clone here, but tcp_read_sock() |
| 1315 | + * ignores reference counts and unconditonally kills the sk_buff |
| 1316 | + * on return from the actor. |
| 1317 | + */ |
| 1318 | + skb = skb_clone(__skb, GFP_KERNEL); |
| 1319 | + if (unlikely(!skb)) |
| 1320 | + return -ENOMEM; |
| 1321 | + |
| 1322 | + /* |
| 1323 | + * __skb_splice_bits() only fails if the output has no room left, |
| 1324 | + * so no point in going over the frag_list for the error case. |
| 1325 | + */ |
| 1326 | + if (__skb_splice_bits(skb, &offset, &tlen, &spd)) |
| 1327 | + goto done; |
| 1328 | + else if (!tlen) |
| 1329 | + goto done; |
| 1330 | + |
| 1331 | + /* |
| 1332 | + * now see if we have a frag_list to map |
| 1333 | + */ |
| 1334 | + if (skb_shinfo(skb)->frag_list) { |
| 1335 | + struct sk_buff *list = skb_shinfo(skb)->frag_list; |
| 1336 | + |
| 1337 | + for (; list && tlen; list = list->next) { |
| 1338 | + if (__skb_splice_bits(list, &offset, &tlen, &spd)) |
| 1339 | + break; |
| 1340 | + } |
| 1341 | + } |
| 1342 | + |
| 1343 | +done: |
| 1344 | + /* |
| 1345 | + * drop our reference to the clone, the pipe consumption will |
| 1346 | + * drop the rest. |
| 1347 | + */ |
| 1348 | + kfree_skb(skb); |
| 1349 | + |
| 1350 | + if (spd.nr_pages) { |
| 1351 | + int ret; |
| 1352 | + |
| 1353 | + /* |
| 1354 | + * Drop the socket lock, otherwise we have reverse |
| 1355 | + * locking dependencies between sk_lock and i_mutex |
| 1356 | + * here as compared to sendfile(). We enter here |
| 1357 | + * with the socket lock held, and splice_to_pipe() will |
| 1358 | + * grab the pipe inode lock. For sendfile() emulation, |
| 1359 | + * we call into ->sendpage() with the i_mutex lock held |
| 1360 | + * and networking will grab the socket lock. |
| 1361 | + */ |
| 1362 | + release_sock(__skb->sk); |
| 1363 | + ret = splice_to_pipe(pipe, &spd); |
| 1364 | + lock_sock(__skb->sk); |
| 1365 | + return ret; |
| 1366 | + } |
| 1367 | + |
| 1368 | + return 0; |
| 1369 | +} |
| 1370 | + |
1125 | 1371 | /**
|
1126 | 1372 | * skb_store_bits - store bits from kernel buffer to skb
|
1127 | 1373 | * @skb: destination buffer
|
|
0 commit comments