| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 1 | /* | 
|  | 2 | *  linux/fs/nfs/blocklayout/blocklayoutdev.c | 
|  | 3 | * | 
|  | 4 | *  Device operations for the pnfs nfs4 file layout driver. | 
|  | 5 | * | 
|  | 6 | *  Copyright (c) 2006 The Regents of the University of Michigan. | 
|  | 7 | *  All rights reserved. | 
|  | 8 | * | 
|  | 9 | *  Andy Adamson <andros@citi.umich.edu> | 
|  | 10 | *  Fred Isaman <iisaman@umich.edu> | 
|  | 11 | * | 
|  | 12 | * permission is granted to use, copy, create derivative works and | 
|  | 13 | * redistribute this software and such derivative works for any purpose, | 
|  | 14 | * so long as the name of the university of michigan is not used in | 
|  | 15 | * any advertising or publicity pertaining to the use or distribution | 
|  | 16 | * of this software without specific, written prior authorization.  if | 
|  | 17 | * the above copyright notice or any other identification of the | 
|  | 18 | * university of michigan is included in any copy of any portion of | 
|  | 19 | * this software, then the disclaimer below must also be included. | 
|  | 20 | * | 
|  | 21 | * this software is provided as is, without representation from the | 
|  | 22 | * university of michigan as to its fitness for any purpose, and without | 
|  | 23 | * warranty by the university of michigan of any kind, either express | 
|  | 24 | * or implied, including without limitation the implied warranties of | 
|  | 25 | * merchantability and fitness for a particular purpose.  the regents | 
|  | 26 | * of the university of michigan shall not be liable for any damages, | 
|  | 27 | * including special, indirect, incidental, or consequential damages, | 
|  | 28 | * with respect to any claim arising out or in connection with the use | 
|  | 29 | * of the software, even if it has been or is hereafter advised of the | 
|  | 30 | * possibility of such damages. | 
|  | 31 | */ | 
|  | 32 | #include <linux/module.h> | 
|  | 33 | #include <linux/buffer_head.h> /* __bread */ | 
|  | 34 |  | 
|  | 35 | #include <linux/genhd.h> | 
|  | 36 | #include <linux/blkdev.h> | 
|  | 37 | #include <linux/hash.h> | 
|  | 38 |  | 
|  | 39 | #include "blocklayout.h" | 
|  | 40 |  | 
|  | 41 | #define NFSDBG_FACILITY         NFSDBG_PNFS_LD | 
|  | 42 |  | 
| Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 43 | static int decode_sector_number(__be32 **rp, sector_t *sp) | 
|  | 44 | { | 
|  | 45 | uint64_t s; | 
|  | 46 |  | 
|  | 47 | *rp = xdr_decode_hyper(*rp, &s); | 
|  | 48 | if (s & 0x1ff) { | 
|  | 49 | printk(KERN_WARNING "%s: sector not aligned\n", __func__); | 
|  | 50 | return -1; | 
|  | 51 | } | 
|  | 52 | *sp = s >> SECTOR_SHIFT; | 
|  | 53 | return 0; | 
|  | 54 | } | 
|  | 55 |  | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 56 | /* Open a block_device by device number. */ | 
|  | 57 | struct block_device *nfs4_blkdev_get(dev_t dev) | 
|  | 58 | { | 
|  | 59 | struct block_device *bd; | 
|  | 60 |  | 
|  | 61 | dprintk("%s enter\n", __func__); | 
|  | 62 | bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); | 
|  | 63 | if (IS_ERR(bd)) | 
|  | 64 | goto fail; | 
|  | 65 | return bd; | 
|  | 66 | fail: | 
|  | 67 | dprintk("%s failed to open device : %ld\n", | 
|  | 68 | __func__, PTR_ERR(bd)); | 
|  | 69 | return NULL; | 
|  | 70 | } | 
|  | 71 |  | 
|  | 72 | /* | 
|  | 73 | * Release the block device | 
|  | 74 | */ | 
|  | 75 | int nfs4_blkdev_put(struct block_device *bdev) | 
|  | 76 | { | 
|  | 77 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), | 
|  | 78 | MINOR(bdev->bd_dev)); | 
|  | 79 | return blkdev_put(bdev, FMODE_READ); | 
|  | 80 | } | 
|  | 81 |  | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 82 | static struct bl_dev_msg bl_mount_reply; | 
|  | 83 |  | 
|  | 84 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, | 
|  | 85 | size_t mlen) | 
|  | 86 | { | 
|  | 87 | if (mlen != sizeof (struct bl_dev_msg)) | 
|  | 88 | return -EINVAL; | 
|  | 89 |  | 
|  | 90 | if (copy_from_user(&bl_mount_reply, src, mlen) != 0) | 
|  | 91 | return -EFAULT; | 
|  | 92 |  | 
|  | 93 | wake_up(&bl_wq); | 
|  | 94 |  | 
|  | 95 | return mlen; | 
|  | 96 | } | 
|  | 97 |  | 
|  | 98 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) | 
|  | 99 | { | 
|  | 100 | if (msg->errno >= 0) | 
|  | 101 | return; | 
|  | 102 | wake_up(&bl_wq); | 
|  | 103 | } | 
|  | 104 |  | 
|  | 105 | /* | 
|  | 106 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | 
|  | 107 | */ | 
|  | 108 | struct pnfs_block_dev * | 
|  | 109 | nfs4_blk_decode_device(struct nfs_server *server, | 
| Fred Isaman | 2f9fd18 | 2011-07-30 20:52:46 -0400 | [diff] [blame] | 110 | struct pnfs_device *dev) | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 111 | { | 
| Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 112 | struct pnfs_block_dev *rv; | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 113 | struct block_device *bd = NULL; | 
|  | 114 | struct rpc_pipe_msg msg; | 
|  | 115 | struct bl_msg_hdr bl_msg = { | 
|  | 116 | .type = BL_DEVICE_MOUNT, | 
|  | 117 | .totallen = dev->mincount, | 
|  | 118 | }; | 
|  | 119 | uint8_t *dataptr; | 
|  | 120 | DECLARE_WAITQUEUE(wq, current); | 
|  | 121 | struct bl_dev_msg *reply = &bl_mount_reply; | 
| Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 122 | int offset, len, i, rc; | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 123 |  | 
|  | 124 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | 
|  | 125 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | 
|  | 126 | dev->mincount); | 
|  | 127 |  | 
|  | 128 | memset(&msg, 0, sizeof(msg)); | 
|  | 129 | msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); | 
|  | 130 | if (!msg.data) { | 
|  | 131 | rv = ERR_PTR(-ENOMEM); | 
|  | 132 | goto out; | 
|  | 133 | } | 
|  | 134 |  | 
|  | 135 | memcpy(msg.data, &bl_msg, sizeof(bl_msg)); | 
|  | 136 | dataptr = (uint8_t *) msg.data; | 
| Fred Isaman | 2f9fd18 | 2011-07-30 20:52:46 -0400 | [diff] [blame] | 137 | len = dev->mincount; | 
|  | 138 | offset = sizeof(bl_msg); | 
|  | 139 | for (i = 0; len > 0; i++) { | 
|  | 140 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | 
|  | 141 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | 
|  | 142 | len -= PAGE_CACHE_SIZE; | 
|  | 143 | offset += PAGE_CACHE_SIZE; | 
|  | 144 | } | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 145 | msg.len = sizeof(bl_msg) + dev->mincount; | 
|  | 146 |  | 
|  | 147 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | 
|  | 148 | add_wait_queue(&bl_wq, &wq); | 
| Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 149 | rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); | 
|  | 150 | if (rc < 0) { | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 151 | remove_wait_queue(&bl_wq, &wq); | 
| Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 152 | rv = ERR_PTR(rc); | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 153 | goto out; | 
|  | 154 | } | 
|  | 155 |  | 
|  | 156 | set_current_state(TASK_UNINTERRUPTIBLE); | 
|  | 157 | schedule(); | 
|  | 158 | __set_current_state(TASK_RUNNING); | 
|  | 159 | remove_wait_queue(&bl_wq, &wq); | 
|  | 160 |  | 
|  | 161 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | 
|  | 162 | dprintk("%s failed to open device: %d\n", | 
|  | 163 | __func__, reply->status); | 
|  | 164 | rv = ERR_PTR(-EINVAL); | 
|  | 165 | goto out; | 
|  | 166 | } | 
|  | 167 |  | 
|  | 168 | bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); | 
|  | 169 | if (IS_ERR(bd)) { | 
| Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 170 | rc = PTR_ERR(bd); | 
|  | 171 | dprintk("%s failed to open device : %d\n", __func__, rc); | 
|  | 172 | rv = ERR_PTR(rc); | 
| Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 173 | goto out; | 
|  | 174 | } | 
|  | 175 |  | 
|  | 176 | rv = kzalloc(sizeof(*rv), GFP_NOFS); | 
|  | 177 | if (!rv) { | 
|  | 178 | rv = ERR_PTR(-ENOMEM); | 
|  | 179 | goto out; | 
|  | 180 | } | 
|  | 181 |  | 
|  | 182 | rv->bm_mdev = bd; | 
|  | 183 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); | 
|  | 184 | dprintk("%s Created device %s with bd_block_size %u\n", | 
|  | 185 | __func__, | 
|  | 186 | bd->bd_disk->disk_name, | 
|  | 187 | bd->bd_block_size); | 
|  | 188 |  | 
|  | 189 | out: | 
|  | 190 | kfree(msg.data); | 
|  | 191 | return rv; | 
|  | 192 | } | 
| Fred Isaman | a60d2eb | 2011-07-30 20:52:44 -0400 | [diff] [blame] | 193 |  | 
| Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 194 | /* Map deviceid returned by the server to constructed block_device */ | 
|  | 195 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, | 
|  | 196 | struct nfs4_deviceid *id) | 
|  | 197 | { | 
|  | 198 | struct block_device *rv = NULL; | 
|  | 199 | struct block_mount_id *mid; | 
|  | 200 | struct pnfs_block_dev *dev; | 
|  | 201 |  | 
|  | 202 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); | 
|  | 203 | mid = BLK_ID(lo); | 
|  | 204 | spin_lock(&mid->bm_lock); | 
|  | 205 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { | 
|  | 206 | if (memcmp(id->data, dev->bm_mdevid.data, | 
|  | 207 | NFS4_DEVICEID4_SIZE) == 0) { | 
|  | 208 | rv = dev->bm_mdev; | 
|  | 209 | goto out; | 
|  | 210 | } | 
|  | 211 | } | 
|  | 212 | out: | 
|  | 213 | spin_unlock(&mid->bm_lock); | 
|  | 214 | dprintk("%s returning %p\n", __func__, rv); | 
|  | 215 | return rv; | 
|  | 216 | } | 
|  | 217 |  | 
|  | 218 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ | 
|  | 219 | struct layout_verification { | 
|  | 220 | u32 mode;	/* R or RW */ | 
|  | 221 | u64 start;	/* Expected start of next non-COW extent */ | 
|  | 222 | u64 inval;	/* Start of INVAL coverage */ | 
|  | 223 | u64 cowread;	/* End of COW read coverage */ | 
|  | 224 | }; | 
|  | 225 |  | 
|  | 226 | /* Verify the extent meets the layout requirements of the pnfs-block draft, | 
|  | 227 | * section 2.3.1. | 
|  | 228 | */ | 
|  | 229 | static int verify_extent(struct pnfs_block_extent *be, | 
|  | 230 | struct layout_verification *lv) | 
|  | 231 | { | 
|  | 232 | if (lv->mode == IOMODE_READ) { | 
|  | 233 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || | 
|  | 234 | be->be_state == PNFS_BLOCK_INVALID_DATA) | 
|  | 235 | return -EIO; | 
|  | 236 | if (be->be_f_offset != lv->start) | 
|  | 237 | return -EIO; | 
|  | 238 | lv->start += be->be_length; | 
|  | 239 | return 0; | 
|  | 240 | } | 
|  | 241 | /* lv->mode == IOMODE_RW */ | 
|  | 242 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { | 
|  | 243 | if (be->be_f_offset != lv->start) | 
|  | 244 | return -EIO; | 
|  | 245 | if (lv->cowread > lv->start) | 
|  | 246 | return -EIO; | 
|  | 247 | lv->start += be->be_length; | 
|  | 248 | lv->inval = lv->start; | 
|  | 249 | return 0; | 
|  | 250 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | 
|  | 251 | if (be->be_f_offset != lv->start) | 
|  | 252 | return -EIO; | 
|  | 253 | lv->start += be->be_length; | 
|  | 254 | return 0; | 
|  | 255 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { | 
|  | 256 | if (be->be_f_offset > lv->start) | 
|  | 257 | return -EIO; | 
|  | 258 | if (be->be_f_offset < lv->inval) | 
|  | 259 | return -EIO; | 
|  | 260 | if (be->be_f_offset < lv->cowread) | 
|  | 261 | return -EIO; | 
|  | 262 | /* It looks like you might want to min this with lv->start, | 
|  | 263 | * but you really don't. | 
|  | 264 | */ | 
|  | 265 | lv->inval = lv->inval + be->be_length; | 
|  | 266 | lv->cowread = be->be_f_offset + be->be_length; | 
|  | 267 | return 0; | 
|  | 268 | } else | 
|  | 269 | return -EIO; | 
|  | 270 | } | 
|  | 271 |  | 
|  | 272 | /* XDR decode pnfs_block_layout4 structure */ | 
| Fred Isaman | a60d2eb | 2011-07-30 20:52:44 -0400 | [diff] [blame] | 273 | int | 
|  | 274 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | 
|  | 275 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) | 
|  | 276 | { | 
| Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 277 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); | 
|  | 278 | int i, status = -EIO; | 
|  | 279 | uint32_t count; | 
|  | 280 | struct pnfs_block_extent *be = NULL, *save; | 
|  | 281 | struct xdr_stream stream; | 
|  | 282 | struct xdr_buf buf; | 
|  | 283 | struct page *scratch; | 
|  | 284 | __be32 *p; | 
|  | 285 | struct layout_verification lv = { | 
|  | 286 | .mode = lgr->range.iomode, | 
|  | 287 | .start = lgr->range.offset >> SECTOR_SHIFT, | 
|  | 288 | .inval = lgr->range.offset >> SECTOR_SHIFT, | 
|  | 289 | .cowread = lgr->range.offset >> SECTOR_SHIFT, | 
|  | 290 | }; | 
|  | 291 | LIST_HEAD(extents); | 
|  | 292 |  | 
|  | 293 | dprintk("---> %s\n", __func__); | 
|  | 294 |  | 
|  | 295 | scratch = alloc_page(gfp_flags); | 
|  | 296 | if (!scratch) | 
|  | 297 | return -ENOMEM; | 
|  | 298 |  | 
|  | 299 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | 
|  | 300 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | 
|  | 301 |  | 
|  | 302 | p = xdr_inline_decode(&stream, 4); | 
|  | 303 | if (unlikely(!p)) | 
|  | 304 | goto out_err; | 
|  | 305 |  | 
|  | 306 | count = be32_to_cpup(p++); | 
|  | 307 |  | 
|  | 308 | dprintk("%s enter, number of extents %i\n", __func__, count); | 
|  | 309 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); | 
|  | 310 | if (unlikely(!p)) | 
|  | 311 | goto out_err; | 
|  | 312 |  | 
|  | 313 | /* Decode individual extents, putting them in temporary | 
|  | 314 | * staging area until whole layout is decoded to make error | 
|  | 315 | * recovery easier. | 
|  | 316 | */ | 
|  | 317 | for (i = 0; i < count; i++) { | 
|  | 318 | be = bl_alloc_extent(); | 
|  | 319 | if (!be) { | 
|  | 320 | status = -ENOMEM; | 
|  | 321 | goto out_err; | 
|  | 322 | } | 
|  | 323 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); | 
|  | 324 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | 
|  | 325 | be->be_mdev = translate_devid(lo, &be->be_devid); | 
|  | 326 | if (!be->be_mdev) | 
|  | 327 | goto out_err; | 
|  | 328 |  | 
|  | 329 | /* The next three values are read in as bytes, | 
|  | 330 | * but stored as 512-byte sector lengths | 
|  | 331 | */ | 
|  | 332 | if (decode_sector_number(&p, &be->be_f_offset) < 0) | 
|  | 333 | goto out_err; | 
|  | 334 | if (decode_sector_number(&p, &be->be_length) < 0) | 
|  | 335 | goto out_err; | 
|  | 336 | if (decode_sector_number(&p, &be->be_v_offset) < 0) | 
|  | 337 | goto out_err; | 
|  | 338 | be->be_state = be32_to_cpup(p++); | 
|  | 339 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | 
|  | 340 | be->be_inval = &bl->bl_inval; | 
|  | 341 | if (verify_extent(be, &lv)) { | 
|  | 342 | dprintk("%s verify failed\n", __func__); | 
|  | 343 | goto out_err; | 
|  | 344 | } | 
|  | 345 | list_add_tail(&be->be_node, &extents); | 
|  | 346 | } | 
|  | 347 | if (lgr->range.offset + lgr->range.length != | 
|  | 348 | lv.start << SECTOR_SHIFT) { | 
|  | 349 | dprintk("%s Final length mismatch\n", __func__); | 
|  | 350 | be = NULL; | 
|  | 351 | goto out_err; | 
|  | 352 | } | 
|  | 353 | if (lv.start < lv.cowread) { | 
|  | 354 | dprintk("%s Final uncovered COW extent\n", __func__); | 
|  | 355 | be = NULL; | 
|  | 356 | goto out_err; | 
|  | 357 | } | 
|  | 358 | /* Extents decoded properly, now try to merge them in to | 
|  | 359 | * existing layout extents. | 
|  | 360 | */ | 
|  | 361 | spin_lock(&bl->bl_ext_lock); | 
|  | 362 | list_for_each_entry_safe(be, save, &extents, be_node) { | 
|  | 363 | list_del(&be->be_node); | 
|  | 364 | status = bl_add_merge_extent(bl, be); | 
|  | 365 | if (status) { | 
|  | 366 | spin_unlock(&bl->bl_ext_lock); | 
|  | 367 | /* This is a fairly catastrophic error, as the | 
|  | 368 | * entire layout extent lists are now corrupted. | 
|  | 369 | * We should have some way to distinguish this. | 
|  | 370 | */ | 
|  | 371 | be = NULL; | 
|  | 372 | goto out_err; | 
|  | 373 | } | 
|  | 374 | } | 
|  | 375 | spin_unlock(&bl->bl_ext_lock); | 
|  | 376 | status = 0; | 
|  | 377 | out: | 
|  | 378 | __free_page(scratch); | 
|  | 379 | dprintk("%s returns %i\n", __func__, status); | 
|  | 380 | return status; | 
|  | 381 |  | 
|  | 382 | out_err: | 
|  | 383 | bl_put_extent(be); | 
|  | 384 | while (!list_empty(&extents)) { | 
|  | 385 | be = list_first_entry(&extents, struct pnfs_block_extent, | 
|  | 386 | be_node); | 
|  | 387 | list_del(&be->be_node); | 
|  | 388 | bl_put_extent(be); | 
|  | 389 | } | 
|  | 390 | goto out; | 
| Fred Isaman | a60d2eb | 2011-07-30 20:52:44 -0400 | [diff] [blame] | 391 | } |