| Joe Thornber | 991d9fa | 2011-10-31 20:21:18 +0000 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2011 Red Hat, Inc. | 
|  | 3 | * | 
|  | 4 | * This file is released under the GPL. | 
|  | 5 | */ | 
|  | 6 |  | 
|  | 7 | #include "dm-thin-metadata.h" | 
|  | 8 | #include "persistent-data/dm-btree.h" | 
|  | 9 | #include "persistent-data/dm-space-map.h" | 
|  | 10 | #include "persistent-data/dm-space-map-disk.h" | 
|  | 11 | #include "persistent-data/dm-transaction-manager.h" | 
|  | 12 |  | 
|  | 13 | #include <linux/list.h> | 
|  | 14 | #include <linux/device-mapper.h> | 
|  | 15 | #include <linux/workqueue.h> | 
|  | 16 |  | 
|  | 17 | /*-------------------------------------------------------------------------- | 
|  | 18 | * As far as the metadata goes, there is: | 
|  | 19 | * | 
|  | 20 | * - A superblock in block zero, taking up fewer than 512 bytes for | 
|  | 21 | *   atomic writes. | 
|  | 22 | * | 
|  | 23 | * - A space map managing the metadata blocks. | 
|  | 24 | * | 
|  | 25 | * - A space map managing the data blocks. | 
|  | 26 | * | 
|  | 27 | * - A btree mapping our internal thin dev ids onto struct disk_device_details. | 
|  | 28 | * | 
|  | 29 | * - A hierarchical btree, with 2 levels which effectively maps (thin | 
|  | 30 | *   dev id, virtual block) -> block_time.  Block time is a 64-bit | 
|  | 31 | *   field holding the time in the low 24 bits, and block in the top 48 | 
|  | 32 | *   bits. | 
|  | 33 | * | 
|  | 34 | * BTrees consist solely of btree_nodes, that fill a block.  Some are | 
|  | 35 | * internal nodes, as such their values are a __le64 pointing to other | 
|  | 36 | * nodes.  Leaf nodes can store data of any reasonable size (ie. much | 
|  | 37 | * smaller than the block size).  The nodes consist of the header, | 
|  | 38 | * followed by an array of keys, followed by an array of values.  We have | 
|  | 39 | * to binary search on the keys so they're all held together to help the | 
|  | 40 | * cpu cache. | 
|  | 41 | * | 
|  | 42 | * Space maps have 2 btrees: | 
|  | 43 | * | 
|  | 44 | * - One maps a uint64_t onto a struct index_entry.  Which points to a | 
|  | 45 | *   bitmap block, and has some details about how many free entries there | 
|  | 46 | *   are etc. | 
|  | 47 | * | 
|  | 48 | * - The bitmap blocks have a header (for the checksum).  Then the rest | 
|  | 49 | *   of the block is pairs of bits.  With the meaning being: | 
|  | 50 | * | 
|  | 51 | *   0 - ref count is 0 | 
|  | 52 | *   1 - ref count is 1 | 
|  | 53 | *   2 - ref count is 2 | 
|  | 54 | *   3 - ref count is higher than 2 | 
|  | 55 | * | 
|  | 56 | * - If the count is higher than 2 then the ref count is entered in a | 
|  | 57 | *   second btree that directly maps the block_address to a uint32_t ref | 
|  | 58 | *   count. | 
|  | 59 | * | 
|  | 60 | * The space map metadata variant doesn't have a bitmaps btree.  Instead | 
|  | 61 | * it has one single blocks worth of index_entries.  This avoids | 
|  | 62 | * recursive issues with the bitmap btree needing to allocate space in | 
|  | 63 | * order to insert.  With a small data block size such as 64k the | 
|  | 64 | * metadata support data devices that are hundreds of terrabytes. | 
|  | 65 | * | 
|  | 66 | * The space maps allocate space linearly from front to back.  Space that | 
|  | 67 | * is freed in a transaction is never recycled within that transaction. | 
|  | 68 | * To try and avoid fragmenting _free_ space the allocator always goes | 
|  | 69 | * back and fills in gaps. | 
|  | 70 | * | 
|  | 71 | * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks | 
|  | 72 | * from the block manager. | 
|  | 73 | *--------------------------------------------------------------------------*/ | 
|  | 74 |  | 
|  | 75 | #define DM_MSG_PREFIX   "thin metadata" | 
|  | 76 |  | 
|  | 77 | #define THIN_SUPERBLOCK_MAGIC 27022010 | 
|  | 78 | #define THIN_SUPERBLOCK_LOCATION 0 | 
|  | 79 | #define THIN_VERSION 1 | 
|  | 80 | #define THIN_METADATA_CACHE_SIZE 64 | 
|  | 81 | #define SECTOR_TO_BLOCK_SHIFT 3 | 
|  | 82 |  | 
|  | 83 | /* This should be plenty */ | 
|  | 84 | #define SPACE_MAP_ROOT_SIZE 128 | 
|  | 85 |  | 
|  | 86 | /* | 
|  | 87 | * Little endian on-disk superblock and device details. | 
|  | 88 | */ | 
|  | 89 | struct thin_disk_superblock { | 
|  | 90 | __le32 csum;	/* Checksum of superblock except for this field. */ | 
|  | 91 | __le32 flags; | 
|  | 92 | __le64 blocknr;	/* This block number, dm_block_t. */ | 
|  | 93 |  | 
|  | 94 | __u8 uuid[16]; | 
|  | 95 | __le64 magic; | 
|  | 96 | __le32 version; | 
|  | 97 | __le32 time; | 
|  | 98 |  | 
|  | 99 | __le64 trans_id; | 
|  | 100 |  | 
|  | 101 | /* | 
|  | 102 | * Root held by userspace transactions. | 
|  | 103 | */ | 
|  | 104 | __le64 held_root; | 
|  | 105 |  | 
|  | 106 | __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; | 
|  | 107 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | 
|  | 108 |  | 
|  | 109 | /* | 
|  | 110 | * 2-level btree mapping (dev_id, (dev block, time)) -> data block | 
|  | 111 | */ | 
|  | 112 | __le64 data_mapping_root; | 
|  | 113 |  | 
|  | 114 | /* | 
|  | 115 | * Device detail root mapping dev_id -> device_details | 
|  | 116 | */ | 
|  | 117 | __le64 device_details_root; | 
|  | 118 |  | 
|  | 119 | __le32 data_block_size;		/* In 512-byte sectors. */ | 
|  | 120 |  | 
|  | 121 | __le32 metadata_block_size;	/* In 512-byte sectors. */ | 
|  | 122 | __le64 metadata_nr_blocks; | 
|  | 123 |  | 
|  | 124 | __le32 compat_flags; | 
|  | 125 | __le32 compat_ro_flags; | 
|  | 126 | __le32 incompat_flags; | 
|  | 127 | } __packed; | 
|  | 128 |  | 
|  | 129 | struct disk_device_details { | 
|  | 130 | __le64 mapped_blocks; | 
|  | 131 | __le64 transaction_id;		/* When created. */ | 
|  | 132 | __le32 creation_time; | 
|  | 133 | __le32 snapshotted_time; | 
|  | 134 | } __packed; | 
|  | 135 |  | 
|  | 136 | struct dm_pool_metadata { | 
|  | 137 | struct hlist_node hash; | 
|  | 138 |  | 
|  | 139 | struct block_device *bdev; | 
|  | 140 | struct dm_block_manager *bm; | 
|  | 141 | struct dm_space_map *metadata_sm; | 
|  | 142 | struct dm_space_map *data_sm; | 
|  | 143 | struct dm_transaction_manager *tm; | 
|  | 144 | struct dm_transaction_manager *nb_tm; | 
|  | 145 |  | 
|  | 146 | /* | 
|  | 147 | * Two-level btree. | 
|  | 148 | * First level holds thin_dev_t. | 
|  | 149 | * Second level holds mappings. | 
|  | 150 | */ | 
|  | 151 | struct dm_btree_info info; | 
|  | 152 |  | 
|  | 153 | /* | 
|  | 154 | * Non-blocking version of the above. | 
|  | 155 | */ | 
|  | 156 | struct dm_btree_info nb_info; | 
|  | 157 |  | 
|  | 158 | /* | 
|  | 159 | * Just the top level for deleting whole devices. | 
|  | 160 | */ | 
|  | 161 | struct dm_btree_info tl_info; | 
|  | 162 |  | 
|  | 163 | /* | 
|  | 164 | * Just the bottom level for creating new devices. | 
|  | 165 | */ | 
|  | 166 | struct dm_btree_info bl_info; | 
|  | 167 |  | 
|  | 168 | /* | 
|  | 169 | * Describes the device details btree. | 
|  | 170 | */ | 
|  | 171 | struct dm_btree_info details_info; | 
|  | 172 |  | 
|  | 173 | struct rw_semaphore root_lock; | 
|  | 174 | uint32_t time; | 
|  | 175 | int need_commit; | 
|  | 176 | dm_block_t root; | 
|  | 177 | dm_block_t details_root; | 
|  | 178 | struct list_head thin_devices; | 
|  | 179 | uint64_t trans_id; | 
|  | 180 | unsigned long flags; | 
|  | 181 | sector_t data_block_size; | 
|  | 182 | }; | 
|  | 183 |  | 
|  | 184 | struct dm_thin_device { | 
|  | 185 | struct list_head list; | 
|  | 186 | struct dm_pool_metadata *pmd; | 
|  | 187 | dm_thin_id id; | 
|  | 188 |  | 
|  | 189 | int open_count; | 
|  | 190 | int changed; | 
|  | 191 | uint64_t mapped_blocks; | 
|  | 192 | uint64_t transaction_id; | 
|  | 193 | uint32_t creation_time; | 
|  | 194 | uint32_t snapshotted_time; | 
|  | 195 | }; | 
|  | 196 |  | 
|  | 197 | /*---------------------------------------------------------------- | 
|  | 198 | * superblock validator | 
|  | 199 | *--------------------------------------------------------------*/ | 
|  | 200 |  | 
|  | 201 | #define SUPERBLOCK_CSUM_XOR 160774 | 
|  | 202 |  | 
|  | 203 | static void sb_prepare_for_write(struct dm_block_validator *v, | 
|  | 204 | struct dm_block *b, | 
|  | 205 | size_t block_size) | 
|  | 206 | { | 
|  | 207 | struct thin_disk_superblock *disk_super = dm_block_data(b); | 
|  | 208 |  | 
|  | 209 | disk_super->blocknr = cpu_to_le64(dm_block_location(b)); | 
|  | 210 | disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, | 
|  | 211 | block_size - sizeof(__le32), | 
|  | 212 | SUPERBLOCK_CSUM_XOR)); | 
|  | 213 | } | 
|  | 214 |  | 
|  | 215 | static int sb_check(struct dm_block_validator *v, | 
|  | 216 | struct dm_block *b, | 
|  | 217 | size_t block_size) | 
|  | 218 | { | 
|  | 219 | struct thin_disk_superblock *disk_super = dm_block_data(b); | 
|  | 220 | __le32 csum_le; | 
|  | 221 |  | 
|  | 222 | if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { | 
|  | 223 | DMERR("sb_check failed: blocknr %llu: " | 
|  | 224 | "wanted %llu", le64_to_cpu(disk_super->blocknr), | 
|  | 225 | (unsigned long long)dm_block_location(b)); | 
|  | 226 | return -ENOTBLK; | 
|  | 227 | } | 
|  | 228 |  | 
|  | 229 | if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { | 
|  | 230 | DMERR("sb_check failed: magic %llu: " | 
|  | 231 | "wanted %llu", le64_to_cpu(disk_super->magic), | 
|  | 232 | (unsigned long long)THIN_SUPERBLOCK_MAGIC); | 
|  | 233 | return -EILSEQ; | 
|  | 234 | } | 
|  | 235 |  | 
|  | 236 | csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, | 
|  | 237 | block_size - sizeof(__le32), | 
|  | 238 | SUPERBLOCK_CSUM_XOR)); | 
|  | 239 | if (csum_le != disk_super->csum) { | 
|  | 240 | DMERR("sb_check failed: csum %u: wanted %u", | 
|  | 241 | le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); | 
|  | 242 | return -EILSEQ; | 
|  | 243 | } | 
|  | 244 |  | 
|  | 245 | return 0; | 
|  | 246 | } | 
|  | 247 |  | 
|  | 248 | static struct dm_block_validator sb_validator = { | 
|  | 249 | .name = "superblock", | 
|  | 250 | .prepare_for_write = sb_prepare_for_write, | 
|  | 251 | .check = sb_check | 
|  | 252 | }; | 
|  | 253 |  | 
|  | 254 | /*---------------------------------------------------------------- | 
|  | 255 | * Methods for the btree value types | 
|  | 256 | *--------------------------------------------------------------*/ | 
|  | 257 |  | 
|  | 258 | static uint64_t pack_block_time(dm_block_t b, uint32_t t) | 
|  | 259 | { | 
|  | 260 | return (b << 24) | t; | 
|  | 261 | } | 
|  | 262 |  | 
|  | 263 | static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) | 
|  | 264 | { | 
|  | 265 | *b = v >> 24; | 
|  | 266 | *t = v & ((1 << 24) - 1); | 
|  | 267 | } | 
|  | 268 |  | 
|  | 269 | static void data_block_inc(void *context, void *value_le) | 
|  | 270 | { | 
|  | 271 | struct dm_space_map *sm = context; | 
|  | 272 | __le64 v_le; | 
|  | 273 | uint64_t b; | 
|  | 274 | uint32_t t; | 
|  | 275 |  | 
|  | 276 | memcpy(&v_le, value_le, sizeof(v_le)); | 
|  | 277 | unpack_block_time(le64_to_cpu(v_le), &b, &t); | 
|  | 278 | dm_sm_inc_block(sm, b); | 
|  | 279 | } | 
|  | 280 |  | 
|  | 281 | static void data_block_dec(void *context, void *value_le) | 
|  | 282 | { | 
|  | 283 | struct dm_space_map *sm = context; | 
|  | 284 | __le64 v_le; | 
|  | 285 | uint64_t b; | 
|  | 286 | uint32_t t; | 
|  | 287 |  | 
|  | 288 | memcpy(&v_le, value_le, sizeof(v_le)); | 
|  | 289 | unpack_block_time(le64_to_cpu(v_le), &b, &t); | 
|  | 290 | dm_sm_dec_block(sm, b); | 
|  | 291 | } | 
|  | 292 |  | 
|  | 293 | static int data_block_equal(void *context, void *value1_le, void *value2_le) | 
|  | 294 | { | 
|  | 295 | __le64 v1_le, v2_le; | 
|  | 296 | uint64_t b1, b2; | 
|  | 297 | uint32_t t; | 
|  | 298 |  | 
|  | 299 | memcpy(&v1_le, value1_le, sizeof(v1_le)); | 
|  | 300 | memcpy(&v2_le, value2_le, sizeof(v2_le)); | 
|  | 301 | unpack_block_time(le64_to_cpu(v1_le), &b1, &t); | 
|  | 302 | unpack_block_time(le64_to_cpu(v2_le), &b2, &t); | 
|  | 303 |  | 
|  | 304 | return b1 == b2; | 
|  | 305 | } | 
|  | 306 |  | 
|  | 307 | static void subtree_inc(void *context, void *value) | 
|  | 308 | { | 
|  | 309 | struct dm_btree_info *info = context; | 
|  | 310 | __le64 root_le; | 
|  | 311 | uint64_t root; | 
|  | 312 |  | 
|  | 313 | memcpy(&root_le, value, sizeof(root_le)); | 
|  | 314 | root = le64_to_cpu(root_le); | 
|  | 315 | dm_tm_inc(info->tm, root); | 
|  | 316 | } | 
|  | 317 |  | 
|  | 318 | static void subtree_dec(void *context, void *value) | 
|  | 319 | { | 
|  | 320 | struct dm_btree_info *info = context; | 
|  | 321 | __le64 root_le; | 
|  | 322 | uint64_t root; | 
|  | 323 |  | 
|  | 324 | memcpy(&root_le, value, sizeof(root_le)); | 
|  | 325 | root = le64_to_cpu(root_le); | 
|  | 326 | if (dm_btree_del(info, root)) | 
|  | 327 | DMERR("btree delete failed\n"); | 
|  | 328 | } | 
|  | 329 |  | 
|  | 330 | static int subtree_equal(void *context, void *value1_le, void *value2_le) | 
|  | 331 | { | 
|  | 332 | __le64 v1_le, v2_le; | 
|  | 333 | memcpy(&v1_le, value1_le, sizeof(v1_le)); | 
|  | 334 | memcpy(&v2_le, value2_le, sizeof(v2_le)); | 
|  | 335 |  | 
|  | 336 | return v1_le == v2_le; | 
|  | 337 | } | 
|  | 338 |  | 
|  | 339 | /*----------------------------------------------------------------*/ | 
|  | 340 |  | 
|  | 341 | static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) | 
|  | 342 | { | 
|  | 343 | int r; | 
|  | 344 | unsigned i; | 
|  | 345 | struct dm_block *b; | 
|  | 346 | __le64 *data_le, zero = cpu_to_le64(0); | 
|  | 347 | unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); | 
|  | 348 |  | 
|  | 349 | /* | 
|  | 350 | * We can't use a validator here - it may be all zeroes. | 
|  | 351 | */ | 
|  | 352 | r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); | 
|  | 353 | if (r) | 
|  | 354 | return r; | 
|  | 355 |  | 
|  | 356 | data_le = dm_block_data(b); | 
|  | 357 | *result = 1; | 
|  | 358 | for (i = 0; i < block_size; i++) { | 
|  | 359 | if (data_le[i] != zero) { | 
|  | 360 | *result = 0; | 
|  | 361 | break; | 
|  | 362 | } | 
|  | 363 | } | 
|  | 364 |  | 
|  | 365 | return dm_bm_unlock(b); | 
|  | 366 | } | 
|  | 367 |  | 
|  | 368 | static int init_pmd(struct dm_pool_metadata *pmd, | 
|  | 369 | struct dm_block_manager *bm, | 
|  | 370 | dm_block_t nr_blocks, int create) | 
|  | 371 | { | 
|  | 372 | int r; | 
|  | 373 | struct dm_space_map *sm, *data_sm; | 
|  | 374 | struct dm_transaction_manager *tm; | 
|  | 375 | struct dm_block *sblock; | 
|  | 376 |  | 
|  | 377 | if (create) { | 
|  | 378 | r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, | 
|  | 379 | &sb_validator, &tm, &sm, &sblock); | 
|  | 380 | if (r < 0) { | 
|  | 381 | DMERR("tm_create_with_sm failed"); | 
|  | 382 | return r; | 
|  | 383 | } | 
|  | 384 |  | 
|  | 385 | data_sm = dm_sm_disk_create(tm, nr_blocks); | 
|  | 386 | if (IS_ERR(data_sm)) { | 
|  | 387 | DMERR("sm_disk_create failed"); | 
|  | 388 | r = PTR_ERR(data_sm); | 
|  | 389 | goto bad; | 
|  | 390 | } | 
|  | 391 | } else { | 
|  | 392 | struct thin_disk_superblock *disk_super = NULL; | 
|  | 393 | size_t space_map_root_offset = | 
|  | 394 | offsetof(struct thin_disk_superblock, metadata_space_map_root); | 
|  | 395 |  | 
|  | 396 | r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, | 
|  | 397 | &sb_validator, space_map_root_offset, | 
|  | 398 | SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); | 
|  | 399 | if (r < 0) { | 
|  | 400 | DMERR("tm_open_with_sm failed"); | 
|  | 401 | return r; | 
|  | 402 | } | 
|  | 403 |  | 
|  | 404 | disk_super = dm_block_data(sblock); | 
|  | 405 | data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, | 
|  | 406 | sizeof(disk_super->data_space_map_root)); | 
|  | 407 | if (IS_ERR(data_sm)) { | 
|  | 408 | DMERR("sm_disk_open failed"); | 
|  | 409 | r = PTR_ERR(data_sm); | 
|  | 410 | goto bad; | 
|  | 411 | } | 
|  | 412 | } | 
|  | 413 |  | 
|  | 414 |  | 
|  | 415 | r = dm_tm_unlock(tm, sblock); | 
|  | 416 | if (r < 0) { | 
|  | 417 | DMERR("couldn't unlock superblock"); | 
|  | 418 | goto bad_data_sm; | 
|  | 419 | } | 
|  | 420 |  | 
|  | 421 | pmd->bm = bm; | 
|  | 422 | pmd->metadata_sm = sm; | 
|  | 423 | pmd->data_sm = data_sm; | 
|  | 424 | pmd->tm = tm; | 
|  | 425 | pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); | 
|  | 426 | if (!pmd->nb_tm) { | 
|  | 427 | DMERR("could not create clone tm"); | 
|  | 428 | r = -ENOMEM; | 
|  | 429 | goto bad_data_sm; | 
|  | 430 | } | 
|  | 431 |  | 
|  | 432 | pmd->info.tm = tm; | 
|  | 433 | pmd->info.levels = 2; | 
|  | 434 | pmd->info.value_type.context = pmd->data_sm; | 
|  | 435 | pmd->info.value_type.size = sizeof(__le64); | 
|  | 436 | pmd->info.value_type.inc = data_block_inc; | 
|  | 437 | pmd->info.value_type.dec = data_block_dec; | 
|  | 438 | pmd->info.value_type.equal = data_block_equal; | 
|  | 439 |  | 
|  | 440 | memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); | 
|  | 441 | pmd->nb_info.tm = pmd->nb_tm; | 
|  | 442 |  | 
|  | 443 | pmd->tl_info.tm = tm; | 
|  | 444 | pmd->tl_info.levels = 1; | 
|  | 445 | pmd->tl_info.value_type.context = &pmd->info; | 
|  | 446 | pmd->tl_info.value_type.size = sizeof(__le64); | 
|  | 447 | pmd->tl_info.value_type.inc = subtree_inc; | 
|  | 448 | pmd->tl_info.value_type.dec = subtree_dec; | 
|  | 449 | pmd->tl_info.value_type.equal = subtree_equal; | 
|  | 450 |  | 
|  | 451 | pmd->bl_info.tm = tm; | 
|  | 452 | pmd->bl_info.levels = 1; | 
|  | 453 | pmd->bl_info.value_type.context = pmd->data_sm; | 
|  | 454 | pmd->bl_info.value_type.size = sizeof(__le64); | 
|  | 455 | pmd->bl_info.value_type.inc = data_block_inc; | 
|  | 456 | pmd->bl_info.value_type.dec = data_block_dec; | 
|  | 457 | pmd->bl_info.value_type.equal = data_block_equal; | 
|  | 458 |  | 
|  | 459 | pmd->details_info.tm = tm; | 
|  | 460 | pmd->details_info.levels = 1; | 
|  | 461 | pmd->details_info.value_type.context = NULL; | 
|  | 462 | pmd->details_info.value_type.size = sizeof(struct disk_device_details); | 
|  | 463 | pmd->details_info.value_type.inc = NULL; | 
|  | 464 | pmd->details_info.value_type.dec = NULL; | 
|  | 465 | pmd->details_info.value_type.equal = NULL; | 
|  | 466 |  | 
|  | 467 | pmd->root = 0; | 
|  | 468 |  | 
|  | 469 | init_rwsem(&pmd->root_lock); | 
|  | 470 | pmd->time = 0; | 
|  | 471 | pmd->need_commit = 0; | 
|  | 472 | pmd->details_root = 0; | 
|  | 473 | pmd->trans_id = 0; | 
|  | 474 | pmd->flags = 0; | 
|  | 475 | INIT_LIST_HEAD(&pmd->thin_devices); | 
|  | 476 |  | 
|  | 477 | return 0; | 
|  | 478 |  | 
|  | 479 | bad_data_sm: | 
|  | 480 | dm_sm_destroy(data_sm); | 
|  | 481 | bad: | 
|  | 482 | dm_tm_destroy(tm); | 
|  | 483 | dm_sm_destroy(sm); | 
|  | 484 |  | 
|  | 485 | return r; | 
|  | 486 | } | 
|  | 487 |  | 
|  | 488 | static int __begin_transaction(struct dm_pool_metadata *pmd) | 
|  | 489 | { | 
|  | 490 | int r; | 
|  | 491 | u32 features; | 
|  | 492 | struct thin_disk_superblock *disk_super; | 
|  | 493 | struct dm_block *sblock; | 
|  | 494 |  | 
|  | 495 | /* | 
|  | 496 | * __maybe_commit_transaction() resets these | 
|  | 497 | */ | 
|  | 498 | WARN_ON(pmd->need_commit); | 
|  | 499 |  | 
|  | 500 | /* | 
|  | 501 | * We re-read the superblock every time.  Shouldn't need to do this | 
|  | 502 | * really. | 
|  | 503 | */ | 
|  | 504 | r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 
|  | 505 | &sb_validator, &sblock); | 
|  | 506 | if (r) | 
|  | 507 | return r; | 
|  | 508 |  | 
|  | 509 | disk_super = dm_block_data(sblock); | 
|  | 510 | pmd->time = le32_to_cpu(disk_super->time); | 
|  | 511 | pmd->root = le64_to_cpu(disk_super->data_mapping_root); | 
|  | 512 | pmd->details_root = le64_to_cpu(disk_super->device_details_root); | 
|  | 513 | pmd->trans_id = le64_to_cpu(disk_super->trans_id); | 
|  | 514 | pmd->flags = le32_to_cpu(disk_super->flags); | 
|  | 515 | pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); | 
|  | 516 |  | 
|  | 517 | features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; | 
|  | 518 | if (features) { | 
|  | 519 | DMERR("could not access metadata due to " | 
|  | 520 | "unsupported optional features (%lx).", | 
|  | 521 | (unsigned long)features); | 
|  | 522 | r = -EINVAL; | 
|  | 523 | goto out; | 
|  | 524 | } | 
|  | 525 |  | 
|  | 526 | /* | 
|  | 527 | * Check for read-only metadata to skip the following RDWR checks. | 
|  | 528 | */ | 
|  | 529 | if (get_disk_ro(pmd->bdev->bd_disk)) | 
|  | 530 | goto out; | 
|  | 531 |  | 
|  | 532 | features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; | 
|  | 533 | if (features) { | 
|  | 534 | DMERR("could not access metadata RDWR due to " | 
|  | 535 | "unsupported optional features (%lx).", | 
|  | 536 | (unsigned long)features); | 
|  | 537 | r = -EINVAL; | 
|  | 538 | } | 
|  | 539 |  | 
|  | 540 | out: | 
|  | 541 | dm_bm_unlock(sblock); | 
|  | 542 | return r; | 
|  | 543 | } | 
|  | 544 |  | 
|  | 545 | static int __write_changed_details(struct dm_pool_metadata *pmd) | 
|  | 546 | { | 
|  | 547 | int r; | 
|  | 548 | struct dm_thin_device *td, *tmp; | 
|  | 549 | struct disk_device_details details; | 
|  | 550 | uint64_t key; | 
|  | 551 |  | 
|  | 552 | list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { | 
|  | 553 | if (!td->changed) | 
|  | 554 | continue; | 
|  | 555 |  | 
|  | 556 | key = td->id; | 
|  | 557 |  | 
|  | 558 | details.mapped_blocks = cpu_to_le64(td->mapped_blocks); | 
|  | 559 | details.transaction_id = cpu_to_le64(td->transaction_id); | 
|  | 560 | details.creation_time = cpu_to_le32(td->creation_time); | 
|  | 561 | details.snapshotted_time = cpu_to_le32(td->snapshotted_time); | 
|  | 562 | __dm_bless_for_disk(&details); | 
|  | 563 |  | 
|  | 564 | r = dm_btree_insert(&pmd->details_info, pmd->details_root, | 
|  | 565 | &key, &details, &pmd->details_root); | 
|  | 566 | if (r) | 
|  | 567 | return r; | 
|  | 568 |  | 
|  | 569 | if (td->open_count) | 
|  | 570 | td->changed = 0; | 
|  | 571 | else { | 
|  | 572 | list_del(&td->list); | 
|  | 573 | kfree(td); | 
|  | 574 | } | 
|  | 575 |  | 
|  | 576 | pmd->need_commit = 1; | 
|  | 577 | } | 
|  | 578 |  | 
|  | 579 | return 0; | 
|  | 580 | } | 
|  | 581 |  | 
|  | 582 | static int __commit_transaction(struct dm_pool_metadata *pmd) | 
|  | 583 | { | 
|  | 584 | /* | 
|  | 585 | * FIXME: Associated pool should be made read-only on failure. | 
|  | 586 | */ | 
|  | 587 | int r; | 
|  | 588 | size_t metadata_len, data_len; | 
|  | 589 | struct thin_disk_superblock *disk_super; | 
|  | 590 | struct dm_block *sblock; | 
|  | 591 |  | 
|  | 592 | /* | 
|  | 593 | * We need to know if the thin_disk_superblock exceeds a 512-byte sector. | 
|  | 594 | */ | 
|  | 595 | BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); | 
|  | 596 |  | 
|  | 597 | r = __write_changed_details(pmd); | 
|  | 598 | if (r < 0) | 
|  | 599 | goto out; | 
|  | 600 |  | 
|  | 601 | if (!pmd->need_commit) | 
|  | 602 | goto out; | 
|  | 603 |  | 
|  | 604 | r = dm_sm_commit(pmd->data_sm); | 
|  | 605 | if (r < 0) | 
|  | 606 | goto out; | 
|  | 607 |  | 
|  | 608 | r = dm_tm_pre_commit(pmd->tm); | 
|  | 609 | if (r < 0) | 
|  | 610 | goto out; | 
|  | 611 |  | 
|  | 612 | r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); | 
|  | 613 | if (r < 0) | 
|  | 614 | goto out; | 
|  | 615 |  | 
|  | 616 | r = dm_sm_root_size(pmd->metadata_sm, &data_len); | 
|  | 617 | if (r < 0) | 
|  | 618 | goto out; | 
|  | 619 |  | 
|  | 620 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 
|  | 621 | &sb_validator, &sblock); | 
|  | 622 | if (r) | 
|  | 623 | goto out; | 
|  | 624 |  | 
|  | 625 | disk_super = dm_block_data(sblock); | 
|  | 626 | disk_super->time = cpu_to_le32(pmd->time); | 
|  | 627 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); | 
|  | 628 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); | 
|  | 629 | disk_super->trans_id = cpu_to_le64(pmd->trans_id); | 
|  | 630 | disk_super->flags = cpu_to_le32(pmd->flags); | 
|  | 631 |  | 
|  | 632 | r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, | 
|  | 633 | metadata_len); | 
|  | 634 | if (r < 0) | 
|  | 635 | goto out_locked; | 
|  | 636 |  | 
|  | 637 | r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, | 
|  | 638 | data_len); | 
|  | 639 | if (r < 0) | 
|  | 640 | goto out_locked; | 
|  | 641 |  | 
|  | 642 | r = dm_tm_commit(pmd->tm, sblock); | 
|  | 643 | if (!r) | 
|  | 644 | pmd->need_commit = 0; | 
|  | 645 |  | 
|  | 646 | out: | 
|  | 647 | return r; | 
|  | 648 |  | 
|  | 649 | out_locked: | 
|  | 650 | dm_bm_unlock(sblock); | 
|  | 651 | return r; | 
|  | 652 | } | 
|  | 653 |  | 
|  | 654 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | 
|  | 655 | sector_t data_block_size) | 
|  | 656 | { | 
|  | 657 | int r; | 
|  | 658 | struct thin_disk_superblock *disk_super; | 
|  | 659 | struct dm_pool_metadata *pmd; | 
|  | 660 | sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | 
|  | 661 | struct dm_block_manager *bm; | 
|  | 662 | int create; | 
|  | 663 | struct dm_block *sblock; | 
|  | 664 |  | 
|  | 665 | pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); | 
|  | 666 | if (!pmd) { | 
|  | 667 | DMERR("could not allocate metadata struct"); | 
|  | 668 | return ERR_PTR(-ENOMEM); | 
|  | 669 | } | 
|  | 670 |  | 
|  | 671 | /* | 
|  | 672 | * Max hex locks: | 
|  | 673 | *  3 for btree insert + | 
|  | 674 | *  2 for btree lookup used within space map | 
|  | 675 | */ | 
|  | 676 | bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, | 
|  | 677 | THIN_METADATA_CACHE_SIZE, 5); | 
|  | 678 | if (!bm) { | 
|  | 679 | DMERR("could not create block manager"); | 
|  | 680 | kfree(pmd); | 
|  | 681 | return ERR_PTR(-ENOMEM); | 
|  | 682 | } | 
|  | 683 |  | 
|  | 684 | r = superblock_all_zeroes(bm, &create); | 
|  | 685 | if (r) { | 
|  | 686 | dm_block_manager_destroy(bm); | 
|  | 687 | kfree(pmd); | 
|  | 688 | return ERR_PTR(r); | 
|  | 689 | } | 
|  | 690 |  | 
|  | 691 |  | 
|  | 692 | r = init_pmd(pmd, bm, 0, create); | 
|  | 693 | if (r) { | 
|  | 694 | dm_block_manager_destroy(bm); | 
|  | 695 | kfree(pmd); | 
|  | 696 | return ERR_PTR(r); | 
|  | 697 | } | 
|  | 698 | pmd->bdev = bdev; | 
|  | 699 |  | 
|  | 700 | if (!create) { | 
|  | 701 | r = __begin_transaction(pmd); | 
|  | 702 | if (r < 0) | 
|  | 703 | goto bad; | 
|  | 704 | return pmd; | 
|  | 705 | } | 
|  | 706 |  | 
|  | 707 | /* | 
|  | 708 | * Create. | 
|  | 709 | */ | 
|  | 710 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 
|  | 711 | &sb_validator, &sblock); | 
|  | 712 | if (r) | 
|  | 713 | goto bad; | 
|  | 714 |  | 
|  | 715 | disk_super = dm_block_data(sblock); | 
|  | 716 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); | 
|  | 717 | disk_super->version = cpu_to_le32(THIN_VERSION); | 
|  | 718 | disk_super->time = 0; | 
|  | 719 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | 
|  | 720 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); | 
|  | 721 | disk_super->data_block_size = cpu_to_le32(data_block_size); | 
|  | 722 |  | 
|  | 723 | r = dm_bm_unlock(sblock); | 
|  | 724 | if (r < 0) | 
|  | 725 | goto bad; | 
|  | 726 |  | 
|  | 727 | r = dm_btree_empty(&pmd->info, &pmd->root); | 
|  | 728 | if (r < 0) | 
|  | 729 | goto bad; | 
|  | 730 |  | 
|  | 731 | r = dm_btree_empty(&pmd->details_info, &pmd->details_root); | 
|  | 732 | if (r < 0) { | 
|  | 733 | DMERR("couldn't create devices root"); | 
|  | 734 | goto bad; | 
|  | 735 | } | 
|  | 736 |  | 
|  | 737 | pmd->flags = 0; | 
|  | 738 | pmd->need_commit = 1; | 
|  | 739 | r = dm_pool_commit_metadata(pmd); | 
|  | 740 | if (r < 0) { | 
|  | 741 | DMERR("%s: dm_pool_commit_metadata() failed, error = %d", | 
|  | 742 | __func__, r); | 
|  | 743 | goto bad; | 
|  | 744 | } | 
|  | 745 |  | 
|  | 746 | return pmd; | 
|  | 747 |  | 
|  | 748 | bad: | 
|  | 749 | if (dm_pool_metadata_close(pmd) < 0) | 
|  | 750 | DMWARN("%s: dm_pool_metadata_close() failed.", __func__); | 
|  | 751 | return ERR_PTR(r); | 
|  | 752 | } | 
|  | 753 |  | 
|  | 754 | int dm_pool_metadata_close(struct dm_pool_metadata *pmd) | 
|  | 755 | { | 
|  | 756 | int r; | 
|  | 757 | unsigned open_devices = 0; | 
|  | 758 | struct dm_thin_device *td, *tmp; | 
|  | 759 |  | 
|  | 760 | down_read(&pmd->root_lock); | 
|  | 761 | list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { | 
|  | 762 | if (td->open_count) | 
|  | 763 | open_devices++; | 
|  | 764 | else { | 
|  | 765 | list_del(&td->list); | 
|  | 766 | kfree(td); | 
|  | 767 | } | 
|  | 768 | } | 
|  | 769 | up_read(&pmd->root_lock); | 
|  | 770 |  | 
|  | 771 | if (open_devices) { | 
|  | 772 | DMERR("attempt to close pmd when %u device(s) are still open", | 
|  | 773 | open_devices); | 
|  | 774 | return -EBUSY; | 
|  | 775 | } | 
|  | 776 |  | 
|  | 777 | r = __commit_transaction(pmd); | 
|  | 778 | if (r < 0) | 
|  | 779 | DMWARN("%s: __commit_transaction() failed, error = %d", | 
|  | 780 | __func__, r); | 
|  | 781 |  | 
|  | 782 | dm_tm_destroy(pmd->tm); | 
|  | 783 | dm_tm_destroy(pmd->nb_tm); | 
|  | 784 | dm_block_manager_destroy(pmd->bm); | 
|  | 785 | dm_sm_destroy(pmd->metadata_sm); | 
|  | 786 | dm_sm_destroy(pmd->data_sm); | 
|  | 787 | kfree(pmd); | 
|  | 788 |  | 
|  | 789 | return 0; | 
|  | 790 | } | 
|  | 791 |  | 
|  | 792 | static int __open_device(struct dm_pool_metadata *pmd, | 
|  | 793 | dm_thin_id dev, int create, | 
|  | 794 | struct dm_thin_device **td) | 
|  | 795 | { | 
|  | 796 | int r, changed = 0; | 
|  | 797 | struct dm_thin_device *td2; | 
|  | 798 | uint64_t key = dev; | 
|  | 799 | struct disk_device_details details_le; | 
|  | 800 |  | 
|  | 801 | /* | 
|  | 802 | * Check the device isn't already open. | 
|  | 803 | */ | 
|  | 804 | list_for_each_entry(td2, &pmd->thin_devices, list) | 
|  | 805 | if (td2->id == dev) { | 
|  | 806 | td2->open_count++; | 
|  | 807 | *td = td2; | 
|  | 808 | return 0; | 
|  | 809 | } | 
|  | 810 |  | 
|  | 811 | /* | 
|  | 812 | * Check the device exists. | 
|  | 813 | */ | 
|  | 814 | r = dm_btree_lookup(&pmd->details_info, pmd->details_root, | 
|  | 815 | &key, &details_le); | 
|  | 816 | if (r) { | 
|  | 817 | if (r != -ENODATA || !create) | 
|  | 818 | return r; | 
|  | 819 |  | 
|  | 820 | changed = 1; | 
|  | 821 | details_le.mapped_blocks = 0; | 
|  | 822 | details_le.transaction_id = cpu_to_le64(pmd->trans_id); | 
|  | 823 | details_le.creation_time = cpu_to_le32(pmd->time); | 
|  | 824 | details_le.snapshotted_time = cpu_to_le32(pmd->time); | 
|  | 825 | } | 
|  | 826 |  | 
|  | 827 | *td = kmalloc(sizeof(**td), GFP_NOIO); | 
|  | 828 | if (!*td) | 
|  | 829 | return -ENOMEM; | 
|  | 830 |  | 
|  | 831 | (*td)->pmd = pmd; | 
|  | 832 | (*td)->id = dev; | 
|  | 833 | (*td)->open_count = 1; | 
|  | 834 | (*td)->changed = changed; | 
|  | 835 | (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); | 
|  | 836 | (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); | 
|  | 837 | (*td)->creation_time = le32_to_cpu(details_le.creation_time); | 
|  | 838 | (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); | 
|  | 839 |  | 
|  | 840 | list_add(&(*td)->list, &pmd->thin_devices); | 
|  | 841 |  | 
|  | 842 | return 0; | 
|  | 843 | } | 
|  | 844 |  | 
|  | 845 | static void __close_device(struct dm_thin_device *td) | 
|  | 846 | { | 
|  | 847 | --td->open_count; | 
|  | 848 | } | 
|  | 849 |  | 
|  | 850 | static int __create_thin(struct dm_pool_metadata *pmd, | 
|  | 851 | dm_thin_id dev) | 
|  | 852 | { | 
|  | 853 | int r; | 
|  | 854 | dm_block_t dev_root; | 
|  | 855 | uint64_t key = dev; | 
|  | 856 | struct disk_device_details details_le; | 
|  | 857 | struct dm_thin_device *td; | 
|  | 858 | __le64 value; | 
|  | 859 |  | 
|  | 860 | r = dm_btree_lookup(&pmd->details_info, pmd->details_root, | 
|  | 861 | &key, &details_le); | 
|  | 862 | if (!r) | 
|  | 863 | return -EEXIST; | 
|  | 864 |  | 
|  | 865 | /* | 
|  | 866 | * Create an empty btree for the mappings. | 
|  | 867 | */ | 
|  | 868 | r = dm_btree_empty(&pmd->bl_info, &dev_root); | 
|  | 869 | if (r) | 
|  | 870 | return r; | 
|  | 871 |  | 
|  | 872 | /* | 
|  | 873 | * Insert it into the main mapping tree. | 
|  | 874 | */ | 
|  | 875 | value = cpu_to_le64(dev_root); | 
|  | 876 | __dm_bless_for_disk(&value); | 
|  | 877 | r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); | 
|  | 878 | if (r) { | 
|  | 879 | dm_btree_del(&pmd->bl_info, dev_root); | 
|  | 880 | return r; | 
|  | 881 | } | 
|  | 882 |  | 
|  | 883 | r = __open_device(pmd, dev, 1, &td); | 
|  | 884 | if (r) { | 
|  | 885 | __close_device(td); | 
|  | 886 | dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); | 
|  | 887 | dm_btree_del(&pmd->bl_info, dev_root); | 
|  | 888 | return r; | 
|  | 889 | } | 
|  | 890 | td->changed = 1; | 
|  | 891 | __close_device(td); | 
|  | 892 |  | 
|  | 893 | return r; | 
|  | 894 | } | 
|  | 895 |  | 
|  | 896 | int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) | 
|  | 897 | { | 
|  | 898 | int r; | 
|  | 899 |  | 
|  | 900 | down_write(&pmd->root_lock); | 
|  | 901 | r = __create_thin(pmd, dev); | 
|  | 902 | up_write(&pmd->root_lock); | 
|  | 903 |  | 
|  | 904 | return r; | 
|  | 905 | } | 
|  | 906 |  | 
|  | 907 | static int __set_snapshot_details(struct dm_pool_metadata *pmd, | 
|  | 908 | struct dm_thin_device *snap, | 
|  | 909 | dm_thin_id origin, uint32_t time) | 
|  | 910 | { | 
|  | 911 | int r; | 
|  | 912 | struct dm_thin_device *td; | 
|  | 913 |  | 
|  | 914 | r = __open_device(pmd, origin, 0, &td); | 
|  | 915 | if (r) | 
|  | 916 | return r; | 
|  | 917 |  | 
|  | 918 | td->changed = 1; | 
|  | 919 | td->snapshotted_time = time; | 
|  | 920 |  | 
|  | 921 | snap->mapped_blocks = td->mapped_blocks; | 
|  | 922 | snap->snapshotted_time = time; | 
|  | 923 | __close_device(td); | 
|  | 924 |  | 
|  | 925 | return 0; | 
|  | 926 | } | 
|  | 927 |  | 
|  | 928 | static int __create_snap(struct dm_pool_metadata *pmd, | 
|  | 929 | dm_thin_id dev, dm_thin_id origin) | 
|  | 930 | { | 
|  | 931 | int r; | 
|  | 932 | dm_block_t origin_root; | 
|  | 933 | uint64_t key = origin, dev_key = dev; | 
|  | 934 | struct dm_thin_device *td; | 
|  | 935 | struct disk_device_details details_le; | 
|  | 936 | __le64 value; | 
|  | 937 |  | 
|  | 938 | /* check this device is unused */ | 
|  | 939 | r = dm_btree_lookup(&pmd->details_info, pmd->details_root, | 
|  | 940 | &dev_key, &details_le); | 
|  | 941 | if (!r) | 
|  | 942 | return -EEXIST; | 
|  | 943 |  | 
|  | 944 | /* find the mapping tree for the origin */ | 
|  | 945 | r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); | 
|  | 946 | if (r) | 
|  | 947 | return r; | 
|  | 948 | origin_root = le64_to_cpu(value); | 
|  | 949 |  | 
|  | 950 | /* clone the origin, an inc will do */ | 
|  | 951 | dm_tm_inc(pmd->tm, origin_root); | 
|  | 952 |  | 
|  | 953 | /* insert into the main mapping tree */ | 
|  | 954 | value = cpu_to_le64(origin_root); | 
|  | 955 | __dm_bless_for_disk(&value); | 
|  | 956 | key = dev; | 
|  | 957 | r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); | 
|  | 958 | if (r) { | 
|  | 959 | dm_tm_dec(pmd->tm, origin_root); | 
|  | 960 | return r; | 
|  | 961 | } | 
|  | 962 |  | 
|  | 963 | pmd->time++; | 
|  | 964 |  | 
|  | 965 | r = __open_device(pmd, dev, 1, &td); | 
|  | 966 | if (r) | 
|  | 967 | goto bad; | 
|  | 968 |  | 
|  | 969 | r = __set_snapshot_details(pmd, td, origin, pmd->time); | 
|  | 970 | if (r) | 
|  | 971 | goto bad; | 
|  | 972 |  | 
|  | 973 | __close_device(td); | 
|  | 974 | return 0; | 
|  | 975 |  | 
|  | 976 | bad: | 
|  | 977 | __close_device(td); | 
|  | 978 | dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); | 
|  | 979 | dm_btree_remove(&pmd->details_info, pmd->details_root, | 
|  | 980 | &key, &pmd->details_root); | 
|  | 981 | return r; | 
|  | 982 | } | 
|  | 983 |  | 
|  | 984 | int dm_pool_create_snap(struct dm_pool_metadata *pmd, | 
|  | 985 | dm_thin_id dev, | 
|  | 986 | dm_thin_id origin) | 
|  | 987 | { | 
|  | 988 | int r; | 
|  | 989 |  | 
|  | 990 | down_write(&pmd->root_lock); | 
|  | 991 | r = __create_snap(pmd, dev, origin); | 
|  | 992 | up_write(&pmd->root_lock); | 
|  | 993 |  | 
|  | 994 | return r; | 
|  | 995 | } | 
|  | 996 |  | 
|  | 997 | static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) | 
|  | 998 | { | 
|  | 999 | int r; | 
|  | 1000 | uint64_t key = dev; | 
|  | 1001 | struct dm_thin_device *td; | 
|  | 1002 |  | 
|  | 1003 | /* TODO: failure should mark the transaction invalid */ | 
|  | 1004 | r = __open_device(pmd, dev, 0, &td); | 
|  | 1005 | if (r) | 
|  | 1006 | return r; | 
|  | 1007 |  | 
|  | 1008 | if (td->open_count > 1) { | 
|  | 1009 | __close_device(td); | 
|  | 1010 | return -EBUSY; | 
|  | 1011 | } | 
|  | 1012 |  | 
|  | 1013 | list_del(&td->list); | 
|  | 1014 | kfree(td); | 
|  | 1015 | r = dm_btree_remove(&pmd->details_info, pmd->details_root, | 
|  | 1016 | &key, &pmd->details_root); | 
|  | 1017 | if (r) | 
|  | 1018 | return r; | 
|  | 1019 |  | 
|  | 1020 | r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); | 
|  | 1021 | if (r) | 
|  | 1022 | return r; | 
|  | 1023 |  | 
|  | 1024 | pmd->need_commit = 1; | 
|  | 1025 |  | 
|  | 1026 | return 0; | 
|  | 1027 | } | 
|  | 1028 |  | 
|  | 1029 | int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, | 
|  | 1030 | dm_thin_id dev) | 
|  | 1031 | { | 
|  | 1032 | int r; | 
|  | 1033 |  | 
|  | 1034 | down_write(&pmd->root_lock); | 
|  | 1035 | r = __delete_device(pmd, dev); | 
|  | 1036 | up_write(&pmd->root_lock); | 
|  | 1037 |  | 
|  | 1038 | return r; | 
|  | 1039 | } | 
|  | 1040 |  | 
|  | 1041 | int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, | 
|  | 1042 | uint64_t current_id, | 
|  | 1043 | uint64_t new_id) | 
|  | 1044 | { | 
|  | 1045 | down_write(&pmd->root_lock); | 
|  | 1046 | if (pmd->trans_id != current_id) { | 
|  | 1047 | up_write(&pmd->root_lock); | 
|  | 1048 | DMERR("mismatched transaction id"); | 
|  | 1049 | return -EINVAL; | 
|  | 1050 | } | 
|  | 1051 |  | 
|  | 1052 | pmd->trans_id = new_id; | 
|  | 1053 | pmd->need_commit = 1; | 
|  | 1054 | up_write(&pmd->root_lock); | 
|  | 1055 |  | 
|  | 1056 | return 0; | 
|  | 1057 | } | 
|  | 1058 |  | 
|  | 1059 | int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, | 
|  | 1060 | uint64_t *result) | 
|  | 1061 | { | 
|  | 1062 | down_read(&pmd->root_lock); | 
|  | 1063 | *result = pmd->trans_id; | 
|  | 1064 | up_read(&pmd->root_lock); | 
|  | 1065 |  | 
|  | 1066 | return 0; | 
|  | 1067 | } | 
|  | 1068 |  | 
|  | 1069 | static int __get_held_metadata_root(struct dm_pool_metadata *pmd, | 
|  | 1070 | dm_block_t *result) | 
|  | 1071 | { | 
|  | 1072 | int r; | 
|  | 1073 | struct thin_disk_superblock *disk_super; | 
|  | 1074 | struct dm_block *sblock; | 
|  | 1075 |  | 
|  | 1076 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 
|  | 1077 | &sb_validator, &sblock); | 
|  | 1078 | if (r) | 
|  | 1079 | return r; | 
|  | 1080 |  | 
|  | 1081 | disk_super = dm_block_data(sblock); | 
|  | 1082 | *result = le64_to_cpu(disk_super->held_root); | 
|  | 1083 |  | 
|  | 1084 | return dm_bm_unlock(sblock); | 
|  | 1085 | } | 
|  | 1086 |  | 
|  | 1087 | int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, | 
|  | 1088 | dm_block_t *result) | 
|  | 1089 | { | 
|  | 1090 | int r; | 
|  | 1091 |  | 
|  | 1092 | down_read(&pmd->root_lock); | 
|  | 1093 | r = __get_held_metadata_root(pmd, result); | 
|  | 1094 | up_read(&pmd->root_lock); | 
|  | 1095 |  | 
|  | 1096 | return r; | 
|  | 1097 | } | 
|  | 1098 |  | 
|  | 1099 | int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, | 
|  | 1100 | struct dm_thin_device **td) | 
|  | 1101 | { | 
|  | 1102 | int r; | 
|  | 1103 |  | 
|  | 1104 | down_write(&pmd->root_lock); | 
|  | 1105 | r = __open_device(pmd, dev, 0, td); | 
|  | 1106 | up_write(&pmd->root_lock); | 
|  | 1107 |  | 
|  | 1108 | return r; | 
|  | 1109 | } | 
|  | 1110 |  | 
|  | 1111 | int dm_pool_close_thin_device(struct dm_thin_device *td) | 
|  | 1112 | { | 
|  | 1113 | down_write(&td->pmd->root_lock); | 
|  | 1114 | __close_device(td); | 
|  | 1115 | up_write(&td->pmd->root_lock); | 
|  | 1116 |  | 
|  | 1117 | return 0; | 
|  | 1118 | } | 
|  | 1119 |  | 
|  | 1120 | dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) | 
|  | 1121 | { | 
|  | 1122 | return td->id; | 
|  | 1123 | } | 
|  | 1124 |  | 
|  | 1125 | static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) | 
|  | 1126 | { | 
|  | 1127 | return td->snapshotted_time > time; | 
|  | 1128 | } | 
|  | 1129 |  | 
|  | 1130 | int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, | 
|  | 1131 | int can_block, struct dm_thin_lookup_result *result) | 
|  | 1132 | { | 
|  | 1133 | int r; | 
|  | 1134 | uint64_t block_time = 0; | 
|  | 1135 | __le64 value; | 
|  | 1136 | struct dm_pool_metadata *pmd = td->pmd; | 
|  | 1137 | dm_block_t keys[2] = { td->id, block }; | 
|  | 1138 |  | 
|  | 1139 | if (can_block) { | 
|  | 1140 | down_read(&pmd->root_lock); | 
|  | 1141 | r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); | 
|  | 1142 | if (!r) | 
|  | 1143 | block_time = le64_to_cpu(value); | 
|  | 1144 | up_read(&pmd->root_lock); | 
|  | 1145 |  | 
|  | 1146 | } else if (down_read_trylock(&pmd->root_lock)) { | 
|  | 1147 | r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); | 
|  | 1148 | if (!r) | 
|  | 1149 | block_time = le64_to_cpu(value); | 
|  | 1150 | up_read(&pmd->root_lock); | 
|  | 1151 |  | 
|  | 1152 | } else | 
|  | 1153 | return -EWOULDBLOCK; | 
|  | 1154 |  | 
|  | 1155 | if (!r) { | 
|  | 1156 | dm_block_t exception_block; | 
|  | 1157 | uint32_t exception_time; | 
|  | 1158 | unpack_block_time(block_time, &exception_block, | 
|  | 1159 | &exception_time); | 
|  | 1160 | result->block = exception_block; | 
|  | 1161 | result->shared = __snapshotted_since(td, exception_time); | 
|  | 1162 | } | 
|  | 1163 |  | 
|  | 1164 | return r; | 
|  | 1165 | } | 
|  | 1166 |  | 
|  | 1167 | static int __insert(struct dm_thin_device *td, dm_block_t block, | 
|  | 1168 | dm_block_t data_block) | 
|  | 1169 | { | 
|  | 1170 | int r, inserted; | 
|  | 1171 | __le64 value; | 
|  | 1172 | struct dm_pool_metadata *pmd = td->pmd; | 
|  | 1173 | dm_block_t keys[2] = { td->id, block }; | 
|  | 1174 |  | 
|  | 1175 | pmd->need_commit = 1; | 
|  | 1176 | value = cpu_to_le64(pack_block_time(data_block, pmd->time)); | 
|  | 1177 | __dm_bless_for_disk(&value); | 
|  | 1178 |  | 
|  | 1179 | r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, | 
|  | 1180 | &pmd->root, &inserted); | 
|  | 1181 | if (r) | 
|  | 1182 | return r; | 
|  | 1183 |  | 
|  | 1184 | if (inserted) { | 
|  | 1185 | td->mapped_blocks++; | 
|  | 1186 | td->changed = 1; | 
|  | 1187 | } | 
|  | 1188 |  | 
|  | 1189 | return 0; | 
|  | 1190 | } | 
|  | 1191 |  | 
|  | 1192 | int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, | 
|  | 1193 | dm_block_t data_block) | 
|  | 1194 | { | 
|  | 1195 | int r; | 
|  | 1196 |  | 
|  | 1197 | down_write(&td->pmd->root_lock); | 
|  | 1198 | r = __insert(td, block, data_block); | 
|  | 1199 | up_write(&td->pmd->root_lock); | 
|  | 1200 |  | 
|  | 1201 | return r; | 
|  | 1202 | } | 
|  | 1203 |  | 
|  | 1204 | static int __remove(struct dm_thin_device *td, dm_block_t block) | 
|  | 1205 | { | 
|  | 1206 | int r; | 
|  | 1207 | struct dm_pool_metadata *pmd = td->pmd; | 
|  | 1208 | dm_block_t keys[2] = { td->id, block }; | 
|  | 1209 |  | 
|  | 1210 | r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); | 
|  | 1211 | if (r) | 
|  | 1212 | return r; | 
|  | 1213 |  | 
|  | 1214 | pmd->need_commit = 1; | 
|  | 1215 |  | 
|  | 1216 | return 0; | 
|  | 1217 | } | 
|  | 1218 |  | 
|  | 1219 | int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) | 
|  | 1220 | { | 
|  | 1221 | int r; | 
|  | 1222 |  | 
|  | 1223 | down_write(&td->pmd->root_lock); | 
|  | 1224 | r = __remove(td, block); | 
|  | 1225 | up_write(&td->pmd->root_lock); | 
|  | 1226 |  | 
|  | 1227 | return r; | 
|  | 1228 | } | 
|  | 1229 |  | 
|  | 1230 | int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) | 
|  | 1231 | { | 
|  | 1232 | int r; | 
|  | 1233 |  | 
|  | 1234 | down_write(&pmd->root_lock); | 
|  | 1235 |  | 
|  | 1236 | r = dm_sm_new_block(pmd->data_sm, result); | 
|  | 1237 | pmd->need_commit = 1; | 
|  | 1238 |  | 
|  | 1239 | up_write(&pmd->root_lock); | 
|  | 1240 |  | 
|  | 1241 | return r; | 
|  | 1242 | } | 
|  | 1243 |  | 
|  | 1244 | int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) | 
|  | 1245 | { | 
|  | 1246 | int r; | 
|  | 1247 |  | 
|  | 1248 | down_write(&pmd->root_lock); | 
|  | 1249 |  | 
|  | 1250 | r = __commit_transaction(pmd); | 
|  | 1251 | if (r <= 0) | 
|  | 1252 | goto out; | 
|  | 1253 |  | 
|  | 1254 | /* | 
|  | 1255 | * Open the next transaction. | 
|  | 1256 | */ | 
|  | 1257 | r = __begin_transaction(pmd); | 
|  | 1258 | out: | 
|  | 1259 | up_write(&pmd->root_lock); | 
|  | 1260 | return r; | 
|  | 1261 | } | 
|  | 1262 |  | 
|  | 1263 | int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) | 
|  | 1264 | { | 
|  | 1265 | int r; | 
|  | 1266 |  | 
|  | 1267 | down_read(&pmd->root_lock); | 
|  | 1268 | r = dm_sm_get_nr_free(pmd->data_sm, result); | 
|  | 1269 | up_read(&pmd->root_lock); | 
|  | 1270 |  | 
|  | 1271 | return r; | 
|  | 1272 | } | 
|  | 1273 |  | 
|  | 1274 | int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, | 
|  | 1275 | dm_block_t *result) | 
|  | 1276 | { | 
|  | 1277 | int r; | 
|  | 1278 |  | 
|  | 1279 | down_read(&pmd->root_lock); | 
|  | 1280 | r = dm_sm_get_nr_free(pmd->metadata_sm, result); | 
|  | 1281 | up_read(&pmd->root_lock); | 
|  | 1282 |  | 
|  | 1283 | return r; | 
|  | 1284 | } | 
|  | 1285 |  | 
|  | 1286 | int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, | 
|  | 1287 | dm_block_t *result) | 
|  | 1288 | { | 
|  | 1289 | int r; | 
|  | 1290 |  | 
|  | 1291 | down_read(&pmd->root_lock); | 
|  | 1292 | r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); | 
|  | 1293 | up_read(&pmd->root_lock); | 
|  | 1294 |  | 
|  | 1295 | return r; | 
|  | 1296 | } | 
|  | 1297 |  | 
|  | 1298 | int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) | 
|  | 1299 | { | 
|  | 1300 | down_read(&pmd->root_lock); | 
|  | 1301 | *result = pmd->data_block_size; | 
|  | 1302 | up_read(&pmd->root_lock); | 
|  | 1303 |  | 
|  | 1304 | return 0; | 
|  | 1305 | } | 
|  | 1306 |  | 
|  | 1307 | int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) | 
|  | 1308 | { | 
|  | 1309 | int r; | 
|  | 1310 |  | 
|  | 1311 | down_read(&pmd->root_lock); | 
|  | 1312 | r = dm_sm_get_nr_blocks(pmd->data_sm, result); | 
|  | 1313 | up_read(&pmd->root_lock); | 
|  | 1314 |  | 
|  | 1315 | return r; | 
|  | 1316 | } | 
|  | 1317 |  | 
|  | 1318 | int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) | 
|  | 1319 | { | 
|  | 1320 | struct dm_pool_metadata *pmd = td->pmd; | 
|  | 1321 |  | 
|  | 1322 | down_read(&pmd->root_lock); | 
|  | 1323 | *result = td->mapped_blocks; | 
|  | 1324 | up_read(&pmd->root_lock); | 
|  | 1325 |  | 
|  | 1326 | return 0; | 
|  | 1327 | } | 
|  | 1328 |  | 
|  | 1329 | static int __highest_block(struct dm_thin_device *td, dm_block_t *result) | 
|  | 1330 | { | 
|  | 1331 | int r; | 
|  | 1332 | __le64 value_le; | 
|  | 1333 | dm_block_t thin_root; | 
|  | 1334 | struct dm_pool_metadata *pmd = td->pmd; | 
|  | 1335 |  | 
|  | 1336 | r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); | 
|  | 1337 | if (r) | 
|  | 1338 | return r; | 
|  | 1339 |  | 
|  | 1340 | thin_root = le64_to_cpu(value_le); | 
|  | 1341 |  | 
|  | 1342 | return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); | 
|  | 1343 | } | 
|  | 1344 |  | 
|  | 1345 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, | 
|  | 1346 | dm_block_t *result) | 
|  | 1347 | { | 
|  | 1348 | int r; | 
|  | 1349 | struct dm_pool_metadata *pmd = td->pmd; | 
|  | 1350 |  | 
|  | 1351 | down_read(&pmd->root_lock); | 
|  | 1352 | r = __highest_block(td, result); | 
|  | 1353 | up_read(&pmd->root_lock); | 
|  | 1354 |  | 
|  | 1355 | return r; | 
|  | 1356 | } | 
|  | 1357 |  | 
|  | 1358 | static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) | 
|  | 1359 | { | 
|  | 1360 | int r; | 
|  | 1361 | dm_block_t old_count; | 
|  | 1362 |  | 
|  | 1363 | r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); | 
|  | 1364 | if (r) | 
|  | 1365 | return r; | 
|  | 1366 |  | 
|  | 1367 | if (new_count == old_count) | 
|  | 1368 | return 0; | 
|  | 1369 |  | 
|  | 1370 | if (new_count < old_count) { | 
|  | 1371 | DMERR("cannot reduce size of data device"); | 
|  | 1372 | return -EINVAL; | 
|  | 1373 | } | 
|  | 1374 |  | 
|  | 1375 | r = dm_sm_extend(pmd->data_sm, new_count - old_count); | 
|  | 1376 | if (!r) | 
|  | 1377 | pmd->need_commit = 1; | 
|  | 1378 |  | 
|  | 1379 | return r; | 
|  | 1380 | } | 
|  | 1381 |  | 
|  | 1382 | int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) | 
|  | 1383 | { | 
|  | 1384 | int r; | 
|  | 1385 |  | 
|  | 1386 | down_write(&pmd->root_lock); | 
|  | 1387 | r = __resize_data_dev(pmd, new_count); | 
|  | 1388 | up_write(&pmd->root_lock); | 
|  | 1389 |  | 
|  | 1390 | return r; | 
|  | 1391 | } |