blob: a42b28e7f3fa27918e157d27038bb16e00c0c5ba [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderf0f8cef2012-01-29 13:57:44 -060044#define RBD_DRV_NAME "rbd"
45#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070046
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
Alex Elder21079782012-01-24 10:08:36 -060049#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070050#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
Alex Elder81a89792012-02-02 08:13:30 -060056/*
57 * An RBD device name will be "rbd#", where the "rbd" comes from
58 * RBD_DRV_NAME above, and # is a unique integer identifier.
59 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
60 * enough to hold all possible device names.
61 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070062#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060063#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070065#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
66
Yehuda Sadeh602adf42010-08-12 16:11:25 -070067/*
68 * block device image metadata (in-memory version)
69 */
70struct rbd_image_header {
71 u64 image_size;
72 char block_name[32];
73 __u8 obj_order;
74 __u8 crypt_type;
75 __u8 comp_type;
76 struct rw_semaphore snap_rwsem;
77 struct ceph_snap_context *snapc;
78 size_t snap_names_len;
79 u64 snap_seq;
80 u32 total_snaps;
81
82 char *snap_names;
83 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070084
85 u64 obj_version;
86};
87
88struct rbd_options {
89 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090};
91
92/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060093 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094 */
95struct rbd_client {
96 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098 struct kref kref;
99 struct list_head node;
100};
101
102/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600103 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700105struct rbd_req_status {
106 int done;
107 int rc;
108 u64 bytes;
109};
110
111/*
112 * a collection of requests
113 */
114struct rbd_req_coll {
115 int total;
116 int num_done;
117 struct kref kref;
118 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700119};
120
Alex Elderf0f8cef2012-01-29 13:57:44 -0600121/*
122 * a single io request
123 */
124struct rbd_request {
125 struct request *rq; /* blk layer request */
126 struct bio *bio; /* cloned bio */
127 struct page **pages; /* list of used pages */
128 u64 len;
129 int coll_index;
130 struct rbd_req_coll *coll;
131};
132
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800133struct rbd_snap {
134 struct device dev;
135 const char *name;
136 size_t size;
137 struct list_head node;
138 u64 id;
139};
140
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141/*
142 * a single device
143 */
144struct rbd_device {
145 int id; /* blkdev unique id */
146
147 int major; /* blkdev assigned major */
148 struct gendisk *disk; /* blkdev's gendisk and rq */
149 struct request_queue *q;
150
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 struct rbd_client *rbd_client;
152
153 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
154
155 spinlock_t lock; /* queue lock */
156
157 struct rbd_image_header header;
158 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
159 int obj_len;
160 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
161 char pool_name[RBD_MAX_POOL_NAME_LEN];
162 int poolid;
163
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700164 struct ceph_osd_event *watch_event;
165 struct ceph_osd_request *watch_request;
166
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167 char snap_name[RBD_MAX_SNAP_NAME_LEN];
168 u32 cur_snap; /* index+1 of current snapshot within snap context
169 0 - for the head */
170 int read_only;
171
172 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800173
174 /* list of snapshots */
175 struct list_head snaps;
176
177 /* sysfs related */
178 struct device dev;
179};
180
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600182
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700183static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600184static DEFINE_SPINLOCK(rbd_dev_list_lock);
185
Alex Elder432b8582012-01-29 13:57:44 -0600186static LIST_HEAD(rbd_client_list); /* clients */
187static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800189static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
190static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800191static ssize_t rbd_snap_add(struct device *dev,
192 struct device_attribute *attr,
193 const char *buf,
194 size_t count);
195static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700196 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800197
Alex Elderf0f8cef2012-01-29 13:57:44 -0600198static ssize_t rbd_add(struct bus_type *bus, const char *buf,
199 size_t count);
200static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
201 size_t count);
202
203static struct bus_attribute rbd_bus_attrs[] = {
204 __ATTR(add, S_IWUSR, NULL, rbd_add),
205 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
206 __ATTR_NULL
207};
208
209static struct bus_type rbd_bus_type = {
210 .name = "rbd",
211 .bus_attrs = rbd_bus_attrs,
212};
213
214static void rbd_root_dev_release(struct device *dev)
215{
216}
217
218static struct device rbd_root_dev = {
219 .init_name = "rbd",
220 .release = rbd_root_dev_release,
221};
222
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800223
224static struct rbd_device *dev_to_rbd(struct device *dev)
225{
226 return container_of(dev, struct rbd_device, dev);
227}
228
229static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
230{
231 return get_device(&rbd_dev->dev);
232}
233
234static void rbd_put_dev(struct rbd_device *rbd_dev)
235{
236 put_device(&rbd_dev->dev);
237}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700238
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700239static int __rbd_update_snaps(struct rbd_device *rbd_dev);
240
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241static int rbd_open(struct block_device *bdev, fmode_t mode)
242{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600243 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700244
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800245 rbd_get_dev(rbd_dev);
246
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247 set_device_ro(bdev, rbd_dev->read_only);
248
249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
252 return 0;
253}
254
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800255static int rbd_release(struct gendisk *disk, fmode_t mode)
256{
257 struct rbd_device *rbd_dev = disk->private_data;
258
259 rbd_put_dev(rbd_dev);
260
261 return 0;
262}
263
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700264static const struct block_device_operations rbd_bd_ops = {
265 .owner = THIS_MODULE,
266 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800267 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268};
269
270/*
271 * Initialize an rbd client instance.
272 * We own *opt.
273 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274static struct rbd_client *rbd_client_create(struct ceph_options *opt,
275 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276{
277 struct rbd_client *rbdc;
278 int ret = -ENOMEM;
279
280 dout("rbd_client_create\n");
281 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
282 if (!rbdc)
283 goto out_opt;
284
285 kref_init(&rbdc->kref);
286 INIT_LIST_HEAD(&rbdc->node);
287
Alex Elderbc534d862012-01-29 13:57:44 -0600288 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
289
Sage Weil6ab00d42011-08-09 09:41:59 -0700290 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600292 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400293 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294
295 ret = ceph_open_session(rbdc->client);
296 if (ret < 0)
297 goto out_err;
298
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700299 rbdc->rbd_opts = rbd_opts;
300
Alex Elder432b8582012-01-29 13:57:44 -0600301 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600303 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
Alex Elderbc534d862012-01-29 13:57:44 -0600305 mutex_unlock(&ctl_mutex);
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307 dout("rbd_client_create created %p\n", rbdc);
308 return rbdc;
309
310out_err:
311 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600312out_mutex:
313 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700314 kfree(rbdc);
315out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400316 if (opt)
317 ceph_destroy_options(opt);
318 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319}
320
321/*
322 * Find a ceph client with specific addr and configuration.
323 */
324static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
325{
326 struct rbd_client *client_node;
327
328 if (opt->flags & CEPH_OPT_NOSHARE)
329 return NULL;
330
331 list_for_each_entry(client_node, &rbd_client_list, node)
332 if (ceph_compare_options(opt, client_node->client) == 0)
333 return client_node;
334 return NULL;
335}
336
337/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700338 * mount options
339 */
340enum {
341 Opt_notify_timeout,
342 Opt_last_int,
343 /* int args above */
344 Opt_last_string,
345 /* string args above */
346};
347
348static match_table_t rbdopt_tokens = {
349 {Opt_notify_timeout, "notify_timeout=%d"},
350 /* int args above */
351 /* string args above */
352 {-1, NULL}
353};
354
355static int parse_rbd_opts_token(char *c, void *private)
356{
357 struct rbd_options *rbdopt = private;
358 substring_t argstr[MAX_OPT_ARGS];
359 int token, intval, ret;
360
Alex Elder21079782012-01-24 10:08:36 -0600361 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700362 if (token < 0)
363 return -EINVAL;
364
365 if (token < Opt_last_int) {
366 ret = match_int(&argstr[0], &intval);
367 if (ret < 0) {
368 pr_err("bad mount option arg (not int) "
369 "at '%s'\n", c);
370 return ret;
371 }
372 dout("got int token %d val %d\n", token, intval);
373 } else if (token > Opt_last_int && token < Opt_last_string) {
374 dout("got string token %d val %s\n", token,
375 argstr[0].from);
376 } else {
377 dout("got token %d\n", token);
378 }
379
380 switch (token) {
381 case Opt_notify_timeout:
382 rbdopt->notify_timeout = intval;
383 break;
384 default:
385 BUG_ON(token);
386 }
387 return 0;
388}
389
390/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700391 * Get a ceph client with specific addr and configuration, if one does
392 * not exist create it.
393 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600394static struct rbd_client *rbd_get_client(const char *mon_addr,
395 size_t mon_addr_len,
396 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700397{
398 struct rbd_client *rbdc;
399 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700400 struct rbd_options *rbd_opts;
401
402 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
403 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600404 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405
406 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700407
Alex Elderee577412012-01-24 10:08:36 -0600408 opt = ceph_parse_options(options, mon_addr,
Alex Elder5214ecc2012-02-02 08:13:30 -0600409 mon_addr + mon_addr_len,
Alex Elder21079782012-01-24 10:08:36 -0600410 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600411 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600412 kfree(rbd_opts);
413 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600414 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415
Alex Elder432b8582012-01-29 13:57:44 -0600416 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417 rbdc = __rbd_client_find(opt);
418 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600419 /* using an existing client */
420 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600421 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600422
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600424 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425
Alex Elderd720bcb2012-02-02 08:13:30 -0600426 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427 }
Alex Elder432b8582012-01-29 13:57:44 -0600428 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700430 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600431
Alex Elderd720bcb2012-02-02 08:13:30 -0600432 if (IS_ERR(rbdc))
433 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434
Alex Elderd720bcb2012-02-02 08:13:30 -0600435 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436}
437
438/*
439 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600440 *
Alex Elder432b8582012-01-29 13:57:44 -0600441 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442 */
443static void rbd_client_release(struct kref *kref)
444{
445 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
446
447 dout("rbd_release_client %p\n", rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448 list_del(&rbdc->node);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449
450 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700451 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 kfree(rbdc);
453}
454
455/*
456 * Drop reference to ceph client node. If it's not referenced anymore, release
457 * it.
458 */
459static void rbd_put_client(struct rbd_device *rbd_dev)
460{
Alex Elder432b8582012-01-29 13:57:44 -0600461 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
Alex Elder432b8582012-01-29 13:57:44 -0600463 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465}
466
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700467/*
468 * Destroy requests collection
469 */
470static void rbd_coll_release(struct kref *kref)
471{
472 struct rbd_req_coll *coll =
473 container_of(kref, struct rbd_req_coll, kref);
474
475 dout("rbd_coll_release %p\n", coll);
476 kfree(coll);
477}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478
479/*
480 * Create a new header structure, translate header format from the on-disk
481 * header.
482 */
483static int rbd_header_from_disk(struct rbd_image_header *header,
484 struct rbd_image_header_ondisk *ondisk,
485 int allocated_snaps,
486 gfp_t gfp_flags)
487{
488 int i;
Alex Elder00f1f362012-02-07 12:03:36 -0600489 u32 snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490
Alex Elder21079782012-01-24 10:08:36 -0600491 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800492 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800493
Alex Elder00f1f362012-02-07 12:03:36 -0600494 snap_count = le32_to_cpu(ondisk->snap_count);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Alex Elder21079782012-01-24 10:08:36 -0600496 snap_count * sizeof (*ondisk),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497 gfp_flags);
498 if (!header->snapc)
499 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600500
501 init_rwsem(&header->snap_rwsem);
502 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 if (snap_count) {
504 header->snap_names = kmalloc(header->snap_names_len,
505 GFP_KERNEL);
506 if (!header->snap_names)
507 goto err_snapc;
508 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
509 GFP_KERNEL);
510 if (!header->snap_sizes)
511 goto err_names;
512 } else {
513 header->snap_names = NULL;
514 header->snap_sizes = NULL;
515 }
516 memcpy(header->block_name, ondisk->block_name,
517 sizeof(ondisk->block_name));
518
519 header->image_size = le64_to_cpu(ondisk->image_size);
520 header->obj_order = ondisk->options.order;
521 header->crypt_type = ondisk->options.crypt_type;
522 header->comp_type = ondisk->options.comp_type;
523
524 atomic_set(&header->snapc->nref, 1);
525 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
526 header->snapc->num_snaps = snap_count;
527 header->total_snaps = snap_count;
528
Alex Elder21079782012-01-24 10:08:36 -0600529 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530 for (i = 0; i < snap_count; i++) {
531 header->snapc->snaps[i] =
532 le64_to_cpu(ondisk->snaps[i].id);
533 header->snap_sizes[i] =
534 le64_to_cpu(ondisk->snaps[i].image_size);
535 }
536
537 /* copy snapshot names */
538 memcpy(header->snap_names, &ondisk->snaps[i],
539 header->snap_names_len);
540 }
541
542 return 0;
543
544err_names:
545 kfree(header->snap_names);
546err_snapc:
547 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600548 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549}
550
551static int snap_index(struct rbd_image_header *header, int snap_num)
552{
553 return header->total_snaps - snap_num;
554}
555
556static u64 cur_snap_id(struct rbd_device *rbd_dev)
557{
558 struct rbd_image_header *header = &rbd_dev->header;
559
560 if (!rbd_dev->cur_snap)
561 return 0;
562
563 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
564}
565
566static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
567 u64 *seq, u64 *size)
568{
569 int i;
570 char *p = header->snap_names;
571
Alex Elder00f1f362012-02-07 12:03:36 -0600572 for (i = 0; i < header->total_snaps; i++) {
573 if (!strcmp(snap_name, p)) {
574
575 /* Found it. Pass back its id and/or size */
576
577 if (seq)
578 *seq = header->snapc->snaps[i];
579 if (size)
580 *size = header->snap_sizes[i];
581 return i;
582 }
583 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584 }
Alex Elder00f1f362012-02-07 12:03:36 -0600585 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586}
587
Josh Durgincc9d7342011-11-21 18:19:13 -0800588static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589{
590 struct rbd_image_header *header = &dev->header;
591 struct ceph_snap_context *snapc = header->snapc;
592 int ret = -ENOENT;
593
Josh Durgincc9d7342011-11-21 18:19:13 -0800594 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
595
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596 down_write(&header->snap_rwsem);
597
Josh Durgincc9d7342011-11-21 18:19:13 -0800598 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
599 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600 if (header->total_snaps)
601 snapc->seq = header->snap_seq;
602 else
603 snapc->seq = 0;
604 dev->cur_snap = 0;
605 dev->read_only = 0;
606 if (size)
607 *size = header->image_size;
608 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800609 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 if (ret < 0)
611 goto done;
612
613 dev->cur_snap = header->total_snaps - ret;
614 dev->read_only = 1;
615 }
616
617 ret = 0;
618done:
619 up_write(&header->snap_rwsem);
620 return ret;
621}
622
623static void rbd_header_free(struct rbd_image_header *header)
624{
625 kfree(header->snapc);
626 kfree(header->snap_names);
627 kfree(header->snap_sizes);
628}
629
630/*
631 * get the actual striped segment name, offset and length
632 */
633static u64 rbd_get_segment(struct rbd_image_header *header,
634 const char *block_name,
635 u64 ofs, u64 len,
636 char *seg_name, u64 *segofs)
637{
638 u64 seg = ofs >> header->obj_order;
639
640 if (seg_name)
641 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
642 "%s.%012llx", block_name, seg);
643
644 ofs = ofs & ((1 << header->obj_order) - 1);
645 len = min_t(u64, len, (1 << header->obj_order) - ofs);
646
647 if (segofs)
648 *segofs = ofs;
649
650 return len;
651}
652
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700653static int rbd_get_num_segments(struct rbd_image_header *header,
654 u64 ofs, u64 len)
655{
656 u64 start_seg = ofs >> header->obj_order;
657 u64 end_seg = (ofs + len - 1) >> header->obj_order;
658 return end_seg - start_seg + 1;
659}
660
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700662 * returns the size of an object in the image
663 */
664static u64 rbd_obj_bytes(struct rbd_image_header *header)
665{
666 return 1 << header->obj_order;
667}
668
669/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670 * bio helpers
671 */
672
673static void bio_chain_put(struct bio *chain)
674{
675 struct bio *tmp;
676
677 while (chain) {
678 tmp = chain;
679 chain = chain->bi_next;
680 bio_put(tmp);
681 }
682}
683
684/*
685 * zeros a bio chain, starting at specific offset
686 */
687static void zero_bio_chain(struct bio *chain, int start_ofs)
688{
689 struct bio_vec *bv;
690 unsigned long flags;
691 void *buf;
692 int i;
693 int pos = 0;
694
695 while (chain) {
696 bio_for_each_segment(bv, chain, i) {
697 if (pos + bv->bv_len > start_ofs) {
698 int remainder = max(start_ofs - pos, 0);
699 buf = bvec_kmap_irq(bv, &flags);
700 memset(buf + remainder, 0,
701 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200702 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 }
704 pos += bv->bv_len;
705 }
706
707 chain = chain->bi_next;
708 }
709}
710
711/*
712 * bio_chain_clone - clone a chain of bios up to a certain length.
713 * might return a bio_pair that will need to be released.
714 */
715static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
716 struct bio_pair **bp,
717 int len, gfp_t gfpmask)
718{
719 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
720 int total = 0;
721
722 if (*bp) {
723 bio_pair_release(*bp);
724 *bp = NULL;
725 }
726
727 while (old_chain && (total < len)) {
728 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
729 if (!tmp)
730 goto err_out;
731
732 if (total + old_chain->bi_size > len) {
733 struct bio_pair *bp;
734
735 /*
736 * this split can only happen with a single paged bio,
737 * split_bio will BUG_ON if this is not the case
738 */
739 dout("bio_chain_clone split! total=%d remaining=%d"
740 "bi_size=%d\n",
741 (int)total, (int)len-total,
742 (int)old_chain->bi_size);
743
744 /* split the bio. We'll release it either in the next
745 call, or it will have to be released outside */
746 bp = bio_split(old_chain, (len - total) / 512ULL);
747 if (!bp)
748 goto err_out;
749
750 __bio_clone(tmp, &bp->bio1);
751
752 *next = &bp->bio2;
753 } else {
754 __bio_clone(tmp, old_chain);
755 *next = old_chain->bi_next;
756 }
757
758 tmp->bi_bdev = NULL;
759 gfpmask &= ~__GFP_WAIT;
760 tmp->bi_next = NULL;
761
762 if (!new_chain) {
763 new_chain = tail = tmp;
764 } else {
765 tail->bi_next = tmp;
766 tail = tmp;
767 }
768 old_chain = old_chain->bi_next;
769
770 total += tmp->bi_size;
771 }
772
773 BUG_ON(total < len);
774
775 if (tail)
776 tail->bi_next = NULL;
777
778 *old = old_chain;
779
780 return new_chain;
781
782err_out:
783 dout("bio_chain_clone with err\n");
784 bio_chain_put(new_chain);
785 return NULL;
786}
787
788/*
789 * helpers for osd request op vectors.
790 */
791static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
792 int num_ops,
793 int opcode,
794 u32 payload_len)
795{
796 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
797 GFP_NOIO);
798 if (!*ops)
799 return -ENOMEM;
800 (*ops)[0].op = opcode;
801 /*
802 * op extent offset and length will be set later on
803 * in calc_raw_layout()
804 */
805 (*ops)[0].payload_len = payload_len;
806 return 0;
807}
808
809static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
810{
811 kfree(ops);
812}
813
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700814static void rbd_coll_end_req_index(struct request *rq,
815 struct rbd_req_coll *coll,
816 int index,
817 int ret, u64 len)
818{
819 struct request_queue *q;
820 int min, max, i;
821
822 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
823 coll, index, ret, len);
824
825 if (!rq)
826 return;
827
828 if (!coll) {
829 blk_end_request(rq, ret, len);
830 return;
831 }
832
833 q = rq->q;
834
835 spin_lock_irq(q->queue_lock);
836 coll->status[index].done = 1;
837 coll->status[index].rc = ret;
838 coll->status[index].bytes = len;
839 max = min = coll->num_done;
840 while (max < coll->total && coll->status[max].done)
841 max++;
842
843 for (i = min; i<max; i++) {
844 __blk_end_request(rq, coll->status[i].rc,
845 coll->status[i].bytes);
846 coll->num_done++;
847 kref_put(&coll->kref, rbd_coll_release);
848 }
849 spin_unlock_irq(q->queue_lock);
850}
851
852static void rbd_coll_end_req(struct rbd_request *req,
853 int ret, u64 len)
854{
855 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
856}
857
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700858/*
859 * Send ceph osd request
860 */
861static int rbd_do_request(struct request *rq,
862 struct rbd_device *dev,
863 struct ceph_snap_context *snapc,
864 u64 snapid,
865 const char *obj, u64 ofs, u64 len,
866 struct bio *bio,
867 struct page **pages,
868 int num_pages,
869 int flags,
870 struct ceph_osd_req_op *ops,
871 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700872 struct rbd_req_coll *coll,
873 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700875 struct ceph_msg *msg),
876 struct ceph_osd_request **linger_req,
877 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878{
879 struct ceph_osd_request *req;
880 struct ceph_file_layout *layout;
881 int ret;
882 u64 bno;
883 struct timespec mtime = CURRENT_TIME;
884 struct rbd_request *req_data;
885 struct ceph_osd_request_head *reqhead;
886 struct rbd_image_header *header = &dev->header;
Alex Elder1dbb4392012-01-24 10:08:37 -0600887 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700890 if (!req_data) {
891 if (coll)
892 rbd_coll_end_req_index(rq, coll, coll_index,
893 -ENOMEM, len);
894 return -ENOMEM;
895 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700897 if (coll) {
898 req_data->coll = coll;
899 req_data->coll_index = coll_index;
900 }
901
902 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903
904 down_read(&header->snap_rwsem);
905
Alex Elder1dbb4392012-01-24 10:08:37 -0600906 osdc = &dev->rbd_client->client->osdc;
907 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
908 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700909 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700911 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700912 goto done_pages;
913 }
914
915 req->r_callback = rbd_cb;
916
917 req_data->rq = rq;
918 req_data->bio = bio;
919 req_data->pages = pages;
920 req_data->len = len;
921
922 req->r_priv = req_data;
923
924 reqhead = req->r_request->front.iov_base;
925 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
926
927 strncpy(req->r_oid, obj, sizeof(req->r_oid));
928 req->r_oid_len = strlen(req->r_oid);
929
930 layout = &req->r_file_layout;
931 memset(layout, 0, sizeof(*layout));
932 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
933 layout->fl_stripe_count = cpu_to_le32(1);
934 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
935 layout->fl_pg_preferred = cpu_to_le32(-1);
936 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
Alex Elder1dbb4392012-01-24 10:08:37 -0600937 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
938 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939
940 ceph_osdc_build_request(req, ofs, &len,
941 ops,
942 snapc,
943 &mtime,
944 req->r_oid, req->r_oid_len);
945 up_read(&header->snap_rwsem);
946
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700947 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700949 *linger_req = req;
950 }
951
Alex Elder1dbb4392012-01-24 10:08:37 -0600952 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953 if (ret < 0)
954 goto done_err;
955
956 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600957 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700958 if (ver)
959 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700960 dout("reassert_ver=%lld\n",
961 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 ceph_osdc_put_request(req);
963 }
964 return ret;
965
966done_err:
967 bio_chain_put(req_data->bio);
968 ceph_osdc_put_request(req);
969done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700970 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700972 return ret;
973}
974
975/*
976 * Ceph osd op callback
977 */
978static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
979{
980 struct rbd_request *req_data = req->r_priv;
981 struct ceph_osd_reply_head *replyhead;
982 struct ceph_osd_op *op;
983 __s32 rc;
984 u64 bytes;
985 int read_op;
986
987 /* parse reply */
988 replyhead = msg->front.iov_base;
989 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
990 op = (void *)(replyhead + 1);
991 rc = le32_to_cpu(replyhead->result);
992 bytes = le64_to_cpu(op->extent.length);
993 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
994
995 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
996
997 if (rc == -ENOENT && read_op) {
998 zero_bio_chain(req_data->bio, 0);
999 rc = 0;
1000 } else if (rc == 0 && read_op && bytes < req_data->len) {
1001 zero_bio_chain(req_data->bio, bytes);
1002 bytes = req_data->len;
1003 }
1004
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001005 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001006
1007 if (req_data->bio)
1008 bio_chain_put(req_data->bio);
1009
1010 ceph_osdc_put_request(req);
1011 kfree(req_data);
1012}
1013
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001014static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1015{
1016 ceph_osdc_put_request(req);
1017}
1018
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001019/*
1020 * Do a synchronous ceph osd operation
1021 */
1022static int rbd_req_sync_op(struct rbd_device *dev,
1023 struct ceph_snap_context *snapc,
1024 u64 snapid,
1025 int opcode,
1026 int flags,
1027 struct ceph_osd_req_op *orig_ops,
1028 int num_reply,
1029 const char *obj,
1030 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001031 char *buf,
1032 struct ceph_osd_request **linger_req,
1033 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034{
1035 int ret;
1036 struct page **pages;
1037 int num_pages;
1038 struct ceph_osd_req_op *ops = orig_ops;
1039 u32 payload_len;
1040
1041 num_pages = calc_pages_for(ofs , len);
1042 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001043 if (IS_ERR(pages))
1044 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045
1046 if (!orig_ops) {
1047 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1048 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1049 if (ret < 0)
1050 goto done;
1051
1052 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1053 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1054 if (ret < 0)
1055 goto done_ops;
1056 }
1057 }
1058
1059 ret = rbd_do_request(NULL, dev, snapc, snapid,
1060 obj, ofs, len, NULL,
1061 pages, num_pages,
1062 flags,
1063 ops,
1064 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001065 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001066 NULL,
1067 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068 if (ret < 0)
1069 goto done_ops;
1070
1071 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1072 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1073
1074done_ops:
1075 if (!orig_ops)
1076 rbd_destroy_ops(ops);
1077done:
1078 ceph_release_page_vector(pages, num_pages);
1079 return ret;
1080}
1081
1082/*
1083 * Do an asynchronous ceph osd operation
1084 */
1085static int rbd_do_op(struct request *rq,
1086 struct rbd_device *rbd_dev ,
1087 struct ceph_snap_context *snapc,
1088 u64 snapid,
1089 int opcode, int flags, int num_reply,
1090 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001091 struct bio *bio,
1092 struct rbd_req_coll *coll,
1093 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001094{
1095 char *seg_name;
1096 u64 seg_ofs;
1097 u64 seg_len;
1098 int ret;
1099 struct ceph_osd_req_op *ops;
1100 u32 payload_len;
1101
1102 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1103 if (!seg_name)
1104 return -ENOMEM;
1105
1106 seg_len = rbd_get_segment(&rbd_dev->header,
1107 rbd_dev->header.block_name,
1108 ofs, len,
1109 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110
1111 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1112
1113 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1114 if (ret < 0)
1115 goto done;
1116
1117 /* we've taken care of segment sizes earlier when we
1118 cloned the bios. We should never have a segment
1119 truncated at this point */
1120 BUG_ON(seg_len < len);
1121
1122 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1123 seg_name, seg_ofs, seg_len,
1124 bio,
1125 NULL, 0,
1126 flags,
1127 ops,
1128 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001129 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001130 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001131
1132 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133done:
1134 kfree(seg_name);
1135 return ret;
1136}
1137
1138/*
1139 * Request async osd write
1140 */
1141static int rbd_req_write(struct request *rq,
1142 struct rbd_device *rbd_dev,
1143 struct ceph_snap_context *snapc,
1144 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001145 struct bio *bio,
1146 struct rbd_req_coll *coll,
1147 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148{
1149 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1150 CEPH_OSD_OP_WRITE,
1151 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1152 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001153 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154}
1155
1156/*
1157 * Request async osd read
1158 */
1159static int rbd_req_read(struct request *rq,
1160 struct rbd_device *rbd_dev,
1161 u64 snapid,
1162 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001163 struct bio *bio,
1164 struct rbd_req_coll *coll,
1165 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001166{
1167 return rbd_do_op(rq, rbd_dev, NULL,
1168 (snapid ? snapid : CEPH_NOSNAP),
1169 CEPH_OSD_OP_READ,
1170 CEPH_OSD_FLAG_READ,
1171 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001172 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173}
1174
1175/*
1176 * Request sync osd read
1177 */
1178static int rbd_req_sync_read(struct rbd_device *dev,
1179 struct ceph_snap_context *snapc,
1180 u64 snapid,
1181 const char *obj,
1182 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001183 char *buf,
1184 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185{
1186 return rbd_req_sync_op(dev, NULL,
1187 (snapid ? snapid : CEPH_NOSNAP),
1188 CEPH_OSD_OP_READ,
1189 CEPH_OSD_FLAG_READ,
1190 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001191 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001192}
1193
1194/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001195 * Request sync osd watch
1196 */
1197static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1198 u64 ver,
1199 u64 notify_id,
1200 const char *obj)
1201{
1202 struct ceph_osd_req_op *ops;
1203 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001204 int ret;
1205
1206 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001207 if (ret < 0)
1208 return ret;
1209
1210 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1211 ops[0].watch.cookie = notify_id;
1212 ops[0].watch.flag = 0;
1213
1214 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1215 obj, 0, 0, NULL,
1216 pages, 0,
1217 CEPH_OSD_FLAG_READ,
1218 ops,
1219 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001220 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001221 rbd_simple_req_cb, 0, NULL);
1222
1223 rbd_destroy_ops(ops);
1224 return ret;
1225}
1226
1227static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1228{
1229 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001230 int rc;
1231
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001232 if (!dev)
1233 return;
1234
1235 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1236 notify_id, (int)opcode);
1237 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001238 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001240 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001241 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1242 " update snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243
1244 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1245}
1246
1247/*
1248 * Request sync osd watch
1249 */
1250static int rbd_req_sync_watch(struct rbd_device *dev,
1251 const char *obj,
1252 u64 ver)
1253{
1254 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001255 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001256
1257 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1258 if (ret < 0)
1259 return ret;
1260
1261 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1262 (void *)dev, &dev->watch_event);
1263 if (ret < 0)
1264 goto fail;
1265
1266 ops[0].watch.ver = cpu_to_le64(ver);
1267 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1268 ops[0].watch.flag = 1;
1269
1270 ret = rbd_req_sync_op(dev, NULL,
1271 CEPH_NOSNAP,
1272 0,
1273 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1274 ops,
1275 1, obj, 0, 0, NULL,
1276 &dev->watch_request, NULL);
1277
1278 if (ret < 0)
1279 goto fail_event;
1280
1281 rbd_destroy_ops(ops);
1282 return 0;
1283
1284fail_event:
1285 ceph_osdc_cancel_event(dev->watch_event);
1286 dev->watch_event = NULL;
1287fail:
1288 rbd_destroy_ops(ops);
1289 return ret;
1290}
1291
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001292/*
1293 * Request sync osd unwatch
1294 */
1295static int rbd_req_sync_unwatch(struct rbd_device *dev,
1296 const char *obj)
1297{
1298 struct ceph_osd_req_op *ops;
1299
1300 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1301 if (ret < 0)
1302 return ret;
1303
1304 ops[0].watch.ver = 0;
1305 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1306 ops[0].watch.flag = 0;
1307
1308 ret = rbd_req_sync_op(dev, NULL,
1309 CEPH_NOSNAP,
1310 0,
1311 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1312 ops,
1313 1, obj, 0, 0, NULL, NULL, NULL);
1314
1315 rbd_destroy_ops(ops);
1316 ceph_osdc_cancel_event(dev->watch_event);
1317 dev->watch_event = NULL;
1318 return ret;
1319}
1320
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321struct rbd_notify_info {
1322 struct rbd_device *dev;
1323};
1324
1325static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1326{
1327 struct rbd_device *dev = (struct rbd_device *)data;
1328 if (!dev)
1329 return;
1330
1331 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1332 notify_id, (int)opcode);
1333}
1334
1335/*
1336 * Request sync osd notify
1337 */
1338static int rbd_req_sync_notify(struct rbd_device *dev,
1339 const char *obj)
1340{
1341 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001342 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343 struct ceph_osd_event *event;
1344 struct rbd_notify_info info;
1345 int payload_len = sizeof(u32) + sizeof(u32);
1346 int ret;
1347
1348 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1349 if (ret < 0)
1350 return ret;
1351
1352 info.dev = dev;
1353
1354 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1355 (void *)&info, &event);
1356 if (ret < 0)
1357 goto fail;
1358
1359 ops[0].watch.ver = 1;
1360 ops[0].watch.flag = 1;
1361 ops[0].watch.cookie = event->cookie;
1362 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1363 ops[0].watch.timeout = 12;
1364
1365 ret = rbd_req_sync_op(dev, NULL,
1366 CEPH_NOSNAP,
1367 0,
1368 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1369 ops,
1370 1, obj, 0, 0, NULL, NULL, NULL);
1371 if (ret < 0)
1372 goto fail_event;
1373
1374 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1375 dout("ceph_osdc_wait_event returned %d\n", ret);
1376 rbd_destroy_ops(ops);
1377 return 0;
1378
1379fail_event:
1380 ceph_osdc_cancel_event(event);
1381fail:
1382 rbd_destroy_ops(ops);
1383 return ret;
1384}
1385
1386/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001387 * Request sync osd read
1388 */
1389static int rbd_req_sync_exec(struct rbd_device *dev,
1390 const char *obj,
1391 const char *cls,
1392 const char *method,
1393 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 int len,
1395 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001396{
1397 struct ceph_osd_req_op *ops;
1398 int cls_len = strlen(cls);
1399 int method_len = strlen(method);
1400 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1401 cls_len + method_len + len);
1402 if (ret < 0)
1403 return ret;
1404
1405 ops[0].cls.class_name = cls;
1406 ops[0].cls.class_len = (__u8)cls_len;
1407 ops[0].cls.method_name = method;
1408 ops[0].cls.method_len = (__u8)method_len;
1409 ops[0].cls.argc = 0;
1410 ops[0].cls.indata = data;
1411 ops[0].cls.indata_len = len;
1412
1413 ret = rbd_req_sync_op(dev, NULL,
1414 CEPH_NOSNAP,
1415 0,
1416 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1417 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001418 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001419
1420 rbd_destroy_ops(ops);
1421
1422 dout("cls_exec returned %d\n", ret);
1423 return ret;
1424}
1425
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001426static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1427{
1428 struct rbd_req_coll *coll =
1429 kzalloc(sizeof(struct rbd_req_coll) +
1430 sizeof(struct rbd_req_status) * num_reqs,
1431 GFP_ATOMIC);
1432
1433 if (!coll)
1434 return NULL;
1435 coll->total = num_reqs;
1436 kref_init(&coll->kref);
1437 return coll;
1438}
1439
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001440/*
1441 * block device queue callback
1442 */
1443static void rbd_rq_fn(struct request_queue *q)
1444{
1445 struct rbd_device *rbd_dev = q->queuedata;
1446 struct request *rq;
1447 struct bio_pair *bp = NULL;
1448
Alex Elder00f1f362012-02-07 12:03:36 -06001449 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450 struct bio *bio;
1451 struct bio *rq_bio, *next_bio = NULL;
1452 bool do_write;
1453 int size, op_size = 0;
1454 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001455 int num_segs, cur_seg = 0;
1456 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457
1458 /* peek at request from block layer */
1459 if (!rq)
1460 break;
1461
1462 dout("fetched request\n");
1463
1464 /* filter out block requests we don't understand */
1465 if ((rq->cmd_type != REQ_TYPE_FS)) {
1466 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001467 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 }
1469
1470 /* deduce our operation (read, write) */
1471 do_write = (rq_data_dir(rq) == WRITE);
1472
1473 size = blk_rq_bytes(rq);
1474 ofs = blk_rq_pos(rq) * 512ULL;
1475 rq_bio = rq->bio;
1476 if (do_write && rbd_dev->read_only) {
1477 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001478 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001479 }
1480
1481 spin_unlock_irq(q->queue_lock);
1482
1483 dout("%s 0x%x bytes at 0x%llx\n",
1484 do_write ? "write" : "read",
1485 size, blk_rq_pos(rq) * 512ULL);
1486
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001487 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1488 coll = rbd_alloc_coll(num_segs);
1489 if (!coll) {
1490 spin_lock_irq(q->queue_lock);
1491 __blk_end_request_all(rq, -ENOMEM);
Alex Elder00f1f362012-02-07 12:03:36 -06001492 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001493 }
1494
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 do {
1496 /* a bio clone to be passed down to OSD req */
1497 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1498 op_size = rbd_get_segment(&rbd_dev->header,
1499 rbd_dev->header.block_name,
1500 ofs, size,
1501 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001502 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1504 op_size, GFP_ATOMIC);
1505 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506 rbd_coll_end_req_index(rq, coll, cur_seg,
1507 -ENOMEM, op_size);
1508 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001509 }
1510
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001511
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512 /* init OSD command: write or read */
1513 if (do_write)
1514 rbd_req_write(rq, rbd_dev,
1515 rbd_dev->header.snapc,
1516 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001517 op_size, bio,
1518 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519 else
1520 rbd_req_read(rq, rbd_dev,
1521 cur_snap_id(rbd_dev),
1522 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001523 op_size, bio,
1524 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 size -= op_size;
1528 ofs += op_size;
1529
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 rq_bio = next_bio;
1532 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001533 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534
1535 if (bp)
1536 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537 spin_lock_irq(q->queue_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538 }
1539}
1540
1541/*
1542 * a queue callback. Makes sure that we don't create a bio that spans across
1543 * multiple osd objects. One exception would be with a single page bios,
1544 * which we handle later at bio_chain_clone
1545 */
1546static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1547 struct bio_vec *bvec)
1548{
1549 struct rbd_device *rbd_dev = q->queuedata;
1550 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1551 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1552 unsigned int bio_sectors = bmd->bi_size >> 9;
1553 int max;
1554
1555 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1556 + bio_sectors)) << 9;
1557 if (max < 0)
1558 max = 0; /* bio_add cannot handle a negative return */
1559 if (max <= bvec->bv_len && bio_sectors == 0)
1560 return bvec->bv_len;
1561 return max;
1562}
1563
1564static void rbd_free_disk(struct rbd_device *rbd_dev)
1565{
1566 struct gendisk *disk = rbd_dev->disk;
1567
1568 if (!disk)
1569 return;
1570
1571 rbd_header_free(&rbd_dev->header);
1572
1573 if (disk->flags & GENHD_FL_UP)
1574 del_gendisk(disk);
1575 if (disk->queue)
1576 blk_cleanup_queue(disk->queue);
1577 put_disk(disk);
1578}
1579
1580/*
1581 * reload the ondisk the header
1582 */
1583static int rbd_read_header(struct rbd_device *rbd_dev,
1584 struct rbd_image_header *header)
1585{
1586 ssize_t rc;
1587 struct rbd_image_header_ondisk *dh;
1588 int snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001589 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001590 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591
Alex Elder00f1f362012-02-07 12:03:36 -06001592 /*
1593 * First reads the fixed-size header to determine the number
1594 * of snapshots, then re-reads it, along with all snapshot
1595 * records as well as their stored names.
1596 */
1597 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001599 dh = kmalloc(len, GFP_KERNEL);
1600 if (!dh)
1601 return -ENOMEM;
1602
1603 rc = rbd_req_sync_read(rbd_dev,
1604 NULL, CEPH_NOSNAP,
1605 rbd_dev->obj_md_name,
1606 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001607 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 if (rc < 0)
1609 goto out_dh;
1610
1611 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001612 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001613 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001614 pr_warning("unrecognized header format"
1615 " for image %s", rbd_dev->obj);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001617 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618
Alex Elder00f1f362012-02-07 12:03:36 -06001619 if (snap_count == header->total_snaps)
1620 break;
1621
1622 snap_count = header->total_snaps;
1623 len = sizeof (*dh) +
1624 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1625 header->snap_names_len;
1626
1627 rbd_header_free(header);
1628 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001630 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631
1632out_dh:
1633 kfree(dh);
1634 return rc;
1635}
1636
1637/*
1638 * create a snapshot
1639 */
1640static int rbd_header_add_snap(struct rbd_device *dev,
1641 const char *snap_name,
1642 gfp_t gfp_flags)
1643{
1644 int name_len = strlen(snap_name);
1645 u64 new_snapid;
1646 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001647 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001648 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001649 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650
1651 /* we should create a snapshot only if we're pointing at the head */
1652 if (dev->cur_snap)
1653 return -EINVAL;
1654
Alex Elder1dbb4392012-01-24 10:08:37 -06001655 monc = &dev->rbd_client->client->monc;
1656 ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001657 dout("created snapid=%lld\n", new_snapid);
1658 if (ret < 0)
1659 return ret;
1660
1661 data = kmalloc(name_len + 16, gfp_flags);
1662 if (!data)
1663 return -ENOMEM;
1664
Sage Weil916d4d62011-05-12 16:10:50 -07001665 p = data;
1666 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001667
Sage Weil916d4d62011-05-12 16:10:50 -07001668 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1669 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670
1671 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001672 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001673
Sage Weil916d4d62011-05-12 16:10:50 -07001674 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001675
1676 if (ret < 0)
1677 return ret;
1678
1679 dev->header.snapc->seq = new_snapid;
1680
1681 return 0;
1682bad:
1683 return -ERANGE;
1684}
1685
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001686static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1687{
1688 struct rbd_snap *snap;
1689
1690 while (!list_empty(&rbd_dev->snaps)) {
1691 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1692 __rbd_remove_snap_dev(rbd_dev, snap);
1693 }
1694}
1695
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696/*
1697 * only read the first part of the ondisk header, without the snaps info
1698 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001699static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001700{
1701 int ret;
1702 struct rbd_image_header h;
1703 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001704 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001705
1706 ret = rbd_read_header(rbd_dev, &h);
1707 if (ret < 0)
1708 return ret;
1709
Sage Weil9db4b3e2011-04-19 22:49:06 -07001710 /* resized? */
1711 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1712
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001713 down_write(&rbd_dev->header.snap_rwsem);
1714
1715 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001716 if (rbd_dev->header.total_snaps &&
1717 rbd_dev->header.snapc->snaps[0] == snap_seq)
1718 /* pointing at the head, will need to follow that
1719 if head moves */
1720 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721
1722 kfree(rbd_dev->header.snapc);
1723 kfree(rbd_dev->header.snap_names);
1724 kfree(rbd_dev->header.snap_sizes);
1725
1726 rbd_dev->header.total_snaps = h.total_snaps;
1727 rbd_dev->header.snapc = h.snapc;
1728 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001729 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001730 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001731 if (follow_seq)
1732 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1733 else
1734 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001736 ret = __rbd_init_snaps_header(rbd_dev);
1737
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001738 up_write(&rbd_dev->header.snap_rwsem);
1739
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001740 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741}
1742
1743static int rbd_init_disk(struct rbd_device *rbd_dev)
1744{
1745 struct gendisk *disk;
1746 struct request_queue *q;
1747 int rc;
1748 u64 total_size = 0;
1749
1750 /* contact OSD, request size info about the object being mapped */
1751 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1752 if (rc)
1753 return rc;
1754
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001755 /* no need to lock here, as rbd_dev is not registered yet */
1756 rc = __rbd_init_snaps_header(rbd_dev);
1757 if (rc)
1758 return rc;
1759
Josh Durgincc9d7342011-11-21 18:19:13 -08001760 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761 if (rc)
1762 return rc;
1763
1764 /* create gendisk info */
1765 rc = -ENOMEM;
1766 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1767 if (!disk)
1768 goto out;
1769
Alex Elderf0f8cef2012-01-29 13:57:44 -06001770 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001771 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772 disk->major = rbd_dev->major;
1773 disk->first_minor = 0;
1774 disk->fops = &rbd_bd_ops;
1775 disk->private_data = rbd_dev;
1776
1777 /* init rq */
1778 rc = -ENOMEM;
1779 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1780 if (!q)
1781 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001782
1783 /* set io sizes to object size */
1784 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1785 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1786 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1787 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1788
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789 blk_queue_merge_bvec(q, rbd_merge_bvec);
1790 disk->queue = q;
1791
1792 q->queuedata = rbd_dev;
1793
1794 rbd_dev->disk = disk;
1795 rbd_dev->q = q;
1796
1797 /* finally, announce the disk to the world */
1798 set_capacity(disk, total_size / 512ULL);
1799 add_disk(disk);
1800
1801 pr_info("%s: added with size 0x%llx\n",
1802 disk->disk_name, (unsigned long long)total_size);
1803 return 0;
1804
1805out_disk:
1806 put_disk(disk);
1807out:
1808 return rc;
1809}
1810
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001811/*
1812 sysfs
1813*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001815static ssize_t rbd_size_show(struct device *dev,
1816 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001818 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1819
1820 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001821}
1822
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001823static ssize_t rbd_major_show(struct device *dev,
1824 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001826 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1827
1828 return sprintf(buf, "%d\n", rbd_dev->major);
1829}
1830
1831static ssize_t rbd_client_id_show(struct device *dev,
1832 struct device_attribute *attr, char *buf)
1833{
1834 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1835
Alex Elder1dbb4392012-01-24 10:08:37 -06001836 return sprintf(buf, "client%lld\n",
1837 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838}
1839
1840static ssize_t rbd_pool_show(struct device *dev,
1841 struct device_attribute *attr, char *buf)
1842{
1843 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1844
1845 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1846}
1847
1848static ssize_t rbd_name_show(struct device *dev,
1849 struct device_attribute *attr, char *buf)
1850{
1851 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1852
1853 return sprintf(buf, "%s\n", rbd_dev->obj);
1854}
1855
1856static ssize_t rbd_snap_show(struct device *dev,
1857 struct device_attribute *attr,
1858 char *buf)
1859{
1860 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1861
1862 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1863}
1864
1865static ssize_t rbd_image_refresh(struct device *dev,
1866 struct device_attribute *attr,
1867 const char *buf,
1868 size_t size)
1869{
1870 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1871 int rc;
1872 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873
1874 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1875
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876 rc = __rbd_update_snaps(rbd_dev);
1877 if (rc < 0)
1878 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001879
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001880 mutex_unlock(&ctl_mutex);
1881 return ret;
1882}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001883
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1885static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1886static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1887static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1888static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1889static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1890static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1891static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001892
1893static struct attribute *rbd_attrs[] = {
1894 &dev_attr_size.attr,
1895 &dev_attr_major.attr,
1896 &dev_attr_client_id.attr,
1897 &dev_attr_pool.attr,
1898 &dev_attr_name.attr,
1899 &dev_attr_current_snap.attr,
1900 &dev_attr_refresh.attr,
1901 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001902 NULL
1903};
1904
1905static struct attribute_group rbd_attr_group = {
1906 .attrs = rbd_attrs,
1907};
1908
1909static const struct attribute_group *rbd_attr_groups[] = {
1910 &rbd_attr_group,
1911 NULL
1912};
1913
1914static void rbd_sysfs_dev_release(struct device *dev)
1915{
1916}
1917
1918static struct device_type rbd_device_type = {
1919 .name = "rbd",
1920 .groups = rbd_attr_groups,
1921 .release = rbd_sysfs_dev_release,
1922};
1923
1924
1925/*
1926 sysfs - snapshots
1927*/
1928
1929static ssize_t rbd_snap_size_show(struct device *dev,
1930 struct device_attribute *attr,
1931 char *buf)
1932{
1933 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1934
1935 return sprintf(buf, "%lld\n", (long long)snap->size);
1936}
1937
1938static ssize_t rbd_snap_id_show(struct device *dev,
1939 struct device_attribute *attr,
1940 char *buf)
1941{
1942 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1943
1944 return sprintf(buf, "%lld\n", (long long)snap->id);
1945}
1946
1947static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1948static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1949
1950static struct attribute *rbd_snap_attrs[] = {
1951 &dev_attr_snap_size.attr,
1952 &dev_attr_snap_id.attr,
1953 NULL,
1954};
1955
1956static struct attribute_group rbd_snap_attr_group = {
1957 .attrs = rbd_snap_attrs,
1958};
1959
1960static void rbd_snap_dev_release(struct device *dev)
1961{
1962 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1963 kfree(snap->name);
1964 kfree(snap);
1965}
1966
1967static const struct attribute_group *rbd_snap_attr_groups[] = {
1968 &rbd_snap_attr_group,
1969 NULL
1970};
1971
1972static struct device_type rbd_snap_device_type = {
1973 .groups = rbd_snap_attr_groups,
1974 .release = rbd_snap_dev_release,
1975};
1976
1977static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1978 struct rbd_snap *snap)
1979{
1980 list_del(&snap->node);
1981 device_unregister(&snap->dev);
1982}
1983
1984static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1985 struct rbd_snap *snap,
1986 struct device *parent)
1987{
1988 struct device *dev = &snap->dev;
1989 int ret;
1990
1991 dev->type = &rbd_snap_device_type;
1992 dev->parent = parent;
1993 dev->release = rbd_snap_dev_release;
1994 dev_set_name(dev, "snap_%s", snap->name);
1995 ret = device_register(dev);
1996
1997 return ret;
1998}
1999
2000static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2001 int i, const char *name,
2002 struct rbd_snap **snapp)
2003{
2004 int ret;
2005 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2006 if (!snap)
2007 return -ENOMEM;
2008 snap->name = kstrdup(name, GFP_KERNEL);
2009 snap->size = rbd_dev->header.snap_sizes[i];
2010 snap->id = rbd_dev->header.snapc->snaps[i];
2011 if (device_is_registered(&rbd_dev->dev)) {
2012 ret = rbd_register_snap_dev(rbd_dev, snap,
2013 &rbd_dev->dev);
2014 if (ret < 0)
2015 goto err;
2016 }
2017 *snapp = snap;
2018 return 0;
2019err:
2020 kfree(snap->name);
2021 kfree(snap);
2022 return ret;
2023}
2024
2025/*
2026 * search for the previous snap in a null delimited string list
2027 */
2028const char *rbd_prev_snap_name(const char *name, const char *start)
2029{
2030 if (name < start + 2)
2031 return NULL;
2032
2033 name -= 2;
2034 while (*name) {
2035 if (name == start)
2036 return start;
2037 name--;
2038 }
2039 return name + 1;
2040}
2041
2042/*
2043 * compare the old list of snapshots that we have to what's in the header
2044 * and update it accordingly. Note that the header holds the snapshots
2045 * in a reverse order (from newest to oldest) and we need to go from
2046 * older to new so that we don't get a duplicate snap name when
2047 * doing the process (e.g., removed snapshot and recreated a new
2048 * one with the same name.
2049 */
2050static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2051{
2052 const char *name, *first_name;
2053 int i = rbd_dev->header.total_snaps;
2054 struct rbd_snap *snap, *old_snap = NULL;
2055 int ret;
2056 struct list_head *p, *n;
2057
2058 first_name = rbd_dev->header.snap_names;
2059 name = first_name + rbd_dev->header.snap_names_len;
2060
2061 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2062 u64 cur_id;
2063
2064 old_snap = list_entry(p, struct rbd_snap, node);
2065
2066 if (i)
2067 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2068
2069 if (!i || old_snap->id < cur_id) {
2070 /* old_snap->id was skipped, thus was removed */
2071 __rbd_remove_snap_dev(rbd_dev, old_snap);
2072 continue;
2073 }
2074 if (old_snap->id == cur_id) {
2075 /* we have this snapshot already */
2076 i--;
2077 name = rbd_prev_snap_name(name, first_name);
2078 continue;
2079 }
2080 for (; i > 0;
2081 i--, name = rbd_prev_snap_name(name, first_name)) {
2082 if (!name) {
2083 WARN_ON(1);
2084 return -EINVAL;
2085 }
2086 cur_id = rbd_dev->header.snapc->snaps[i];
2087 /* snapshot removal? handle it above */
2088 if (cur_id >= old_snap->id)
2089 break;
2090 /* a new snapshot */
2091 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2092 if (ret < 0)
2093 return ret;
2094
2095 /* note that we add it backward so using n and not p */
2096 list_add(&snap->node, n);
2097 p = &snap->node;
2098 }
2099 }
2100 /* we're done going over the old snap list, just add what's left */
2101 for (; i > 0; i--) {
2102 name = rbd_prev_snap_name(name, first_name);
2103 if (!name) {
2104 WARN_ON(1);
2105 return -EINVAL;
2106 }
2107 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2108 if (ret < 0)
2109 return ret;
2110 list_add(&snap->node, &rbd_dev->snaps);
2111 }
2112
2113 return 0;
2114}
2115
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2117{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002118 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119 struct device *dev;
2120 struct rbd_snap *snap;
2121
2122 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2123 dev = &rbd_dev->dev;
2124
2125 dev->bus = &rbd_bus_type;
2126 dev->type = &rbd_device_type;
2127 dev->parent = &rbd_root_dev;
2128 dev->release = rbd_dev_release;
2129 dev_set_name(dev, "%d", rbd_dev->id);
2130 ret = device_register(dev);
2131 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002132 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133
2134 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2135 ret = rbd_register_snap_dev(rbd_dev, snap,
2136 &rbd_dev->dev);
2137 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002138 break;
2139 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002140out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141 mutex_unlock(&ctl_mutex);
2142 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002143}
2144
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002145static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2146{
2147 device_unregister(&rbd_dev->dev);
2148}
2149
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002150static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2151{
2152 int ret, rc;
2153
2154 do {
2155 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2156 rbd_dev->header.obj_version);
2157 if (ret == -ERANGE) {
2158 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2159 rc = __rbd_update_snaps(rbd_dev);
2160 mutex_unlock(&ctl_mutex);
2161 if (rc < 0)
2162 return rc;
2163 }
2164 } while (ret == -ERANGE);
2165
2166 return ret;
2167}
2168
Alex Elder1ddbe942012-01-29 13:57:44 -06002169static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2170
2171/*
Alex Elder499afd52012-02-02 08:13:29 -06002172 * Get a unique rbd identifier for the given new rbd_dev, and add
2173 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002174 */
Alex Elder499afd52012-02-02 08:13:29 -06002175static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002176{
Alex Elder499afd52012-02-02 08:13:29 -06002177 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2178
2179 spin_lock(&rbd_dev_list_lock);
2180 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2181 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002182}
Alex Elderb7f23c32012-01-29 13:57:43 -06002183
Alex Elder1ddbe942012-01-29 13:57:44 -06002184/*
Alex Elder499afd52012-02-02 08:13:29 -06002185 * Remove an rbd_dev from the global list, and record that its
2186 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002187 */
Alex Elder499afd52012-02-02 08:13:29 -06002188static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002189{
Alex Elderd184f6b2012-01-29 13:57:44 -06002190 struct list_head *tmp;
2191 int rbd_id = rbd_dev->id;
2192 int max_id;
2193
2194 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002195
2196 spin_lock(&rbd_dev_list_lock);
2197 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002198
2199 /*
2200 * If the id being "put" is not the current maximum, there
2201 * is nothing special we need to do.
2202 */
2203 if (rbd_id != atomic64_read(&rbd_id_max)) {
2204 spin_unlock(&rbd_dev_list_lock);
2205 return;
2206 }
2207
2208 /*
2209 * We need to update the current maximum id. Search the
2210 * list to find out what it is. We're more likely to find
2211 * the maximum at the end, so search the list backward.
2212 */
2213 max_id = 0;
2214 list_for_each_prev(tmp, &rbd_dev_list) {
2215 struct rbd_device *rbd_dev;
2216
2217 rbd_dev = list_entry(tmp, struct rbd_device, node);
2218 if (rbd_id > max_id)
2219 max_id = rbd_id;
2220 }
Alex Elder499afd52012-02-02 08:13:29 -06002221 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002222
Alex Elder1ddbe942012-01-29 13:57:44 -06002223 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002224 * The max id could have been updated by rbd_id_get(), in
2225 * which case it now accurately reflects the new maximum.
2226 * Be careful not to overwrite the maximum value in that
2227 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002228 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002229 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002230}
2231
Alex Eldera725f65e2012-02-02 08:13:30 -06002232/*
Alex Eldere28fff262012-02-02 08:13:30 -06002233 * Skips over white space at *buf, and updates *buf to point to the
2234 * first found non-space character (if any). Returns the length of
2235 * the token (string of non-white space characters) found.
2236 */
2237static inline size_t next_token(const char **buf)
2238{
2239 /*
2240 * These are the characters that produce nonzero for
2241 * isspace() in the "C" and "POSIX" locales.
2242 */
2243 const char *spaces = " \f\n\r\t\v";
2244
2245 *buf += strspn(*buf, spaces); /* Find start of token */
2246
2247 return strcspn(*buf, spaces); /* Return token length */
2248}
2249
2250/*
2251 * Finds the next token in *buf, and if the provided token buffer is
2252 * big enough, copies the found token into it. The result, if
2253 * copied, is guaranteed to be terminated with '\0'.
2254 *
2255 * Returns the length of the token found (not including the '\0').
2256 * Return value will be 0 if no token is found, and it will be >=
2257 * token_size if the token would not fit.
2258 *
2259 * The *buf pointer will be updated point beyond the end of the
2260 * found token. Note that this occurs even if the token buffer is
2261 * too small to hold it.
2262 */
2263static inline size_t copy_token(const char **buf,
2264 char *token,
2265 size_t token_size)
2266{
2267 size_t len;
2268
2269 len = next_token(buf);
2270 if (len < token_size) {
2271 memcpy(token, *buf, len);
2272 *(token + len) = '\0';
2273 }
2274 *buf += len;
2275
2276 return len;
2277}
2278
2279/*
Alex Eldera725f65e2012-02-02 08:13:30 -06002280 * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
2281 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2282 * on the list of monitor addresses and other options provided via
2283 * /sys/bus/rbd/add.
2284 */
2285static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2286 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002287 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002288 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002289 char *options,
2290 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002291{
Alex Eldere28fff262012-02-02 08:13:30 -06002292 size_t len;
2293
2294 /* The first four tokens are required */
2295
Alex Elder7ef32142012-02-02 08:13:30 -06002296 len = next_token(&buf);
2297 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002298 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002299 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002300 *mon_addrs = buf;
2301
2302 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002303
Alex Eldere28fff262012-02-02 08:13:30 -06002304 len = copy_token(&buf, options, options_size);
2305 if (!len || len >= options_size)
2306 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002307
Alex Eldere28fff262012-02-02 08:13:30 -06002308 len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name));
2309 if (!len || len >= sizeof (rbd_dev->pool_name))
2310 return -EINVAL;
2311
2312 len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
2313 if (!len || len >= sizeof (rbd_dev->obj))
2314 return -EINVAL;
2315
2316 /* We have the object length in hand, save it. */
2317
2318 rbd_dev->obj_len = len;
2319
Alex Elder81a89792012-02-02 08:13:30 -06002320 BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
2321 < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
2322 sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002323
Alex Eldere28fff262012-02-02 08:13:30 -06002324 /*
2325 * The snapshot name is optional, but it's an error if it's
2326 * too long. If no snapshot is supplied, fill in the default.
2327 */
2328 len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
2329 if (!len)
2330 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2331 sizeof (RBD_SNAP_HEAD_NAME));
2332 else if (len >= sizeof (rbd_dev->snap_name))
2333 return -EINVAL;
2334
Alex Eldera725f65e2012-02-02 08:13:30 -06002335 return 0;
2336}
2337
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002338static ssize_t rbd_add(struct bus_type *bus,
2339 const char *buf,
2340 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002341{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002342 struct rbd_device *rbd_dev;
Alex Elder7ef32142012-02-02 08:13:30 -06002343 const char *mon_addrs = NULL;
2344 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002345 char *options = NULL;
2346 struct ceph_osd_client *osdc;
2347 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002348
2349 if (!try_module_get(THIS_MODULE))
2350 return -ENODEV;
2351
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002352 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2353 if (!rbd_dev)
Alex Elder27cc2592012-02-02 08:13:30 -06002354 goto err_nomem;
Alex Elder27cc2592012-02-02 08:13:30 -06002355 options = kmalloc(count, GFP_KERNEL);
2356 if (!options)
2357 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002358
2359 /* static rbd_device initialization */
2360 spin_lock_init(&rbd_dev->lock);
2361 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002362 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002363
Alex Elder0e805a12012-01-11 19:42:15 -08002364 init_rwsem(&rbd_dev->header.snap_rwsem);
2365
Alex Elderd184f6b2012-01-29 13:57:44 -06002366 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002367 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002368
Alex Eldera725f65e2012-02-02 08:13:30 -06002369 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002370 BUILD_BUG_ON(DEV_NAME_LEN
2371 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2372 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002373
Alex Eldera725f65e2012-02-02 08:13:30 -06002374 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002375 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002376 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002377 if (rc)
2378 goto err_put_id;
2379
Alex Elder5214ecc2012-02-02 08:13:30 -06002380 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2381 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002382 if (IS_ERR(rbd_dev->rbd_client)) {
2383 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002384 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002385 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002386
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002387 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002388 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002389 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2390 if (rc < 0)
2391 goto err_out_client;
2392 rbd_dev->poolid = rc;
2393
2394 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002395 rc = register_blkdev(0, rbd_dev->name);
2396 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002397 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002398 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002399
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002400 rc = rbd_bus_add_dev(rbd_dev);
2401 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002402 goto err_out_blkdev;
2403
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002404 /* set up and announce blkdev mapping */
2405 rc = rbd_init_disk(rbd_dev);
2406 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002407 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002408
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002409 rc = rbd_init_watch_dev(rbd_dev);
2410 if (rc)
2411 goto err_out_bus;
2412
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002413 return count;
2414
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002415err_out_bus:
Alex Elder499afd52012-02-02 08:13:29 -06002416 rbd_id_put(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002417
2418 /* this will also clean up rest of rbd_dev stuff */
2419
2420 rbd_bus_del_dev(rbd_dev);
2421 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002422 return rc;
2423
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002424err_out_blkdev:
2425 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2426err_out_client:
2427 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002428err_put_id:
Alex Elder499afd52012-02-02 08:13:29 -06002429 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002430err_nomem:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002431 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002432 kfree(rbd_dev);
2433
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002434 dout("Error adding device %s\n", buf);
2435 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002436
2437 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002438}
2439
2440static struct rbd_device *__rbd_get_dev(unsigned long id)
2441{
2442 struct list_head *tmp;
2443 struct rbd_device *rbd_dev;
2444
Alex Eldere124a822012-01-29 13:57:44 -06002445 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002446 list_for_each(tmp, &rbd_dev_list) {
2447 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002448 if (rbd_dev->id == id) {
2449 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002450 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002451 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002452 }
Alex Eldere124a822012-01-29 13:57:44 -06002453 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002454 return NULL;
2455}
2456
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002457static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002458{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002459 struct rbd_device *rbd_dev =
2460 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002461
Alex Elder1dbb4392012-01-24 10:08:37 -06002462 if (rbd_dev->watch_request) {
2463 struct ceph_client *client = rbd_dev->rbd_client->client;
2464
2465 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002466 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002467 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002468 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002469 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002470
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002471 rbd_put_client(rbd_dev);
2472
2473 /* clean up and free blkdev */
2474 rbd_free_disk(rbd_dev);
2475 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2476 kfree(rbd_dev);
2477
2478 /* release module ref */
2479 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480}
2481
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002482static ssize_t rbd_remove(struct bus_type *bus,
2483 const char *buf,
2484 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002485{
2486 struct rbd_device *rbd_dev = NULL;
2487 int target_id, rc;
2488 unsigned long ul;
2489 int ret = count;
2490
2491 rc = strict_strtoul(buf, 10, &ul);
2492 if (rc)
2493 return rc;
2494
2495 /* convert to int; abort if we lost anything in the conversion */
2496 target_id = (int) ul;
2497 if (target_id != ul)
2498 return -EINVAL;
2499
2500 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2501
2502 rbd_dev = __rbd_get_dev(target_id);
2503 if (!rbd_dev) {
2504 ret = -ENOENT;
2505 goto done;
2506 }
2507
Alex Elder499afd52012-02-02 08:13:29 -06002508 rbd_id_put(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002509
2510 __rbd_remove_all_snaps(rbd_dev);
2511 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512
2513done:
2514 mutex_unlock(&ctl_mutex);
2515 return ret;
2516}
2517
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002518static ssize_t rbd_snap_add(struct device *dev,
2519 struct device_attribute *attr,
2520 const char *buf,
2521 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002522{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002523 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2524 int ret;
2525 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002526 if (!name)
2527 return -ENOMEM;
2528
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002529 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002530
2531 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2532
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002533 ret = rbd_header_add_snap(rbd_dev,
2534 name, GFP_KERNEL);
2535 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002536 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002537
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002538 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002539 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002540 goto err_unlock;
2541
2542 /* shouldn't hold ctl_mutex when notifying.. notify might
2543 trigger a watch callback that would need to get that mutex */
2544 mutex_unlock(&ctl_mutex);
2545
2546 /* make a best effort, don't error if failed */
2547 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002548
2549 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002550 kfree(name);
2551 return ret;
2552
2553err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002554 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002555 kfree(name);
2556 return ret;
2557}
2558
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002559/*
2560 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002561 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002562 */
2563static int rbd_sysfs_init(void)
2564{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002565 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566
Alex Elderfed4c142012-02-07 12:03:36 -06002567 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002568 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002569 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570
Alex Elderfed4c142012-02-07 12:03:36 -06002571 ret = bus_register(&rbd_bus_type);
2572 if (ret < 0)
2573 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002575 return ret;
2576}
2577
2578static void rbd_sysfs_cleanup(void)
2579{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002580 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002581 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002582}
2583
2584int __init rbd_init(void)
2585{
2586 int rc;
2587
2588 rc = rbd_sysfs_init();
2589 if (rc)
2590 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002591 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002592 return 0;
2593}
2594
2595void __exit rbd_exit(void)
2596{
2597 rbd_sysfs_cleanup();
2598}
2599
2600module_init(rbd_init);
2601module_exit(rbd_exit);
2602
2603MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2604MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2605MODULE_DESCRIPTION("rados block device");
2606
2607/* following authorship retained from original osdblk.c */
2608MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2609
2610MODULE_LICENSE("GPL");