4 * Copyright (C) 2005-2009
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
10 #include <linux/namei.h>
11 #include <linux/crc32.h>
12 #include <linux/statfs.h>
13 #include <linux/mount.h>
14 #include <linux/exportfs.h>
15 #include <linux/slab.h>
17 static struct super_operations lafs_sops;
18 static const struct export_operations lafs_export_ops;
20 /*---------------------------------------------------------------------
21 * Write out state and super blocks
22 * The super blocks only need to be written when the geometry of the
23 * array changes such as when a device is added, removed, or resized.
24 * So we don't bother with that just yet.
25 * The state block needs to be written - twice on each device - whenever
26 * a checkpoint is completed. All copies are identical and the writes
27 * proceed in parallel. There are 4 stateblock locations on each device.
28 * 2 are typically less recent than the other two. We over-write the
30 * FIXME on a RAID4 we should pad the write to be a full stripe.
32 * Locking issues: This is called from the checkpoint thread and so
33 * it does not race with anything else exclusive to that thread.
34 * The nonlog information needs to be reviewed once that functionality
38 int lafs_write_state(struct fs *fs)
40 struct lafs_state *st;
45 st->seq = cpu_to_le32(fs->seq);
46 st->nonlog_segment = cpu_to_le32(fs->nonlog_segment);
47 st->nonlog_dev = cpu_to_le16(fs->nonlog_dev);
48 st->nonlog_offset = cpu_to_le16(fs->nonlog_offset);
49 st->nextyouth = cpu_to_le16(fs->youth_next);
50 st->checkpointcluster = cpu_to_le64(fs->checkpointcluster);
51 for (i = 0; i < fs->maxsnapshot; i++)
52 st->root_inodes[i] = cpu_to_le64(fs->ss[i].root_addr);
55 st->checksum = crc32_le(0, (unsigned char *)st, fs->statesize);
57 for (d = 0; d < fs->devices ; d++)
58 for (i = (fs->seq & 1); i < 4 ; i += 2)
59 lafs_super_write(fs, d, fs->devs[d].stateaddr[i] >> 9,
60 (char *)st, fs->statesize);
62 /* FIXME what about a write error ??? */
67 valid_devblock(struct lafs_dev *db, sector_t addr, sector_t size)
69 /* check that this devblock is valid, given that
70 * it was found at sector 'addr'
77 if (strncmp(db->idtag, "LaFS-DeviceBlock", 16) != 0)
79 if (strncmp(db->version, "AlphaDevel ", 16) != 0)
81 /* uuid can be anything */
84 crc2 = crc32_le(0, (unsigned char *)db, LAFS_DEVBLK_SIZE);
87 dprintk("%lx != %lx\n", (unsigned long)crc,
92 byteaddr = (u64)addr << 9; /* convert to byte */
93 if (le64_to_cpu(db->devaddr[0]) != byteaddr &&
94 le64_to_cpu(db->devaddr[1]) != byteaddr)
97 if (db->statebits < 10 || db->statebits > 16)
99 if (db->blockbits < 9 || db->blockbits > 20)
101 if (le16_to_cpu(db->width) < 1 || le16_to_cpu(db->width) >= 512)
103 if (le32_to_cpu(db->stride) < 1)
105 /* devaddr[0] must be early, [1] must be late */
106 if (le64_to_cpu(db->devaddr[0]) >=
107 le64_to_cpu(db->segment_offset))
110 if (le64_to_cpu(db->devaddr[1]) <
111 le64_to_cpu(db->segment_offset) +
112 ((((sector_t)le32_to_cpu(db->segment_count)
113 * le32_to_cpu(db->segment_size)))
117 /* 2 is an absolute minimum segment size, a few hundred is more
118 * likely. We'll put a lower limit of 8, and an upper of 800000
120 if (le32_to_cpu(db->segment_size) < 8 ||
121 le32_to_cpu(db->segment_size) > 800000)
124 if (le32_to_cpu(db->segment_offset) >
125 (le32_to_cpu(db->segment_size)<<db->blockbits) * 10)
128 /* The 4 state blocks live before the first or after the last segment.
129 * The distance from start of first to end of last is either:
130 * - segment_count * segment_size if width*stride <= segment_size
131 * - (width-1) * stride + segment_size / width * segment_count
132 * if width * stride > segment_size
134 segsize = le32_to_cpu(db->segment_size);
135 segsize *= le32_to_cpu(db->segment_count);
136 if (le16_to_cpu(db->width) * le32_to_cpu(db->stride)
137 > le32_to_cpu(db->segment_size)) {
138 int stride = le32_to_cpu(db->stride);
139 int width = le16_to_cpu(db->width);
141 sector_div(segsize, width);
142 segsize += (width - 1) * stride;
144 segsize <<= db->blockbits;
145 for (i = 0; i < 4; i++) {
146 sector_t addr = le64_to_cpu(db->stateaddr[i]);
147 int offset = le32_to_cpu(db->segment_offset);
148 if (addr + (1<<db->statebits) > offset &&
149 addr < offset + segsize)
151 if (addr + (1<<db->statebits) > (size << db->blockbits))
155 /* Check all segments fit within device */
156 if (le32_to_cpu(db->segment_offset) + segsize > (size << db->blockbits))
159 if (le32_to_cpu(db->level) > 10)
162 /* I guess it look sane enough... */
167 compare_dev(struct lafs_dev *orig, struct lafs_dev *new)
169 /* Both these are known to be valid.
171 * 0 if they are for same filesystem, but 'new' is older
172 * 1 if they are for same filesystem, and 'new' is newer
173 * -1 if they are for different filesystems
175 if (memcmp(orig->uuid, new->uuid, 16))
177 if (u32_after(le32_to_cpu(new->seq),
178 le32_to_cpu(orig->seq)))
184 valid_stateblock(struct lafs_state *st, struct lafs_dev *dv)
186 /* Given the 'dv' devblock, make sure 'st' is a valid
187 * and consistent stateblock
190 if (strncmp(st->idtag, "LaFS-State-Block", 16) != 0)
192 if (strncmp(st->version, "AlphaDevel ", 16) != 0)
196 if (crc32_le(0, (unsigned char *)st, 1<<dv->statebits) != crc)
200 if (memcmp(st->uuid, dv->uuid, 16))
203 if (sizeof(*st) + le32_to_cpu(st->maxsnapshot) * 8
204 > (1<<dv->statebits))
211 compare_state(struct lafs_state *orig, struct lafs_state *new)
213 /* return 1 if 'new' is actually newer than 'orig'.
214 * We already know they are both valid and have the same
215 * uuid... I don't think there is anything else to be checked
217 return u32_after(le32_to_cpu(new->seq), le32_to_cpu(orig->seq));
222 * As we can have multiple devices, things are slightly non-obvious.
223 * The 'devname' can be either a device name, starting '/', or
224 * a filesytem name (not starting '/').
225 * The 'data' is a standard comma-separated list of options.
226 * For 'mount' these are:
228 * - devices in addition to 'dev_name'
230 * - A new device, with a superblock already present, to be added.
232 * - don't complain if not all devices are given
233 * ?? quota stuff, cleaning parameters,
235 * For 'remount', options are
236 * dev= - add another device
237 * new= - the device is being added.
244 int statebits, blockbits;
249 struct block_device *bdev;
250 struct lafs_dev *devblock;
251 struct lafs_state *stateblock;
252 int devchoice, statechoice;
257 count_devs(const char *name, char *data)
262 while (data && *data) {
263 if (strncmp(data, "dev=", 4) == 0)
265 if (strncmp(data, "new=", 4) == 0)
267 data = strchr(data, ',');
275 parse_opts(struct options *op, const char *name, char *data)
280 memset(op, 0, sizeof(*op));
281 op->devcnt = count_devs(name, data);
282 op->devlist = kzalloc(op->devcnt*sizeof(op->devlist[0]), GFP_KERNEL);
289 op->devlist[dv].is_name = 1;
290 op->devlist[dv++].dev = name;
293 while ((p = strsep(&data, ",")) != NULL) {
296 if (strncmp(p, "dev=", 4) == 0)
297 op->devlist[dv++].dev = p+4;
298 else if (strncmp(p, "new=", 4) == 0) {
299 op->devlist[dv].is_new = 1;
300 op->devlist[dv++].dev = p+4;
303 "LaFS: Unrecognised mount option \"%s\"\n", p);
314 lafs_load_super(struct block_device *bdev, void *opv, int silent)
316 /* Find the devblock and the stateblock for this device
318 * Only do basic internal consistancy checks. Inter-device
319 * checks happen later
321 struct options *op = opv;
324 sector_t sect, dev_addr = 0, state_addr = 0;
328 int have_dev = 0, have_state = 0;
331 dv = &op->devlist[op->curr_dev];
332 BUG_ON(dv->devblock);
333 BUG_ON(dv->stateblock);
335 n = queue_logical_block_size(bdev->bd_disk->queue);
336 if (n < LAFS_DEVBLK_SIZE)
337 n = LAFS_DEVBLK_SIZE;
338 BUG_ON(n > PAGE_SIZE);
339 dv->devblock = kmalloc(n, GFP_KERNEL);
342 pg = alloc_page(GFP_KERNEL);
346 devsize = i_size_read(bdev->bd_inode);
348 /* Now find a devblock, check the first two possible locations,
349 * and the last two. If two devblocks are found with different
350 * uuids, we are confused!
353 for (i = 0; i < 4; i++) {
354 /* try to read block at 'sect' */
355 int ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
357 if (ok && valid_devblock(page_address(pg), sect, devsize)) {
360 memcpy(dv->devblock, page_address(pg), n);
362 } else switch (compare_dev(dv->devblock,
364 case 0: /* older, do nothing */
366 case 1: /* newer, overwrite */
367 memcpy(dv->devblock, page_address(pg), n);
370 default: /* inconsistent --- HELP */
371 printk(KERN_ERR "LaFS: inconsistent device-blocks found.\n");
380 sect = devsize & ~(sector_t)(n-1);
385 /* FIXME - we've lost the read error, if it was significant */
389 printk(KERN_ERR "LaFS - no valid devblock found.\n");
393 /* OK, we have a valid devblock, that's nice.
394 * Now we should be able to find some stateblocks.
395 * The locations are in the devblock
397 n = le32_to_cpu(1<<dv->devblock->statebits);
399 n < queue_logical_block_size(bdev->bd_disk->queue) ||
401 printk(KERN_ERR "LaFS: statesize of %u not acceptable.\n", n);
405 dv->stateblock = kmalloc(n, GFP_KERNEL);
409 for (i = 0; i < 4; i++) {
411 sect = le64_to_cpu(dv->devblock->stateaddr[i])>>9;
412 ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
413 if (ok && valid_stateblock(page_address(pg), dv->devblock)) {
416 memcpy(dv->stateblock, page_address(pg), n);
418 } else if (compare_state(dv->stateblock,
420 memcpy(dv->stateblock, page_address(pg), n);
428 dv->devchoice = dev_addr;
429 dv->statechoice = state_addr;
433 printk(KERN_ERR "LaFS: no valid stateblock found.\n");
436 page_cache_release(pg);
441 check_devs(struct options *op)
443 /* Check we have enough, that they are for the same
444 * uuid, and they they don't overlap
445 * Also check that 'seq' number of devblocks
448 int seqlo = le32_to_cpu(op->devlist[0].devblock->seq);
449 int seqhi = le32_to_cpu(op->devlist[0].devblock->seq);
454 for (i = 1; i < op->devcnt; i++) {
455 if (memcmp(op->devlist[0].stateblock->uuid,
456 op->devlist[i].stateblock->uuid,
460 if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo)
462 else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo+1) {
465 } else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqhi-1)
470 if (u32_after(le32_to_cpu(op->devlist[i].stateblock->seq),
471 le32_to_cpu(op->devlist[newstate].
475 if (le32_to_cpu(op->devlist[newstate].stateblock->devices)
479 op->statebits = op->devlist[0].devblock->statebits;
480 op->blockbits = op->devlist[0].devblock->blockbits;
482 /* Now check devices don't overlap in start/size.
483 * We do a simple quadratic search
485 for (i = 0; i < op->devcnt; i++)
486 for (j = 0; j < op->devcnt; j++)
488 if (le64_to_cpu(op->devlist[i].devblock->start) <
489 le64_to_cpu(op->devlist[j].devblock->start) &&
491 le64_to_cpu(op->devlist[i].devblock->start)+
492 le64_to_cpu(op->devlist[i].devblock->size) >
493 le64_to_cpu(op->devlist[j].devblock->start))
498 /* we identify lafs superblocks by the filesystem uuid. This means
499 * that block-level snapshots cannot be mounted. You should use
500 * fs-level snapshots instead.
502 static int sb_test(struct super_block *sb, void *data)
504 struct sb_key *ptn = data;
505 struct sb_key *sk = sb->s_fs_info;
506 return memcmp(ptn->fs->state->uuid,
507 sk->fs->state->uuid, 16) == 0;
510 static int sb_set(struct super_block *sb, void *data)
512 struct sb_key *ptn = data;
514 return set_anon_super(sb, NULL);
519 lafs_load(struct fs *fs, struct options *op, int newest)
521 /* We seem to have a full set of devices for the filesystem.
522 * Time to create our fs_info structure and fill it out.
523 * This only includes information from the dev and state blocks.
524 * Finding the root-inode comes a bit later.
526 struct lafs_state *st;
531 st = fs->state = op->devlist[newest].stateblock;
532 op->devlist[newest].stateblock = NULL;
537 fs->seq = le32_to_cpu(st->seq);
538 fs->levels = le32_to_cpu(st->levels);
539 fs->devices = op->devcnt;
540 fs->devs_loaded = fs->devices; /* FIXME use this or lose this */
541 fs->statesize = 1 << op->statebits;
542 fs->blocksize = 1 << op->blockbits;
543 fs->blocksize_bits = op->blockbits;
545 fs->nonlog_segment = le32_to_cpu(st->nonlog_segment);
546 fs->nonlog_dev = le16_to_cpu(st->nonlog_dev);
547 fs->nonlog_offset = le16_to_cpu(st->nonlog_offset);
548 fs->youth_next = le16_to_cpu(st->nextyouth);
549 fs->checkpoint_youth = fs->youth_next;
550 if (fs->youth_next < 8)
552 fs->scan.first_free_pass = 1;
553 fs->scan.free_dev = -1;
555 fs->maxsnapshot = le32_to_cpu(st->maxsnapshot);
557 fs->scan.free_usages = kmalloc(PAGE_SIZE, GFP_KERNEL);
558 err = lafs_segtrack_init(fs->segtrack);
560 fs->ss = kzalloc(sizeof(struct snapshot)*fs->maxsnapshot, GFP_KERNEL);
561 if (!fs->ss || !fs->scan.free_usages || err) {
567 fs->checkpointcluster = le64_to_cpu(st->checkpointcluster);
568 for (i = 0; i < fs->maxsnapshot; i++) {
569 fs->ss[i].root_addr =
570 le64_to_cpu(st->root_inodes[i]);
571 dprintk("root inode %d are %llu\n",
572 i, fs->ss[i].root_addr);
574 INIT_LIST_HEAD(&fs->pending_orphans);
575 INIT_LIST_HEAD(&fs->inode_index);
576 INIT_LIST_HEAD(&fs->phase_leafs[0]);
577 INIT_LIST_HEAD(&fs->phase_leafs[1]);
578 INIT_LIST_HEAD(&fs->clean_leafs);
579 INIT_LIST_HEAD(&fs->account_leafs);
580 atomic_set(&fs->sb_writes_pending, 0);
581 init_waitqueue_head(&fs->sb_writes_wait);
582 init_waitqueue_head(&fs->async_complete);
583 init_waitqueue_head(&fs->trunc_wait);
584 mutex_init(&fs->cleaner.lock);
585 spin_lock_init(&fs->stable_lock);
586 spin_lock_init(&fs->alloc_lock);
587 spin_lock_init(&fs->lock);
588 init_waitqueue_head(&fs->phase_wait);
590 INIT_WORK(&fs->done_work, lafs_done_work);
592 /* FIXME add congention and unplug functions to this bdi */
593 err = bdi_init(&fs->bdi);
598 fs->phase_locked = 0;
599 for (i = 0; i < WC_NUM; i++) {
601 mutex_init(&fs->wc[i].lock);
602 for (j = 0; j < 4 ; j++) {
603 atomic_set(&fs->wc[i].pending_cnt[j], 0);
604 INIT_LIST_HEAD(&fs->wc[i].pending_blocks[j]);
606 init_waitqueue_head(&fs->wc[i].pending_wait);
607 fs->wc[i].seg.dev = -1;
610 fs->max_newsegs = 32; /* FIXME this should be configurable */
613 fs->devs = kzalloc(sizeof(struct fs_dev)*fs->devices, GFP_KERNEL);
617 k = kzalloc(sizeof(*k), GFP_KERNEL);
619 fs->prime_sb = sget(&lafs_fs_type, sb_test, sb_set, k);
620 if (IS_ERR(fs->prime_sb)) {
622 err = PTR_ERR(fs->prime_sb);
625 if (fs->prime_sb->s_root) {
626 /* filesystem with this uuid already exists */
627 deactivate_locked_super(fs->prime_sb);
633 err = bdi_register_dev(&fs->bdi, fs->prime_sb->s_dev);
635 deactivate_locked_super(fs->prime_sb);
640 fs->prime_sb->s_bdi = &fs->bdi;
642 fs->prime_sb->s_blocksize = 1 << op->blockbits;
643 fs->prime_sb->s_blocksize_bits = op->blockbits;
644 fs->prime_sb->s_op = &lafs_sops;
645 fs->prime_sb->s_export_op = &lafs_export_ops;
646 fs->prime_sb->s_root = NULL;
648 /* We allow 29 bits for nanosecs, so they must be even. */
649 fs->prime_sb->s_time_gran = 2;
651 for (i = 0; i < fs->devices; i++) {
652 struct fs_dev *dv = &fs->devs[i];
653 struct devent *de = &op->devlist[i];
658 dv->devblk = de->devblock;
661 dv->recent_dev = de->devchoice;
662 dv->recent_state = de->statechoice;
664 dv->start = le64_to_cpu(dv->devblk->start);
665 dv->size = le64_to_cpu(dv->devblk->size);
666 dprintk("Dev %d seems to range %llu + %llu\n",
667 i, (unsigned long long)dv->start,
668 (unsigned long long)dv->size);
670 dv->width = le16_to_cpu(dv->devblk->width);
671 dv->stride = le32_to_cpu(dv->devblk->stride);
672 dv->segment_size = le32_to_cpu(dv->devblk->segment_size);
673 dv->segment_offset = le32_to_cpu(dv->devblk->segment_offset);
674 dv->segment_count = le32_to_cpu(dv->devblk->segment_count);
675 dv->usage_inum = le32_to_cpu(dv->devblk->usage_inum);
676 dv->level = le16_to_cpu(dv->devblk->level);
678 if (dv->segment_size > fs->max_segment)
679 fs->max_segment = dv->segment_size;
681 if (dv->width * dv->stride <= dv->segment_size) {
682 dv->tables_per_seg = dv->segment_size /
683 dv->width / dv->stride;
684 dv->rows_per_table = dv->stride;
685 dv->segment_stride = dv->segment_size;
687 dv->tables_per_seg = 1;
688 dv->rows_per_table = dv->segment_size / dv->width;
689 dv->segment_stride = dv->rows_per_table;
691 /* table size is the number of blocks in the segment usage
694 dv->tablesize = (dv->segment_count + (1<<(fs->blocksize_bits-1)) + 1)
695 >> (fs->blocksize_bits-1);
697 for (j = 0; j < 2; j++)
698 dv->devaddr[j] = le64_to_cpu(dv->devblk->devaddr[j]);
699 for (j = 0; j < 4; j++)
700 dv->stateaddr[j] = le64_to_cpu(dv->devblk->stateaddr[j]);
705 bdi_destroy(&fs->bdi);
706 kfree(fs->scan.free_usages);
707 lafs_segtrack_free(fs->segtrack);
714 static int show_orphans(struct fs *fs)
716 struct datablock *db;
717 printk("Orphans:\n");
718 list_for_each_entry(db, &fs->pending_orphans,
720 struct inode *ino = iget_my_inode(db);
721 printk("orphan=%s\n", strblk(&db->b));
723 lafs_print_tree(&LAFSI(ino)->iblock->b, 0);
726 printk("cleaner active: %d %d\n", fs->cleaner.active,
728 return 1; /* meaningless, but makes it easy to add to wait_event below */
731 static void lafs_kill_sb(struct super_block *sb)
733 struct fs *fs = fs_from_sb(sb);
734 /* Release the 'struct fs' */
737 /* FIXME should I refcount this when there are multiple
738 * filesets? How does that work?
741 /* Delay final destruction of the root inode */
742 /* FIXME all the sbs... */
743 set_bit(I_Deleting, &LAFSI(fs->ss[0].root)->iflags);
745 /* FIXME I'm not sure we should be waiting for the
746 * cleaner. Maybe we should just release all tc->cleaning
749 set_bit(CleanerDisabled, &fs->fsstate);
751 wait_event(fs->async_complete,
753 !test_bit(OrphansRunning, &fs->fsstate) &&
754 list_empty(&fs->pending_orphans) &&
755 fs->scan.done == 1 &&
756 fs->cleaner.active == 0);
758 if (LAFSI(fs->ss[0].root)->md.fs.accesstime) {
759 struct inode *i = LAFSI(fs->ss[0].root)->md.fs.accesstime;
760 LAFSI(fs->ss[0].root)->md.fs.accesstime = NULL;
764 kill_anon_super(fs->prime_sb);
766 bdi_destroy(&fs->bdi);
768 for (i = 0; i < fs->devices; i++) {
769 struct fs_dev *dv = &fs->devs[i];
771 close_bdev_exclusive(dv->bdev, FMODE_READ|FMODE_WRITE);
774 /* Final checkpoint will have cleared out the leafs lists,
775 * so they should all be empty.
777 /* Lets see what is on the 'leaf' list? */
778 for (i = 0; i < 2; i++) {
780 dprintk("For phase %d\n", i);
782 list_for_each_entry(b, &fs->phase_leafs[i], lru) {
783 /* FIXME this only OK for readonly mounts.
785 getref(b, MKREF(release));
787 if (test_bit(B_Pinned, &b->flags)) {
788 /* didn't fix the pincnt !! */
789 printk("This was pinned: %s\n", strblk(b));
790 lafs_print_tree(b, 1);
793 putref(b, MKREF(release));
797 BUG_ON(!list_empty(&fs->clean_leafs));
799 flush_scheduled_work();
800 lafs_stop_thread(fs);
802 for (i = 0; i < 4; i++)
803 if (fs->cleaner.seg[i].chead)
804 put_page(fs->cleaner.seg[i].chead);
809 lafs_segtrack_free(fs->segtrack);
810 kfree(fs->scan.free_usages);
811 kfree(fs->prime_sb->s_fs_info);
816 lafs_put_super(struct super_block *sb)
818 struct fs *fs = fs_from_sb(sb);
820 struct lafs_inode *li;
822 lafs_checkpoint_lock(fs);
823 lafs_checkpoint_start(fs);
824 if (sb == fs->prime_sb)
825 /* Don't incorporate any more segusage/quota updates. */
826 set_bit(FinalCheckpoint, &fs->fsstate);
827 lafs_checkpoint_unlock_wait(fs);
828 lafs_cluster_wait_all(fs);
830 if (sb == fs->prime_sb) {
832 /* This is the main sb, not a snapshot or
834 * Now that all inodes have been invalidated we can do
835 * the final checkpoint.
837 lafs_close_all_segments(fs);
838 lafs_empty_segment_table(fs);
839 lafs_seg_put_all(fs);
843 for (d=0; d < fs->devices; d++)
844 if (fs->devs[d].segsum) {
845 iput(fs->devs[d].segsum);
846 fs->devs[d].segsum = NULL;
850 /* need to break a circular reference... */
851 for (ss = 0; ss < fs->maxsnapshot; ss++)
852 if (fs->ss[ss].root &&
853 fs->ss[ss].root->i_sb == sb) {
854 dprintk("Putting ss %d\n", ss);
855 li = LAFSI(fs->ss[ss].root);
856 if (test_bit(B_Realloc, &li->dblock->b.flags))
858 iput(fs->ss[ss].root);
859 fs->ss[ss].root = NULL;
865 lafs_get_devs(struct fs *fs, struct options *op, int flags)
870 for (i = 0; i < op->devcnt; i++) {
871 struct block_device *bdev;
874 bdev = open_bdev_exclusive(op->devlist[i].dev,
875 FMODE_READ|FMODE_WRITE, fs);
879 err = lafs_load_super(bdev, op, flags & MS_SILENT ? 1 : 0);
882 op->devlist[i].bdev = bdev;
891 lafs_get_sb(struct file_system_type *fs_type,
892 int flags, const char *dev_name, void *data,
893 struct vfsmount *mnt)
895 /* as we may have multiple devices, some in 'data', we cannot just
896 * use get_sb_bdev, we need to roll-our-own.
897 * We call get_sb_bdev on *each* bdev, and make sure the returned
898 * superblocks are either all new, or all for the same filesystem.
899 * If the later, we return the primary.
900 * If the former, we init the filesystem copying static data
902 * First we 'open_bdev_exclusive' each device, exclusive to lafs
903 * Then we 'sget' a superblock that knows any/all the devices.
904 * This may be pre-existing, or may be new
905 * If new, it will be created knowing all devices.
906 * If pre-existing, and don't have correct device list, error
911 struct fs *fs = kzalloc(sizeof(*fs), GFP_KERNEL);
919 err = parse_opts(&op, dev_name, cdata);
923 /* We now have as list of device names. We call open_bdev_exclusive
924 * on each to collect some superblocks.
926 err = lafs_get_devs(fs, &op, flags);
930 /* Each device has a valid dev and state block. Hopefully they
931 * are all for the same filesystem. If they don't have the
932 * same uuid, we will bale-out here. We also check that we have
933 * enough, and that they don't overlap.
934 * While we are looking at state blocks, pick the newest.
936 newest = check_devs(&op);
942 /* So they seem to be the same - better create our
943 * 'fs' structure and fill it in
945 err = lafs_load(fs, &op, newest);
949 /* Well, all the devices check out. Now we need to find the
951 err = lafs_mount(fs);
953 err = lafs_start_thread(fs);
955 deactivate_locked_super(fs->prime_sb);
957 fs->prime_sb->s_flags |= MS_ACTIVE;
958 simple_set_mnt(mnt, fs->prime_sb);
960 /* And there you have it. Filesystem all mounted, root dir found,
961 * metadata files initialised, all pigs fed, and ready to fly!!!
965 /* Now we clean up 'options'. Anything that is wanted has
966 * been moved into 'fs', so we just discard anything we find
970 for (i = 0; i < op.devcnt; i++) {
971 kfree(op.devlist[i].devblock);
972 kfree(op.devlist[i].stateblock);
973 if (op.devlist[i].bdev)
974 close_bdev_exclusive(op.devlist[i].bdev,
975 FMODE_READ|FMODE_WRITE);
982 static int test_subset(struct super_block *sb, void *data)
984 struct sb_key *ptn = data;
985 struct sb_key *k = sb->s_fs_info;
987 return ptn->fs == k->fs && ptn->root == k->root;
990 static int set_subset(struct super_block *sb, void *data)
992 sb->s_fs_info = data;
993 set_anon_super(sb, NULL);
997 static struct file_system_type lafs_subset_fs_type;
998 struct super_block *lafs_get_subset_sb(struct inode *ino)
1000 /* ino must be a TypeInodeFile inode in the prime filesystem. */
1001 struct fs *fs = fs_from_inode(ino);
1002 struct super_block *sb;
1003 struct sb_key *k = kmalloc(sizeof(*k), GFP_KERNEL);
1006 return ERR_PTR(-ENOMEM);
1010 sb = sget(&lafs_subset_fs_type, test_subset, set_subset, k);
1013 } else if (sb->s_root) {
1014 /* already allocated */
1017 struct inode *rootdir, *imapfile;
1021 sb->s_blocksize = fs->blocksize;
1022 sb->s_blocksize_bits = fs->blocksize_bits;
1023 sb->s_bdi = fs->prime_sb->s_bdi;
1024 sb->s_op = &lafs_sops;
1025 sb->s_export_op = &lafs_export_ops;
1026 sb->s_time_gran = 2;
1027 rootdir = lafs_iget(sb, 2, SYNC);
1028 if (IS_ERR(rootdir) && PTR_ERR(rootdir) == -ENOENT) {
1029 rootdir = lafs_new_inode(fs, sb, NULL,
1030 TypeDir, 2, 0755, NULL);
1031 /* FIXME could the inode get written before we set
1032 * the link count ??*/
1033 rootdir->i_nlink = 2;
1035 if (IS_ERR(rootdir))
1036 err = PTR_ERR(rootdir);
1038 sb->s_root = d_alloc_root(rootdir);
1039 imapfile = lafs_iget(sb, 1, SYNC);
1040 if (IS_ERR(imapfile) && PTR_ERR(imapfile) == -ENOENT)
1041 imapfile = lafs_new_inode(fs, sb, NULL,
1042 TypeInodeMap, 1, 0, NULL);
1044 if (IS_ERR(imapfile))
1045 err = PTR_ERR(imapfile);
1051 struct inode *atime = lafs_iget(sb, 3, SYNC);
1052 if (!IS_ERR(atime)) {
1053 if (LAFSI(atime)->type != TypeAccessTime) {
1057 LAFSI(ino)->md.fs.accesstime = atime;
1058 } else if (PTR_ERR(atime) != -ENOENT)
1063 sb->s_op = fs->prime_sb->s_op;
1064 sb->s_flags |= MS_ACTIVE;
1065 atomic_inc(&fs->prime_sb->s_active);
1068 deactivate_locked_super(sb);
1076 lafs_get_subset(struct file_system_type *fs_type,
1077 int flags, const char *dev_name, void *data,
1078 struct vfsmount *mnt)
1080 /* mount, possibly creating, a sub-fileset.
1081 * dev_name must be an absolute path that leads
1082 * to an object in a lafs file-system (or snapshot).
1083 * The object must be either an InodeFile or
1084 * an empty directory in the main file-system
1085 * with mode 0 (though that rule might change).
1086 * In the latter case we change the object to an
1088 * FIXME must require readonly for snapshots, and readwrite
1092 struct nameidata nd;
1094 struct super_block *sb;
1098 err = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
1101 sb = nd.path.dentry->d_sb;
1103 if (sb->s_type != &lafs_fs_type &&
1104 sb->s_type != &lafs_snap_fs_type)
1106 ino = nd.path.dentry->d_inode;
1107 if (LAFSI(ino)->type != TypeInodeFile &&
1108 LAFSI(ino)->type != TypeDir)
1110 fs = fs_from_sb(sb);
1111 mutex_lock(&ino->i_mutex);
1112 if (LAFSI(ino)->type == TypeDir) {
1113 struct datablock *inodb;
1114 /* maybe convert this to TypeInodeFile */
1115 if (sb->s_type != &lafs_fs_type)
1118 /* FIXME maybe I should run orphans */
1120 if ((ino->i_mode & 07777) != 0)
1122 inodb = lafs_inode_dblock(ino, SYNC, MKREF(make_subset));
1123 err = PTR_ERR(inodb);
1126 lafs_iolock_block(&inodb->b);
1127 set_bit(B_PinPending, &inodb->b.flags);
1128 lafs_iounlock_block(&inodb->b);
1129 lafs_checkpoint_lock(fs);
1130 err = lafs_pin_dblock(inodb, ReleaseSpace);
1133 /* OK, we are good to go making this filesystem */
1134 LAFSI(ino)->type = TypeInodeFile;
1135 LAFSI(ino)->metadata_size = (sizeof(struct la_inode) +
1136 sizeof(struct fs_metadata));
1137 ino->i_op = &lafs_subset_ino_operations;
1138 ino->i_fop = &lafs_subset_file_operations;
1139 /* FIXME we lose md->parent here - what to do?? */
1140 md = &LAFSI(ino)->md.fs;
1142 ino->i_mtime = current_fs_time(sb);
1143 md->cblocks_used = 0;
1144 md->pblocks_used = 0;
1145 md->ablocks_used = 0;
1146 md->blocks_allowed = 10000; /* FIXME */
1147 md->blocks_unalloc = 0;
1148 /* FIXME should I be using inode_init here */
1149 md->creation_age = fs->wc[0].cluster_seq;
1150 md->inodes_used = 0;
1151 md->quota_inums[0] = 0;
1152 md->quota_inums[1] = 0;
1153 md->quota_inums[2] = 0;
1154 md->quota_inodes[0] = NULL;
1155 md->quota_inodes[1] = NULL;
1156 md->quota_inodes[2] = NULL;
1157 md->accesstime = NULL;
1159 lafs_dirty_dblock(inodb);
1160 lafs_dirty_inode(ino);
1161 /* We use a checkpoint to commit this change,
1162 * it is too unusual to bother logging
1164 lafs_checkpoint_start(fs);
1165 lafs_checkpoint_unlock_wait(fs);
1167 lafs_checkpoint_unlock(fs);
1169 putdref(inodb, MKREF(make_subset));
1174 /* We have a TypeInodeFile so we can make a superblock */
1175 sb = lafs_get_subset_sb(ino);
1181 simple_set_mnt(mnt, sb);
1183 mutex_unlock(&ino->i_mutex);
1190 static void lafs_kill_subset(struct super_block *sb)
1192 struct sb_key *k = sb->s_fs_info;
1193 if (LAFSI(k->root)->md.fs.accesstime) {
1194 iput(LAFSI(k->root)->md.fs.accesstime);
1195 LAFSI(k->root)->md.fs.accesstime = NULL;
1197 kill_anon_super(sb);
1199 deactivate_super(k->fs->prime_sb);
1203 const struct file_operations lafs_subset_file_operations = {
1206 const struct inode_operations lafs_subset_ino_operations = {
1210 struct file_system_type lafs_fs_type = {
1211 .owner = THIS_MODULE,
1213 .get_sb = lafs_get_sb,
1214 .kill_sb = lafs_kill_sb,
1215 .fs_flags = FS_REQUIRES_DEV,
1218 static struct file_system_type lafs_subset_fs_type = {
1219 .owner = THIS_MODULE,
1220 .name = "lafs_subset",
1221 .get_sb = lafs_get_subset,
1222 .kill_sb = lafs_kill_subset,
1225 static int __init lafs_init(void)
1229 BUILD_BUG_ON(B_NUM_FLAGS > 32);
1231 err = lafs_ihash_init();
1232 err = err ?: register_filesystem(&lafs_fs_type);
1233 err = err ?: register_filesystem(&lafs_snap_fs_type);
1234 err = err ?: register_filesystem(&lafs_subset_fs_type);
1240 unregister_filesystem(&lafs_fs_type);
1241 unregister_filesystem(&lafs_snap_fs_type);
1242 unregister_filesystem(&lafs_subset_fs_type);
1247 static void __exit lafs_exit(void)
1249 unregister_filesystem(&lafs_fs_type);
1250 unregister_filesystem(&lafs_snap_fs_type);
1251 unregister_filesystem(&lafs_subset_fs_type);
1255 static struct inode *lafs_nfs_get_inode(struct super_block *sb,
1256 u64 ino, u32 generation)
1258 struct inode *inode;
1260 inode = lafs_iget(sb, ino, SYNC);
1262 return ERR_CAST(inode);
1263 if (generation && inode->i_generation != generation) {
1265 return ERR_PTR(-ESTALE);
1271 static struct dentry *lafs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1272 int fh_len, int fh_type)
1274 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1275 lafs_nfs_get_inode);
1278 static struct dentry *lafs_fh_to_parent(struct super_block *sb, struct fid *fid,
1279 int fh_len, int fh_type)
1281 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1282 lafs_nfs_get_inode);
1285 static struct dentry *lafs_get_parent(struct dentry *child)
1287 ino_t inum = LAFSI(child->d_inode)->md.file.parent;
1288 struct inode *inode = lafs_iget(child->d_inode->i_sb, inum, SYNC);
1290 return ERR_CAST(inode);
1291 return d_obtain_alias(inode);
1294 static const struct export_operations lafs_export_ops = {
1295 .fh_to_dentry = lafs_fh_to_dentry,
1296 .fh_to_parent = lafs_fh_to_parent,
1297 .get_parent = lafs_get_parent,
1300 static struct inode *lafs_alloc_inode(struct super_block *sb)
1302 struct lafs_inode *li;
1303 li = kmalloc(sizeof(*li), GFP_NOFS);
1306 inode_init_once(&li->vfs_inode);
1307 li->vfs_inode.i_data.backing_dev_info = sb->s_bdi;
1310 li->update_cluster = 0;
1311 li->md.fs.name = NULL;
1313 init_rwsem(&li->ind_sem);
1314 INIT_LIST_HEAD(&li->free_index);
1316 return &li->vfs_inode;
1319 static void kfree_inode(struct rcu_head *head)
1321 struct lafs_inode *lai = container_of(head, struct lafs_inode,
1323 if (lai->type == TypeInodeFile)
1324 kfree(lai->md.fs.name);
1328 void lafs_destroy_inode(struct inode *inode)
1330 struct datablock *db;
1332 BUG_ON(!list_empty(&inode->i_sb_list));
1333 // Cannot test i_list as dispose_list just does list_del
1334 db = lafs_inode_get_dblock(inode, MKREF(destroy));
1337 set_bit(I_Destroyed, &LAFSI(inode)->iflags);
1338 putdref(db, MKREF(destroy));
1340 spin_lock(&inode->i_data.private_lock);
1341 if (LAFSI(inode)->iblock)
1342 LAFS_BUG(atomic_read(&LAFSI(inode)->iblock->b.refcnt),
1343 &LAFSI(inode)->iblock->b);
1344 /* FIXME could there be Async blocks keeps a refcount?
1345 * we should free them
1347 spin_unlock(&inode->i_data.private_lock);
1348 lafs_release_index(&LAFSI(inode)->free_index);
1349 call_rcu(&LAFSI(inode)->md.rcu,
1354 static int lafs_sync_fs(struct super_block *sb, int wait)
1357 /* We only reach here if s_dirt was set, so it
1358 * is reasonable to force a checkpoint.
1360 lafs_checkpoint_start(fs_from_sb(sb));
1362 lafs_checkpoint_wait(fs_from_sb(sb));
1366 static int lafs_statfs(struct dentry *de, struct kstatfs *buf)
1371 struct fs *fs = fs_from_inode(de->d_inode);
1372 struct lafs_inode *fsroot = LAFSI(ino_from_sb(de->d_inode->i_sb));
1373 struct lafs_inode *laroot = LAFSI(fs->ss[0].root);
1376 fsuuid = (u32 *)fs->state->uuid;
1377 for (i = 0; i < 16 / 4 ; i++) {
1378 fsid ^= le32_to_cpu(fsuuid[i]);
1379 buf->f_fsid.val[i/2] = fsid;
1381 buf->f_fsid.val[1] ^= fsroot->vfs_inode.i_ino;
1382 buf->f_type = 0x4C614654; /* "LaFS" */
1383 buf->f_bsize = fs->blocksize;
1384 buf->f_blocks = fsroot->md.fs.blocks_allowed;
1385 if (buf->f_blocks == 0) {
1386 /* should subtract usage of all other filesystems...*/
1387 for (i = 0; i < fs->devs_loaded; i++)
1388 buf->f_blocks += fs->devs[i].size;
1393 buf->f_namelen = 255;
1396 spin_lock(&laroot->vfs_inode.i_lock);
1397 /* "bavail" is "blocks we could succeed in adding to the filesystem".
1398 * "bfree" is effectively total blocks - used blocks
1400 buf->f_bavail = fs->free_blocks + fs->clean_reserved - fs->allocated_blocks;
1401 spin_unlock(&laroot->vfs_inode.i_lock);
1402 spin_lock(&fsroot->vfs_inode.i_lock);
1403 buf->f_bfree = buf->f_blocks - (fsroot->md.fs.cblocks_used +
1404 fsroot->md.fs.pblocks_used +
1405 fsroot->md.fs.ablocks_used);
1406 dprintk("df: tot=%ld free=%ld avail=%ld(%ld-%ld-%ld) cb=%ld pb=%ld ab=%ld\n",
1407 (long)buf->f_blocks, (long)buf->f_bfree, (long)buf->f_bavail,
1408 (long)fs->free_blocks, (long)fs->clean_reserved,
1409 (long)fs->allocated_blocks,
1410 (long)fsroot->md.fs.cblocks_used, (long)fsroot->md.fs.pblocks_used,
1411 (long)fsroot->md.fs.ablocks_used);
1412 spin_unlock(&fsroot->vfs_inode.i_lock);
1416 /* FIXME we hold inode_lock while calling drop_inode, so
1417 * extra locking isn't really welcome....???
1419 static void lafs_drop_inode(struct inode *inode)
1421 struct fs *fs = fs_from_inode(inode);
1422 struct datablock *db;
1424 /* This lock that we now hold on the inode could prevent
1425 * the cleaner from getting the inode. So after
1426 * the complete the drop we might need to wake the cleaner.
1429 db = lafs_inode_get_dblock(inode, MKREF(drop));
1431 generic_drop_inode(inode);
1432 if (db && test_bit(B_Async, &db->b.flags))
1433 lafs_wake_thread(fs);
1435 putdref(db, MKREF(drop));
1438 static struct super_operations lafs_sops = {
1439 .alloc_inode = lafs_alloc_inode,
1440 .destroy_inode = lafs_destroy_inode, /* Inverse of 'alloc_inode' */
1441 /* Don't use read_inode */
1442 .dirty_inode = lafs_dirty_inode,
1443 /* .write_inode not needed */
1445 .drop_inode = lafs_drop_inode,
1446 /* drop_inode ?? */ /* default will call delete or forget
1447 * where 'forget' flushes and clears
1450 .clear_inode = lafs_clear_inode, /* forget internal state of this inode */
1451 .delete_inode = lafs_delete_inode, /* remove this inode from filesystem */
1452 .put_super = lafs_put_super,
1453 .sync_fs = lafs_sync_fs,
1454 /* write_super_lockfs ?? */
1456 .statfs = lafs_statfs,
1460 MODULE_AUTHOR("Neil Brown");
1461 MODULE_DESCRIPTION("LaFS - Log Structured File System");
1462 MODULE_LICENSE("GPL");
1463 module_init(lafs_init);
1464 module_exit(lafs_exit);
1466 module_param(lafs_trace, int, 0644);
1470 static int do_dump(const char *val, struct kernel_param *kp)
1472 extern void lafs_dump_orphans(void);
1473 extern void lafs_dump_tree(void);
1474 extern void lafs_dump_cleanable(void);
1475 extern void lafs_dump_usage(void);
1477 printk("Want dump of %s\n", val);
1478 if (strncmp(val, "orphan", 6) == 0)
1479 lafs_dump_orphans();
1480 if (strncmp(val, "tree", 4) == 0)
1482 if (strncmp(val, "cleanable", 9) == 0)
1483 lafs_dump_cleanable();
1484 if (strncmp(val, "usage", 5) == 0)
1489 static int get_dump(char *buffer, struct kernel_param *kp)
1491 strcpy(buffer, "orphans,tree,cleanable,usage");
1492 return strlen(buffer);
1496 module_param_call(dump, do_dump, get_dump, &arg, 0775);