4 * Copyright (C) 2005-2010
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
10 * This file handles mounting of a filesystem once the superblocks
12 * It loads the root inode (the root of the filesystem, not of the
13 * directory tree) and then handles roll-forward to pick up and changes
14 * there are not in the filesystem yet, either due to a crash, or because
15 * they cannot be consistently stored easily (final segusage/quota info).
17 * Roll-forward reads write-cluster header and handle things as appropriate.
18 * Data blocks are only processed if they belong to:
22 * A data block in a regular file implies an extension of the file size
23 * to the end of the block, if it was previously at or before the start
24 * of the block. Datablocks that were just moved for cleaning are
27 * Index blocks are always ignored - they need to be recalculated.
29 * 'miniblocks' or 'updates' are always processed - they represent an
30 * atomic update that might affect multiple files - those files for which
31 * data blocks are ignored.
32 * Updates are understood:
33 * - for inodes. The update simply over-writes part of the inode metadata,
34 * which could affect the link count or size. Such inodes become
35 * orphans in case truncation or deletion is needed. This can create
36 * an inode which might affect the inode usage map.
37 * - for directories. The update identifies a name and an inode number.
38 * This can imply a change to the inode's link count and again could
39 * make it an orphan. In some cases updates are paired, possibly across
40 * different directories. This is needed for 'rename'.
42 * Each write-cluster has three levels of validation.
43 * Firstly, if the header is internally consistent, with correct tag,
44 * uuid, and sequence, then we know a write was attempted, and anything that
45 * must be written before that was successfully written.
46 * Secondly, if the header has a correct checksum, then it is all correct,
47 * and the miniblocks are valid.
48 * Thirdly, if the next or next-but-one header (depending on verify_type) is
49 * internally consistent, than we know that the data blocks in this cluster
50 * were all written successfully.
54 #include <linux/slab.h>
57 roll_valid(struct fs *fs, struct cluster_head *ch, unsigned long long addr)
59 /* return 1 if the cluster_head looks locally valid.
60 * Don't check checksum as we may not have the whole head
62 if (memcmp(ch->idtag, "LaFSHead", 8) != 0)
64 if (memcmp(fs->state->uuid, ch->uuid, 16) != 0)
66 if (le64_to_cpu(ch->this_addr) != addr)
68 switch (le16_to_cpu(ch->verify_type)) {
76 if (le16_to_cpu(ch->Clength) > fs->max_segment)
82 * roll_locate scopes out the full extent of the required roll-forward.
83 * It starts at the start of the last checkpoint (recorded in the stateblock)
84 * and checks that the end of the checkpoint exists, and continues following
85 * the chain as far as valid cluster heads can be found.
86 * roll_locate returns 0 if proper endpoints were found,
87 * or -EIO if CheckpointStart and CheckpointEnd weren't found properly
88 * "next" will contain the address of the next cluster to be written to,
89 * "last" the cluster before that, and "seq" the seq number for next cluster
90 * "maxp" will be used to report the maximum size of a cluster head.
93 roll_locate(struct fs *fs, u64 start,
94 u64 *nextp, u64 *lastp, u64 *seqp,
95 int *maxp, struct page *p)
97 struct cluster_head *ch;
98 u64 this, prev, prev2, last, next;
101 int prevtype, prev2type;
103 ch = (struct cluster_head *)page_address(p);
105 this = start; prev = start;
107 /* First we walk through the checkpoint section, which should
111 if (lafs_load_page(fs, p, this, 1) != 0) {
112 printk(KERN_ERR "LaFS: Could not read cluster %llu\n",
113 (unsigned long long) this);
116 if (!roll_valid(fs, ch, this)) {
117 printk(KERN_ERR "LaFS: Bad cluster at %llu\n",
118 (unsigned long long) this);
122 seq = le64_to_cpu(ch->seq);
123 if (!(ch->flags & CH_CheckpointStart)) {
124 printk(KERN_ERR "LaFS: Cluster at %llu not CheckpointStart!!\n",
125 (unsigned long long)this);
128 } else if (seq != le64_to_cpu(ch->seq)) {
129 printk(KERN_ERR "LaFS: Cluster sequence bad at %llu: %llu->%llu\n",
130 (unsigned long long)this,
131 (unsigned long long)seq,
132 (unsigned long long)le64_to_cpu(ch->seq));
136 if (this != start && le64_to_cpu(ch->prev_addr) != prev) {
137 printk(KERN_ERR "LaFS: Cluster Linkage error at %llu: %llu != %llu\n",
138 (unsigned long long)this,
139 (unsigned long long)le64_to_cpu(ch->prev_addr),
140 (unsigned long long)prev);
143 if (!ch->flags & CH_Checkpoint) {
144 printk(KERN_ERR "LaFS: Cluster %llu not a Checkpoint cluster\n",
145 (unsigned long long)this);
148 dprintk("Found seq %llu at %llu\n",
149 (unsigned long long)seq, (unsigned long long)this);
150 if (le16_to_cpu(ch->Hlength) > max)
151 max = le16_to_cpu(ch->Hlength);
153 this = le64_to_cpu(ch->next_addr);
155 } while (!(ch->flags & CH_CheckpointEnd));
157 /* 'seq' is sequence number of 'this' */
158 dprintk("CheckpointEnd found at %llu, seq %llu\n", prev, seq-1);
160 /* now we need to step forward a bit more carefully, as any
161 * cluster we find now could easily be bad.
163 * this - address of cluster we are now considering
164 * prev - address of previous cluster
165 * prevtype - verify type of previous cluster
166 * prev2 - address of cluster before prev
167 * prev2type - verify type of that cluster.
168 * start - "next_addr" entry from last known-good cluster
176 prevtype = prev2type = VerifyNull;
179 if (lafs_load_page(fs, p, this, 1) != 0)
181 if (!roll_valid(fs, ch, this))
183 if (le64_to_cpu(ch->prev_addr) != prev)
185 if (le64_to_cpu(ch->seq) != seq)
188 /* this head looks valid, so we can possibly verify previous
191 if (le16_to_cpu(ch->Hlength) > max)
192 max = le16_to_cpu(ch->Hlength);
194 if (prev2type == VerifyNext2) {
198 if (prevtype == VerifyNext) {
203 /* shift prev info back */
205 prev2type = prevtype;
207 prevtype = le16_to_cpu(ch->verify_type);
208 this = le64_to_cpu(ch->next_addr);
209 if (prevtype == VerifyNull) {
216 dprintk("LaFS: Next address to write is %llu\n", next);
221 else if (next == prev)
223 else if (next == prev2)
231 static int __must_check
232 roll_mini(struct fs *fs, int fsnum, int inum, int trunc,
233 u32 bnum, int offset, int len, char *data)
236 struct inode *fsinode;
237 struct lafs_inode *li;
238 struct datablock *db = NULL;
245 dprintk("Roll Mini %d/%d/%lu/%d,%d\n",
246 fsnum, inum, (unsigned long) bnum,
249 /* The handling of miniblock updates is quite different for
252 * inode-files: meta-data updates, including size, are allowed.
253 * index update and data update are not (data update must
254 * go through the file). Implied creation requires
256 * regular-files: We don't create miniblocks for regular files,
257 * but we might write an inode with embedded data and want
258 * that data to be safe. When those inodes are found, at
259 * miniblock is synthesised from the data so we need to
261 * symlink,dev,pipe: as with reg-files
262 * directory: add/remove entries. Each miniblock has an address and
263 * identifies a name, an inode number, and one of:
264 * LINK - create a link with this name to the inode
265 * UNLINK - remove the link
266 * REN_SOURCE - record this info against the 'address' which must
267 * be unique in this checkpoint across all directories
268 * REN_TARGET - The source with matching 'address' is being
269 * renamed to here. So unlink the source and either create the
270 * target (if inode is zero) or replace the target. This
271 * miniblock could be in a different directory to the matching
275 inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
277 return PTR_ERR(inode);
282 default: /* Any unknown type is an error */
283 printk(KERN_WARNING "LAFS impossibly file type for roll-forward: %d\n",
291 printk(KERN_WARNING "LAFS: Ignoring impossible sub-subset\n");
296 inode = lafs_iget_fs(fs, inum, bnum, SYNC);
298 err = PTR_ERR(inode);
299 if (err != -ENOENT || offset != 0) {
300 lafs_iput_fs(fsinode);
304 db = lafs_get_block(fsinode, bnum, NULL, GFP_KERNEL,
306 lafs_inode_inuse(fs, fsinode, bnum);
307 lafs_iput_fs(fsinode);
309 db = ERR_PTR(-ENOMEM);
311 lafs_iput_fs(fsinode);
312 db = lafs_inode_dblock(inode, SYNC, MKREF(roll));
314 /* Make sure block is in-sync with inode */
315 lafs_inode_fillblock(inode);
321 /* Should normally iolock the block, but we don't
322 * need that during roll-forward */
323 set_bit(B_PinPending, &db->b.flags);
324 lafs_pin_dblock(db, CleanSpace);
325 buf = map_dblock(db);
326 memcpy(buf+offset, data, len);
327 unmap_dblock(db, buf);
329 err = lafs_import_inode(inode, db);
331 inode = lafs_iget_fs(fs, inum, bnum, SYNC);
334 lafs_dirty_dblock(db);
338 /* 'bnum' is the handle for match 'rename' parts.
339 * 'offset' is the DIROP type
340 * 'len' is 4 plus length of name.
341 * data contains 4-byte inode number, then name
347 inum = le32_to_cpu(*(u32*)data);
349 err = lafs_dir_roll_mini(inode, bnum, offset, inum, name, len-4);
355 if (bnum != 0 || offset != 0) {
356 /* We currently only expect update at the very start
358 * So reject anything else.
363 err = pagecache_write_begin(NULL, inode->i_mapping,
367 char *b = kmap_atomic(page, KM_USER0);
368 memcpy(b, data, len);
369 kunmap_atomic(b, KM_USER0);
370 pagecache_write_end(NULL, inode->i_mapping,
371 0, len, len, page, fsdata);
375 /* We borrow the orphan list to keep a reference on
376 * this inode until all processing is finished
377 * to make sure inodes that are about to get linked
378 * don't get deleted early
380 if (inode->i_nlink == 0) {
382 db = lafs_inode_get_dblock(inode, MKREF(roll));
384 list_empty(&db->orphans)) {
385 list_add(&db->orphans, &fs->pending_orphans);
386 lafs_igrab_fs(inode);
387 getdref(db, MKREF(roll_orphan));
390 putdref(db, MKREF(roll));
395 static int __must_check
396 roll_block(struct fs *fs, int fsnum, int inum, int trunc,
397 u32 bnum, u64 baddr, int bytes, u64 tstamp, struct page *p)
400 struct datablock *blk = NULL;
401 struct lafs_inode *li;
404 /* We found this block during roll-forward and need to
405 * include it in the filesystem.
406 * If 'bytes' is 0, the this is a 'hole' and we should
409 if (bytes == DescHole)
412 dprintk("Roll Block %d/%d/%lu/%llu\n",
413 fsnum, inum, (unsigned long) bnum,
414 (unsigned long long)baddr);
416 /* find/load the inode */
417 inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
419 return PTR_ERR(inode);
424 dprintk("Got the inode, type %d %p size %llu\n", li->type,
425 inode, inode->i_size);
428 struct la_inode *lai;
431 default: /* most filetypes are simply ignored */
435 /* The only part of an inode that might be interesting
436 * is embedded data: All metadata changes get logged
438 * Further the data can only be interesting for non-directories,
439 * as directory updates are also logged as miniblocks.
440 * So if this is a depth==0 non-directory inode,
441 * treat the data as a miniblock update.
443 if (bytes != fs->blocksize)
445 err = lafs_load_page(fs, p, baddr, 1);
446 dprintk("inode load page err %d\n", err);
449 lai = (struct la_inode *)page_address(p);
450 mdsize = le16_to_cpu(lai->metadata_size);
451 if (lai->filetype >= TypeBase &&
452 lai->filetype != TypeDir &&
454 mdsize > 1 && mdsize < fs->blocksize) {
455 u64 sz = le64_to_cpu(lai->metadata[0].file.size);
456 if (sz <= fs->blocksize - mdsize)
457 err = roll_mini(fs, inum, bnum, -1, 0, 0,
459 page_address(p) + mdsize);
465 /* These only get merged while in a checkpoint. */
466 if (fs->qphase == fs->phase)
471 /* merge into the file and possibly extend inode.size
472 * Only extend the size if it was before this block.
473 * i.e. if size was to the middle of this block, we don't
476 dprintk("FILE type\n");
478 blk = lafs_get_block(inode, bnum, NULL, GFP_KERNEL,
483 err = lafs_find_block(blk, ADOPT);
486 if (blk->b.physaddr == baddr)
487 /* already correctly indexed */
490 if (li->type >= TypeBase && bytes != DescHole &&
491 inode->i_size <= ((loff_t)bnum << inode->i_blkbits)) {
492 inode->i_size = ((loff_t)bnum << inode->i_blkbits) + bytes;
493 set_bit(I_Dirty, &LAFSI(inode)->iflags);
496 decode_time(&inode->i_mtime, tstamp);
497 decode_time(&inode->i_ctime, tstamp);
498 set_bit(I_Dirty, &LAFSI(inode)->iflags);
501 /* FIXME: we pretend this is a dirty, pinned block
502 * so the lower-level code doesn't get confused.
503 * Is this really the best approach?
504 * Do I need to release some space here?
506 set_bit(B_PinPending, &blk->b.flags); /* Don't need iolock as no io yet */
507 lafs_pin_dblock(blk, CleanSpace); /* cannot fail during ! ->rolled */
509 lafs_iolock_block(&blk->b);
510 /* The '1' in lafs_summary_update assumes SegRef is set, so
513 LAFS_BUG(!test_bit(B_SegRef, &blk->b.flags), &blk->b);
514 lafs_summary_update(fs, blk->b.inode, blk->b.physaddr, baddr,
516 blk->b.physaddr = baddr;
517 lafs_dirty_iblock(blk->b.parent, 0);
518 set_bit(B_Writeback, &blk->b.flags);
519 lafs_iounlock_block(&blk->b);
521 while (lafs_add_block_address(fs, &blk->b) == 0)
522 /* Just like in lafs_phase_flip, there is no special
523 * action required here.
527 dprintk("Allocated block %lu to %llu\n",
528 (unsigned long)bnum, baddr);
529 lafs_writeback_done(&blk->b);
531 clear_bit(B_PinPending, &blk->b.flags);
532 /* If we had previously read this block for some reason,
533 * the contents are now invalid. If they are dirty,
534 * we have a real problem as those changes cannot be saved.
536 LAFS_BUG(test_bit(B_Dirty, &blk->b.flags), &blk->b);
537 clear_bit(B_Valid, &blk->b.flags);
542 putdref(blk, MKREF(roll));
544 if (inode->i_nlink == 0) {
545 struct datablock *db = lafs_inode_get_dblock(inode, MKREF(roll));
547 list_empty(&db->orphans)) {
548 list_add(&db->orphans, &fs->pending_orphans);
549 lafs_igrab_fs(inode);
550 getdref(db, MKREF(roll_orphan));
552 putdref(db, MKREF(roll));
555 dprintk("leaving with error %d\n", err);
559 static int __must_check
560 roll_one(struct fs *fs, u64 *addrp, struct page *p, struct page *pg,
564 struct cluster_head *ch = (struct cluster_head *)page_address(p);
565 struct group_head *gh;
566 struct descriptor *desc;
570 int blocksize = fs->blocksize;
574 /* we "know" buf is big enough */
575 err = lafs_load_pages(fs, p, addr, max/blocksize);
579 /* just minimal checks, as we have looked at this already */
580 if (!roll_valid(fs, ch, addr))
582 if (lafs_calc_cluster_csum(ch) != ch->checksum)
584 *addrp = le64_to_cpu(ch->next_addr);
586 if (le16_to_cpu(ch->Hlength) > max)
589 lafs_seg_setpos(fs, &seg, addr);
590 lafs_seg_setsize(fs, &seg, le16_to_cpu(ch->Clength));
591 header_blocks = (le16_to_cpu(ch->Hlength) + blocksize - 1) / blocksize;
592 for (i = 0; i < header_blocks; i++) {
593 baddr = lafs_seg_next(fs, &seg);
594 BUG_ON(baddr != addr + i);
597 if (!(ch->flags & CH_Checkpoint))
598 fs->qphase = fs->phase;
602 while (((char *)gh - (char *)ch) < le16_to_cpu(ch->Hlength)) {
604 int inum = le32_to_cpu(gh->inum);
605 int fsnum = le32_to_cpu(gh->fsnum);
606 int trunc = le16_to_cpu(gh->truncatenum_and_flag) & 0x7fff;
607 int flg = le16_to_cpu(gh->truncatenum_and_flag) & 0x8000;
608 u64 tstamp = le64_to_cpu(gh->timestamp);
611 while (((char *)desc - (char *)gh) <
612 le16_to_cpu(gh->group_size_words)*4) {
613 if (le16_to_cpu(desc->block_bytes) <= DescMiniOffset ||
614 le16_to_cpu(desc->block_bytes) == DescIndex) {
615 u32 bnum = le32_to_cpu(desc->block_num);
616 int cnt = le16_to_cpu(desc->block_cnt);
617 int bytes = le16_to_cpu(desc->block_bytes);
619 if (le16_to_cpu(desc->block_bytes) == DescIndex
621 return -EIO; /* FIXME is this
624 /* FIXME range check count */
625 while (!err && cnt--) {
626 if (bytes != DescHole)
627 baddr = lafs_seg_next(fs, &seg);
628 if (bytes != DescHole &&
630 /* We have fallen off the end of
631 * the write-cluster - something
632 * is wrong with the header
634 printk(KERN_WARNING "LAFS: cluster size is wrong\n");
637 if (!flg && bytes != DescIndex)
638 err = roll_block(fs, fsnum, inum, trunc,
640 cnt == 0 || bytes == DescHole
649 struct miniblock *mb = (struct miniblock *)desc;
650 u32 bnum = le32_to_cpu(mb->block_num);
651 int offset = le16_to_cpu(mb->block_offset);
652 int len = le16_to_cpu(mb->length)
655 err = roll_mini(fs, fsnum, inum, trunc,
656 bnum, offset, len, (char *)(mb+1));
659 mb = (struct miniblock *)(((char*)mb)
661 desc = (struct descriptor *)mb;
667 gh = (struct group_head *)desc;
672 if (ch->flags & CH_CheckpointEnd)
673 fs->qphase = fs->phase;
677 static int roll_forward(struct fs *fs)
679 u64 first, next = 0, last = 0, seq = 0;
683 int blocksize = fs->blocksize;
688 struct list_head pending;
692 fs->checkpointing = CH_Checkpoint;
693 clear_bit(DelayYouth, &fs->fsstate);
695 first = fs->checkpointcluster;
696 p = alloc_pages(GFP_KERNEL, order);
700 err = roll_locate(fs, first, &next, &last, &seq, &max, p);
702 max = ((max + blocksize - 1) / blocksize) * blocksize;
704 if (!err && max > PAGE_SIZE) {
705 __free_pages(p, order);
706 order = get_order(max * blocksize);
707 p = alloc_pages(order, GFP_KERNEL);
712 __free_pages(p, order);
716 pg = alloc_page(GFP_KERNEL);
718 __free_pages(p, order);
722 err = lafs_cluster_init(fs, 0, next, last, seq);
724 __free_pages(p, order); put_page(pg);
727 lafs_cluster_init(fs, 1, 0, 0, 0);
729 virttoseg(fs, first, &dev, &seg, &offset);
731 while (first != next) {
735 virttoseg(fs, first, &dev2, &seg2, &offset);
736 err = roll_one(fs, &first, p, pg, max);
740 if (fs->qphase == fs->phase &&
742 fs->checkpointing = 0;
743 clear_bit(DelayYouth, &fs->fsstate);
744 lafs_seg_apply_all(fs);
747 if (dev2 != dev || seg2 != seg) {
748 /* New segment - need to make sure youth is correct */
751 /* if fs->checkpointing, seg_apply_all will do the youth
754 if (fs->checkpointing == 0)
755 lafs_update_youth(fs, dev, seg);
758 __free_pages(p, order);
761 lafs_add_active(fs, next);
763 /* pending_renames will normally be empty, but it is not
764 * impossible that we crashed and an awkward time. So just
765 * clean up whatever is there
767 while (fs->pending_renames != NULL) {
768 struct rename_roll *rr = fs->pending_renames;
769 fs->pending_renames = rr->next;
776 /* Now we release all the nlink==0 inodes that we found */
777 INIT_LIST_HEAD(&pending);
778 list_splice_init(&fs->pending_orphans, &pending);
779 while (!list_empty(&pending)) {
780 struct datablock *db = list_first_entry(&pending,
783 list_del_init(&db->orphans);
784 if (db->my_inode->i_nlink == 0)
785 lafs_make_orphan(fs, db);
786 lafs_iput_fs(db->my_inode);
787 putdref(db, MKREF(roll_orphan));
794 lafs_mount(struct fs *fs)
796 struct datablock *b = NULL;
797 struct inode *rootino;
798 struct inode *rootdir;
799 struct inode *aino, *oino;
803 struct sb_key *k = fs->prime_sb->s_fs_info;
807 fs->ss[0].root = rootino = iget_locked(fs->prime_sb, 0);
809 LAFSI(rootino)->filesys = rootino;
814 b = lafs_get_block(rootino, 0, NULL, GFP_KERNEL, MKREF(mount));
817 set_bit(B_Root, &b->b.flags);
818 b->b.physaddr = fs->ss[0].root_addr;
819 set_bit(B_PhysValid, &b->b.flags);
820 err = lafs_load_block(&b->b, NULL);
823 err = lafs_wait_block(&b->b);
827 err = lafs_import_inode(rootino, b);
830 putdref(b, MKREF(mount));
833 unlock_new_inode(rootino);
835 rootdir = lafs_iget(rootino, 2, SYNC);
836 err = PTR_ERR(rootdir);
839 de = d_alloc_root(rootdir);
845 fs->prime_sb->s_root = de;
847 oino = lafs_iget(rootino, 8, SYNC);
851 if (LAFSI(oino)->type != TypeOrphanList) {
857 for (d = 0; d < fs->devices ; d++) {
858 struct inode *sino = lafs_iget(rootino,
859 fs->devs[d].usage_inum,
864 if (LAFSI(sino)->type != TypeSegmentMap) {
869 fs->devs[d].segsum = sino;
871 orphan_count = lafs_count_orphans(fs->orphans);
872 LAFSI(fs->orphans)->md.orphan.nextfree = orphan_count;
874 lafs_checkpoint_lock(fs);
875 err = roll_forward(fs);
876 lafs_checkpoint_unlock(fs);
878 lafs_add_orphans(fs, fs->orphans, orphan_count);
880 for (d = 0; d < 4; d++) {
881 struct page *p = alloc_page(GFP_KERNEL);
884 fs->cleaner.seg[d].chead = p;
885 INIT_LIST_HEAD(&fs->cleaner.seg[d].cleaning);
888 aino = lafs_iget(rootino, 3, SYNC);
890 if (LAFSI(aino)->type != TypeAccessTime) {
894 LAFSI(fs->ss[0].root)->md.fs.accesstime = aino;
895 } else if (PTR_ERR(aino) != -ENOENT)
899 putdref(b, MKREF(mount));