4 * Copyright (C) 2005-2009
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
12 #include <linux/slab.h>
15 roll_valid(struct fs *fs, struct cluster_head *ch, unsigned long long addr)
17 /* return 1 if the cluster_head looks locally valid.
18 * Don't check checksum as we may not have the whole head
20 if (memcmp(ch->idtag, "LaFSHead", 8) != 0)
22 if (memcmp(fs->state->uuid, ch->uuid, 16) != 0)
24 if (le64_to_cpu(ch->this_addr) != addr)
26 switch (le16_to_cpu(ch->verify_type)) {
36 if (le16_to_cpu(ch->Clength) > fs->max_segment)
42 * roll_locate returns 0 if proper endpoints were found,
43 * or -EINVAL?? if CheckpointStart and CheckpointEnd weren't found properly
44 * "next" will contain the address of the next cluster to be written to,
45 * "last" the cluster before that, and "seq" the seq number for next cluster
48 roll_locate(struct fs *fs, u64 start,
49 u64 *next, u64 *lastp, u64 *seqp,
50 int *maxp, struct page *p)
52 struct cluster_head *ch;
53 u64 this, prev, prev2, last;
56 int prevtype, prev2type;
58 ch = (struct cluster_head *)page_address(p);
60 this = start; prev = start;
63 if (lafs_load_page(fs, p, this, 1) != 0) {
64 printk(KERN_ERR "LaFS: Could not read cluster %llu\n",
65 (unsigned long long) this);
68 if (!roll_valid(fs, ch, this)) {
69 printk(KERN_ERR "LaFS: Bad cluster at %llu\n",
70 (unsigned long long) this);
74 seq = le64_to_cpu(ch->seq);
75 if (!(ch->flags & CH_CheckpointStart)) {
76 printk(KERN_ERR "LaFS: Cluster at %llu not CheckpointStart!!\n",
77 (unsigned long long)this);
80 } else if (seq != le64_to_cpu(ch->seq)) {
81 printk(KERN_ERR "LaFS: Cluster sequence bad at %llu: %llu->%llu\n",
82 (unsigned long long)this,
83 (unsigned long long)seq,
84 (unsigned long long)le64_to_cpu(ch->seq));
88 if (this != start && le64_to_cpu(ch->prev_addr) != prev) {
89 printk(KERN_ERR "LaFS: Cluster Linkage error at %llu: %llu != %llu\n",
90 (unsigned long long)this,
91 (unsigned long long)le64_to_cpu(ch->prev_addr),
92 (unsigned long long)prev);
95 if (!ch->flags & CH_Checkpoint) {
96 printk(KERN_ERR "LaFS: Cluster %llu not a Checkpoint cluster\n",
97 (unsigned long long)this);
100 dprintk("Found seq %llu at %llu\n",
101 (unsigned long long)seq, (unsigned long long)this);
102 if (le16_to_cpu(ch->Hlength) > max)
103 max = le16_to_cpu(ch->Hlength);
105 this = le64_to_cpu(ch->next_addr);
107 } while (!(ch->flags & CH_CheckpointEnd));
109 /* 'seq' is sequence number of 'this' */
110 dprintk("CheckpointEnd found at %llu, seq %llu\n", prev, seq-1);
112 /* now we need to step forward a bit more carefully. as any
113 * cluster we find now could easily be bad.
115 * this - address of cluster we are now considering
116 * prev - address of previous cluster
117 * prevtype - verify type of previous cluster
118 * prev2 - address of cluster before prev
119 * prev2type - verify type of that cluster.
120 * start - "next_addr" entry from last known-good cluster
128 prevtype = prev2type = VerifyNull;
131 if (lafs_load_page(fs, p, this, 1) != 0)
133 if (!roll_valid(fs, ch, this))
135 if (le64_to_cpu(ch->prev_addr) != prev)
137 if (le64_to_cpu(ch->seq) != seq)
139 /* FIXME check checksum, and possibly VerifySum */
140 /* this head looks valid, so we can possibly verify previous
143 if (le16_to_cpu(ch->Hlength) > max)
144 max = le16_to_cpu(ch->Hlength);
146 if (prev2type == VerifyNext2) {
147 start = prev; last = prev2;
149 if (prevtype == VerifyNext) {
154 /* shift prev info back */
155 prev2 = prev; prev2type = prevtype;
156 prev = this ; prevtype = le16_to_cpu(ch->verify_type);
157 this = le64_to_cpu(ch->next_addr);
158 if (prevtype == VerifyNull) {
165 dprintk("LaFS: Next address to write is %llu\n", start);
170 else if (start == prev)
172 else if (start == prev2)
180 static int __must_check
181 roll_mini(struct fs *fs, int fsnum, int inum, int trunc, int flg,
182 u32 bnum, int offset, int len, char *data)
185 struct lafs_inode *li;
186 struct datablock *db;
190 dprintk("Roll Mini %d/%d/%d/%lu/%d,%d\n",
191 fsnum, inum, flg, (unsigned long) bnum,
194 /* The handling of miniblock updates is quite different for
197 * inode-files: meta-data updates, including size, are allowed.
198 * index update and data update are not (data update must
199 * go through the file). Implied creation requires
201 * regular-files: just over-write data, possibly extending size
202 * symlink,dev,pipe: as with reg-files
203 * directory: add/remove entries.
207 return 0; /* old stuff isn't interesting, or even possible */
209 inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
211 return PTR_ERR(inode);
218 BUG_ON(fsnum); /* FIXME should be more careful */
220 inode = lafs_iget_fs(fs, inum, bnum, SYNC);
222 err = PTR_ERR(inode);
223 if (err == -EIO && offset == 0) {
224 /* creating new inode */
226 return PTR_ERR(inode);
228 db = lafs_inode_dblock(inode, SYNC, MKREF(roll));
229 buf = map_dblock(db);
230 /* FIXME do I sync the inode back to the datablock first? */
231 memcpy(buf+offset, data, len);
232 unmap_dblock(db, buf);
233 err = lafs_import_inode(inode, db);
234 /* We borrow the orphan list to keep a reference on
235 * this inode until all processing is finished
236 * to make sure inodes that are about to get linked
237 * to get deleted early
239 if (list_empty(&db->orphans)) {
240 list_add(&db->orphans, &fs->pending_orphans);
241 lafs_igrab_fs(inode);
242 getdref(db, MKREF(roll_orphan));
244 putdref(db, MKREF(roll));
251 static int __must_check
252 roll_block(struct fs *fs, int fsnum, int inum, int trunc, int flg,
253 u32 bnum, u64 baddr, int type, struct page *p)
256 struct datablock *blk = NULL;
257 struct lafs_inode *li;
261 return 0; /* "old" blocks aren't interesting */
262 if (type == DescIndex)
263 return 0; /* index blocks aren't interesting either */
264 if (type == DescHole)
265 return 0; /* FIXME should I punch a hole here? */
267 dprintk("Roll Block %d/%d/%d/%lu/%llu\n",
268 fsnum, inum, flg, (unsigned long) bnum,
269 (unsigned long long)baddr);
271 /* find/load the inode */
272 inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
274 return PTR_ERR(inode);
276 /* FIXME do I need to 'lock' the inode in any way? */
281 dprintk("Got the inode, type %d %p size %llu\n", li->type,
282 inode, inode->i_size);
285 struct la_inode *lai;
288 default: /* most filetypes are simply ignored */
292 /* The only part of an inode that might be interesting
293 * is embedded data: All metadata changes get logged
295 * Further the data can only be interesting for non-directories,
296 * as directory updates are also logged as miniblocks.
297 * So if this is a depth==0 non-directory inode,
298 * treat the data as a miniblock update.
300 err = lafs_load_page(fs, p, baddr, 1);
301 dprintk("inode load page err %d\n", err);
304 lai = (struct la_inode *)page_address(p);
305 mdsize = le16_to_cpu(lai->metadata_size);
306 if (lai->filetype >= TypeBase &&
307 lai->filetype != TypeDir &&
309 mdsize > 1 && mdsize < fs->blocksize) {
310 u64 sz = le64_to_cpu(lai->metadata[0].file.size);
311 if (sz <= fs->blocksize - mdsize)
312 err = roll_mini(fs, inum, bnum, -1, flg, 0, 0,
314 page_address(p) + mdsize);
320 /* These only get merged while in a checkpoint. */
321 if (fs->qphase == fs->phase)
326 /* merge into the file and possibly extend inode.size
327 * Only extend the size if it was before this block.
328 * i.e. if size was to the middle of this block, we don't
331 dprintk("FILE type\n");
333 blk = lafs_get_block(inode, bnum, NULL, GFP_KERNEL,
338 err = lafs_find_block(blk, ADOPT);
341 if (blk->b.physaddr == baddr)
342 /* already correctly indexed */
345 /* FIXME do I need to dirty the inode to flush
346 * this change into the datablock?
348 if (li->type >= TypeBase &&
349 inode->i_size <= ((loff_t)bnum << inode->i_blkbits))
350 inode->i_size = ((loff_t)bnum << inode->i_blkbits) + type;
352 /* FIXME: we pretend this is a dirty, pinned block
353 * so the lower-level code doesn't get confused.
354 * Is this really the best approach?
355 * Do I need to release some space here?
357 set_bit(B_PinPending, &blk->b.flags); /* Don't need iolock as no io yet */
358 lafs_pin_dblock(blk, CleanSpace); /* cannot fail during ! ->rolled */
360 lafs_iolock_block(&blk->b);
361 lafs_summary_update(fs, blk->b.inode, blk->b.physaddr, baddr,
363 blk->b.physaddr = baddr;
364 lafs_dirty_iblock(blk->b.parent, 0);
365 /* FIXME maybe set Writeback and unlock */
366 if (lafs_add_block_address(fs, &blk->b) == 0)
367 /* FIXME if the table becomes full, we have a problem... */
368 LAFS_BUG(1, &blk->b);
369 dprintk("Allocated block %lu to %llu\n",
370 (unsigned long)bnum, baddr);
371 /* FIXME maybe clear Writeback instead */
372 lafs_iounlock_block(&blk->b);
374 clear_bit(B_PinPending, &blk->b.flags);
375 /* If we had previously read this block for some reason,
376 * the contents are now invalid. If they are dirty,
377 * we have a real problem as those changes cannot be saved.
379 LAFS_BUG(test_bit(B_Dirty, &blk->b.flags), &blk->b);
380 clear_bit(B_Valid, &blk->b.flags);
385 putdref(blk, MKREF(roll));
387 dprintk("leaving with error %d\n", err);
391 static int __must_check
392 roll_one(struct fs *fs, u64 *addrp, struct page *p, struct page *pg,
396 struct cluster_head *ch = (struct cluster_head *)page_address(p);
397 struct group_head *gh;
398 struct descriptor *desc;
402 int blocksize = fs->blocksize;
404 /* we "know" buf is big enough */
405 err = lafs_load_page(fs, p, addr, max/blocksize);
409 /* just minimal checks, as we have looked at this already */
410 if (!roll_valid(fs, ch, addr))
412 if (lafs_calc_cluster_csum(ch) != ch->checksum)
414 *addrp = le64_to_cpu(ch->next_addr);
416 if (le16_to_cpu(ch->Hlength) > max)
419 baddr += (le16_to_cpu(ch->Hlength) + blocksize - 1) / blocksize;
421 if (!(ch->flags & CH_Checkpoint))
422 fs->qphase = fs->phase;
426 while (((char *)gh - (char *)ch) < le16_to_cpu(ch->Hlength)) {
428 int inum = le32_to_cpu(gh->inum);
429 int fsnum = le32_to_cpu(gh->fsnum);
430 int trunc = le16_to_cpu(gh->truncatenum_and_flag) & 0x7fff;
431 int flg = le16_to_cpu(gh->truncatenum_and_flag) & 0x8000;
434 while (((char *)desc - (char *)gh) <
435 le16_to_cpu(gh->group_size_words)*4) {
436 if (le16_to_cpu(desc->block_bytes) <= DescMiniOffset ||
437 le16_to_cpu(desc->block_bytes) == DescIndex) {
438 u32 bnum = le32_to_cpu(desc->block_num);
439 int cnt = le16_to_cpu(desc->block_cnt);
441 if (le16_to_cpu(desc->block_bytes) == DescIndex
443 return -EIO; /* FIXME is this
446 /* FIXME range check count */
447 while (!err && cnt--) {
448 err = roll_block(fs, fsnum, inum, trunc,
451 ? le16_to_cpu(desc->block_bytes)
456 /* FIXME allow for striping */
459 struct miniblock *mb = (struct miniblock *)desc;
460 u32 bnum = le32_to_cpu(mb->block_num);
461 int offset = le16_to_cpu(mb->block_offset);
462 int len = le16_to_cpu(mb->length)
464 err = roll_mini(fs, fsnum, inum, trunc, flg,
465 bnum, offset, len, (char *)(mb+1));
468 mb = (struct miniblock *)(((char*)mb)
470 desc = (struct descriptor *)mb;
476 gh = (struct group_head *)desc;
481 if (ch->flags & CH_CheckpointEnd)
482 fs->qphase = fs->phase;
486 static int roll_forward(struct fs *fs)
488 u64 first, next = 0, last = 0, seq = 0;
492 int blocksize = fs->blocksize;
499 fs->checkpointing = CH_Checkpoint;
500 clear_bit(DelayYouth, &fs->fsstate);
502 first = fs->checkpointcluster;
503 p = alloc_page(GFP_KERNEL);
507 err = roll_locate(fs, first, &next, &last, &seq, &max, p);
509 max = ((max + blocksize - 1) / blocksize) * blocksize;
511 if (!err && max > PAGE_SIZE)
518 pg = alloc_page(GFP_KERNEL);
524 err = lafs_cluster_init(fs, 0, next, last, seq);
526 put_page(p); put_page(pg);
529 lafs_cluster_init(fs, 1, 0, 0, 0);
531 virttoseg(fs, first, &dev, &seg, &offset);
533 while (first != next) {
537 virttoseg(fs, first, &dev2, &seg2, &offset);
538 err = roll_one(fs, &first, p, pg, max);
542 if (fs->qphase == fs->phase &&
544 fs->checkpointing = 0;
545 clear_bit(DelayYouth, &fs->fsstate);
546 lafs_seg_apply_all(fs);
549 if (dev2 != dev || seg2 != seg) {
550 /* New segment - need to make sure youth is correct */
553 /* if fs->checkpointing, seg_apply_all will do the youth
556 if (fs->checkpointing == 0)
557 lafs_update_youth(fs, dev, seg);
563 lafs_add_active(fs, next);
565 /* Now we release all the nlink==0 inodes that we found */
566 while (!list_empty(&fs->pending_orphans)) {
567 struct datablock *db = list_entry(fs->pending_orphans.next,
570 list_del_init(&db->orphans);
571 lafs_iput_fs(db->my_inode);
572 putdref(db, MKREF(roll_orphan));
579 lafs_mount(struct fs *fs)
583 struct inode *rootdir;
587 struct sb_key *k = fs->prime_sb->s_fs_info;
591 fs->ss[0].root = root = iget_locked(fs->prime_sb, 0);
595 b = lafs_get_block(root, 0, NULL, GFP_KERNEL, MKREF(mount));
598 set_bit(B_Root, &b->b.flags);
599 b->b.physaddr = fs->ss[0].root_addr;
600 set_bit(B_PhysValid, &b->b.flags);
601 err = lafs_load_block(&b->b, NULL);
604 err = lafs_wait_block(&b->b);
608 err = lafs_import_inode(root, b);
611 putdref(b, MKREF(mount));
614 unlock_new_inode(root);
615 /* FIXME lots of error checking */
617 rootdir = lafs_iget(fs->prime_sb, 2, SYNC);
618 err = PTR_ERR(rootdir);
621 de = d_alloc_root(rootdir);
625 fs->prime_sb->s_root = de;
627 fs->orphans = lafs_iget(fs->prime_sb, 8, SYNC);
628 for (d = 0; d < fs->devices ; d++) {
629 fs->devs[d].segsum = lafs_iget(fs->prime_sb,
630 fs->devs[d].usage_inum,
632 /* FIXME check this is a segusage file */
634 orphan_count = lafs_count_orphans(fs->orphans);
635 LAFSI(fs->orphans)->md.orphan.nextfree = orphan_count;
637 lafs_checkpoint_lock(fs);
638 err = roll_forward(fs);
639 lafs_checkpoint_unlock(fs);
641 lafs_add_orphans(fs, fs->orphans, orphan_count);
643 for (d = 0; d < 4; d++) {
644 fs->cleaner.seg[d].chead = alloc_page(GFP_KERNEL);
645 INIT_LIST_HEAD(&fs->cleaner.seg[d].cleaning);
650 putdref(b, MKREF(mount));