3 * routines to create a checkpoint for LaFS
5 * Copyright (C) 2006-2009
6 * NeilBrown <neilb@suse.de>
7 * Released under the GPL, version 2
10 /* A checkpoint is a series of write clusters
11 * from which roll-forward can start when the filesystem
13 * By creating a new checkpoint, all write clusters before
14 * the checkpoint become free to participate in cleaning.
15 * Thus the two motivations for creating a checkpoint are
16 * - to limit the amount of device space that roll-forward will
17 * have to scan on a restart
18 * - to make more space available in a very full filesystem.
19 * - to allow creation of a snapshot.
20 * So when the filesystem is very full a checkpoint should be
21 * triggered every time a new segment is started.
22 * Otherwise a checkpoint should be triggered once periodically based
23 * on the number of segments written - probably once for every
24 * few dozen segments - experimentation and possible tuning needed.
26 * Checkpoint creation is not needed for any data integrity
27 * requirements. It is not even needed at unmount, but is
28 * done at that point anyway to minimise startup time.
30 * Taking a checkpoint involves:
31 * - stopping all cleaning and defrag activities
32 * - synchronising with various threads that might want to get
33 * a number of changes into the same phase
34 * - synchronising with writecluster creation on the main writecluster
36 * - flipping the 'phase' bit.
37 * - Arranging that the next writecluster will start the checkpoint
38 * - flush out all blocks in the old phase, starting at the leaves, and
39 * ultimately writing the root.
40 * - writing out any changed blocks in the segment usage files and quota
42 * - marking the end of the checkpoint.
44 * The synchronisation and bit flipping is from anywhere that decides
45 * a checkpoint is needed. The flush and finish-up is handled by
46 * a thread that is created for the purpose.
50 #include <linux/kthread.h>
53 static char *strflags(struct block *b)
57 if (test_bit(B_Index, &b->flags))
58 sprintf(ans, "Index(%d),", iblk(b)->depth);
59 if (test_bit(B_Pinned, &b->flags)) {
60 strcat(ans, "Pinned,");
61 if (test_bit(B_Phase1, &b->flags))
62 strcat(ans, "Phase1,");
64 strcat(ans, "Phase0,");
66 if (test_bit(B_PinPending, &b->flags))
67 strcat(ans, "PinPending,");
68 if (test_bit(B_InoIdx, &b->flags))
69 strcat(ans, "InoIdx,");
70 if (test_bit(B_Valid, &b->flags))
71 strcat(ans, "Valid,");
72 if (test_bit(B_Dirty, &b->flags))
73 strcat(ans, "Dirty,");
74 if (test_bit(B_Writeback, &b->flags))
75 strcat(ans, "Writeback,");
76 if (test_bit(B_Linked, &b->flags))
77 strcat(ans, "Linked,");
78 if (test_bit(B_Realloc, &b->flags))
79 strcat(ans, "Realloc,");
80 if (test_bit(B_Async, &b->flags))
81 strcat(ans, "Async,");
82 if (test_bit(B_Root, &b->flags))
84 if (test_bit(B_SegRef, &b->flags))
85 strcat(ans, "SegRef,");
86 if (test_bit(B_Credit, &b->flags))
88 if (test_bit(B_ICredit, &b->flags))
90 if (test_bit(B_NCredit, &b->flags))
92 if (test_bit(B_NICredit, &b->flags))
94 if (test_bit(B_UnincCredit, &b->flags))
95 strcat(ans, "UninCredit,");
96 if (test_bit(B_IOLock, &b->flags))
97 strcat(ans, "IOLock,");
98 if (test_bit(B_IOLockLock, &b->flags))
99 strcat(ans, "IOLockLock,");
100 if (test_bit(B_HaveLock, &b->flags))
101 strcat(ans, "HaveLock,");
102 if (test_bit(B_HaveWriteback, &b->flags))
103 strcat(ans, "HaveWriteback,");
104 if (test_bit(B_WriteError, &b->flags))
105 strcat(ans, "WriteError,");
106 if (test_bit(B_Claimed, &b->flags))
107 strcat(ans, "Claimed,");
108 if (test_bit(B_OnFree, &b->flags))
109 strcat(ans, "OnFree,");
110 if (test_bit(B_PhysValid, &b->flags))
111 strcat(ans, "PhysValid,");
112 if (test_bit(B_PrimaryRef, &b->flags))
113 strcat(ans, "PrimaryRef,");
114 if (test_bit(B_EmptyIndex, &b->flags))
115 strcat(ans, "EmptyIndex,");
117 if (test_bit(B_Uninc, &b->flags))
118 strcat(ans, "Uninc,");
119 if (test_bit(B_Orphan, &b->flags))
120 sprintf(ans+strlen(ans), "Orphan(%d),", dblk(b)->orphan_slot);
121 if (test_bit(B_Prealloc, &b->flags))
122 strcat(ans, "Prealloc,");
124 ans[strlen(ans)-1] = 0;
126 strcpy(ans, "-nil-");
130 char *strblk(struct block *b)
132 static char ans[400];
133 unsigned long ino = 0;
136 return "(NULL block)";
138 ino = b->inode->i_ino;
139 if (test_bit(B_PhysValid, &b->flags))
140 sprintf(ans, "[%p]%lu/%lu(%llu)r%d%c:%s",
142 b->physaddr, atomic_read(&b->refcnt),
143 list_empty(&b->lru) ? 'E' : 'F',
146 sprintf(ans, "[%p]%lu/%lu(NoPhysAddr)r%d%c:%s",
148 atomic_read(&b->refcnt),
149 list_empty(&b->lru) ? 'E' : 'F',
151 if (test_bit(B_Pinned, &b->flags) &&
152 test_bit(B_Index, &b->flags))
153 sprintf(ans+strlen(ans), "{%d,%d}",
154 atomic_read(&iblk(b)->pincnt[0]),
155 atomic_read(&iblk(b)->pincnt[1]));
156 if (test_bit(B_Index, &b->flags)) {
157 sprintf(ans+strlen(ans), "[%d%s%s]",
158 iblk(b)->uninc_table.pending_cnt,
159 iblk(b)->uninc ? "*" : "",
160 iblk(b)->uninc_next ? "+" : "");
163 if (test_bit(B_IOLock, &b->flags)) {
164 sprintf(ans+strlen(ans), "<%s:%d>",
165 strrchr(b->iolock_file, '/'),
170 sprintf(ans+strlen(ans), " NP");
174 for (i = 0; i < 16; i++)
175 if (b->holders[i].cnt)
176 sprintf(ans+strlen(ans),
177 " %s(%d)", b->holders[i].name,
184 int lafs_print_tree(struct block *b, int depth)
186 struct indexblock *ib = iblk(b);
191 struct inode *ino = NULL;
194 printk("... aborting at %d\n", depth);
199 printk("%*s", depth, "");
200 printk("%s", strblk(b));
206 for (i = 0; i < 2; i++) {
208 list_for_each_entry(b2, &dfs->phase_leafs[i], lru) {
210 printk(" Leaf%d(%d) ", i, j);
217 list_for_each_entry(b2, &dfs->clean_leafs, lru) {
219 printk(" Clean(%d) ", j);
224 list_for_each_entry(b2, &dfs->account_leafs, lru) {
226 printk(" Account(%d) ", j);
231 list_for_each_entry(b2, &freelist.lru, lru)
236 for (i = 0 ; i < 2; i++) {
239 list_for_each_entry(b2, &dfs->wc[i].clhead, lru) {
241 printk(" wc[%d][%d] ", i, j);
244 for (k = 0; k < 4; k++) {
246 list_for_each_entry(b2, &dfs->wc[i].pending_blocks[k], lru) {
248 printk(" pending[%d][%d][%d] ", i, k, j);
255 if (test_bit(B_Index, &b->flags) && ib->uninc_table.pending_cnt) {
256 lafs_print_uninc(&ib->uninc_table);
257 credits += ib->uninc_table.credits;
260 if (test_bit(B_Credit, &b->flags))
262 if (test_bit(B_ICredit, &b->flags))
264 if (test_bit(B_NCredit, &b->flags))
266 if (test_bit(B_NICredit, &b->flags))
268 if (test_bit(B_UnincCredit, &b->flags))
270 if (test_bit(B_Dirty, &b->flags))
272 if (test_bit(B_Realloc, &b->flags))
275 if (test_bit(B_Index, &b->flags)) {
276 list_for_each_entry(b, &ib->children, siblings) {
277 LAFS_BUG(b->parent != ib, b);
278 credits += lafs_print_tree(b, depth+2);
280 } else if ((ino = rcu_my_inode(dblk(b))) != NULL &&
281 LAFSI(ino)->iblock &&
282 LAFSI(ino)->iblock->b.parent
284 LAFS_BUG(LAFSI(ino)->iblock->b.parent != b->parent,
286 credits += lafs_print_tree(&LAFSI(ino)->iblock->b,
293 void lafs_dump_tree(void)
296 printk("=================== Index Dump ===================\n");
297 printk("inum dpth (refcnt) addr [phys] {[lk cnt]} flags\n");
298 credits = lafs_print_tree(&LAFSI(dfs->ss[0].root)->dblock->b, 0);
299 credits += lafs_print_tree(&LAFSI(dfs->ss[0].root)->iblock->b, 0);
300 printk("====================%d/%d-%d credits ==============================\n",
301 credits, (int)dfs->allocated_blocks, dfs->cluster_updates);
305 static int prepare_checkpoint(struct fs *fs)
310 spin_lock(&fs->lock);
311 wait_event_lock(fs->phase_wait,
312 fs->phase_locked == 0,
315 oldphase = fs->phase;
316 fs->phase = !oldphase;
317 fs->checkpointing = CH_CheckpointStart | CH_Checkpoint;
318 clear_bit(CheckpointNeeded, &fs->fsstate);
319 spin_unlock(&fs->lock);
321 youth = fs->youth_next - 1;
325 /* Get a block that should be written out.
326 * If phase is '-1', then only return cleanable blocks, else
327 * also return blocks for that phase.
328 * The returned block is locked for IO.
330 struct block *lafs_get_flushable(struct fs *fs, int phase)
332 struct block *b = NULL;
335 spin_lock(&fs->lock);
336 if (!list_empty(&fs->clean_leafs))
337 b = list_entry(fs->clean_leafs.next, struct block, lru);
338 else if (phase >= 0 && !list_empty(&fs->phase_leafs[phase]))
339 b = list_entry(fs->phase_leafs[phase].next, struct block, lru);
341 spin_unlock(&fs->lock);
345 getref_locked_needsync(b, MKREF(flushable));
346 list_del_init(&b->lru);
347 spin_unlock(&fs->lock);
351 /* Need an iolock, but if the block is in writeback,
352 * or unpinned or otherwise not a leaf, we don't want it.
354 lafs_iolock_block(b);
356 if (!lafs_is_leaf(b, phase)) {
357 lafs_iounlock_block(b);
358 putref(b, MKREF(flushable));
365 static void do_checkpoint(void *data)
367 struct fs *fs = data;
368 int oldphase = !fs->phase;
375 dprintk("Start Checkpoint\n");
379 while ((b = lafs_get_flushable(fs, oldphase)) != NULL) {
381 dprintk("Checkpoint Block %s\n", strblk(b));
383 if (test_and_clear_bit(B_Async, &b->flags)) {
384 /* this shouldn't normally happen, but
385 * it is possible, so check anyway
387 putref(b, MKREF(async));
388 lafs_wake_thread(fs);
391 if (!!test_bit(B_Phase1, &b->flags) != oldphase)
392 /* lafs_pin_dblock flipped phase for us */;
393 else if (test_bit(B_PinPending, &b->flags) &&
394 !test_bit(B_Index, &b->flags) &&
395 (LAFSI(b->inode)->type == TypeSegmentMap ||
396 LAFSI(b->inode)->type == TypeQuota)) {
397 /* Need to delay handling of this block until
398 * the phase change has finished, to just
399 * before we finish the checkpoint.
400 * Note that we don't check if they are dirty -
401 * they might not be yet - the whole point of the
402 * delay is that changes are still arriving.
404 lafs_flip_dblock(dblk(b));
405 /* access to account_leafs is single-threaded
406 * by the cleaner thread so no locking needed
408 getref(b, MKREF(accounting));
409 list_add(&b->lru, &fs->account_leafs);
410 } else if (test_bit(B_Index, &b->flags) &&
411 (iblk(b)->uninc_table.pending_cnt ||
413 lafs_incorporate(fs, iblk(b));
414 } else if ((test_bit(B_Dirty, &b->flags) ||
415 test_bit(B_Realloc, &b->flags))) {
416 lafs_cluster_allocate(b, 0);
418 } else if (test_bit(B_Index, &b->flags)) {
419 if (atomic_read(&iblk(b)->pincnt[oldphase])
421 lafs_phase_flip(fs, iblk(b));
425 /* Data block is still PinPending at
426 * checkpoint so must be a directory block.
427 * We want to keep PinPending until the operation
428 * completes, but need to unpin.
429 * So we drop/refile/retake.
430 * PinPending is mainly needed to keep
431 * writepage at bay and it will take iolock,
432 * so dropping PinPending while holding
435 LAFS_BUG(!test_bit(B_PinPending, &b->flags), b);
436 clear_bit(B_PinPending, &b->flags);
438 set_bit(B_PinPending, &b->flags);
442 lafs_iounlock_block(b);
444 putref(b, MKREF(flushable));
446 if ((test_bit(B_Pinned, &LAFSI(fs->ss[0].root)->iblock->b.flags) &&
447 !!test_bit(B_Phase1, &LAFSI(fs->ss[0].root)->iblock->b.flags)
450 (test_bit(B_Pinned, &LAFSI(fs->ss[0].root)->dblock->b.flags) &&
451 !!test_bit(B_Phase1, &LAFSI(fs->ss[0].root)->dblock->b.flags)
454 printk("Cannot escape PHASE=%d\n", oldphase);
455 lafs_print_tree(&LAFSI(fs->ss[0].root)->dblock->b, 0);
456 lafs_print_tree(&LAFSI(fs->ss[0].root)->iblock->b, 0);
460 lafs_cluster_flush(fs, 0);
461 lafs_cluster_wait_all(fs);
462 lafs_clusters_done(fs);
464 printk("After cluster_flush...\n");
465 lafs_print_tree(&LAFSI(fs->ss[0].root)->dblock->b, 0);
466 lafs_print_tree(&LAFSI(fs->ss[0].root)->iblock->b, 0);
472 lafs_clusters_done(fs);
475 static void flush_accounting(struct fs *fs)
477 while (!list_empty(&fs->account_leafs)) {
478 struct block *b = list_first_entry(&fs->account_leafs,
481 list_del_init(&b->lru);
482 lafs_iolock_block(b);
483 lafs_cluster_allocate(b, 0);
484 putref(b, MKREF(accounting));
486 fs->qphase = fs->phase;
489 static void finish_checkpoint(struct fs *fs, int youth)
491 /* Don't want to change the youth block while writing them out */
492 set_bit(DelayYouth, &fs->fsstate);
494 flush_accounting(fs);
496 /* if we are creating a snapshot, special handling is needed */
497 if (LAFSI(fs->ss[0].root)->md.fs.usagetable > 1) {
498 /* duplicate the usage table */
499 if (lafs_seg_dup(fs, LAFSI(fs->ss[0].root)->md.fs.usagetable)) {
500 /* FIXME need to abort the snapshot somehow */
504 fs->checkpointing |= CH_CheckpointEnd;
505 dprintk("FinalFlush %d\n", fs->seq);
506 lafs_cluster_flush(fs, 0);
508 clear_bit(DelayYouth, &fs->fsstate);
510 if (!test_bit(FinalCheckpoint, &fs->fsstate))
511 lafs_seg_apply_all(fs);
513 if (test_bit(EmergencyPending, &fs->fsstate)) {
514 set_bit(EmergencyClean, &fs->fsstate);
515 clear_bit(EmergencyPending, &fs->fsstate);
518 lafs_write_state(fs);
519 dprintk("State written, all done %d\n", fs->seq);
521 if (!test_bit(CleanerDisabled, &fs->fsstate)) {
523 if (youth > 65536 - 2 * fs->max_newsegs) {
524 /* time to decay youth */
525 spin_lock(&fs->lock);
526 youth = decay_youth(youth);
527 fs->youth_next = decay_youth(fs->youth_next);
528 fs->scan.do_decay = 1;
529 spin_unlock(&fs->lock);
532 fs->cleaner.cleaning = 0;
534 fs->checkpoint_youth = youth;
536 lafs_wake_thread(fs);
539 unsigned long lafs_do_checkpoint(struct fs *fs)
542 if (!test_bit(CheckpointNeeded, &fs->fsstate) &&
543 !test_bit(FinalCheckpoint, &fs->fsstate)) {
544 /* not compulsory, but maybe just time */
545 if (fs->cleaner.active == 0 &&
546 fs->newsegments > fs->max_newsegs)
547 set_bit(CheckpointNeeded, &fs->fsstate);
548 else if (test_bit(CheckpointOpen, &fs->fsstate)) {
549 unsigned long j = jiffies;
550 if (time_after_eq(j, fs->next_checkpoint))
551 set_bit(CheckpointNeeded, &fs->fsstate);
553 return fs->next_checkpoint - j;
555 return MAX_SCHEDULE_TIMEOUT;
558 if (!test_bit(CheckpointNeeded, &fs->fsstate))
559 /* Must have done final checkpoint */
560 return MAX_SCHEDULE_TIMEOUT;
562 if (fs->cleaner.active || ! fs->scan.done)
563 return MAX_SCHEDULE_TIMEOUT;
565 /* Make sure all cleaner blocks are flushed as if anything lingers
566 * in the cleaner cluster, they will stop the checkpoint from making
569 lafs_cluster_flush(fs, 1);
571 /* OK, time for some work. */
572 dprintk("############################ start checkpoint\n");
573 y = prepare_checkpoint(fs);
577 finish_checkpoint(fs, y);
578 dprintk("############################ finish checkpoint\n");
580 return MAX_SCHEDULE_TIMEOUT;
583 unsigned long long lafs_checkpoint_start(struct fs *fs)
585 unsigned long long cp = fs->wc[0].cluster_seq;
586 WARN_ON(test_bit(FinalCheckpoint, &fs->fsstate));
587 set_bit(CheckpointNeeded, &fs->fsstate);
588 /* Whenever we do a checkpoint, get anyone waiting on
589 * space to check again */
590 clear_bit(CleanerBlocks, &fs->fsstate);
591 fs->prime_sb->s_dirt = 0;
592 lafs_wake_thread(fs);
596 void lafs_checkpoint_lock(struct fs *fs)
598 spin_lock(&fs->lock);
600 spin_unlock(&fs->lock);
603 void lafs_checkpoint_unlock(struct fs *fs)
606 spin_lock(&fs->lock);
607 l = --fs->phase_locked;
608 spin_unlock(&fs->lock);
610 wake_up(&fs->phase_wait);
613 void lafs_checkpoint_unlock_wait(struct fs *fs)
615 /* FIXME it is a bug if we haven't done something to
616 * make a checkpoint happen, else we could wait forever.
617 * As we still hold the lock prepare_checkpoint cannot
618 * have progressed, so CheckpointNeeded must be set.
620 BUG_ON(!test_bit(CheckpointNeeded, &fs->fsstate) &&
621 !test_bit(CleanerBlocks, &fs->fsstate));
622 lafs_checkpoint_unlock(fs);
624 wait_event(fs->phase_wait,
625 !test_bit(CleanerBlocks, &fs->fsstate) &&
626 !test_bit(CheckpointNeeded, &fs->fsstate) &&
627 fs->checkpointing == 0);
630 void lafs_checkpoint_wait(struct fs *fs)
632 /* Wait until there is no checkpoint requested or happening.
633 * This is used to implement filesystem-wide sync
635 wait_event(fs->phase_wait,
636 !test_bit(CheckpointNeeded, &fs->fsstate) &&
637 fs->checkpointing == 0);