]> git.neil.brown.name Git - LaFS.git/blob - checkpoint.c
README update
[LaFS.git] / checkpoint.c
1
2 /*
3  * routines to create a checkpoint for LaFS
4  * fs/lafs/checkpoint.c
5  * Copyright (C) 2006-2009
6  * NeilBrown <neilb@suse.de>
7  * Released under the GPL, version 2
8  */
9
10 /* A checkpoint is a series of write clusters
11  * from which roll-forward can start when the filesystem
12  * is restarted.
13  * By creating a new checkpoint, all write clusters before
14  * the checkpoint become free to participate in cleaning.
15  * Thus the two motivations for creating a checkpoint are
16  * - to limit the amount of device space that roll-forward will
17  *   have to scan on a restart
18  * - to make more space available in a very full filesystem.
19  * - to allow creation of a snapshot.
20  * So when the filesystem is very full a checkpoint should be
21  * triggered every time a new segment is started.
22  * Otherwise a checkpoint should be triggered once periodically based
23  * on the number of segments written - probably once for every
24  * few dozen segments - experimentation and possible tuning needed.
25  *
26  * Checkpoint creation is not needed for any data integrity
27  * requirements.  It is not even needed at unmount, but is
28  * done at that point anyway to minimise startup time.
29  *
30  * Taking a checkpoint involves:
31  * - stopping all cleaning and defrag activities
32  * - synchronising with various threads that might want to get
33  *   a number of changes into the same phase
34  * - synchronising with writecluster creation on the main writecluster
35  *   sequence
36  * - flipping the 'phase' bit.
37  * - Arranging that the next writecluster will start the checkpoint
38  * - flush out all blocks in the old phase, starting at the leaves, and
39  *   ultimately writing the root.
40  * - writing out any changed blocks in the segment usage files and quota
41  *   files.
42  * - marking the end of the checkpoint.
43  *
44  * The synchronisation and bit flipping is from anywhere that decides
45  * a checkpoint is needed.  The flush and finish-up is handled by
46  * a thread that is created for the purpose.
47  */
48
49 #include "lafs.h"
50 #include <linux/kthread.h>
51
52 #ifdef DUMP
53 static char *strflags(struct block *b)
54 {
55         static char ans[200];
56         ans[0] = 0;
57         if (test_bit(B_Index, &b->flags))
58                 sprintf(ans, "Index(%d),", iblk(b)->depth);
59         if (test_bit(B_Pinned, &b->flags)) {
60                 strcat(ans, "Pinned,");
61                 if (test_bit(B_Phase1, &b->flags))
62                         strcat(ans, "Phase1,");
63                 else
64                         strcat(ans, "Phase0,");
65         }
66         if (test_bit(B_PinPending, &b->flags))
67                 strcat(ans, "PinPending,");
68         if (test_bit(B_InoIdx, &b->flags))
69                 strcat(ans, "InoIdx,");
70         if (test_bit(B_Valid, &b->flags))
71                 strcat(ans, "Valid,");
72         if (test_bit(B_Dirty, &b->flags))
73                 strcat(ans, "Dirty,");
74         if (test_bit(B_Writeback, &b->flags))
75                 strcat(ans, "Writeback,");
76         if (test_bit(B_Linked, &b->flags))
77                 strcat(ans, "Linked,");
78         if (test_bit(B_Realloc, &b->flags))
79                 strcat(ans, "Realloc,");
80         if (test_bit(B_Async, &b->flags))
81                 strcat(ans, "Async,");
82         if (test_bit(B_Root, &b->flags))
83                 strcat(ans, "Root,");
84         if (test_bit(B_SegRef, &b->flags))
85                 strcat(ans, "SegRef,");
86         if (test_bit(B_Credit, &b->flags))
87                 strcat(ans, "C,");
88         if (test_bit(B_ICredit, &b->flags))
89                 strcat(ans, "CI,");
90         if (test_bit(B_NCredit, &b->flags))
91                 strcat(ans, "CN,");
92         if (test_bit(B_NICredit, &b->flags))
93                 strcat(ans, "CNI,");
94         if (test_bit(B_UnincCredit, &b->flags))
95                 strcat(ans, "UninCredit,");
96         if (test_bit(B_IOLock, &b->flags))
97                 strcat(ans, "IOLock,");
98         if (test_bit(B_IOLockLock, &b->flags))
99                 strcat(ans, "IOLockLock,");
100         if (test_bit(B_HaveLock, &b->flags))
101                 strcat(ans, "HaveLock,");
102         if (test_bit(B_HaveWriteback, &b->flags))
103                 strcat(ans, "HaveWriteback,");
104         if (test_bit(B_WriteError, &b->flags))
105                 strcat(ans, "WriteError,");
106         if (test_bit(B_Claimed, &b->flags))
107                 strcat(ans, "Claimed,");
108         if (test_bit(B_OnFree, &b->flags))
109                 strcat(ans, "OnFree,");
110         if (test_bit(B_PhysValid, &b->flags))
111                 strcat(ans, "PhysValid,");
112         if (test_bit(B_PrimaryRef, &b->flags))
113                 strcat(ans, "PrimaryRef,");
114         if (test_bit(B_EmptyIndex, &b->flags))
115                 strcat(ans, "EmptyIndex,");
116
117         if (test_bit(B_Uninc, &b->flags))
118                 strcat(ans, "Uninc,");
119         if (test_bit(B_Orphan, &b->flags))
120                 sprintf(ans+strlen(ans), "Orphan(%d),", dblk(b)->orphan_slot);
121         if (test_bit(B_Prealloc, &b->flags))
122                 strcat(ans, "Prealloc,");
123         if (ans[0])
124                 ans[strlen(ans)-1] = 0;
125         else
126                 strcpy(ans, "-nil-");
127         return ans;
128 }
129
130 char *strblk(struct block *b)
131 {
132         static char ans[400];
133         unsigned long ino = 0;
134
135         if (!b)
136                 return "(NULL block)";
137         if (b->inode)
138                 ino = b->inode->i_ino;
139         if (test_bit(B_PhysValid, &b->flags))
140                 sprintf(ans, "[%p]%lu/%lu(%llu)r%d%c:%s",
141                         b, ino, b->fileaddr,
142                         b->physaddr, atomic_read(&b->refcnt),
143                         list_empty(&b->lru) ? 'E' : 'F',
144                         strflags(b));
145         else
146                 sprintf(ans, "[%p]%lu/%lu(NoPhysAddr)r%d%c:%s",
147                         b, ino, b->fileaddr,
148                         atomic_read(&b->refcnt),
149                         list_empty(&b->lru) ? 'E' : 'F',
150                         strflags(b));
151         if (test_bit(B_Pinned, &b->flags) &&
152             test_bit(B_Index, &b->flags))
153                 sprintf(ans+strlen(ans), "{%d,%d}",
154                         atomic_read(&iblk(b)->pincnt[0]),
155                         atomic_read(&iblk(b)->pincnt[1]));
156         if (test_bit(B_Index, &b->flags)) {
157                 sprintf(ans+strlen(ans), "[%d%s%s]",
158                         iblk(b)->uninc_table.pending_cnt,
159                         iblk(b)->uninc ? "*" : "",
160                         iblk(b)->uninc_next ? "+" : "");
161         }
162 #ifdef DEBUG_IOLOCK
163         if (test_bit(B_IOLock, &b->flags)) {
164                 sprintf(ans+strlen(ans), "<%s:%d>",
165                         strrchr(b->iolock_file, '/'),
166                         b->iolock_line);
167         }
168 #endif
169         if (!b->parent)
170                 sprintf(ans+strlen(ans), " NP");
171 #if DEBUG_REF
172         {
173                 int i;
174                 for (i = 0; i < 16; i++)
175                         if (b->holders[i].cnt)
176                                 sprintf(ans+strlen(ans),
177                                         " %s(%d)", b->holders[i].name,
178                                         b->holders[i].cnt);
179         }
180 #endif
181         return ans;
182 }
183
184 int lafs_print_tree(struct block *b, int depth)
185 {
186         struct indexblock *ib = iblk(b);
187         int i;
188         int j;
189         struct block *b2;
190         int credits = 0;
191         struct inode *ino = NULL;
192
193         if (depth > 20) {
194                 printk("... aborting at %d\n", depth);
195                 BUG();
196                 return 0;
197         }
198
199         printk("%*s", depth, "");
200         printk("%s", strblk(b));
201         if (!b) {
202                 printk("\n");
203                 return 0;
204         }
205
206         for (i = 0; i < 2; i++) {
207                 j = 0;
208                 list_for_each_entry(b2, &dfs->phase_leafs[i], lru) {
209                         if (b2 == b) {
210                                 printk(" Leaf%d(%d) ", i, j);
211                                 break;
212                         }
213                         j++;
214                 }
215         }
216         j = 0;
217         list_for_each_entry(b2, &dfs->clean_leafs, lru) {
218                 if (b2 == b) {
219                         printk(" Clean(%d) ", j);
220                         break;
221                 }
222                 j++;
223         }
224         list_for_each_entry(b2, &dfs->account_leafs, lru) {
225                 if (b2 == b) {
226                         printk(" Account(%d) ", j);
227                         break;
228                 }
229                 j++;
230         }
231         list_for_each_entry(b2, &freelist.lru, lru)
232                 if (b2 == b) {
233                         printk(" on free ");
234                         break;
235                 }
236         for (i = 0 ; i < 2; i++) {
237                 int j, k;
238                 j = 0;
239                 list_for_each_entry(b2, &dfs->wc[i].clhead, lru) {
240                         if (b2 == b)
241                                 printk(" wc[%d][%d] ", i, j);
242                         j++;
243                 }
244                 for (k = 0; k < 4; k++) {
245                         j = 0;
246                         list_for_each_entry(b2, &dfs->wc[i].pending_blocks[k], lru) {
247                                 if (b2 == b)
248                                         printk(" pending[%d][%d][%d] ", i, k, j);
249                                 j++;
250                         }
251                 }
252         }
253
254         printk("\n");
255         if (test_bit(B_Index, &b->flags) && ib->uninc_table.pending_cnt) {
256                 lafs_print_uninc(&ib->uninc_table);
257                 credits += ib->uninc_table.credits;
258         }
259
260         if (test_bit(B_Credit, &b->flags))
261                 credits++;
262         if (test_bit(B_ICredit, &b->flags))
263                 credits++;
264         if (test_bit(B_NCredit, &b->flags))
265                 credits++;
266         if (test_bit(B_NICredit, &b->flags))
267                 credits++;
268         if (test_bit(B_UnincCredit, &b->flags))
269                 credits++;
270         if (test_bit(B_Dirty, &b->flags))
271                 credits++;
272         if (test_bit(B_Realloc, &b->flags))
273                 credits++;
274
275         if (test_bit(B_Index, &b->flags)) {
276                 list_for_each_entry(b, &ib->children, siblings) {
277                         LAFS_BUG(b->parent != ib, b);
278                         credits += lafs_print_tree(b, depth+2);
279                 }
280         } else if ((ino = rcu_my_inode(dblk(b))) != NULL &&
281                    LAFSI(ino)->iblock &&
282                    LAFSI(ino)->iblock->b.parent
283                 ) {
284                 LAFS_BUG(LAFSI(ino)->iblock->b.parent != b->parent,
285                          b);
286                 credits += lafs_print_tree(&LAFSI(ino)->iblock->b,
287                                            depth+1);
288         }
289         rcu_iput(ino);
290         return credits;
291 }
292
293 void lafs_dump_tree(void)
294 {
295         int credits;
296         printk("=================== Index Dump ===================\n");
297         printk("inum dpth (refcnt) addr [phys] {[lk cnt]} flags\n");
298         credits = lafs_print_tree(&LAFSI(dfs->ss[0].root)->dblock->b, 0);
299         credits += lafs_print_tree(&LAFSI(dfs->ss[0].root)->iblock->b, 0);
300         printk("====================%d/%d-%d credits ==============================\n",
301                credits, (int)dfs->allocated_blocks, dfs->cluster_updates);
302 }
303 #endif
304
305 static int prepare_checkpoint(struct fs *fs)
306 {
307         int oldphase;
308         int youth;
309
310         spin_lock(&fs->lock);
311         wait_event_lock(fs->phase_wait,
312                         fs->phase_locked == 0,
313                         fs->lock);
314
315         oldphase = fs->phase;
316         fs->phase = !oldphase;
317         fs->checkpointing = CH_CheckpointStart | CH_Checkpoint;
318         clear_bit(CheckpointNeeded, &fs->fsstate);
319         spin_unlock(&fs->lock);
320
321         youth = fs->youth_next - 1;
322         return youth;
323 }
324
325 /* Get a block that should be written out.
326  * If phase is '-1', then only return cleanable blocks, else
327  * also return blocks for that phase.
328  * The returned block is locked for IO.
329  */
330 struct block *lafs_get_flushable(struct fs *fs, int phase)
331 {
332         struct block *b = NULL;
333
334 retry:
335         spin_lock(&fs->lock);
336         if (!list_empty(&fs->clean_leafs))
337                 b = list_entry(fs->clean_leafs.next, struct block, lru);
338         else if (phase >= 0 && !list_empty(&fs->phase_leafs[phase]))
339                 b = list_entry(fs->phase_leafs[phase].next, struct block, lru);
340         else {
341                 spin_unlock(&fs->lock);
342                 return NULL;
343         }
344
345         getref_locked_needsync(b, MKREF(flushable));
346         list_del_init(&b->lru);
347         spin_unlock(&fs->lock);
348
349         sync_ref(b);
350
351         /* Need an iolock, but if the block is in writeback,
352          * or unpinned or otherwise not a leaf, we don't want it.
353          */
354         lafs_iolock_block(b);
355
356         if (!lafs_is_leaf(b, phase)) {
357                 lafs_iounlock_block(b);
358                 putref(b, MKREF(flushable));
359                 goto retry;
360         }
361
362         return b;
363 }
364
365 static void do_checkpoint(void *data)
366 {
367         struct fs *fs = data;
368         int oldphase = !fs->phase;
369         struct block *b;
370         int loops = 0;
371 #ifdef DUMP
372         dfs = fs;
373 #endif
374
375         dprintk("Start Checkpoint\n");
376         if (lafs_trace)
377                 lafs_dump_tree();
378 again:
379         while ((b = lafs_get_flushable(fs, oldphase)) != NULL) {
380                 int unlock = 1;
381                 dprintk("Checkpoint Block %s\n", strblk(b));
382
383                 if (test_and_clear_bit(B_Async, &b->flags)) {
384                         /* this shouldn't normally happen, but
385                          * it is possible, so check anyway
386                          */
387                         putref(b, MKREF(async));
388                         lafs_wake_thread(fs);
389                 }
390
391                 if (!!test_bit(B_Phase1, &b->flags) != oldphase)
392                         /* lafs_pin_dblock flipped phase for us */;
393                 else if (test_bit(B_PinPending, &b->flags) &&
394                     !test_bit(B_Index, &b->flags) &&
395                     (LAFSI(b->inode)->type == TypeSegmentMap ||
396                      LAFSI(b->inode)->type == TypeQuota)) {
397                         /* Need to delay handling of this block until
398                          * the phase change has finished, to just
399                          * before we finish the checkpoint.
400                          * Note that we don't check if they are dirty -
401                          * they might not be yet - the whole point of the
402                          * delay is that changes are still arriving.
403                          */
404                         lafs_flip_dblock(dblk(b));
405                         /* access to account_leafs is single-threaded
406                          * by the cleaner thread so no locking needed
407                          */
408                         getref(b, MKREF(accounting));
409                         list_add(&b->lru, &fs->account_leafs);
410                 } else if (test_bit(B_Index, &b->flags) &&
411                     (iblk(b)->uninc_table.pending_cnt ||
412                      iblk(b)->uninc)) {
413                         lafs_incorporate(fs, iblk(b));
414                 } else if ((test_bit(B_Dirty, &b->flags) ||
415                             test_bit(B_Realloc, &b->flags))) {
416                         lafs_cluster_allocate(b, 0);
417                         unlock = 0;
418                 } else if (test_bit(B_Index, &b->flags)) {
419                         if (atomic_read(&iblk(b)->pincnt[oldphase])
420                             == 0) {
421                                 lafs_phase_flip(fs, iblk(b));
422                                 unlock = 0;
423                         }
424                 } else {
425                         /* Data block is still PinPending at
426                          * checkpoint so must be a directory block.
427                          * We want to keep PinPending until the operation
428                          * completes, but need to unpin.
429                          * So we drop/refile/retake.
430                          * PinPending is mainly needed to keep
431                          * writepage at bay and it will take iolock,
432                          * so dropping PinPending while holding
433                          * iolock is safe.
434                          */
435                         LAFS_BUG(!test_bit(B_PinPending, &b->flags), b);
436                         clear_bit(B_PinPending, &b->flags);
437                         lafs_refile(b, 0);
438                         set_bit(B_PinPending, &b->flags);
439                 }
440
441                 if (unlock)
442                         lafs_iounlock_block(b);
443
444                 putref(b, MKREF(flushable));
445         }
446         if ((test_bit(B_Pinned, &LAFSI(fs->ss[0].root)->iblock->b.flags) &&
447              !!test_bit(B_Phase1, &LAFSI(fs->ss[0].root)->iblock->b.flags)
448              != fs->phase)
449             ||
450             (test_bit(B_Pinned, &LAFSI(fs->ss[0].root)->dblock->b.flags) &&
451              !!test_bit(B_Phase1, &LAFSI(fs->ss[0].root)->dblock->b.flags)
452              != fs->phase)) {
453                 if (loops == 20) {
454                         printk("Cannot escape PHASE=%d\n", oldphase);
455                         lafs_print_tree(&LAFSI(fs->ss[0].root)->dblock->b, 0);
456                         lafs_print_tree(&LAFSI(fs->ss[0].root)->iblock->b, 0);
457                         lafs_trace = 1;
458                 }
459
460                 lafs_cluster_flush(fs, 0);
461                 lafs_cluster_wait_all(fs);
462                 lafs_clusters_done(fs);
463                 if (loops == 20) {
464                         printk("After cluster_flush...\n");
465                         lafs_print_tree(&LAFSI(fs->ss[0].root)->dblock->b, 0);
466                         lafs_print_tree(&LAFSI(fs->ss[0].root)->iblock->b, 0);
467                         BUG();
468                 }
469                 loops++;
470                 goto again;
471         }
472         lafs_clusters_done(fs);
473 }
474
475 static void flush_accounting(struct fs *fs)
476 {
477         while (!list_empty(&fs->account_leafs)) {
478                 struct block *b = list_first_entry(&fs->account_leafs,
479                                                    struct block,
480                                                    lru);
481                 list_del_init(&b->lru);
482                 lafs_iolock_block(b);
483                 lafs_cluster_allocate(b, 0);
484                 putref(b, MKREF(accounting));
485         }
486         fs->qphase = fs->phase;
487 }
488
489 static void finish_checkpoint(struct fs *fs, int youth)
490 {
491         /* Don't want to change the youth block while writing them out */
492         set_bit(DelayYouth, &fs->fsstate);
493
494         flush_accounting(fs);
495
496         /* if we are creating a snapshot, special handling is needed */
497         if (LAFSI(fs->ss[0].root)->md.fs.usagetable > 1) {
498                 /* duplicate the usage table */
499                 if (lafs_seg_dup(fs, LAFSI(fs->ss[0].root)->md.fs.usagetable)) {
500                         /* FIXME need to abort the snapshot somehow */
501                 }
502         }
503
504         fs->checkpointing |= CH_CheckpointEnd;
505         dprintk("FinalFlush %d\n", fs->seq);
506         lafs_cluster_flush(fs, 0);
507
508         clear_bit(DelayYouth, &fs->fsstate);
509
510         if (!test_bit(FinalCheckpoint, &fs->fsstate))
511                 lafs_seg_apply_all(fs);
512         lafs_clean_free(fs);
513         if (test_bit(EmergencyPending, &fs->fsstate)) {
514                 set_bit(EmergencyClean, &fs->fsstate);
515                 clear_bit(EmergencyPending, &fs->fsstate);
516         }
517
518         lafs_write_state(fs);
519         dprintk("State written, all done %d\n", fs->seq);
520
521         if (!test_bit(CleanerDisabled, &fs->fsstate)) {
522                 fs->scan.done = 0;
523                 if (youth > 65536 - 2 * fs->max_newsegs) {
524                         /* time to decay youth */
525                         spin_lock(&fs->lock);
526                         youth = decay_youth(youth);
527                         fs->youth_next = decay_youth(fs->youth_next);
528                         fs->scan.do_decay = 1;
529                         spin_unlock(&fs->lock);
530                 }
531         }
532         fs->cleaner.cleaning = 0;
533
534         fs->checkpoint_youth = youth;
535         fs->newsegments = 0;
536         lafs_wake_thread(fs);
537 }
538
539 unsigned long lafs_do_checkpoint(struct fs *fs)
540 {
541         int y;
542         if (!test_bit(CheckpointNeeded, &fs->fsstate) &&
543             !test_bit(FinalCheckpoint, &fs->fsstate)) {
544                 /* not compulsory, but maybe just time */
545                 if (fs->cleaner.active == 0 &&
546                     fs->newsegments > fs->max_newsegs)
547                         set_bit(CheckpointNeeded, &fs->fsstate);
548                 else if (test_bit(CheckpointOpen, &fs->fsstate)) {
549                         unsigned long j = jiffies;
550                         if (time_after_eq(j, fs->next_checkpoint))
551                                 set_bit(CheckpointNeeded, &fs->fsstate);
552                         else
553                                 return fs->next_checkpoint - j;
554                 } else
555                         return MAX_SCHEDULE_TIMEOUT;
556         }
557
558         if (!test_bit(CheckpointNeeded, &fs->fsstate))
559                 /* Must have done final checkpoint */
560                 return MAX_SCHEDULE_TIMEOUT;
561
562         if (fs->cleaner.active || ! fs->scan.done)
563                 return MAX_SCHEDULE_TIMEOUT;
564
565         /* Make sure all cleaner blocks are flushed as if anything lingers
566          * in the cleaner cluster, they will stop the checkpoint from making
567          * progress.
568          */
569         lafs_cluster_flush(fs, 1);
570
571         /* OK, time for some work. */
572         dprintk("############################ start checkpoint\n");
573         y = prepare_checkpoint(fs);
574
575         do_checkpoint(fs);
576
577         finish_checkpoint(fs, y);
578         dprintk("############################ finish checkpoint\n");
579
580         return MAX_SCHEDULE_TIMEOUT;
581 }
582
583 unsigned long long lafs_checkpoint_start(struct fs *fs)
584 {
585         unsigned long long cp = fs->wc[0].cluster_seq;
586         WARN_ON(test_bit(FinalCheckpoint, &fs->fsstate));
587         set_bit(CheckpointNeeded, &fs->fsstate);
588         /* Whenever we do a checkpoint, get anyone waiting on
589          * space to check again */
590         clear_bit(CleanerBlocks, &fs->fsstate);
591         fs->prime_sb->s_dirt = 0;
592         lafs_wake_thread(fs);
593         return cp;
594 }
595
596 void lafs_checkpoint_lock(struct fs *fs)
597 {
598         spin_lock(&fs->lock);
599         fs->phase_locked++;
600         spin_unlock(&fs->lock);
601 }
602
603 void lafs_checkpoint_unlock(struct fs *fs)
604 {
605         int l;
606         spin_lock(&fs->lock);
607         l = --fs->phase_locked;
608         spin_unlock(&fs->lock);
609         if (l == 0)
610                 wake_up(&fs->phase_wait);
611 }
612
613 void lafs_checkpoint_unlock_wait(struct fs *fs)
614 {
615         /* FIXME it is a bug if we haven't done something to
616          * make a checkpoint happen, else we could wait forever.
617          * As we still hold the lock prepare_checkpoint cannot
618          * have progressed, so CheckpointNeeded must be set.
619          */
620         BUG_ON(!test_bit(CheckpointNeeded, &fs->fsstate) &&
621                !test_bit(CleanerBlocks, &fs->fsstate));
622         lafs_checkpoint_unlock(fs);
623
624         wait_event(fs->phase_wait,
625                    !test_bit(CleanerBlocks, &fs->fsstate) &&
626                    !test_bit(CheckpointNeeded, &fs->fsstate) &&
627                    fs->checkpointing == 0);
628 }
629
630 void lafs_checkpoint_wait(struct fs *fs)
631 {
632         /* Wait until there is no checkpoint requested or happening.
633          * This is used to implement filesystem-wide sync
634          */
635         wait_event(fs->phase_wait,
636                    !test_bit(CheckpointNeeded, &fs->fsstate) &&
637                    fs->checkpointing == 0);
638 }