4 * Copyright (C) 2005-2010
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
12 * Given a block that is in a segment that is being cleaner, mark
13 * and pin it so that it gets cleaner.
14 * This is written to cope with failure when allocating space, but in
15 * the current design, that should never happen.
17 static int mark_cleaning(struct block *b)
22 if (test_bit(B_Realloc, &b->flags))
23 /* As cleaning is single threaded,
24 * we cannot race with another
25 * pin_dblock_clean here - the
26 * previous attempt must have succeeded
29 err = lafs_reserve_block(b, CleanSpace);
32 LAFS_BUG(!test_bit(B_Valid, &b->flags), b);
33 credits = lafs_space_alloc(fs_from_inode(b->inode), 2, CleanSpace);
36 if (!test_and_set_bit(B_Realloc, &b->flags))
40 if (test_bit(B_Dirty, &b->flags)) {
41 /* If dirty, then now that it is pinned,
42 * it will get written, so don't need
45 if (test_and_clear_bit(B_Realloc, &b->flags))
49 if (!test_and_set_bit(B_UnincCredit, &b->flags))
51 lafs_space_return(fs_from_inode(b->inode), credits);
54 lafs_refile(&b->parent->b, 0);
59 * Find the first block among this and its ancenstor which
60 * is in the nominated segment - if any are.
61 * As the cluster header does not differentiate between
62 * index blocks of different depths, we need to check them
65 static struct block *first_in_seg(struct block *b, struct fs *fs,
66 int dev, u32 seg, REFARG)
68 struct address_space *as = &b->inode->i_data;
70 if (in_seg(fs, dev, seg, b->physaddr))
71 return getref(b, REF);
73 spin_lock(&as->private_lock);
75 p && !in_seg(fs, dev, seg, p->physaddr);
76 p = &(p->parent)->b) {
77 if (test_bit(B_InoIdx, &p->flags)) {
78 struct datablock *db = LAFSI(p->inode)->dblock;
80 spin_unlock(&as->private_lock);
81 as = &db->b.inode->i_data;
82 spin_lock(&as->private_lock);
85 if (p && test_bit(B_InoIdx, &p->flags)) {
86 struct datablock *db = LAFSI(p->inode)->dblock;
87 spin_unlock(&as->private_lock);
88 as = &db->b.inode->i_data;
89 spin_lock(&as->private_lock);
94 getref_locked(p, REF);
96 spin_unlock(&as->private_lock);
100 /* To 'flush' the cleaner, anything on the fs->clean_leafs needs
101 * to be either allocated to a cleaning-cluster or incorporated then added.
102 * If the list gets empty we flush out the cleaning cluster and return.
103 * The next time the cleaner runs it will come back here and do
104 * some more flushing.
106 static void cleaner_flush(struct fs *fs)
110 dprintk("Start cleaner_flush\n");
112 (b = lafs_get_flushable(fs, -1)) != NULL) {
115 dprintk("cleaning %s\n", strblk(b));
117 if (test_bit(B_PinPending, &b->flags)) {
118 /* Cannot safely clean this now. Just mark
119 * it Dirty (it probably will be soon anyway)
120 * so it gets written to the new-data segment
121 * which will effectively clean it.
123 if (!test_and_set_bit(B_Dirty, &b->flags))
124 if (!test_and_clear_bit(B_Realloc, &b->flags))
125 if (!test_and_clear_bit(B_Credit, &b->flags))
128 if (test_bit(B_Dirty, &b->flags)) {
129 /* Ignore this, checkpoint will take it */
130 if (test_and_clear_bit(B_Realloc, &b->flags))
131 if (test_and_set_bit(B_Credit, &b->flags))
132 lafs_space_return(fs, 1);
133 } else if (test_bit(B_Index, &b->flags) &&
135 iblk(b)->uninc_table.pending_cnt)) {
136 lafs_incorporate(fs, iblk(b));
138 err = lafs_cluster_allocate(b, 1);
142 lafs_iounlock_block(b);
143 putref(b, MKREF(flushable));
145 lafs_cluster_flush(fs, 1);
149 * Load the next cluster header for this segment and perform
150 * some validity checks. If there is a failure, simply
151 * update the 'tc' state so that it looks like there are
152 * no more clusters to find.
154 static void cleaner_load(struct fs *fs, struct toclean *tc)
157 err = lafs_load_page_async(fs, tc->chead,
163 BUG_ON(err && err != -EAGAIN && err != -EIO);
168 //printk("CLEANER got IO error !!\n");
169 // FIXME adjust youth to so as not to touch this again
172 tc->ch = page_address(tc->chead);
173 if (memcmp(tc->ch->idtag, "LaFSHead", 8) != 0)
175 if (memcmp(tc->ch->uuid, fs->state->uuid, 16) != 0)
178 tc->seq = le64_to_cpu(tc->ch->seq);
181 if (tc->seq != le64_to_cpu(tc->ch->seq)) {
182 printk("Bad seq number\n");
186 if (lafs_calc_cluster_csum(tc->ch) != tc->ch->checksum) {
187 //printk("Cluster header checksum is wrong!!\n");
190 dprintk("try_clean: got header %d\n", (int)tc->haddr);
198 /* Parse a cluster header and identify blocks that might need cleaning.
199 * Each time through we start parsing from the start.
200 * As we find blocks, or reject inodes we update the header so that
201 * the next time through we don't try those again.
202 * Once we have started IO on 16 different inodes, we take a break
203 * and let some of the IO complete.
204 * As we find blocks, we put them on a list to be processed later.
206 static void cleaner_parse(struct fs *fs, struct toclean *tc)
212 struct inode *ino = NULL;
215 /* Always start at the beginning - we invalidate entries
216 * that we have finished with
218 tc->gh = tc->ch->groups;
219 tc->desc = tc->gh->u.desc;
222 /* Load the index block for each described data or index block.
223 * For data blocks, get the address and possibly load the
225 * As blocks are loaded, they will be checked for membership
226 * in a current cleaning-segment and flagged for reallocation
229 if ((((char *)tc->gh) - (char *)tc->ch)
230 >= le16_to_cpu(tc->ch->Hlength)) {
231 /* Finished with that cluster, try another. */
234 /* Need to come back later */
236 next = le64_to_cpu(tc->ch->next_addr);
237 if (next > tc->haddr &&
238 in_seg(fs, tc->dev, tc->seg, next)) {
246 lafs_wake_thread(fs);
249 if (((((char *)tc->desc) - (char *)tc->gh)+3)/4
250 >= le16_to_cpu(tc->gh->group_size_words)) {
251 /* Finished with that group, try another */
252 /* FIXME what if group has padding at end?
253 * this might be fixed, but need to be certain
254 * of all possibilities. */
255 BUG_ON(le16_to_cpu(tc->gh->group_size_words) == 0);
256 tc->gh = (struct group_head *)(((char *)tc->gh) +
257 le16_to_cpu(tc->gh->group_size_words)*4);
258 tc->desc = tc->gh->u.desc;
261 if (le16_to_cpu(tc->desc->block_bytes) > DescMiniOffset &&
262 tc->desc->block_bytes != DescIndex) {
263 /* This is a miniblock, skip it. */
264 int len = le16_to_cpu(tc->desc->block_bytes)
267 tc->desc = (struct descriptor *)
272 /* Ok, desc seems to be a valid descriptor in this group */
273 /* Try to load the index info for block_num in inode in filesys.
275 // FIXME track phys block number for comparison
276 // FIXME try to optimise out based on youth and snapshot age
278 bnum = le32_to_cpu(tc->desc->block_num);
279 bcnt = le16_to_cpu(tc->desc->block_cnt);
284 inum = le32_to_cpu(tc->gh->inum);
285 fsnum = le32_to_cpu(tc->gh->fsnum);
286 trunc = le16_to_cpu(tc->gh->truncatenum_and_flag) & 0x7fff;
288 if (inum == 0xFFFFFFFF &&
289 fsnum == 0xFFFFFFFF) {
293 dprintk("Cleaner looking at %d/%d %d+%d (%d)\n",
294 (int)fsnum, (int)inum, (int)bnum, (int)bcnt,
295 (int)le16_to_cpu(tc->desc->block_bytes));
297 if (fsnum == 0 && inum == 0 && bnum == 0)
301 ino->i_ino != inum ||
302 LAFSI(ino)->filesys->i_ino != fsnum) {
306 struct inode *fsino =
307 lafs_iget_fs(fs, 0, fsnum, ASYNC);
310 else if (LAFSI(fsino)->md.fs.creation_age > tc->seq) {
311 /* skip this inode as filesystem
320 ino = lafs_iget_fs(fs, fsnum, inum, ASYNC);
325 dprintk("got the inode\n");
326 /* Minor optimisation for files that have shrunk */
327 /* Actually this is critical for handling truncation
328 * properly. We don't want to even 'get' a block beyond
329 * EOF, certainly not after the truncate_inode_pages.
331 if (LAFSI(ino)->type == 0 ||
332 (LAFSI(ino)->type >= TypeBase &&
333 ((loff_t)bnum << ino->i_blkbits) >= i_size_read(ino))) {
334 /* skip the whole descriptor */
338 itrunc = ((ino->i_generation<<8) |
339 (LAFSI(ino)->trunc_gen & 0xff)) & 0x7fff;
340 if (itrunc != trunc) {
341 /* file has been truncated or replaced since
346 b = lafs_get_block(ino, bnum, NULL, GFP_NOFS,
351 if (!test_and_set_bit(B_Cleaning, &b->b.flags)) {
352 getdref(b, MKREF(cleaning));
355 if (LAFSI(ino)->type == TypeInodeFile ||
356 LAFSI(ino)->type == TypeDir) {
357 /* Could become an orphan just now, so need
358 * to protect b->cleaning
360 spin_lock(&fs->lock);
361 list_move_tail(&b->cleaning, &tc->cleaning);
362 spin_unlock(&fs->lock);
364 /* No locking needed */
365 if (list_empty(&b->cleaning))
366 list_add_tail(&b->cleaning, &tc->cleaning);
369 /* We can race with truncate here, so need to check
370 * i_size again now that b->cleaning is non-empty.
371 * The thread doing the truncate will have to lock
372 * the page holding this block, which should be enough
373 * of a barrier so that if it sets i_size after now,
374 * it will see that b->cleaning is non-empty.
375 * We need to be sure that if it sets it before now,
376 * we get to see i_size.
377 * So I think a memory barrier is a good idea...
381 if (LAFSI(ino)->type == 0 ||
382 (LAFSI(ino)->type >= TypeBase &&
383 ((loff_t)bnum << ino->i_blkbits)
384 >= i_size_read(ino))) {
385 list_del_init(&b->cleaning);
386 if (test_and_clear_bit(B_Cleaning, &b->b.flags)) {
387 putdref(b, MKREF(cleaning));
391 putdref(b, MKREF(cleaning));
393 int err = PTR_ERR(ino);
395 dprintk("iget gives error %d\n", err);
396 if (err == -EAGAIN) {
401 /* inode not found, make sure we never
405 tc->gh->inum = 0xFFFFFFFF;
406 tc->gh->fsnum = 0xFFFFFFFF;
411 /* We modify the descriptor in-place to track where
412 * we are up to. This is a private copy. The real
413 * descriptor doesn't change.
415 tc->desc->block_num = cpu_to_le32(bnum+1);
416 tc->desc->block_cnt = cpu_to_le16(bcnt-1);
422 /* Process all blocks that have been found to possibly need to be
423 * moved from their current address (i.e. cleaned).
424 * We initiate async index lookup, then if the block really
425 * is in the target segment we initiate async read. Once
426 * that is complete we mark the block for cleaning and
429 static int cleaner_process(struct fs *fs, struct toclean *tc)
431 struct datablock *b, *tmp;
435 dprintk("start processing list\n");
436 list_for_each_entry_safe(b, tmp, &tc->cleaning, cleaning) {
437 struct block *cb = NULL;
438 int err = lafs_find_block_async(b);
439 dprintk("find_async gives %d %s\n", err, strblk(&b->b));
443 /* Eeek, what do I do?? */
446 cb = first_in_seg(&b->b, fs, tc->dev, tc->seg, MKREF(clean2));
449 /* Moved, don't want this. */
450 dprintk("Not in seg\n");
453 err = lafs_load_block(cb, NULL);
457 err = lafs_wait_block_async(cb);
458 if (err == -EAGAIN) {
459 putref(cb, MKREF(clean2));
465 err = mark_cleaning(cb);
466 dprintk("Want to clean %s (%d)\n",
471 /* as cb is B_Pinned, it holds an effective
472 * ref on the inode, so it is safe to drop our
476 clear_bit(B_Cleaning, &b->b.flags);
478 list_del_init(&b->cleaning);
479 if (test_bit(B_Orphan, &b->b.flags)) {
480 spin_lock(&fs->lock);
481 if (test_bit(B_Orphan, &b->b.flags) &&
482 list_empty(&b->orphans)) {
483 list_add(&b->orphans, &fs->pending_orphans);
484 lafs_wake_thread(fs);
486 spin_unlock(&fs->lock);
490 putdref(b, MKREF(cleaning));
492 putref(cb, MKREF(clean2));
500 * Try to advance the process of cleaning the given segment.
501 * This may require loading a cluster head, parsing or reparsing
502 * that head, or loading some blocks.
504 static int try_clean(struct fs *fs, struct toclean *tc)
506 /* return 1 if everything has been found, -ve if we need to flush */
509 mutex_lock(&fs->cleaner.lock);
510 dprintk("try_clean: state = %d\n", tc->ac.state);
511 if (tc->ch == NULL && tc->have_addr)
512 cleaner_load(fs, tc);
515 cleaner_parse(fs, tc);
517 if (!list_empty(&tc->cleaning))
518 rv = cleaner_process(fs, tc);
520 rv = (tc->ch == NULL && !tc->have_addr &&
521 list_empty(&tc->cleaning));
522 mutex_unlock(&fs->cleaner.lock);
527 * When we truncate a file, and block that is in the
528 * process of being cleaned must have that cleaning
529 * cancelled. That is done by lafs_erase_dblock calling
532 void lafs_unclean(struct datablock *db)
534 if (!list_empty_careful(&db->cleaning)) {
535 struct fs *fs = fs_from_inode(db->b.inode);
536 mutex_lock(&fs->cleaner.lock);
537 if (test_and_clear_bit(B_Cleaning, &db->b.flags)) {
538 /* This must be on the cleaner list, so
539 * it is safe to delete without a spinlock
541 list_del_init(&db->cleaning);
542 putdref(db, MKREF(cleaning));
543 lafs_iput_fs(db->b.inode);
544 if (test_and_clear_bit(B_Async, &db->b.flags)) {
545 putdref(db, MKREF(async));
546 lafs_wake_thread(fs);
548 if (test_bit(B_Orphan, &db->b.flags)) {
549 spin_lock(&fs->lock);
550 if (test_bit(B_Orphan, &db->b.flags) &&
551 list_empty(&db->orphans)) {
552 list_add(&db->orphans, &fs->pending_orphans);
553 lafs_wake_thread(fs);
555 spin_unlock(&fs->lock);
558 mutex_unlock(&fs->cleaner.lock);
562 unsigned long lafs_do_clean(struct fs *fs)
565 * If the cleaner is inactive, we need to decide whether to
566 * activate. This depends on the amount of free space that
567 * is tied up in dirty segments.
568 * If we decide to activate, we collect a few segments and
569 * start work on them.
571 * If the cleaner is active we simply try to progress any activity.
572 * Any activity may trigger an async read in which case we
573 * leave it and move on.
575 * (Most of this happens in try_clean() )
576 * - If we have chosen a segment/cluster but haven't loaded the
577 * cluster head, load the cluster head.
578 * - If we have loaded a cluster head but haven't processed all
579 * of it, process some more into a list of block addresses
580 * - If we have loaded a cluster head and have processed all
581 * of it, select the next cluster in the segment, or forget
583 * - If we have un-processed block addresses in some snapshots
584 * Try to find the current addresses of those blocks
585 * - If we found blocks in the current segment, read them in
586 * - If we have them read in, mark them for reallocation.
587 * - If all done, process realloc_leafs and allocate to clean cluster.
590 if (!fs->cleaner.active &&
591 !test_bit(CheckpointNeeded, &fs->fsstate) &&
592 !test_bit(CleanerDisabled, &fs->fsstate)) {
593 /* Choose to clean when the fraction of all space that is clean
594 * is below the fraction of free space that is not clean.
595 * i.e. if T is total space, C is clean space, F is free space,
596 * then clean when C/T < (F-C)/F
597 * So as the amount of clean space decreases we are less tolerant
598 * of unavailable free space.
599 * Avoiding division, this is
600 * C * F < T * (F - C)
601 * As we always reserve 3 clean segments for accounting overhead
602 * and 1 to ensure we can handle deletions, we exclude those
603 * clean segments from the calculations.
604 * i.e. subtract 4 segments from T, C and F
606 * T we know from the size of the devices
607 * C we know by counting the clean segments
608 * F we count each time we scan the segments. (total_free)
609 * We used the largest count of last pass and this pass.
611 * We need to avoid cleaning too much in one checkpoint as
612 * the free counts will start to get misleading.
613 * Maybe every time we choose to clean a segment, we add the
614 * size of the segment to some counter and add that to C in the
615 * above calculations.
617 * For now, clean up to 4 segments at a time.
621 int force_checkpoint_after_clean = 0;
623 for (i = 0; i < fs->devices; i++)
624 T += fs->devs[i].size;
626 T -= TOTAL_RESERVED * fs->max_segment;
628 max_segs = lafs_alloc_cleaner_segs(fs, CLEANER_SEGS);
630 /* If we can only clean to main segment, we may
632 * - only do one segment at a time
633 * - only if there are no clean (but not yet free) segments
634 * - if CleanerBlocks, then clean.
637 if (fs->segtrack->clean.cnt == 0
638 && test_bit(CleanerBlocks, &fs->fsstate)) {
640 force_checkpoint_after_clean = 1;
643 for (i = 0; i < max_segs; i++) {
644 struct toclean *tc = &fs->cleaner.seg[i];
645 u64 C = fs->free_blocks + fs->clean_reserved
646 + fs->cleaner.cleaning;
647 u64 F = max(fs->total_free, fs->total_free_prev);
649 if (TOTAL_RESERVED * fs->max_segment < C)
650 C -= TOTAL_RESERVED * fs->max_segment; /* adjust to unusable space FIXME adjust F too? */
653 if (TOTAL_RESERVED * fs->max_segment < F)
654 F -= TOTAL_RESERVED * fs->max_segment; /* adjust to unusable space FIXME adjust F too? */
658 dprintk("C=%llu F=%llu T=%llu\n", C, F, T);
659 if ((F < C || C * F >= T * (F - C)) &&
660 !test_bit(EmergencyClean, &fs->fsstate) &&
661 !test_bit(EmergencyPending, &fs->fsstate) &&
662 !test_bit(CleanerBlocks, &fs->fsstate)) {
663 dprintk("CLEANER: enough cleaning with %d segments\n",
668 if (tc->chead == NULL)
671 /* OK, we are good to keep cleaning */
672 tc->have_addr = lafs_get_cleanable(fs, &tc->dev, &tc->seg);
673 if (!tc->have_addr) {
674 dprintk("CLEANER: Nothing found to clean at %d :-(\n",
676 if (i == 0 && (test_bit(EmergencyPending, &fs->fsstate) ||
677 test_bit(CleanerBlocks, &fs->fsstate)))
678 lafs_checkpoint_start(fs);
681 printk("CLEANER: clean %d/%d\n", tc->dev, tc->seg);
682 fs->cleaner.cleaning += fs->devs[tc->dev].segment_size /* - 1*/;
683 tc->haddr = segtovirt(fs, tc->dev, tc->seg);
688 INIT_LIST_HEAD(&tc->cleaning);
689 fs->cleaner.active = 1;
690 if (force_checkpoint_after_clean)
691 lafs_checkpoint_start(fs);
693 if (i == CLEANER_SEGS)
694 dprintk("CLEANER: found %d segments to clean\n", i);
696 if (fs->cleaner.active) {
700 for (i = 0; i < CLEANER_SEGS ; i++) {
701 struct toclean *tc = &fs->cleaner.seg[i];
702 if (tc->have_addr || !list_empty(&tc->cleaning)) {
703 /* Might be something to do here */
704 int done = try_clean(fs, tc);
707 if (done == 0 && doflush == 1)
715 fs->cleaner.active = 0;
716 lafs_wake_thread(fs);
719 if (test_bit(CleanerBlocks, &fs->fsstate)) {
721 int clean = lafs_clean_count(fs, &any_clean);
722 dprintk("clean=%d max_seg=%d need=%d act=%d any=%d\n", (int)clean,
723 (int)fs->max_segment, (int)fs->cleaner.need, fs->cleaner.active, any_clean);
725 /* If there is enough clean space for everything to move
726 * forward, or the cleaner has done all it can, then
727 * push out a checkpoint so threads waiting on the cleaner
730 if (clean * fs->max_segment
731 >= fs->allocated_blocks + fs->cleaner.need
734 lafs_checkpoint_start(fs);
737 return MAX_SCHEDULE_TIMEOUT;