3 * segment tracking routines for LaFS
5 * Copyright (C) 2006-2009
6 * NeilBrown <neilb@suse.de>
7 * Released under the GPL, version 2
11 * - active blocks in segment usage files
12 * - next-phase usage updates
13 * - youth tracking and decay
14 * - list of segments that could be cleaned
15 * - list of free segments that could be reused
16 * - tracking space allocation and available free space
18 * Segment usage blocks:
19 * Every Pinned, Dirty or Realloc block with physaddr != 0 owns a reference
20 * on the block in the segment usage file that counts that
21 * block. When the physaddr is changed, the new block will always
22 * be available so the reference is simply moved.
24 * Normally when ->physaddr changes, the counts in the relevant
25 * segusage blocks are updated directly. However when the change is
26 * for a write in the next phase (during a checkpoint) that is not
27 * possible as the segusage blocks need to be written with the old value.
28 * So we track these updates in a simple data structure that
29 * records delta against each ss+dev+seg. If such a data structure
30 * cannot be allocated (-ENOMEM) the associated write is blocked until
31 * the checkpoint completes. FIXME - implement that.
33 * Every Pinned or Dirty block holds a number of space credits,
34 * following a pattern described elsewhere. The space allocation
35 * unit identifies segments to be used in advance and holds a reference
39 * Youth numbers are handed out sequentially to new write clusters.
40 * When the available space approaches being full all youth numbers are
41 * decayed (as differentiation between old youth values is uninteresting).
42 * This decay is done in a staged fashion, one block at a time so that
43 * a checkpoint can happen at any time and the current block in the
44 * decay sweep is recorded.
47 * Holding a record of all free segments is impractical in general. Instead
48 * We record a limited number of free segments (1 page worth - 512 entries,
49 * possibly with ranges) and when needed we scan the segment usage
50 * files for free segments and add them to the list. This scan needs to
51 * be rate limited - not sure how best to do that.
54 * Similar to free segments we need to keep track of cleanable segments.
55 * Almost all non-free segments are cleanable, and we assign a weight to
56 * each based on youth and usage. We want to keep the most cleanable
57 * segments based on this weight.
58 * We keep an unordered list of cleanable segments. When it becomes
59 * full we sort it by weight and discard the worst half.
62 * As there could be hundreds of snapshots (hopefully not thousands)
63 * we cannot load the segusage block for each snapshot and then parse them
64 * in parallel. Instead we allocate space to store a max usage and
65 * merge each block one at a time into that max. We then combine the
66 * max with the youth to get a 64bit weight... I wonder if that is good.
70 /* Every block that could change its ->physaddr holds a reference
71 * on the segment summary block through this structure
75 #include <linux/hash.h>
76 #include <linux/random.h>
77 #include <linux/slab.h>
84 struct hlist_node hash;
89 struct datablock *ssblk;
90 /* youthblk is only set when ssnum is 0 */
91 struct datablock *youthblk;
94 static int shash(u32 segnum, int devnum, int ssnum)
96 unsigned long h = hash_long(segnum, BITS_PER_LONG);
97 return hash_long(h ^ (devnum | (ssnum << 16)), SHASHBITS);
100 static inline void ss_get(struct segsum *ss)
102 BUG_ON(atomic_read(&ss->refcnt) == 0);
103 atomic_inc(&ss->refcnt);
106 static void ss_put(struct segsum *ss, struct fs *fs)
108 if (atomic_dec_and_lock(&ss->refcnt, &fs->stable_lock)) {
109 if (atomic_read(&ss->delayed) != 0)
110 spin_unlock(&fs->stable_lock);
112 if (!hlist_unhashed(&ss->hash))
113 hlist_del(&ss->hash);
114 if (!fs->stable_changed)
115 fs->stable_changed = 1;
116 spin_unlock(&fs->stable_lock);
117 putdref(ss->ssblk, MKREF(ss));
118 putdref(ss->youthblk, MKREF(ssyouth));
124 static struct segsum *segsum_find(struct fs *fs, u32 segnum,
125 int devnum, int ssnum)
127 struct hlist_head *head = &fs->stable[shash(segnum, devnum, ssnum)];
128 struct segsum *ss, *new = NULL;
129 struct hlist_node *n;
135 spin_lock(&fs->stable_lock);
136 hlist_for_each_entry(ss, n, head, hash)
137 if (ss->segnum == segnum &&
138 ss->ssnum == ssnum &&
139 ss->devnum == devnum) {
140 atomic_inc(&ss->refcnt);
141 spin_unlock(&fs->stable_lock);
144 if (!fs->rolled && fs->qphase != fs->phase)
147 !test_bit(B_Valid, &ss->ssblk->b.flags) ) {
148 /* need to read this now that roll forward
151 err = lafs_read_block(ss->ssblk);
158 !test_bit(B_Valid, &ss->youthblk->b.flags)) {
159 /* need to read this now that roll forward
162 err = lafs_read_block(ss->youthblk);
171 hlist_add_head(&new->hash, head);
172 spin_unlock(&fs->stable_lock);
175 spin_unlock(&fs->stable_lock);
177 /* seems we must allocate something */
179 new = kmalloc(sizeof(*new), GFP_KERNEL | __GFP_NOFAIL);
180 new->segnum = segnum;
181 new->devnum = devnum;
183 atomic_set(&new->refcnt, 1);
184 atomic_set(&new->delayed, 0);
185 INIT_HLIST_NODE(&new->hash);
186 dv = fs->devs + devnum;
187 addr = LAFSI(fs->ss[ssnum].root)->md.fs.usagetable * dv->tablesize;
188 addr += segnum >> (fs->blocksize_bits - USAGE_SHIFT);
189 new->ssblk = lafs_get_block(dv->segsum, addr, NULL,
193 new->youthblk = lafs_get_block(dv->segsum,
194 segnum >> (fs->blocksize_bits
200 new->youthblk = NULL;
201 BUG_ON(!new->ssblk || IS_ERR(new->ssblk) || IS_ERR(new->youthblk));
202 if (!fs->rolled && fs->qphase != fs->phase) {
203 /* Too early to safely read segusage blocks,
204 * leave it until later
208 err = lafs_read_block(new->ssblk);
209 if (new->youthblk && err == 0)
210 err = lafs_read_block(new->youthblk);
218 static struct segsum *segsum_byaddr(struct fs *fs, u64 addr, int ssnum)
223 virttoseg(fs, addr, &dev, &seg, &offset);
225 return segsum_find(fs, seg, dev, ssnum);
228 void lafs_seg_put_all(struct fs *fs)
230 /* remove all the ssblk and youthblk references from
232 * This is called when filesystem is being unmounted
235 for (i = 0; i < SHASHSIZE; i++)
236 if (!hlist_empty(&fs->stable[i])) {
238 struct datablock *b, *y;
239 struct hlist_node *n, *pos;
241 spin_lock(&fs->stable_lock);
243 hlist_for_each_entry_safe(ss, pos, n, &fs->stable[i], hash) {
250 spin_unlock(&fs->stable_lock);
251 if (b && test_and_clear_bit(B_Async, &b->b.flags))
252 putdref(b, MKREF(async));
253 if (y && test_and_clear_bit(B_Async, &y->b.flags))
254 putdref(y, MKREF(async));
255 putdref(b, MKREF(ss));
256 putdref(y, MKREF(ssyouth));
257 spin_lock(&fs->stable_lock);
258 if (fs->stable_changed)
261 spin_unlock(&fs->stable_lock);
265 /* We need to set SegRef on this block, and all parents,
266 * and the segsum block and all parents, and so on.
267 * We find the first ancestor that does not have SegRef,
268 * find the matching segsum block. If it already
269 * has SegRef, we add the reference and retry from the start.
270 * If it doesn't have SegRef, we prealloc and then tail recurse.
271 * Note: we never set SegRef on B_Root blocks as their usage
274 int lafs_seg_ref_block(struct block *b, int ssnum)
276 struct fs *fs = fs_from_inode(b->inode);
278 LAFS_BUG(test_bit(B_InoIdx, &b->flags), b);
279 getref(b, MKREF(segref));
281 while (!test_bit(B_SegRef, &b->flags)) {
283 struct block *b2, *to_put = NULL;
288 BUG_ON(test_bit(B_Root, &b->flags));
290 /* The extra reference of 'b2' and the use for 'to_put'
291 * deserves some explanation.
292 * We will normally hold an implicit reference to b2 due to
293 * our reference on 'b' and the fact that b2 is an ancestor
294 * of 'b'. However at this point we have no locks on the file
295 * at all so it is possible that a btree manipulation could
296 * split b2 resulting in b2 not being the parent of b any more
297 * so our reference guarantee is lost. Even in that case,
298 * the ancestor of b would probably have a B_PrimaryRef on
299 * b2. However it is hard to prove that will stay around long
300 * enough so we take an extra ref just in case.
301 * We only need the ref when we drop private_lock so only take
302 * it then. We cannot drop an old ref at that point, so
303 * store the old ref in 'to_put' and drop it when releasing
308 /* Need to check parents */
310 spin_lock(&ino->i_data.private_lock);
312 p && !test_bit(B_SegRef, &p->flags);
313 p = &(p->parent)->b) {
314 if (test_bit(B_InoIdx, &p->flags)) {
315 struct datablock *db = LAFSI(p->inode)->dblock;
317 getref(b2, MKREF(segref2));
318 spin_unlock(&ino->i_data.private_lock);
321 putref(to_put, MKREF(segref2));
322 spin_lock(&ino->i_data.private_lock);
325 if (test_bit(B_SegRef, &p->flags))
327 if (!test_bit(B_Root, &p->flags))
330 getref(b2, MKREF(segref2));
331 spin_unlock(&ino->i_data.private_lock);
333 putref(to_put, MKREF(segref2));
334 /* b2 is the first ancestor (closest to root) without SegRef */
336 if (b2->physaddr == 0) {
337 /* There is no segsum to reference */
338 set_bit(B_SegRef, &b2->flags);
339 putref(b2, MKREF(segref2));
343 ss = segsum_byaddr(fs, b2->physaddr, ssnum);
345 putref(b, MKREF(segref));
346 putref(b2, MKREF(segref2));
350 if (&ss->ssblk->b == b) {
351 /* Don't need to check for Prealloc or
352 * SegRef, just take a reference now
354 if (test_and_set_bit(B_SegRef, &b2->flags))
356 putref(b2, MKREF(segref2));
360 /* Don't need iolock here as segusage is very special */
361 set_bit(B_PinPending, &ss->ssblk->b.flags);
362 err = lafs_pin_dblock(ss->ssblk, AccountSpace);
365 putref(b, MKREF(segref));
366 putref(b2, MKREF(segref2));
369 if (!test_bit(B_SegRef, &ss->ssblk->b.flags)) {
370 putref(b, MKREF(segref));
371 b = getref(&ss->ssblk->b, MKREF(segref));
373 } else if (test_and_set_bit(B_SegRef, &b2->flags))
375 putref(b2, MKREF(segref2));
377 putref(b, MKREF(segref));
381 void lafs_seg_deref(struct fs *fs, u64 physaddr, int ssnum)
384 struct segsum *ss = segsum_byaddr(fs, physaddr, ssnum);
386 BUG_ON(atomic_read(&ss->refcnt) < 2);
392 static void seg_inc(struct fs *fs, struct segsum *ss, int diff, int in_phase)
395 atomic_add(diff, &ss->delayed);
398 b = map_dblock(ss->ssblk);
399 spin_lock(&fs->stable_lock);
400 p = &b[ss->segnum & ((fs->blocksize-1)>>USAGE_SHIFT)];
401 //BUG_ON(diff < 0 && le32_to_cpu(*p) < -diff);
402 if (diff < 0 && le32_to_cpu(*p) < -diff) {
403 printk("diff=%d p=%d segnum=%d\n", diff, le32_to_cpu(*p),
407 *p = cpu_to_le32(le32_to_cpu(*p) + diff);
408 spin_unlock(&fs->stable_lock);
409 unmap_dblock(ss->ssblk, b);
410 lafs_dirty_dblock(ss->ssblk);
414 void lafs_seg_move(struct fs *fs, u64 oldaddr, u64 newaddr,
415 int ssnum, int phase, int moveref)
419 dprintk("SEGMOVE %llu %llu\n",
420 (unsigned long long) oldaddr,
421 (unsigned long long) newaddr);
424 ss = segsum_byaddr(fs, newaddr, ssnum);
425 seg_inc(fs, ss, 1, fs->qphase == phase);
432 ss = segsum_byaddr(fs, oldaddr, ssnum);
433 seg_inc(fs, ss, -1, fs->qphase == phase);
440 static void set_youth(struct fs *fs, struct segsum *ss)
446 /* HACK youth_next should always be at least 0x8000 so that
447 * cleanable score differentiates well for new segments.
448 * old code would sometimes set youth_next very low, so
451 if (fs->youth_next < 0x8000)
452 fs->youth_next = 0x8000;
454 if (fs->scan.do_decay &&
455 (fs->scan.free_dev < ss->devnum
456 || (fs->scan.free_dev == ss->devnum
457 && fs->scan.free_block < (ss->segnum
458 / (fs->blocksize / 2)))
460 /* Haven't decayed this block yet - revert decay */
462 ybuf = map_dblock(ss->youthblk);
463 youthp = ybuf + (ss->segnum & ((1 << (fs->blocksize_bits
464 - YOUTH_SHIFT)) - 1));
465 if (le16_to_cpu(*youthp) < 8) {
466 *youthp = cpu_to_le16(y);
470 unmap_dblock(ss->youthblk, youthp);
472 lafs_dirty_dblock(ss->youthblk);
475 static void seg_apply(struct fs *fs, struct segsum *ss)
478 int diff = atomic_read(&ss->delayed);
481 atomic_set(&ss->delayed, 0);
482 dprintk("Seg apply %d %d\n", (int)ss->segnum, diff);
483 err = lafs_read_block(ss->ssblk);
484 BUG_ON(err); // FIXME do something useful here
486 err = lafs_read_block(ss->youthblk);
488 seg_inc(fs, ss, diff, 1);
490 if (diff > 0 && ss->youthblk)
494 /* lafs_seg_apply_all
495 * During a checkpoint, blocks written to the next phase cannot immediately
496 * update the segment usage tables so those updates need to be delayed.
497 * This function applies all the delayed updates to the segment usage
498 * files at the end of a checkpoint.
500 void lafs_seg_apply_all(struct fs *fs)
504 for (i = 0 ; i < SHASHSIZE ; i++) {
505 struct hlist_head *head = &fs->stable[i];
507 struct hlist_node *n, *pos;
508 spin_lock(&fs->stable_lock);
510 fs->stable_changed = 0;
511 hlist_for_each_entry_safe(ss, pos, n, head, hash) {
512 if (atomic_read(&ss->delayed) == 0)
514 atomic_inc(&ss->refcnt);
515 spin_unlock(&fs->stable_lock);
518 spin_lock(&fs->stable_lock);
519 if (fs->stable_changed)
522 spin_unlock(&fs->stable_lock);
524 /* Now any clean segments found earlier are free. */
528 * When creating a snapshot, we need to duplicate table[1] in
529 * each segment usage file into table[n].
530 * It would be nice to just copy block pointers but this might
531 * confuse cleaning etc. so duplicate the data for now.
533 int lafs_seg_dup(struct fs *fs, int newtable)
536 * get blocks for each table and memcpy
540 for (d=0; d < fs->devices ; d++) {
541 struct fs_dev *dv = &fs->devs[d];
543 for (b=0 ; b < dv->tablesize ; b++) {
544 struct datablock *obl, *nbl;
548 addr = dv->tablesize + b;
549 obl = lafs_get_block(dv->segsum, addr, 0,
550 GFP_KERNEL | __GFP_NOFAIL,
552 err = lafs_read_block(obl);
554 putdref(obl, MKREF(ss));
558 addr = dv->tablesize * newtable + b;
559 nbl = lafs_get_block(dv->segsum, addr, 0,
560 GFP_KERNEL | __GFP_NOFAIL,
563 from = map_dblock(obl);
564 to = map_dblock_2(nbl);
565 memcpy(to, from, fs->blocksize);
566 unmap_dblock_2(nbl, to);
567 unmap_dblock(obl, from);
568 lafs_dirty_dblock(nbl);
569 putdref(obl, MKREF(ss));
570 putdref(nbl, MKREF(ss));
577 /*************************************************************
578 * Space management: allocate, use, free
581 static int count_credits(struct block *b)
584 struct indexblock *ib = iblk(b);
585 struct inode *ino = NULL;
587 if (test_bit(B_Credit, &b->flags))
589 if (test_bit(B_ICredit, &b->flags))
591 if (test_bit(B_NCredit, &b->flags))
593 if (test_bit(B_NICredit, &b->flags))
595 if (test_bit(B_UnincCredit, &b->flags))
597 if (test_bit(B_Dirty, &b->flags))
599 if (test_bit(B_Realloc, &b->flags))
602 if (test_bit(B_Index, &b->flags)) {
603 list_for_each_entry(b, &ib->children, siblings) {
604 LAFS_BUG(b->parent != ib, b);
605 credits += count_credits(b);
607 credits += ib->uninc_table.credits;
608 } else if ((ino = rcu_my_inode(dblk(b))) != NULL &&
609 LAFSI(ino)->iblock &&
610 LAFSI(ino)->iblock->b.parent
612 LAFS_BUG(LAFSI(ino)->iblock->b.parent != b->parent, b);
613 credits += count_credits(&LAFSI(ino)->iblock->b);
620 static void check_credits(struct fs *fs)
623 * count credits in tree and make sure they match allocate_blocks
626 if (fs->ss[0].root == NULL ||
627 LAFSI(fs->ss[0].root)->iblock == NULL ||
628 LAFSI(fs->ss[0].root)->dblock == NULL)
630 credits = count_credits(&LAFSI(fs->ss[0].root)->iblock->b);
631 credits += count_credits(&LAFSI(fs->ss[0].root)->dblock->b);
632 if (credits + fs->cluster_updates + temp_credits != fs->allocated_blocks) {
633 printk("credits=%d updates=%d temp=%d allocated=%d\n", credits,
634 fs->cluster_updates, temp_credits,
635 (int)fs->allocated_blocks);
641 void lafs_space_return(struct fs *fs, int credits)
643 spin_lock(&fs->alloc_lock);
645 if (credits > fs->allocated_blocks)
646 printk("cred=%d ab=%d\n", credits, (int)fs->allocated_blocks);
647 BUG_ON(credits > fs->allocated_blocks);
648 fs->allocated_blocks -= credits;
649 spin_unlock(&fs->alloc_lock);
653 int lafs_alloc_cleaner_segs(struct fs *fs, int max)
655 /* cleaner is about to start.
656 * See how many segments of space can safely
657 * be reserved for its use.
658 * We don't allocate more to the cleaner than we
659 * leave spare for new allocations.
661 int watermark = fs->max_segment * 4;
663 spin_lock(&fs->alloc_lock);
664 while (fs->clean_reserved < max * fs->max_segment &&
665 fs->free_blocks > 0 &&
666 (u64)fs->free_blocks > (fs->clean_reserved
667 + fs->allocated_blocks
669 fs->clean_reserved += fs->max_segment;
670 fs->free_blocks -= fs->max_segment;
673 spin_unlock(&fs->alloc_lock);
677 int lafs_space_alloc(struct fs *fs, int credits, int why)
679 /* 'why's for space reservation.
680 * NewSpace means we want to write a block which didn't exist
681 * before. This is allowed to fail or block if the cleaner
682 * appears able to make progress.
683 * ReleaseSpace means we want to write a block which does currently
684 * exist, so doing this will eventually free up space. This must
685 * never fail, but can block.
686 * CleanSpace means we want to write a block to relocate it to
687 * a 'cleaning' segment. This may never fail.
688 * AccountSpace means we absolutely need this block now, and it is
689 * a BUG is there is no space available.
694 spin_lock(&fs->alloc_lock);
697 watermark += RELEASE_RESERVED * fs->max_segment;
700 watermark += ACCOUNT_RESERVED * fs->max_segment;
704 /* Definitely no water mark here. */
708 if (fs->rolled && watermark) {
709 /* We cannot account properly before roll-forward has
710 * completed. FIXME once it has completed we need to
711 * check and invalidate the FS if there was a problem.
713 if (fs->free_blocks < 0 ||
714 (u64)fs->free_blocks < (fs->allocated_blocks
715 + credits + watermark))
716 credits = 0; /* Sorry, no room */
718 if (fs->rolled && watermark == 0) {
719 /* When including the clean_reserved space, there should
720 * be room for these controlled allocations
722 BUG_ON(fs->free_blocks < 0);
723 if (fs->free_blocks + fs->clean_reserved <
724 fs->allocated_blocks + credits)
729 if (why == NewSpace && !test_bit(EmergencyClean, &fs->fsstate))
730 set_bit(EmergencyPending, &fs->fsstate);
731 if (why >= ReleaseSpace || !test_bit(EmergencyClean, &fs->fsstate))
732 if (!test_bit(CleanerBlocks, &fs->fsstate) ||
733 fs->cleaner.need > watermark + fs->max_segment) {
734 fs->cleaner.need = watermark + fs->max_segment;
735 set_bit(CleanerBlocks, &fs->fsstate);
736 lafs_wake_thread(fs);
738 } else if (why == NewSpace)
739 if (test_bit(EmergencyClean, &fs->fsstate) ||
740 test_bit(EmergencyPending, &fs->fsstate)) {
741 clear_bit(EmergencyPending, &fs->fsstate);
742 clear_bit(EmergencyClean, &fs->fsstate);
745 fs->allocated_blocks += credits;
746 BUG_ON(fs->free_blocks + fs->clean_reserved < fs->allocated_blocks);
747 spin_unlock(&fs->alloc_lock);
751 /********************************************************************
753 * Segment usage / youth tracking
756 /* Information about free and cleanable segments are stored in a table
757 * comprised of a few pages. Indexes into this table are 16bit. 4bits
758 * for page number, 12 bits for index in the page.
759 * Different pages have different sized entries to allow for different
760 * depths in the skiplist that sorts entries by dev/segment.
761 * Entries are at least 16 bytes, so 12 bit can index up 64K.
762 * Each entry is also on one of four lists:
763 * - unused: this entry is not used
764 * - free: this entry is for a free segment
765 * - cleanable: this entry is for a cleanable segment.
766 * - clean: segment has been cleaned but is not yet free (awaiting checkpoint).
767 * These are singly linked lists (via 'next'). We record head and tail.
768 * The end of this list has a pointer to 0xFFFF, not 0.
769 * We usually remove entries from the head, but when the table is
770 * full, we might walk part way down a list and discard all the rest.
771 * We discard them by setting score to 0xFFFFFFFF. We then walk the
772 * skip list moving anything with a score of 0xFFFFFFFF to the unused list.
774 * We occasionally sort the cleanable list using a merge sort.
776 * There is a progression from 'cleanable' to 'clean' to 'free', though
777 * segments can enter at any point.
778 * When we choose a segment to clean, we remove the entry from the cleanable
779 * list. We do this by setting the score to 0xFFFFFFFE and unlinking it.
780 * After cleaning has completed a scan should find that it is clean and so
781 * add it to the 'clean' list.
782 * When we record a 'clean' segment as 'free' (after a checkpoint) we
783 * move it from the clean list to the free list, from where it will be
784 * removed when needed.
785 * We don't add new free segments when the total of free+clean segments
786 * is more than half the size of the table.
787 * Similarly when the cleanable table reaches half the available size
788 * we remove the least-interesting half.
789 * When we choose a free segment to start filling, we remove it from the
790 * free list but not from the table. The score is set to 0xFFFFFFFD to
791 * record that it is unlinked but busy. If it is found to be cleanable,
792 * it will be ignored. When we finish filling a segment, we find the entry
793 * again and remove it properly so it can become cleanable later.
796 #define SCORE_MAX 0xFFFFFFFFFFFFFFFCULL /* Maximum normal score */
797 #define SCORE_ACTIVE 0xFFFFFFFFFFFFFFFDULL /* This segment is being written to */
798 #define SCORE_CLEANING 0xFFFFFFFFFFFFFFFEULL /* This segment in being cleaned */
799 #define SCORE_DEAD 0xFFFFFFFFFFFFFFFFULL /* This segment is to be removed */
807 u16 skip[0]; /* or larger... */
810 static inline struct segstat *segfollow(struct segtracker *st, u16 link)
815 a = st->page[link >> 12];
816 a += (link & 0xFFF) *
817 (st->size[link >> 12] * sizeof(u16) + sizeof(struct segstat));
818 return (struct segstat *) a;
821 int lafs_segtrack_init(struct segtracker *st)
827 st->page[0] = kmalloc(PAGE_SIZE, GFP_KERNEL);
828 st->page[1] = kmalloc(PAGE_SIZE, GFP_KERNEL);
829 st->page[2] = kmalloc(PAGE_SIZE, GFP_KERNEL);
830 st->page[3] = kmalloc(PAGE_SIZE, GFP_KERNEL);
832 if (st->page[0] == NULL ||
833 st->page[1] == NULL ||
834 st->page[2] == NULL ||
835 st->page[3] == NULL) {
836 lafs_segtrack_free(st);
843 BUG_ON(SEG_NUM_HEIGHTS <= 9);
845 st->unused.first = st->unused.last = 0xffff;
846 st->cleanable.first = st->cleanable.last = 0xffff;
847 st->free.first = st->free.last = 0xffff;
848 st->clean.first = st->clean.last = 0xffff;
849 st->unused.cnt = st->free.cnt = st->cleanable.cnt = st->clean.cnt = 0;
851 for (h = 0; h < SEG_NUM_HEIGHTS; h++)
852 st->head[h] = 0xFFFF;
854 /* how many entries in each page */
855 for (i = 0 ; i < 4; i++)
856 n[i] = PAGE_SIZE / (sizeof(struct segstat) +
857 st->size[i] * sizeof(u16));
863 get_random_bytes(&rand, 1);
866 while (p < 3 && n[p] == 0)
875 ss = segfollow(st, sn);
876 ss->next = st->unused.first;
877 st->unused.first = sn;
878 if (st->unused.cnt == 0)
879 st->unused.last = sn;
884 st->total = st->unused.cnt;
888 void lafs_segtrack_free(struct segtracker *st)
896 int lafs_check_seg_cnt(struct segtracker *st)
898 /* check at unused, free, cleanable, clean have correct count.
903 for (prev = sn = st->unused.first ;
905 sn = segfollow(st, (prev = sn))->next)
907 if (cnt != st->unused.cnt) {
908 printk("%d != %d\n", cnt, st->unused.cnt); WARN_ON(1);
911 if (st->unused.last != prev) {
912 printk("L%d != %d\n", prev, st->unused.last); WARN_ON(1);
917 for (prev = sn = st->free.first ;
919 sn = segfollow(st, (prev = sn))->next)
921 if (cnt != st->free.cnt) {
922 printk("%d != %d\n", cnt, st->free.cnt); WARN_ON(1);
925 if (st->free.last != prev) {
926 printk("L%d != %d\n", prev, st->free.last); WARN_ON(1);
931 for (prev = sn = st->clean.first ;
933 sn = segfollow(st, (prev = sn))->next)
935 if (cnt != st->clean.cnt) {
936 printk("%d != %d\n", cnt, st->clean.cnt); WARN_ON(1);
939 if (st->clean.last != prev) {
940 printk("L%d != %d\n", prev, st->clean.last); WARN_ON(1);
945 for (prev = sn = st->cleanable.first ;
947 sn = segfollow(st, (prev = sn))->next)
949 if (cnt != st->cleanable.cnt) {
950 printk("%d != %d\n", cnt, st->cleanable.cnt); WARN_ON(1);
953 if (st->cleanable.last != prev) {
954 printk("L%d != %d\n", prev, st->cleanable.last); WARN_ON(1);
961 static void segsort(struct segtracker *st, struct slist *l)
963 /* sort the 'l' list of st by score - lowest first */
964 /* use a merge sort */
983 /* Find the least of h[0] and h[1] that is not less than
984 * prev, and add it to hp[curr]. If both are larger than
985 * prev, flip 'curr' and add smallest.
987 while (h[0] != 0xFFFF || h[1] != 0xFFFF) {
988 if (h[next] == 0xFFFF ||
989 (h[1-next] != 0xFFFF &&
990 !((prev <= segfollow(st, h[1-next])->score)
991 ^ (segfollow(st, h[1-next])->score
992 <= segfollow(st, h[next])->score)
993 ^ (segfollow(st, h[next])->score <= prev))
997 if (segfollow(st, h[next])->score < prev)
999 prev = segfollow(st, h[next])->score;
1000 *hp[curr] = h[next];
1001 hp[curr] = &segfollow(st, h[next])->next;
1003 h[next] = segfollow(st, h[next])->next;
1007 } while (head[0] != 0xFFFF && head[1] != 0xFFFF);
1008 if (head[0] == 0xFFFF)
1015 static int statcmp(struct segstat *ss, u16 dev, u32 seg)
1021 if (ss->segment < seg)
1023 if (ss->segment > seg)
1028 static struct segstat *segfind(struct segtracker *st, u16 dev,
1029 u32 segnum, u16 *where[SEG_NUM_HEIGHTS])
1031 /* Find the segment entry for dev/segnum.
1032 * Return link info in where to allow insertion/deletion.
1035 u16 *head = st->head;
1036 for (level = SEG_NUM_HEIGHTS-1 ; level >= 0; level--) {
1037 while (head[level] != 0xffff &&
1038 statcmp(segfollow(st, head[level]), dev, segnum) < 0) {
1039 head = segfollow(st, head[level])->skip;
1041 where[level] = &head[level];
1043 if (head[0] == 0xFFFF ||
1044 statcmp(segfollow(st, head[0]), dev, segnum) != 0)
1047 return segfollow(st, head[0]);
1050 static int segchoose_height(struct segtracker *st, u16 ssn)
1052 unsigned long rand[DIV_ROUND_UP(SEG_NUM_HEIGHTS * 2 / 8 + 1,
1053 sizeof(unsigned long))];
1054 int max = st->size[ssn>>12];
1056 get_random_bytes(rand, sizeof(rand));
1062 test_bit(h*2, rand) &&
1063 test_bit(h*2+1, rand))
1068 static void seginsert(struct segtracker *st, u16 ssn,
1069 u16 *where[SEG_NUM_HEIGHTS])
1071 /* We looked for 'ss' but didn't find it. 'where' is the
1072 * result of looking. Now insert 'ss'
1074 struct segstat *ss = segfollow(st, ssn);
1075 int h = segchoose_height(st, ssn);
1077 ss->skip[h] = *where[h];
1083 static void segdelete(struct segtracker *st, struct segstat *ss)
1085 /* This segstat has just been removed from a list (free or cleanable).
1086 * Remove it from the skiplist as well
1088 u16 *where[SEG_NUM_HEIGHTS];
1089 struct segstat *ss2 = segfind(st, ss->dev, ss->segment, where);
1093 BUG_ON(ss2 == NULL);
1096 for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos ; h++)
1097 *where[h] = ss2->skip[h];
1098 ss2->next = st->unused.first;
1099 st->unused.first = pos;
1100 if (st->unused.cnt == 0)
1101 st->unused.last = pos;
1103 lafs_check_seg_cnt(st);
1106 static void segdelete_all(struct segtracker *st, struct fs *fs)
1108 /* walk entire skip list. Any entry with score of SCORE_DEAD
1111 u16 *where[SEG_NUM_HEIGHTS];
1114 for (h = SEG_NUM_HEIGHTS-1; h >= 0; h--)
1115 where[h] = &st->head[h];
1117 while (*where[0] != 0xFFFF) {
1118 u16 pos = *where[0];
1119 struct segstat *ss = segfollow(st, pos);
1120 if (ss->score == SCORE_DEAD) {
1121 /* delete this entry */
1123 /* FIXME Locking is really bad here */
1125 ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1126 putdref(ssm->ssblk, MKREF(intable));
1127 putdref(ssm->youthblk, MKREF(intable));
1130 for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1131 *where[h] = ss->skip[h];
1132 ss->next = st->unused.first;
1133 st->unused.first = pos;
1134 if (st->unused.cnt == 0)
1135 st->unused.last = pos;
1137 lafs_check_seg_cnt(st);
1139 /* advance 'where' to here */
1140 for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1141 where[h] = &ss->skip[h];
1146 static u16 seg_pop(struct segtracker *st, struct slist *which)
1149 u16 rv = which->first;
1152 ss = segfollow(st, rv);
1153 which->first = ss->next;
1155 if (ss->next == 0xFFFF)
1156 which->last = 0xFFFF;
1160 static struct segstat *seg_add_new(struct segtracker *st, struct slist *which, int atend,
1161 int dev, u32 seg, long long score, int usage,
1162 u16 *where[SEG_NUM_HEIGHTS])
1167 ssn = seg_pop(st, &st->unused);
1168 ss = segfollow(st, ssn);
1174 int l = which->last;
1181 BUG_ON(segfollow(st, l)->next != 0xFFFF);
1182 segfollow(st, l)->next = ssn;
1185 ss->next = which->first;
1187 if (which->last == 0xFFFF)
1196 seginsert(st, ssn, where);
1197 lafs_check_seg_cnt(st);
1201 void lafs_free_get(struct fs *fs, unsigned int *dev, u32 *seg,
1204 /* Select and return a free segment.
1205 * The youth value must have been zero, and we set it to the
1206 * current youth number.
1209 struct segsum *ssum = NULL;
1212 BUG_ON(nonlogged); // FIXME should handle this case, not ignore it
1215 spin_lock(&fs->lock);
1217 wait_event_lock(fs->phase_wait,
1218 !fs->scan.first_free_pass ||
1219 fs->segtrack->free.first != 0xFFFF,
1222 ss = segfollow(fs->segtrack, fs->segtrack->free.first);
1228 if (!test_bit(DelayYouth, &fs->fsstate) &&
1230 ssum->devnum == ss->dev &&
1231 ssum->segnum == ss->segment)) {
1232 spin_unlock(&fs->lock);
1236 ssum = segsum_find(fs, ss->segment, ss->dev, 0);
1237 /* As we hold a ref on youth block for anything in the
1238 * table, and that block was loaded at the time, it must
1241 BUG_ON(!ssum || !ssum->youthblk);
1242 BUG_ON(!test_bit(B_Valid, &ssum->youthblk->b.flags));
1243 set_bit(B_PinPending, &ssum->youthblk->b.flags);
1244 lafs_pin_dblock(ssum->youthblk, AccountSpace);
1247 seg_pop(fs->segtrack, &fs->segtrack->free);
1249 ss->score = SCORE_ACTIVE;
1250 /* still in table, but unlinked */
1252 if (ssum && test_bit(DelayYouth, &fs->fsstate)) {
1258 set_youth(fs, ssum);
1260 spin_unlock(&fs->lock);
1262 /* now need to reserve/dirty/reference the youth and
1263 * segsum block for each snapshot that could possibly
1265 * NOTE: this needs fixing to support snapshots. Later.
1267 for (ssnum = 0; ssnum < 1 ; ssnum++) {
1268 struct segsum *ssum2;
1269 ssum2 = segsum_find(fs, ss->segment, ss->dev, ssnum);
1271 /* ?? what do I need to release etc */
1272 /* Maybe this cannot fail because we own references
1273 * to the two blocks !! */
1275 lafs_checkpoint_lock(fs);
1276 set_bit(B_PinPending, &ssum2->ssblk->b.flags);
1277 (void)lafs_pin_dblock(ssum2->ssblk, AccountSpace);
1278 lafs_checkpoint_unlock(fs);
1283 dprintk("NEXT segment found %d/%d youth %d\n",
1284 *dev, *seg, fs->youth_next - 1);
1285 /* Note that we return an implicit reference to the ssum */
1288 void lafs_update_youth(struct fs *fs, int dev, u32 seg)
1290 struct segsum *ssum = NULL;
1291 ssum = segsum_find(fs, seg, dev, 0);
1292 set_bit(B_PinPending, &ssum->youthblk->b.flags);
1293 lafs_pin_dblock(ssum->youthblk, AccountSpace);
1294 spin_lock(&fs->lock);
1295 set_youth(fs, ssum);
1296 spin_unlock(&fs->lock);
1300 void lafs_seg_forget(struct fs *fs, int dev, u32 seg)
1302 /* this segment was being filled and is now full.
1303 * We need to drop it from the table, and drop
1304 * references to the blocks
1309 spin_lock(&fs->lock);
1312 segdelete(fs->segtrack, &tmp);
1313 spin_unlock(&fs->lock);
1315 ss = segsum_find(fs, seg, dev, 0);
1317 BUG_ON(atomic_read(&ss->refcnt) < 2);
1318 /* Removed from table so ... */
1319 putdref(ss->ssblk, MKREF(intable));
1320 putdref(ss->youthblk, MKREF(intable));
1325 int lafs_clean_count(struct fs *fs, int *any_clean)
1328 spin_lock(&fs->lock);
1329 *any_clean = fs->segtrack->clean.cnt != 0;
1330 rv = fs->segtrack->free.cnt + fs->segtrack->clean.cnt;
1331 spin_unlock(&fs->lock);
1335 static int add_free(struct fs *fs, unsigned int dev, u32 seg, u16 *youthp)
1337 /* This dev/seg is known to be free. add it to the list */
1339 u16 *where[SEG_NUM_HEIGHTS];
1340 dprintk("add_free %d %d\n", (int)dev, (int)seg);
1341 spin_lock(&fs->lock);
1343 /* not really free any more */
1344 spin_unlock(&fs->lock);
1348 ss = segfind(fs->segtrack, dev, seg, where);
1350 /* already in the table. If it happens to be
1351 * on the cleanable list, we have to wait for
1352 * the cleaner to find it and add it to our list.
1354 spin_unlock(&fs->lock);
1357 if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1358 fs->segtrack->total / 2) {
1359 /* Have enough free/clean entries already */
1360 spin_unlock(&fs->lock);
1364 seg_add_new(fs->segtrack, &fs->segtrack->free, 0,
1365 dev, seg, 0, 0, where);
1367 spin_unlock(&fs->lock);
1371 static int add_clean(struct fs *fs, unsigned int dev, u32 seg, bool if_present)
1373 /* This dev/seg is now clean. Make sure it is on the list.
1374 * Chances are that this is already present in the table as
1375 * a recently cleaned segment. If 'if_present', then insist
1377 * Return TRUE if segment was added to the table (as this
1378 * implies a reference count).
1381 u16 *where[SEG_NUM_HEIGHTS];
1383 dprintk("ADD CLEAN %d/%d %d #####################################\n",
1384 dev, seg, fs->segtrack->clean.cnt);
1385 spin_lock(&fs->lock);
1386 ss = segfind(fs->segtrack, dev, seg, where);
1388 /* already in the table. If loose, attach it to
1389 * the clean list. If cleanable, set score to 0 to make
1390 * sure it is found soon
1392 if (ss->score == SCORE_CLEANING) {
1393 ss->next = fs->segtrack->clean.first;
1394 fs->segtrack->clean.first = where[0][0];
1395 if (fs->segtrack->clean.last == 0xFFFF)
1396 fs->segtrack->clean.last =
1397 fs->segtrack->clean.first;
1398 fs->segtrack->clean.cnt++;
1400 if (ss->score != SCORE_ACTIVE) {
1401 /* Must be on the clean list now */
1405 spin_unlock(&fs->lock);
1409 spin_unlock(&fs->lock);
1413 if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1414 fs->segtrack->total / 2) {
1415 /* Have enough free/clean entries already */
1416 spin_unlock(&fs->lock);
1420 ss = seg_add_new(fs->segtrack, &fs->segtrack->clean, 0,
1421 dev, seg, 0, 0, where);
1423 spin_unlock(&fs->lock);
1430 void lafs_add_active(struct fs *fs, u64 addr)
1432 /* add this segment to the table as 'active'.
1433 * This is only called once at mount time for the
1434 * active segments. Other segments become active
1438 u16 *where[SEG_NUM_HEIGHTS];
1439 struct segsum *ssum = segsum_byaddr(fs, addr, 0);
1440 (void)getdref(ssum->ssblk, MKREF(intable));
1441 (void)getdref(ssum->youthblk, MKREF(intable));
1442 spin_lock(&fs->lock);
1443 ss = segfind(fs->segtrack, ssum->devnum, ssum->segnum, where);
1445 ss = seg_add_new(fs->segtrack, NULL, 0,
1446 ssum->devnum, ssum->segnum,
1447 SCORE_ACTIVE, 0, where);
1450 spin_unlock(&fs->lock);
1453 void lafs_clean_free(struct fs *fs)
1455 /* We are finishing off a checkpoint. Move all from 'clean'
1456 * list to 'free' list, and set the youth for each to 0.
1457 * Note that we can walk the 'clean' list without locking as
1458 * items are only ever removed by this thread.
1462 if (fs->segtrack->clean.cnt == 0)
1464 for (ssn = fs->segtrack->clean.first ; ssn != 0xffff; ssn = ss->next) {
1465 struct datablock *db;
1467 ss = segfollow(fs->segtrack, ssn);
1468 spin_lock(&fs->lock);
1469 fs->free_blocks += fs->devs[ss->dev].segment_size;
1470 spin_unlock(&fs->lock);
1471 db = lafs_get_block(fs->devs[ss->dev].segsum,
1472 ss->segment >> (fs->blocksize_bits
1474 NULL, GFP_KERNEL | __GFP_NOFAIL,
1476 err = lafs_read_block(db);
1478 err = lafs_reserve_block(&db->b, AccountSpace);
1480 u16 *b = map_dblock(db);
1481 spin_lock(&fs->stable_lock);
1482 b[ss->segment & ((fs->blocksize-1)>>YOUTH_SHIFT)] = 0;
1483 spin_unlock(&fs->stable_lock);
1484 unmap_dblock(db, b);
1485 lafs_dirty_dblock(db);
1487 /* FIXME make filesystem read-only */
1489 putdref(db, MKREF(cleanfree));
1491 /* Now move them to the 'free' list */
1492 spin_lock(&fs->lock);
1493 fs->segtrack->free.cnt += fs->segtrack->clean.cnt;
1494 if (fs->segtrack->free.last == 0xFFFF)
1495 fs->segtrack->free.first = fs->segtrack->clean.first;
1497 segfollow(fs->segtrack, fs->segtrack->free.last)->next
1498 = fs->segtrack->clean.first;
1499 fs->segtrack->free.last = fs->segtrack->clean.last;
1500 fs->segtrack->clean.first = 0xFFFF;
1501 fs->segtrack->clean.last = 0xFFFF;
1502 fs->segtrack->clean.cnt = 0;
1503 spin_unlock(&fs->lock);
1506 void lafs_dump_cleanable(void)
1509 struct segtracker *st;
1513 printk("No cleanable table\n");
1517 printk("============= Cleanable table (%d) =================\n",
1519 printk("pos: dev/seg usage score\n");
1522 for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1523 ss = segfollow(st, ssn);
1525 printk("%3d: %3d/%-4d %5d %lld\n",
1532 printk("...sorted....\n");
1533 segsort(st, &st->cleanable);
1535 for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1536 ss = segfollow(st, ssn);
1538 printk("%3d: %3d/%-4d %5d %lld\n",
1545 printk("--------------- Free table (%d) ---------------\n",
1548 for (ssn = st->free.first; ssn != 0xffff; ssn = ss->next) {
1549 ss = segfollow(st, ssn);
1551 printk("%3d: %3d/%-4d %5d %lld\n",
1558 printk("--------------- Clean table (%d) ---------------\n",
1561 for (ssn = st->clean.first; ssn != 0xffff; ssn = ss->next) {
1562 ss = segfollow(st, ssn);
1564 printk("%3d: %3d/%-4d %5d %lld\n",
1571 printk("--------\n");
1572 printk("free_blocks=%lld allocated=%llu max_seg=%llu clean_reserved=%llu\n",
1573 dfs->free_blocks, dfs->allocated_blocks, dfs->max_segment,
1574 dfs->clean_reserved);
1577 int lafs_get_cleanable(struct fs *fs, u16 *dev, u32 *seg)
1579 /* Return the most cleanable segment on the list.
1580 * The segstat is removed from the 'cleanable' list
1581 * and left in the table. This prevents it being re-added.
1582 * When it is later found to be clean, it will be added
1583 * to the 'clean' list.
1585 struct segtracker *st = fs->segtrack;
1589 lafs_check_seg_cnt(st);
1592 spin_lock(&fs->lock);
1593 if (st->cleanable.first == 0xFFFF) {
1594 spin_unlock(&fs->lock);
1598 if (st->sorted_size < st->cleanable.cnt / 2) {
1599 segsort(st, &st->cleanable);
1600 st->sorted_size = st->cleanable.cnt;
1601 st->max_score = segfollow(st, st->cleanable.last)->score;
1604 lafs_check_seg_cnt(fs->segtrack);
1606 ss = segfollow(st, seg_pop(st, &st->cleanable));
1608 if (lafs_check_seg_cnt(fs->segtrack)) {
1610 printk("first=%x last=%x cnt=%d\n", st->cleanable.first,
1611 st->cleanable.last, st->cleanable.cnt);
1612 for (sj = st->cleanable.first ;
1614 sj = segfollow(st, sj)->next)
1615 printk(" %x\n", sj);
1625 dprintk("SEG: cleanable %d/%d score=%llu usage=%d\n",
1626 ss->dev, ss->segment, (unsigned long long)ss->score, ss->usage);
1628 ss->score = SCORE_CLEANING;
1629 if (ss->usage == 0) {
1631 spin_unlock(&fs->lock);
1632 rv = add_clean(fs, *dev, *seg, true);
1633 BUG_ON(rv); /* passing 'true' makes this impossible */
1636 segdelete(fs->segtrack, ss);
1637 spin_unlock(&fs->lock);
1639 ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1640 putdref(ssm->ssblk, MKREF(intable));
1641 putdref(ssm->youthblk, MKREF(intable));
1647 static int add_cleanable(struct fs *fs, unsigned int dev, u32 seg,
1648 u16 youth, u32 usage)
1653 u16 *where[SEG_NUM_HEIGHTS];
1655 lafs_check_seg_cnt(fs->segtrack);
1656 if (fs->scan.trace || lafs_trace || 1)
1657 printk("CLEANABLE: %u/%lu y=%d u=%d\n",
1658 dev, (unsigned long)seg, (int)youth, (int)usage);
1661 if (youth >= fs->checkpoint_youth)
1664 segsize = fs->devs[dev].segment_size;
1665 fs->total_free += segsize - usage /* - 1 */;
1668 int rv = add_clean(fs, dev, seg, false);
1669 lafs_check_seg_cnt(fs->segtrack);
1673 if (test_bit(EmergencyClean, &fs->fsstate))
1676 /* 0x100000000 is to ensure this score is always
1677 * more than the above score */
1678 score = (u64)youth * usage;
1679 do_div(score, segsize);
1680 score += 0x100000000;
1683 spin_lock(&fs->lock);
1684 if (score > SCORE_MAX)
1687 ss = segfind(fs->segtrack, dev, seg, where);
1689 /* already in the table. Just update the usage.
1690 * It must be on the right list.
1692 if (ss->score == SCORE_ACTIVE)
1693 ; /* still in use */
1694 else if (ss->usage == 0 && ss->score > 0)
1695 ; /* on free list, leave it alone */
1700 lafs_check_seg_cnt(fs->segtrack);
1701 spin_unlock(&fs->lock);
1705 if (fs->segtrack->cleanable.cnt >= fs->segtrack->total/2) {
1706 /* Table of cleanable segments is full. Sort and discard
1712 segsort(fs->segtrack, &fs->segtrack->cleanable);
1713 ssnum = fs->segtrack->cleanable.first;
1714 keep = (fs->segtrack->cleanable.cnt+1) / 2;
1715 for (i = 0; i < keep; i++) {
1717 ss = segfollow(fs->segtrack, ssnum);
1720 fs->segtrack->max_score = ss->score;
1722 fs->segtrack->cleanable.last = prev;
1723 fs->segtrack->cleanable.cnt = keep;
1724 ss = segfollow(fs->segtrack, ssnum);
1726 ss->score = SCORE_DEAD;
1727 ss = segfollow(fs->segtrack, ss->next);
1729 segdelete_all(fs->segtrack, fs);
1730 fs->segtrack->sorted_size = fs->segtrack->total/2;
1732 if (fs->segtrack->cleanable.cnt <= fs->segtrack->total/4)
1733 fs->segtrack->max_score = 0;
1734 if (fs->segtrack->max_score &&
1735 fs->segtrack->max_score < score) {
1736 /* score too high to bother with */
1737 lafs_check_seg_cnt(fs->segtrack);
1738 spin_unlock(&fs->lock);
1742 seg_add_new(fs->segtrack, &fs->segtrack->cleanable, 1,
1743 dev, seg, score, usage, where);
1744 spin_unlock(&fs->lock);
1748 static void merge_usage(struct fs *fs, u32 *d)
1750 u32 *u = fs->scan.free_usages;
1751 int segperblk = fs->blocksize >> USAGE_SHIFT;
1754 for (i = 0; i < segperblk; i++)
1755 if (le32_to_cpu(d[i]) > le32_to_cpu(u[i]))
1759 unsigned long lafs_scan_seg(struct fs *fs)
1761 /* FIXME this comment is very out-dated */
1762 /* Process one block of youth or segment-usage data. We
1763 * collect free segments (youth==0) into a table that is kept
1764 * sorted to ensure against duplicates. It is treated like a
1765 * ring buffer with a head and a tail to distinguish free
1766 * space from used space. head/tail point to the free space
1767 * (which is never empty). As we scan all segments
1768 * sequentially any free segment we find is placed at the head
1769 * of the free list and the entry just after the tail might
1770 * get discarded if we run out of room, or if that entry
1771 * matches the entry we just inserted. New free segments are
1772 * allocated from just after the tail. i.e. we increment the
1773 * tail and return the entry recorded there. If tail+1 ==
1774 * head, there are no segments in the buffer.
1776 * We also collect potentially cleanable segments. These are any
1777 * segments which is not empty and not non-logged (i.e youth>=8).
1778 * We record them in a table and whenever the table gets full, we
1779 * sort the table by score, and discard the less-cleanable half.
1780 * We only add segments to the table when it is less than half full,
1781 * or when the new segment has a higher score than the segment at
1782 * the half-way mark.
1784 * To get the score for a segment, we need its youth and its usage.
1785 * The usage is the maximum usage from all snapshots.
1786 * We allocate a block to hold the current max usage.
1788 * load the youth table, find free segments, decay youth if needed.
1789 * load the first usage table and copy to temporary block
1790 * repeat: load other usage tables and merge
1791 * calculate score for each segment and add those with a good score
1793 * On the first pass through, we count all the free segments into
1794 * the fs->free_blocks.
1796 * In the first pass, we continue make a full pass without deliberately
1797 * waiting. In subsequent passes ... only once per checkpoint???
1801 return MAX_SCHEDULE_TIMEOUT;
1803 dprintk("scan: dev=%d block=%d stage=%d\n",
1804 fs->scan.free_dev, fs->scan.free_block, fs->scan.free_stage);
1805 if (fs->scan.free_stage == 0) {
1806 /* Need to find the youth block for next dev/offset.
1807 * Possibly we are finished with this dev and must go
1808 * to next. Possibly we are finished altogether.
1810 int dev = fs->scan.free_dev;
1811 int block = fs->scan.free_block + 1;
1812 int youthblock = block >> (USAGE_SHIFT - YOUTH_SHIFT);
1816 block > (fs->devs[dev].segment_count
1817 >> (fs->blocksize_bits - 1))) {
1820 if (dev >= fs->devices) {
1821 fs->scan.free_dev = -1;
1823 fs->scan.do_decay = 0;
1825 fs->total_free_prev = fs->total_free;
1827 fs->scan.first_free_pass = 0;
1831 if (fs->scan.youth_db)
1832 if (fs->scan.youth_db->b.fileaddr != youthblock ||
1834 fs->scan.youth_db->b.inode != fs->devs[dev].segsum) {
1835 putdref(fs->scan.youth_db, MKREF(youth_scan));
1836 fs->scan.youth_db = NULL;
1840 lafs_wake_thread(fs);
1841 return MAX_SCHEDULE_TIMEOUT;
1844 if (fs->scan.youth_db == NULL)
1846 lafs_get_block(fs->devs[dev].segsum,
1848 NULL, GFP_KERNEL, MKREF(youth_scan));
1849 if (!fs->scan.youth_db) {
1850 printk("EEEEEKKKKK get_block failed\n");
1851 return MAX_SCHEDULE_TIMEOUT;
1853 switch (err = lafs_read_block_async(fs->scan.youth_db)) {
1856 printk("EEEEEKKKKK read of youth block failed\n");
1859 return MAX_SCHEDULE_TIMEOUT;
1864 if (fs->scan.do_decay) {
1865 /* youth_db must be writable */
1866 struct datablock *db = fs->scan.youth_db;
1867 lafs_checkpoint_lock(fs);
1868 set_bit(B_PinPending, &db->b.flags);
1869 lafs_pin_dblock(db, AccountSpace);
1871 spin_lock(&fs->lock);
1872 fs->scan.free_block = block;
1873 fs->scan.free_dev = dev;
1874 if (!err && fs->scan.do_decay &&
1875 youthblock << (USAGE_SHIFT - YOUTH_SHIFT) == block) {
1876 u16 *yp = map_dblock(fs->scan.youth_db);
1878 int segperblk = fs->blocksize >> YOUTH_SHIFT;
1880 for (i = 0 ; i < segperblk ; i++) {
1881 int y = le16_to_cpu(yp[i]);
1885 unmap_dblock(fs->scan.youth_db, yp);
1886 lafs_dirty_dblock(fs->scan.youth_db);
1888 spin_unlock(&fs->lock);
1889 if (fs->scan.do_decay)
1890 lafs_checkpoint_unlock(fs);
1893 fs->scan.free_stage = 1;
1895 lafs_check_seg_cnt(fs->segtrack);
1896 if (fs->scan.free_stage == 1) {
1897 /* Find the main usage block and copy the data into
1900 struct datablock *db;
1906 int segperblk = fs->blocksize >> USAGE_SHIFT;
1907 int segments = segperblk;
1911 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1912 fs->scan.free_block +
1913 fs->devs[fs->scan.free_dev].tablesize,
1914 NULL, GFP_KERNEL, MKREF(usage0));
1916 printk("EEEEKKK get_block for first usage failed\n");
1918 fs->scan.free_stage = 0;
1921 switch (lafs_read_block_async(db)) {
1924 printk("EEEEKKK read of usage block failed\n");
1925 putdref(db, MKREF(usage0));
1928 putdref(db, MKREF(usage0));
1929 return MAX_SCHEDULE_TIMEOUT;
1934 memcpy(fs->scan.free_usages, d, fs->blocksize);
1935 unmap_dblock(db, d);
1937 /* Now add any free blocks */
1938 segcount = fs->devs[fs->scan.free_dev].segment_count;
1939 blks = segcount / segments;
1940 if (fs->scan.free_block == blks)
1941 segments = segcount % segperblk;
1942 firstseg = fs->scan.free_block * segperblk;
1944 yp0 = yp = map_dblock(fs->scan.youth_db);
1945 yp += (fs->scan.free_block -
1946 (fs->scan.youth_db->b.fileaddr << (USAGE_SHIFT - YOUTH_SHIFT)))
1948 for (i = 0; i < segments ; i++)
1949 if (yp[i] == cpu_to_le16(0)) {
1950 if (fs->scan.first_free_pass) {
1951 spin_lock(&fs->lock);
1953 fs->devs[fs->scan.free_dev]
1955 spin_unlock(&fs->lock);
1957 if (add_free(fs, fs->scan.free_dev, firstseg + i,
1959 /* Everything in the table owns a reference
1960 * to youth and segusage[0]
1962 (void)getdref(fs->scan.youth_db, MKREF(intable));
1963 (void)getdref(db, MKREF(intable));
1966 fs->devs[fs->scan.free_dev]
1967 .segment_size /*- 1*/;
1969 unmap_dblock(fs->scan.youth_db, yp0);
1971 fs->scan.usage0_db = db;
1972 fs->scan.free_stage = 2;
1974 lafs_check_seg_cnt(fs->segtrack);
1975 while (fs->scan.free_stage > 1 &&
1976 fs->scan.free_stage < fs->maxsnapshot + 1) {
1977 struct datablock *db;
1980 if (fs->ss[fs->scan.free_stage-1].root == NULL) {
1981 fs->scan.free_stage++;
1984 /* Load the usage block for this snapshot and merge */
1986 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1987 fs->scan.free_block +
1988 fs->devs[fs->scan.free_dev].tablesize *
1989 LAFSI(fs->ss[fs->scan.free_stage-1].root)
1991 NULL, GFP_KERNEL, MKREF(usage_ss));
1993 printk("EEEEKKK get_block for subsequent usage failed\n");
1995 fs->scan.free_stage = 0;
1996 putdref(fs->scan.usage0_db, MKREF(usage0));
1997 fs->scan.usage0_db = NULL;
2000 switch (lafs_read_block_async(db)) {
2003 printk("EEEEKKK read of subsequent usage block failed\n");
2004 putdref(db, MKREF(usage_ss));
2007 putdref(db, MKREF(usage_ss));
2008 return MAX_SCHEDULE_TIMEOUT;
2014 unmap_dblock(db, d);
2015 fs->scan.free_stage++;
2016 putdref(db, MKREF(usage_ss));
2018 lafs_check_seg_cnt(fs->segtrack);
2020 if (fs->scan.free_stage == fs->maxsnapshot + 1) {
2021 /* All usage data has been merged, we can record all these
2022 * cleanable segments now
2024 u16 *yp = map_dblock(fs->scan.youth_db);
2026 u32 *up = fs->scan.free_usages;
2028 int segperblk = fs->blocksize >> USAGE_SHIFT;
2029 int segments = segperblk;
2030 int segcount = fs->devs[fs->scan.free_dev].segment_count;
2031 int blks = segcount / segments;
2032 if (fs->scan.free_block == blks)
2033 segments = segcount % segperblk;
2035 yp += (fs->scan.free_block -
2036 (fs->scan.youth_db->b.fileaddr << (USAGE_SHIFT - YOUTH_SHIFT)))
2038 for (i = 0; i < segments; i++)
2039 if (add_cleanable(fs, fs->scan.free_dev,
2040 i + fs->scan.free_block * segperblk,
2042 le16_to_cpu(up[i]))) {
2043 /* Every segment in table owns these
2046 (void)getdref(fs->scan.youth_db, MKREF(intable));
2047 (void)getdref(fs->scan.usage0_db, MKREF(intable));
2050 unmap_dblock(fs->scan.youth_db, yp0);
2051 putdref(fs->scan.usage0_db, MKREF(usage0));
2052 fs->scan.usage0_db = NULL;
2053 fs->scan.free_stage = 0;
2055 lafs_check_seg_cnt(fs->segtrack);
2058 else if (fs->scan.free_stage == 0)
2061 return MAX_SCHEDULE_TIMEOUT;
2064 void lafs_empty_segment_table(struct fs *fs)
2066 /* remove everything from the table. */
2070 ssnum = fs->segtrack->cleanable.first;
2071 fs->segtrack->cleanable.first = 0xffff;
2072 fs->segtrack->cleanable.last = 0xffff;
2073 fs->segtrack->cleanable.cnt = 0;
2074 while (ssnum != 0xffff) {
2075 ss = segfollow(fs->segtrack, ssnum);
2076 ss->score = SCORE_DEAD;
2080 ssnum = fs->segtrack->clean.first;
2081 fs->segtrack->clean.first = 0xffff;
2082 fs->segtrack->clean.last = 0xffff;
2083 fs->segtrack->clean.cnt = 0;
2084 while (ssnum != 0xffff) {
2085 ss = segfollow(fs->segtrack, ssnum);
2086 ss->score = SCORE_DEAD;
2090 ssnum = fs->segtrack->free.first;
2091 fs->segtrack->free.first = 0xffff;
2092 fs->segtrack->free.last = 0xffff;
2093 fs->segtrack->free.cnt = 0;
2094 while (ssnum != 0xffff) {
2095 ss = segfollow(fs->segtrack, ssnum);
2096 ss->score = SCORE_DEAD;
2100 segdelete_all(fs->segtrack, fs);
2103 void lafs_dump_usage(void)
2107 dfs->scan.trace = 1;
2109 lafs_wake_thread(dfs);