3 * segment tracking routines for LaFS
5 * Copyright (C) 2006-2009
6 * NeilBrown <neilb@suse.de>
7 * Released under the GPL, version 2
11 * - active blocks in segment usage files
12 * - next-phase usage updates
13 * - youth tracking and decay
14 * - list of segments that could be cleaned
15 * - list of free segments that could be reused
16 * - tracking space allocation and available free space
18 * Segment usage blocks:
19 * Every Pinned, Dirty or Realloc block with physaddr != 0 owns a reference
20 * on the block in the segment usage file that counts that
21 * block. When the physaddr is changed, the new block will always
22 * be available so the reference is simply moved.
24 * Normally when ->physaddr changes, the counts in the relevant
25 * segusage blocks are updated directly. However when the change is
26 * for a write in the next phase (during a checkpoint) that is not
27 * possible as the segusage blocks need to be written with the old value.
28 * So we track these updates in a simple data structure that
29 * records delta against each ss+dev+seg. If such a data structure
30 * cannot be allocated (-ENOMEM) the associated write is blocked until
31 * the checkpoint completes. FIXME - implement that.
33 * Every Pinned or Dirty block holds a number of space credits,
34 * following a pattern described elsewhere. The space allocation
35 * unit identifies segments to be used in advance and holds a reference
39 * Youth numbers are handed out sequentially to new write clusters.
40 * When the available space approaches being full all youth numbers are
41 * decayed (as differentiation between old youth values is uninteresting).
42 * This decay is done in a staged fashion, one block at a time so that
43 * a checkpoint can happen at any time and the current block in the
44 * decay sweep is recorded.
47 * Holding a record of all free segments is impractical in general. Instead
48 * We record a limited number of free segments (1 page worth - 512 entries,
49 * possibly with ranges) and when needed we scan the segment usage
50 * files for free segments and add them to the list. This scan needs to
51 * be rate limited - not sure how best to do that.
54 * Similar to free segments we need to keep track of cleanable segments.
55 * Almost all non-free segments are cleanable, and we assign a weight to
56 * each based on youth and usage. We want to keep the most cleanable
57 * segments based on this weight.
58 * We keep an unordered list of cleanable segments. When it becomes
59 * full we sort it by weight and discard the worst half.
62 * As there could be hundreds of snapshots (hopefully not thousands)
63 * we cannot load the segusage block for each snapshot and then parse them
64 * in parallel. Instead we allocate space to store a max usage and
65 * merge each block one at a time into that max. We then combine the
66 * max with the youth to get a 32bit weight... I wonder if that is good.
70 /* Every block that could change its ->physaddr holds a reference
71 * on the segment summary block through this structure
75 #include <linux/hash.h>
76 #include <linux/random.h>
77 #include <linux/slab.h>
84 struct hlist_node hash;
89 struct datablock *ssblk;
90 /* youthblk is only set when ssnum is 0 and
91 * usage is zero we are cleaning.
93 struct datablock *youthblk;
96 static int shash(u32 segnum, int devnum, int ssnum)
98 unsigned long h = hash_long(segnum, BITS_PER_LONG);
99 return hash_long(h ^ (devnum | (ssnum << 16)), SHASHBITS);
102 static inline void ss_get(struct segsum *ss)
104 BUG_ON(atomic_read(&ss->refcnt) == 0);
105 atomic_inc(&ss->refcnt);
108 static void ss_put(struct segsum *ss, struct fs *fs)
110 if (atomic_dec_and_lock(&ss->refcnt, &fs->stable_lock)) {
111 if (atomic_read(&ss->delayed) != 0)
112 spin_unlock(&fs->stable_lock);
114 if (!hlist_unhashed(&ss->hash))
115 hlist_del(&ss->hash);
116 spin_unlock(&fs->stable_lock);
117 putdref(ss->ssblk, MKREF(ss));
118 putdref(ss->youthblk, MKREF(ssyouth));
124 static struct segsum *segsum_find(struct fs *fs, u32 segnum,
125 int devnum, int ssnum)
127 struct hlist_head *head = &fs->stable[shash(segnum, devnum, ssnum)];
128 struct segsum *ss, *new = NULL;
129 struct hlist_node *n;
135 spin_lock(&fs->stable_lock);
136 hlist_for_each_entry(ss, n, head, hash)
137 if (ss->segnum == segnum &&
138 ss->ssnum == ssnum &&
139 ss->devnum == devnum) {
140 atomic_inc(&ss->refcnt);
141 spin_unlock(&fs->stable_lock);
145 !test_bit(B_Valid, &ss->ssblk->b.flags) &&
146 (fs->rolled || fs->qphase == fs->phase)) {
147 /* need to read this now that roll forward
150 err = lafs_read_block(ss->ssblk);
159 hlist_add_head(&new->hash, head);
160 spin_unlock(&fs->stable_lock);
163 spin_unlock(&fs->stable_lock);
165 /* seems we must allocate something */
167 new = kmalloc(sizeof(*new), GFP_KERNEL | __GFP_NOFAIL);
168 new->segnum = segnum;
169 new->devnum = devnum;
171 atomic_set(&new->refcnt, 1);
172 atomic_set(&new->delayed, 0);
173 INIT_HLIST_NODE(&new->hash);
174 dv = fs->devs + devnum;
175 addr = LAFSI(fs->ss[ssnum].root)->md.fs.usagetable * dv->tablesize;
176 addr += segnum >> (fs->blocksize_bits-1);
177 new->ssblk = lafs_get_block(dv->segsum, addr, NULL,
181 new->youthblk = lafs_get_block(dv->segsum,
182 segnum >> (fs->blocksize_bits-1),
187 new->youthblk = NULL;
188 BUG_ON(!new->ssblk || IS_ERR(new->ssblk) || IS_ERR(new->youthblk));
189 if (!fs->rolled && fs->qphase != fs->phase) {
190 /* Too early to safely read segusage blocks,
191 * leave it until later
195 err = lafs_read_block(new->ssblk);
203 static struct segsum *segsum_byaddr(struct fs *fs, u64 addr, int ssnum)
208 virttoseg(fs, addr, &dev, &seg, &offset);
210 return segsum_find(fs, seg, dev, ssnum);
213 void lafs_seg_put_all(struct fs *fs)
215 /* remove all the ssblk and youthblk references from
217 * This is called when filesystem is being unmounted
220 for (i = 0; i < SHASHSIZE; i++)
221 if (!hlist_empty(&fs->stable[i])) {
223 struct datablock *b, *y;
224 struct hlist_node *n;
225 /* FIXME this is quadratic */
227 spin_lock(&fs->stable_lock);
228 hlist_for_each_entry(ss, n, &fs->stable[i], hash) {
235 spin_unlock(&fs->stable_lock);
236 if (b && test_and_clear_bit(B_Async, &b->b.flags))
237 putdref(b, MKREF(async));
238 if (y && test_and_clear_bit(B_Async, &y->b.flags))
239 putdref(y, MKREF(async));
240 putdref(b, MKREF(ss));
241 putdref(y, MKREF(ssyouth));
244 spin_unlock(&fs->stable_lock);
248 /* We need to set SegRef on this block, and all parents,
249 * and the segsum block and all parents, and so on.
250 * We find the first ancestor that does not have SegRef,
251 * find the matching segsum block. If it already
252 * has SegRef, we add the reference and retry from the start.
253 * If it doesn't have SegRef, we prealloc and then tail recurse.
254 * Note: we never set SegRef on B_Root blocks as their usage
257 int lafs_seg_ref_block(struct block *b, int ssnum)
259 struct fs *fs = fs_from_inode(b->inode);
261 LAFS_BUG(test_bit(B_InoIdx, &b->flags), b);
262 getref(b, MKREF(segref));
264 while (!test_bit(B_SegRef, &b->flags)) {
271 BUG_ON(test_bit(B_Root, &b->flags));
274 /* Need to check parents */
276 spin_lock(&ino->i_data.private_lock);
278 p && !test_bit(B_SegRef, &p->flags);
279 p = &(p->parent)->b) {
280 if (test_bit(B_InoIdx, &p->flags)) {
281 struct datablock *db = LAFSI(p->inode)->dblock;
283 spin_unlock(&ino->i_data.private_lock);
285 spin_lock(&ino->i_data.private_lock);
287 if (test_bit(B_SegRef, &p->flags))
289 if (!test_bit(B_Root, &p->flags))
292 spin_unlock(&ino->i_data.private_lock);
293 /* b2 is the first ancestor (closest to root) without SegRef */
294 /* FIXME we have an implicit reference on b2
295 * through b->parent...
296 * But if a split happens, b2 might not be a
297 * true parent any more, so we no longer have a
298 * reference. If that can actually happen,
299 * we need to take a reference before dropping
301 * If it cannot, then we don't need the lock.
304 if (b2->physaddr == 0) {
305 /* There is no segsum to reference */
306 set_bit(B_SegRef, &b2->flags);
310 ss = segsum_byaddr(fs, b2->physaddr, ssnum);
312 putref(b, MKREF(segref));
316 if (&ss->ssblk->b == b) {
317 /* Don't need to check for Prealloc or
318 * SegRef, just take a reference now
320 if (test_and_set_bit(B_SegRef, &b2->flags))
325 /* Don't need iolock here as segusage is very special */
326 set_bit(B_PinPending, &ss->ssblk->b.flags);
327 err = lafs_pin_dblock(ss->ssblk, AccountSpace);
330 putref(b, MKREF(segref));
333 if (!test_bit(B_SegRef, &ss->ssblk->b.flags)) {
334 putref(b, MKREF(segref));
335 b = getref(&ss->ssblk->b, MKREF(segref));
337 } else if (test_and_set_bit(B_SegRef, &b2->flags))
340 putref(b, MKREF(segref));
344 void lafs_seg_deref(struct fs *fs, u64 physaddr, int ssnum)
347 struct segsum *ss = segsum_byaddr(fs, physaddr, ssnum);
349 BUG_ON(atomic_read(&ss->refcnt) < 2);
355 static void seg_inc(struct fs *fs, struct segsum *ss, int diff, int in_phase)
358 atomic_add(diff, &ss->delayed);
361 b = map_dblock(ss->ssblk);
362 spin_lock(&fs->stable_lock);
363 p = &b[ss->segnum & ((fs->blocksize-1)>>1)];
364 //BUG_ON(diff < 0 && le16_to_cpu(*p) < -diff);
365 if (diff < 0 && le16_to_cpu(*p) < -diff) {
366 printk("diff=%d p=%d segnum=%d\n", diff, le16_to_cpu(*p),
370 *p = cpu_to_le16(le16_to_cpu(*p) + diff);
371 spin_unlock(&fs->stable_lock);
372 unmap_dblock(ss->ssblk, b);
373 lafs_dirty_dblock(ss->ssblk);
377 void lafs_seg_move(struct fs *fs, u64 oldaddr, u64 newaddr,
378 int ssnum, int phase, int moveref)
382 dprintk("SEGMOVE %llu %llu\n",
383 (unsigned long long) oldaddr,
384 (unsigned long long) newaddr);
387 ss = segsum_byaddr(fs, newaddr, ssnum);
388 seg_inc(fs, ss, 1, fs->qphase == phase);
395 ss = segsum_byaddr(fs, oldaddr, ssnum);
396 seg_inc(fs, ss, -1, fs->qphase == phase);
403 static void seg_apply(struct fs *fs, struct segsum *ss)
406 int diff = atomic_read(&ss->delayed);
409 atomic_set(&ss->delayed, 0);
410 dprintk("Seg apply %d %d\n", (int)ss->segnum, diff);
411 err = lafs_read_block(ss->ssblk);
412 BUG_ON(err); // FIXME do something useful here
413 seg_inc(fs, ss, diff, 1);
416 /* lafs_seg_apply_all
417 * During a checkpoint, blocks written to the next phase cannot immediately
418 * update the segment usage tables so those updates need to be delayed.
419 * This function applies all the delayed updates to the segment usage
420 * files at the end of a checkpoint.
422 void lafs_seg_apply_all(struct fs *fs)
426 for (i = 0 ; i < SHASHSIZE ; i++) {
427 struct hlist_head *head = &fs->stable[i];
429 struct hlist_node *n, *pos;
430 spin_lock(&fs->stable_lock);
431 hlist_for_each_entry_safe(ss, pos, n, head, hash) {
432 atomic_inc(&ss->refcnt);
433 spin_unlock(&fs->stable_lock);
436 spin_lock(&fs->stable_lock);
437 // FIXME this still isn't safe - 'n' could
438 // disappear while unlocked.
440 spin_unlock(&fs->stable_lock);
442 /* Now any clean segments found earlier are free. */
446 * When creating a snapshot, we need to duplicate table[1] in
447 * each segment usage file into table[n].
448 * It would be nice to just copy block pointers but this might
449 * confuse cleaning etc. so duplicate the data for now.
451 int lafs_seg_dup(struct fs *fs, int newtable)
454 * get blocks for each table and memcpy
458 for (d=0; d < fs->devices ; d++) {
459 struct fs_dev *dv = &fs->devs[d];
461 for (b=0 ; b < dv->tablesize ; b++) {
462 struct datablock *obl, *nbl;
466 addr = dv->tablesize + b;
467 obl = lafs_get_block(dv->segsum, addr, 0,
468 GFP_KERNEL | __GFP_NOFAIL,
470 err = lafs_read_block(obl);
472 putdref(obl, MKREF(ss));
476 addr = dv->tablesize * newtable + b;
477 nbl = lafs_get_block(dv->segsum, addr, 0,
478 GFP_KERNEL | __GFP_NOFAIL,
481 from = map_dblock(obl);
482 to = map_dblock_2(nbl);
483 memcpy(to, from, fs->blocksize);
484 unmap_dblock_2(nbl, to);
485 unmap_dblock(obl, from);
486 lafs_dirty_dblock(nbl);
487 putdref(obl, MKREF(ss));
488 putdref(nbl, MKREF(ss));
495 /*************************************************************
496 * Space management: allocate, use, free
499 static int count_credits(struct block *b)
502 struct indexblock *ib = iblk(b);
503 struct inode *ino = NULL;
505 if (test_bit(B_Credit, &b->flags))
507 if (test_bit(B_ICredit, &b->flags))
509 if (test_bit(B_NCredit, &b->flags))
511 if (test_bit(B_NICredit, &b->flags))
513 if (test_bit(B_UnincCredit, &b->flags))
515 if (test_bit(B_Dirty, &b->flags))
517 if (test_bit(B_Realloc, &b->flags))
520 if (test_bit(B_Index, &b->flags)) {
521 list_for_each_entry(b, &ib->children, siblings) {
522 LAFS_BUG(b->parent != ib, b);
523 credits += count_credits(b);
525 credits += ib->uninc_table.credits;
526 } else if ((ino = rcu_my_inode(dblk(b))) != NULL &&
527 LAFSI(ino)->iblock &&
528 LAFSI(ino)->iblock->b.parent
530 LAFS_BUG(LAFSI(ino)->iblock->b.parent != b->parent, b);
531 credits += count_credits(&LAFSI(ino)->iblock->b);
538 static void check_credits(struct fs *fs)
541 * count credits in tree and make sure they match allocate_blocks
544 if (fs->ss[0].root == NULL ||
545 LAFSI(fs->ss[0].root)->iblock == NULL ||
546 LAFSI(fs->ss[0].root)->dblock == NULL)
548 credits = count_credits(&LAFSI(fs->ss[0].root)->iblock->b);
549 credits += count_credits(&LAFSI(fs->ss[0].root)->dblock->b);
550 if (credits + fs->cluster_updates + temp_credits != fs->allocated_blocks) {
551 printk("credits=%d updates=%d temp=%d allocated=%d\n", credits,
552 fs->cluster_updates, temp_credits,
553 (int)fs->allocated_blocks);
559 void lafs_space_return(struct fs *fs, int credits)
561 spin_lock(&fs->alloc_lock);
563 if (credits > fs->allocated_blocks)
564 printk("cred=%d ab=%d\n", credits, (int)fs->allocated_blocks);
565 BUG_ON(credits > fs->allocated_blocks);
566 fs->allocated_blocks -= credits;
567 spin_unlock(&fs->alloc_lock);
571 int lafs_alloc_cleaner_segs(struct fs *fs, int max)
573 /* cleaner is about to start.
574 * See how many segments of space can safely
575 * be reserved for its use.
576 * We don't allocate more to the cleaner than we
577 * leave spare for new allocations.
579 int watermark = fs->max_segment * 4;
581 spin_lock(&fs->alloc_lock);
582 while (fs->clean_reserved < max * fs->max_segment &&
583 fs->free_blocks > (fs->clean_reserved
584 + fs->allocated_blocks
586 fs->clean_reserved += fs->max_segment;
587 fs->free_blocks -= fs->max_segment;
590 spin_unlock(&fs->alloc_lock);
594 int lafs_space_alloc(struct fs *fs, int credits, int why)
596 /* 'why's for space reservation.
597 * NewSpace means we want to write a block which didn't exist
598 * before. This is allowed to fail or block if the cleaner
599 * appears able to make progress.
600 * ReleaseSpace means we want to write a block which does currently
601 * exist, so doing this will eventually free up space. This must
602 * never fail, but can block.
603 * CleanSpace means we want to write a block to relocate it to
604 * a 'cleaning' segment. This may never fail.
605 * AccountSpace means we absolutely need this block now, and it is
606 * a BUG is there is no space available.
611 spin_lock(&fs->alloc_lock);
614 watermark += RELEASE_RESERVED * fs->max_segment;
617 watermark += ACCOUNT_RESERVED * fs->max_segment;
621 /* Definitely no water mark here. */
625 if (fs->rolled && watermark) {
626 /* We cannot account properly before roll-forward has
627 * completed. FIXME once it has completed we need to
628 * check and invalidate the FS if there was a problem.
630 if (fs->free_blocks < fs->allocated_blocks
631 + credits + watermark)
632 credits = 0; /* Sorry, no room */
634 if (fs->rolled && watermark == 0) {
635 /* When including the clean_reserved space, there should
636 * be room for these controlled allocations
638 if (fs->free_blocks + fs->clean_reserved <
639 fs->allocated_blocks + credits)
644 if (why == NewSpace && !test_bit(EmergencyClean, &fs->fsstate))
645 set_bit(EmergencyPending, &fs->fsstate);
646 if (why >= ReleaseSpace || !test_bit(EmergencyClean, &fs->fsstate))
647 if (!test_bit(CleanerBlocks, &fs->fsstate) ||
648 fs->cleaner.need > watermark + fs->max_segment) {
649 fs->cleaner.need = watermark + fs->max_segment;
650 set_bit(CleanerBlocks, &fs->fsstate);
651 lafs_wake_thread(fs);
653 } else if (why == NewSpace)
654 if (test_bit(EmergencyClean, &fs->fsstate) ||
655 test_bit(EmergencyPending, &fs->fsstate)) {
656 clear_bit(EmergencyPending, &fs->fsstate);
657 clear_bit(EmergencyClean, &fs->fsstate);
660 fs->allocated_blocks += credits;
661 BUG_ON(fs->free_blocks + fs->clean_reserved < fs->allocated_blocks);
662 spin_unlock(&fs->alloc_lock);
666 /********************************************************************
668 * Segment usage / youth tracking
671 /* Information about free and cleanable segments are stored in a table
672 * comprised of a few pages. Indexes into this table are 16bit. 4bits
673 * for page number, 12 bits for index in the page.
674 * Different pages have different sized entries to allow for different
675 * depths in the skiplist that sorts entries by dev/segment.
676 * Entries are at least 16 bytes, so 12 bit can index up 64K.
677 * Each entry is also on one of four lists:
678 * - unused: this entry is not used
679 * - free: this entry is for a free segment
680 * - cleanable: this entry is for a cleanable segment.
681 * - clean: segment has been cleaned but is not yet free (awaiting checkpoint).
682 * These are singly linked lists (via 'next'). We record head and tail.
683 * The end of this list has a pointer to 0xFFFF, not 0.
684 * We usually remove entries from the head, but when the table is
685 * full, we might walk part way down a list and discard all the rest.
686 * We discard them by setting score to 0xFFFFFFFF. We then walk the
687 * skip list moving anything with a score of 0xFFFFFFFF to the unused list.
689 * We occasionally sort the cleanable list using a merge sort.
691 * There is a progression from 'cleanable' to 'clean' to 'free', though
692 * segments can enter at any point.
693 * When we choose a segment to clean, we remove the entry from the cleanable
694 * list. We do this by setting the score to 0xFFFFFFFE and unlinking it.
695 * After cleaning has completed a scan should find that it is clean and so
696 * add it to the 'clean' list.
697 * When we record a 'clean' segment as 'free' (after a checkpoint) we
698 * move it from the clean list to the free list, from where it will be
699 * removed when needed.
700 * We don't add new free segments when the total of free+clean segments
701 * is more than half the size of the table.
702 * Similarly when the cleanable table reaches half the available size
703 * we remove the least-interesting half.
704 * When we choose a free segment to start filling, we remove it from the
705 * free list but not from the table. The score is set to 0xFFFFFFFD to
706 * record that it is unlinked but busy. If it is found to be cleanable,
707 * it will be ignored. When we finish filling a segment, we find the entry
708 * again and remove it properly so it can become cleanable later.
711 #define SCORE_MAX 0xFFFFFFFC /* Maximum normal score */
712 #define SCORE_ACTIVE 0xFFFFFFFD /* This segment is being written to */
713 #define SCORE_CLEANING 0xFFFFFFFE /* This segment in being cleaned */
714 #define SCORE_DEAD 0xFFFFFFFF /* This segment is to be removed */
722 u16 skip[0]; /* or larger... */
725 static inline struct segstat *segfollow(struct segtracker *st, u16 link)
730 a = st->page[link >> 12];
731 a += (link & 0xFFF) *
732 (st->size[link >> 12] * sizeof(u16) + sizeof(struct segstat));
733 return (struct segstat *) a;
736 int lafs_segtrack_init(struct segtracker *st)
742 st->page[0] = kmalloc(PAGE_SIZE, GFP_KERNEL);
743 st->page[1] = kmalloc(PAGE_SIZE, GFP_KERNEL);
744 st->page[2] = kmalloc(PAGE_SIZE, GFP_KERNEL);
745 st->page[3] = kmalloc(PAGE_SIZE, GFP_KERNEL);
747 if (st->page[0] == NULL ||
748 st->page[1] == NULL ||
749 st->page[2] == NULL ||
750 st->page[3] == NULL) {
751 lafs_segtrack_free(st);
758 BUG_ON(SEG_NUM_HEIGHTS <= 9);
760 st->unused.first = st->unused.last = 0xffff;
761 st->cleanable.first = st->cleanable.last = 0xffff;
762 st->free.first = st->free.last = 0xffff;
763 st->clean.first = st->clean.last = 0xffff;
764 st->unused.cnt = st->free.cnt = st->cleanable.cnt = 0;
766 for (h = 0; h < SEG_NUM_HEIGHTS; h++)
767 st->head[h] = 0xFFFF;
769 /* how many entries in each page */
770 for (i = 0 ; i < 4; i++)
771 n[i] = PAGE_SIZE / (sizeof(struct segstat) +
772 st->size[i] * sizeof(u16));
778 get_random_bytes(&rand, 1);
781 while (p < 3 && n[p] == 0)
790 ss = segfollow(st, sn);
791 ss->next = st->unused.first;
792 st->unused.first = sn;
793 if (st->unused.cnt == 0)
794 st->unused.last = sn;
799 st->total = st->unused.cnt;
803 void lafs_segtrack_free(struct segtracker *st)
811 int lafs_check_seg_cnt(struct segtracker *st)
813 /* check at unused, free, cleanable, clean have correct count.
818 for (prev = sn = st->unused.first ;
820 sn = segfollow(st, (prev = sn))->next)
822 if (cnt != st->unused.cnt) {
823 printk("%d != %d\n", cnt, st->unused.cnt); WARN_ON(1);
826 if (st->unused.last != prev) {
827 printk("L%d != %d\n", prev, st->unused.last); WARN_ON(1);
832 for (prev = sn = st->free.first ;
834 sn = segfollow(st, (prev = sn))->next)
836 if (cnt != st->free.cnt) {
837 printk("%d != %d\n", cnt, st->free.cnt); WARN_ON(1);
840 if (st->free.last != prev) {
841 printk("L%d != %d\n", prev, st->free.last); WARN_ON(1);
846 for (prev = sn = st->clean.first ;
848 sn = segfollow(st, (prev = sn))->next)
850 if (cnt != st->clean.cnt) {
851 printk("%d != %d\n", cnt, st->clean.cnt); WARN_ON(1);
854 if (st->clean.last != prev) {
855 printk("L%d != %d\n", prev, st->clean.last); WARN_ON(1);
860 for (prev = sn = st->cleanable.first ;
862 sn = segfollow(st, (prev = sn))->next)
864 if (cnt != st->cleanable.cnt) {
865 printk("%d != %d\n", cnt, st->cleanable.cnt); WARN_ON(1);
868 if (st->cleanable.last != prev) {
869 printk("L%d != %d\n", prev, st->cleanable.last); WARN_ON(1);
876 static void segsort(struct segtracker *st, struct slist *l)
878 /* sort the 'l' list of st by score - lowest first */
879 /* use a merge sort */
898 /* Find the least of h[0] and h[1] that is not less than
899 * prev, and add it to hp[curr]. If both are larger than
900 * prev, flip 'curr' and add smallest.
902 while (h[0] != 0xFFFF || h[1] != 0xFFFF) {
903 if (h[next] == 0xFFFF ||
904 (h[1-next] != 0xFFFF &&
905 !((prev <= segfollow(st, h[1-next])->score)
906 ^ (segfollow(st, h[1-next])->score
907 <= segfollow(st, h[next])->score)
908 ^ (segfollow(st, h[next])->score <= prev))
912 if (segfollow(st, h[next])->score < prev)
914 prev = segfollow(st, h[next])->score;
916 hp[curr] = &segfollow(st, h[next])->next;
918 h[next] = segfollow(st, h[next])->next;
922 } while (head[0] != 0xFFFF && head[1] != 0xFFFF);
923 if (head[0] == 0xFFFF)
930 static int statcmp(struct segstat *ss, u16 dev, u32 seg)
936 if (ss->segment < seg)
938 if (ss->segment > seg)
943 static struct segstat *segfind(struct segtracker *st, u16 dev,
944 u32 segnum, u16 *where[SEG_NUM_HEIGHTS])
946 /* Find the segment entry for dev/segnum.
947 * Return link info in where to allow insertion/deletion.
950 u16 *head = st->head;
951 for (level = SEG_NUM_HEIGHTS-1 ; level >= 0; level--) {
952 while (head[level] != 0xffff &&
953 statcmp(segfollow(st, head[level]), dev, segnum) < 0) {
954 head = segfollow(st, head[level])->skip;
956 where[level] = &head[level];
958 if (head[0] == 0xFFFF ||
959 statcmp(segfollow(st, head[0]), dev, segnum) != 0)
962 return segfollow(st, head[0]);
965 static int segchoose_height(struct segtracker *st, u16 ssn)
967 unsigned long rand[DIV_ROUND_UP(SEG_NUM_HEIGHTS * 2 / 8 + 1,
968 sizeof(unsigned long))];
969 int max = st->size[ssn>>12];
971 get_random_bytes(rand, sizeof(rand));
977 test_bit(h*2, rand) &&
978 test_bit(h*2+1, rand))
983 static void seginsert(struct segtracker *st, u16 ssn,
984 u16 *where[SEG_NUM_HEIGHTS])
986 /* We looked for 'ss' but didn't find it. 'where' is the
987 * result of looking. Now insert 'ss'
989 struct segstat *ss = segfollow(st, ssn);
990 int h = segchoose_height(st, ssn);
992 ss->skip[h] = *where[h];
998 static void segdelete(struct segtracker *st, struct segstat *ss)
1000 /* This segstat has just been removed from a list (free or cleanable).
1001 * Remove it from the skiplist as well
1003 u16 *where[SEG_NUM_HEIGHTS];
1004 struct segstat *ss2 = segfind(st, ss->dev, ss->segment, where);
1008 BUG_ON(ss2 == NULL);
1011 for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos ; h++)
1012 *where[h] = ss2->skip[h];
1013 ss2->next = st->unused.first;
1014 st->unused.first = pos;
1016 lafs_check_seg_cnt(st);
1017 // FIXME what about 'last' ??
1020 static void segdelete_all(struct segtracker *st, struct fs *fs)
1022 /* walk entire skip list. Any entry with score of SCORE_DEAD
1025 u16 *where[SEG_NUM_HEIGHTS];
1028 for (h = SEG_NUM_HEIGHTS-1; h >= 0; h--)
1029 where[h] = &st->head[h];
1031 while (*where[0] != 0xFFFF) {
1032 u16 pos = *where[0];
1033 struct segstat *ss = segfollow(st, pos);
1034 if (ss->score == SCORE_DEAD) {
1035 /* delete this entry */
1037 /* FIXME Locking is really bad here */
1039 ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1040 putdref(ssm->ssblk, MKREF(intable));
1041 putdref(ssm->youthblk, MKREF(intable));
1044 for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1045 *where[h] = ss->skip[h];
1046 ss->next = st->unused.first;
1047 st->unused.first = pos;
1049 lafs_check_seg_cnt(st);
1051 /* advance 'where' to here */
1052 for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1053 where[h] = &ss->skip[h];
1058 static u16 seg_pop(struct segtracker *st, struct slist *which)
1061 u16 rv = which->first;
1064 ss = segfollow(st, rv);
1065 which->first = ss->next;
1067 if (ss->next == 0xFFFF)
1068 which->last = 0xFFFF;
1072 static struct segstat *seg_add_new(struct segtracker *st, struct slist *which, int atend,
1073 int dev, u32 seg, int score, int usage,
1074 u16 *where[SEG_NUM_HEIGHTS])
1079 ssn = seg_pop(st, &st->unused);
1080 ss = segfollow(st, ssn);
1086 int l = which->last;
1093 BUG_ON(segfollow(st, l)->next != 0xFFFF);
1094 segfollow(st, l)->next = ssn;
1097 ss->next = which->first;
1099 if (which->last == 0xFFFF)
1108 seginsert(st, ssn, where);
1109 lafs_check_seg_cnt(st);
1113 void lafs_free_get(struct fs *fs, unsigned int *dev, u32 *seg,
1116 /* Select and return a free segment.
1117 * The youth value must have been zero, and we set it to the
1118 * current youth number.
1122 struct segsum *ssum = NULL;
1125 BUG_ON(nonlogged); // FIXME should handle this case, not ignore it
1128 spin_lock(&fs->lock);
1130 wait_event_lock(fs->phase_wait,
1131 !fs->scan.first_free_pass ||
1132 fs->segtrack->free.first != 0xFFFF,
1135 ss = segfollow(fs->segtrack, fs->segtrack->free.first);
1141 /* On the final checkpoint we skip the youth update as
1142 * rollforward will do it for us, and it is too late
1143 * the write the new youth block anyway.
1144 * FIXME there should be a more general way to delay
1145 * updates to the Youth block.
1147 if (!test_bit(DelayYouth, &fs->fsstate) &&
1149 ssum->devnum == ss->dev &&
1150 ssum->segnum == ss->segment)) {
1151 spin_unlock(&fs->lock);
1155 ssum = segsum_find(fs, ss->segment, ss->dev, 0);
1156 /* As we hold a ref on youth block for anything in the
1157 * table, and that block was loaded at the time, it must
1160 BUG_ON(!ssum || !ssum->youthblk);
1161 BUG_ON(!test_bit(B_Valid, &ssum->youthblk->b.flags));
1162 set_bit(B_PinPending, &ssum->youthblk->b.flags);
1163 lafs_pin_dblock(ssum->youthblk, AccountSpace);
1166 seg_pop(fs->segtrack, &fs->segtrack->free);
1168 ss->score = SCORE_ACTIVE;
1169 /* still in table, but unlinked */
1171 if (ssum && test_bit(DelayYouth, &fs->fsstate)) {
1178 /* HACK youth_next should always be at least 0x8000 so that
1179 * cleanable score differentiates well for new segments.
1180 * old code would sometimes set youth_next very low, so
1184 if (fs->youth_next < 0x8000)
1185 fs->youth_next = 0x8000;
1187 if (fs->scan.do_decay &&
1188 (fs->scan.free_dev < ss->dev
1189 || (fs->scan.free_dev == ss->dev
1190 && fs->scan.free_block < (ss->segment
1191 / (fs->blocksize / 2)))
1193 /* Haven't decayed this block yet - revert decay */
1195 youthp = map_dblock(ssum->youthblk);
1196 youthp[(*seg) & ((1 << (fs->blocksize_bits
1200 unmap_dblock(ssum->youthblk, youthp);
1201 lafs_dirty_dblock(ssum->youthblk);
1204 spin_unlock(&fs->lock);
1206 /* now need to reserve/dirty/reference the youth and
1207 * segsum block for each snapshot that could possibly
1209 * NOTE: this needs fixing to support snapshots. Later.
1211 for (ssnum = 0; ssnum < 1 ; ssnum++) {
1212 ssum = segsum_find(fs, ss->segment, ss->dev, ssnum);
1214 /* ?? what do I need to release etc */
1215 /* Maybe this cannot fail because we own references
1216 * to the two blocks !! */
1218 lafs_checkpoint_lock(fs);
1219 set_bit(B_PinPending, &ssum->ssblk->b.flags);
1220 (void)lafs_pin_dblock(ssum->ssblk, AccountSpace);
1221 lafs_checkpoint_unlock(fs);
1226 dprintk("NEXT segment found %d/%d youth %d\n",
1227 *dev, *seg, fs->youth_next - 1);
1228 /* Note that we return an implicit reference to the ssum */
1231 void lafs_seg_forget(struct fs *fs, int dev, u32 seg)
1233 /* this segment was being filled and is now full.
1234 * We need to drop it from the table, and drop
1235 * references to the blocks
1240 spin_lock(&fs->lock);
1243 segdelete(fs->segtrack, &tmp);
1244 spin_unlock(&fs->lock);
1246 ss = segsum_find(fs, seg, dev, 0);
1248 BUG_ON(atomic_read(&ss->refcnt) < 2);
1249 /* Removed from table so ... */
1250 putdref(ss->ssblk, MKREF(intable));
1251 putdref(ss->youthblk, MKREF(intable));
1256 int lafs_clean_count(struct fs *fs, int *any_clean)
1259 spin_lock(&fs->lock);
1260 *any_clean = fs->segtrack->clean.cnt != 0;
1261 rv = fs->segtrack->free.cnt + fs->segtrack->clean.cnt;
1262 spin_unlock(&fs->lock);
1266 static int add_free(struct fs *fs, unsigned int dev, u32 seg, u16 *youthp)
1268 /* This dev/seg is known to be free. add it to the list */
1270 u16 *where[SEG_NUM_HEIGHTS];
1271 dprintk("add_free %d %d\n", (int)dev, (int)seg);
1272 spin_lock(&fs->lock);
1274 /* not really free any more */
1275 spin_unlock(&fs->lock);
1279 ss = segfind(fs->segtrack, dev, seg, where);
1281 /* already in the table. If it happens to be
1282 * on the cleanable list, we have to wait for
1283 * the cleaner to find it and add it to our list.
1285 spin_unlock(&fs->lock);
1288 if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1289 fs->segtrack->total / 2) {
1290 /* Have enough free/clean entries already */
1291 spin_unlock(&fs->lock);
1295 seg_add_new(fs->segtrack, &fs->segtrack->free, 0,
1296 dev, seg, 0, 0, where);
1298 spin_unlock(&fs->lock);
1302 static int add_clean(struct fs *fs, unsigned int dev, u32 seg)
1304 /* This dev/seg is now clean. Make sure it is on the list.
1305 * Chances are that this is already present in the table as
1306 * a recently cleaned segment.
1307 * Return TRUE if segment was added to the table (as this
1308 * implies a reference count).
1311 u16 *where[SEG_NUM_HEIGHTS];
1313 dprintk("ADD CLEAN %d/%d %d #####################################\n",
1314 dev, seg, fs->segtrack->clean.cnt);
1315 spin_lock(&fs->lock);
1316 ss = segfind(fs->segtrack, dev, seg, where);
1318 /* already in the table. If loose, attach it to
1319 * the clean list. If cleanable, set score to 0 to make
1320 * sure it is found soon
1322 if (ss->score == SCORE_CLEANING) {
1323 ss->next = fs->segtrack->clean.first;
1324 fs->segtrack->clean.first = where[0][0];
1325 if (fs->segtrack->clean.last == 0xFFFF)
1326 fs->segtrack->clean.last =
1327 fs->segtrack->clean.first;
1328 fs->segtrack->clean.cnt++;
1330 if (ss->score != SCORE_ACTIVE) {
1331 /* Must be on the clean list now */
1335 spin_unlock(&fs->lock);
1339 if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1340 fs->segtrack->total / 2) {
1341 /* Have enough free/clean entries already */
1342 spin_unlock(&fs->lock);
1346 ss = seg_add_new(fs->segtrack, &fs->segtrack->clean, 0,
1347 dev, seg, 0, 0, where);
1349 spin_unlock(&fs->lock);
1356 void lafs_add_active(struct fs *fs, u64 addr)
1358 /* add this segment to the table as 'active'.
1359 * This is only called once at mount time for the
1360 * active segments. Other segments become active
1364 u16 *where[SEG_NUM_HEIGHTS];
1365 struct segsum *ssum = segsum_byaddr(fs, addr, 0);
1366 (void)getdref(ssum->ssblk, MKREF(intable));
1367 (void)getdref(ssum->youthblk, MKREF(intable));
1368 spin_lock(&fs->lock);
1369 ss = segfind(fs->segtrack, ssum->devnum, ssum->segnum, where);
1371 ss = seg_add_new(fs->segtrack, NULL, 0,
1372 ssum->devnum, ssum->segnum,
1373 SCORE_ACTIVE, 0, where);
1376 spin_unlock(&fs->lock);
1379 void lafs_clean_free(struct fs *fs)
1381 /* We are finishing off a checkpoint. Move all from 'clean'
1382 * list to 'free' list, and set the youth for each to 0.
1383 * Note that we can walk the 'clean' list without locking as
1384 * items are only ever removed by this thread.
1388 if (fs->segtrack->clean.cnt == 0)
1390 for (ssn = fs->segtrack->clean.first ; ssn != 0xffff; ssn = ss->next) {
1391 struct datablock *db;
1393 ss = segfollow(fs->segtrack, ssn);
1394 spin_lock(&fs->lock);
1395 fs->free_blocks += fs->devs[ss->dev].segment_size;
1396 spin_unlock(&fs->lock);
1397 db = lafs_get_block(fs->devs[ss->dev].segsum,
1398 ss->segment >> (fs->blocksize_bits-1),
1399 NULL, GFP_KERNEL | __GFP_NOFAIL,
1401 err = lafs_read_block(db);
1403 err = lafs_reserve_block(&db->b, AccountSpace);
1405 u16 *b = map_dblock(db);
1406 spin_lock(&fs->stable_lock);
1407 b[ss->segment & ((fs->blocksize-1)>>1)] = 0;
1408 spin_unlock(&fs->stable_lock);
1409 unmap_dblock(db, b);
1410 lafs_dirty_dblock(db);
1412 /* FIXME make filesystem read-only */
1414 putdref(db, MKREF(cleanfree));
1416 /* Now move them to the 'free' list */
1417 spin_lock(&fs->lock);
1418 fs->segtrack->free.cnt += fs->segtrack->clean.cnt;
1419 if (fs->segtrack->free.last == 0xFFFF)
1420 fs->segtrack->free.first = fs->segtrack->clean.first;
1422 segfollow(fs->segtrack, fs->segtrack->free.last)->next
1423 = fs->segtrack->clean.first;
1424 fs->segtrack->free.last = fs->segtrack->clean.last;
1425 fs->segtrack->clean.first = 0xFFFF;
1426 fs->segtrack->clean.last = 0xFFFF;
1427 fs->segtrack->clean.cnt = 0;
1428 spin_unlock(&fs->lock);
1431 void lafs_dump_cleanable(void)
1434 struct segtracker *st;
1438 printk("No cleanable table\n");
1442 printk("============= Cleanable table (%d) =================\n",
1444 printk("pos: dev/seg usage score\n");
1447 for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1448 ss = segfollow(st, ssn);
1450 printk("%3d: %3d/%-4d %5d %d\n",
1457 printk("...sorted....\n");
1458 segsort(st, &st->cleanable);
1460 for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1461 ss = segfollow(st, ssn);
1463 printk("%3d: %3d/%-4d %5d %d\n",
1470 printk("--------------- Free table (%d) ---------------\n",
1473 for (ssn = st->free.first; ssn != 0xffff; ssn = ss->next) {
1474 ss = segfollow(st, ssn);
1476 printk("%3d: %3d/%-4d %5d %d\n",
1483 printk("--------------- Clean table (%d) ---------------\n",
1486 for (ssn = st->clean.first; ssn != 0xffff; ssn = ss->next) {
1487 ss = segfollow(st, ssn);
1489 printk("%3d: %3d/%-4d %5d %d\n",
1496 printk("--------\n");
1497 printk("free_blocks=%llu allocated=%llu max_seg=%llu clean_reserved=%llu\n",
1498 dfs->free_blocks, dfs->allocated_blocks, dfs->max_segment,
1499 dfs->clean_reserved);
1502 int lafs_get_cleanable(struct fs *fs, u16 *dev, u32 *seg)
1504 /* Return the most cleanable segment on the list.
1505 * The segstat is removed from the 'cleanable' list
1506 * and left in the table. This prevents it being re-added.
1507 * When it is later found to be clean, it will be added
1508 * to the 'clean' list.
1510 struct segtracker *st = fs->segtrack;
1514 lafs_check_seg_cnt(st);
1517 spin_lock(&fs->lock);
1518 if (st->cleanable.first == 0xFFFF) {
1519 spin_unlock(&fs->lock);
1523 if (st->sorted_size < st->cleanable.cnt / 2) {
1524 segsort(st, &st->cleanable);
1525 st->sorted_size = st->cleanable.cnt;
1526 st->max_score = segfollow(st, st->cleanable.last)->score;
1529 lafs_check_seg_cnt(fs->segtrack);
1531 ss = segfollow(st, seg_pop(st, &st->cleanable));
1533 if (lafs_check_seg_cnt(fs->segtrack)) {
1535 printk("first=%x last=%x cnt=%d\n", st->cleanable.first,
1536 st->cleanable.last, st->cleanable.cnt);
1537 for (sj = st->cleanable.first ;
1539 sj = segfollow(st, sj)->next)
1540 printk(" %x\n", sj);
1550 dprintk("SEG: cleanable %d/%d score=%d usage=%d\n",
1551 ss->dev, ss->segment, ss->score, ss->usage);
1553 ss->score = SCORE_CLEANING;
1554 if (ss->usage == 0) {
1556 spin_unlock(&fs->lock);
1557 rv = add_clean(fs, ss->dev, ss->segment);
1558 /* FIXME we dropped the lock, so maybe this could bug?? */
1559 /* I should make sure I hold the block references */
1563 segdelete(fs->segtrack, ss);
1564 spin_unlock(&fs->lock);
1566 ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1567 putdref(ssm->ssblk, MKREF(intable));
1568 putdref(ssm->youthblk, MKREF(intable));
1574 static int add_cleanable(struct fs *fs, unsigned int dev, u32 seg,
1575 u16 youth, u16 usage)
1580 u16 *where[SEG_NUM_HEIGHTS];
1582 lafs_check_seg_cnt(fs->segtrack);
1583 if (fs->scan.trace || lafs_trace || 1)
1584 printk("CLEANABLE: %u/%lu y=%d u=%d\n",
1585 dev, (unsigned long)seg, (int)youth, (int)usage);
1588 if (youth >= fs->checkpoint_youth)
1591 segsize = fs->devs[dev].segment_size;
1592 fs->total_free += segsize - usage /* - 1 */;
1595 int rv = add_clean(fs, dev, seg);
1596 lafs_check_seg_cnt(fs->segtrack);
1600 if (test_bit(EmergencyClean, &fs->fsstate))
1603 /* 0x10000 is to ensure this score is always
1604 * more than the above score */
1605 score = youth * usage / segsize + 0x10000;
1607 spin_lock(&fs->lock);
1608 if (score > SCORE_MAX)
1611 ss = segfind(fs->segtrack, dev, seg, where);
1613 /* already in the table. Just update the usage.
1614 * It must be on the right list.
1616 if (ss->score == SCORE_ACTIVE)
1617 ; /* still in use */
1618 else if (ss->usage == 0 && ss->score > 0)
1619 ; /* on free list, leave it alone */
1624 lafs_check_seg_cnt(fs->segtrack);
1625 spin_unlock(&fs->lock);
1629 if (fs->segtrack->cleanable.cnt >= fs->segtrack->total/2) {
1630 /* Table of cleanable segments is full. Sort and discard
1636 segsort(fs->segtrack, &fs->segtrack->cleanable);
1637 ssnum = fs->segtrack->cleanable.first;
1638 keep = (fs->segtrack->cleanable.cnt+1) / 2;
1639 for (i = 0; i < keep; i++) {
1641 ss = segfollow(fs->segtrack, ssnum);
1644 fs->segtrack->max_score = ss->score;
1646 fs->segtrack->cleanable.last = prev;
1647 fs->segtrack->cleanable.cnt = keep;
1648 ss = segfollow(fs->segtrack, ssnum);
1650 ss->score = SCORE_DEAD;
1651 ss = segfollow(fs->segtrack, ss->next);
1653 segdelete_all(fs->segtrack, fs);
1654 fs->segtrack->sorted_size = fs->segtrack->total/2;
1656 if (fs->segtrack->cleanable.cnt <= fs->segtrack->total/4)
1657 fs->segtrack->max_score = 0;
1658 if (fs->segtrack->max_score &&
1659 fs->segtrack->max_score < score) {
1660 /* score too high to bother with */
1661 lafs_check_seg_cnt(fs->segtrack);
1662 spin_unlock(&fs->lock);
1666 seg_add_new(fs->segtrack, &fs->segtrack->cleanable, 1,
1667 dev, seg, score, usage, where);
1668 spin_unlock(&fs->lock);
1672 static void merge_usage(struct fs *fs, u16 *d)
1674 u16 *u = fs->scan.free_usages;
1675 int segperblk = fs->blocksize / 2;
1678 for (i = 0; i < segperblk; i++)
1679 if (le16_to_cpu(d[i]) > le16_to_cpu(u[i]))
1683 unsigned long lafs_scan_seg(struct fs *fs)
1685 /* Process one block of youth or segment-usage data. We
1686 * collect free segments (youth==0) into a table that is kept
1687 * sorted to ensure against duplicates. It is treated like a
1688 * ring buffer with a head and a tail to distinguish free
1689 * space from used space. head/tail point to the free space
1690 * (which is never empty). As we scan all segments
1691 * sequentially any free segment we find is placed at the head
1692 * of the free list and the entry just after the tail might
1693 * get discarded if we run out of room, or if that entry
1694 * matches the entry we just inserted. New free segments are
1695 * allocated from just after the tail. i.e. we increment the
1696 * tail and return the entry recorded there. If tail+1 ==
1697 * head, there are no segments in the buffer.
1699 * We also collect potentially cleanable segments. These are any
1700 * segments which is not empty and not non-logged (i.e youth>=8).
1701 * We record them in a table and whenever the table gets full, we
1702 * sort the table by score, and discard the less-cleanable half.
1703 * We only add segments to the table when it is less than half full,
1704 * or when the new segment has a higher score than the segment at
1705 * the half-way mark.
1707 * To get the score for a segment, we need its youth and its usage.
1708 * The usage is the maximum usage from all snapshots.
1709 * We allocate a block to hold the current max usage.
1711 * load the youth table, find free segments, decay youth if needed.
1712 * load the first usage table and copy to temporary block
1713 * repeat: load other usage tables and merge
1714 * calculate score for each segment and add those with a good score
1716 * On the first pass through, we count all the free segments into
1717 * the fs->free_blocks.
1719 * In the first pass, we continue make a full pass without deliberately
1720 * waiting. In subsequent passes ... only once per checkpoint???
1724 return MAX_SCHEDULE_TIMEOUT;
1726 dprintk("scan: dev=%d block=%d stage=%d\n",
1727 fs->scan.free_dev, fs->scan.free_block, fs->scan.free_stage);
1728 if (fs->scan.free_stage == 0) {
1729 /* Need to find the youth block for next dev/offset.
1730 * Possibly we are finished with this dev and must go
1731 * to next. Possibly we are finished altogether.
1733 int dev = fs->scan.free_dev;
1734 int block = fs->scan.free_block + 1;
1738 block > (fs->devs[dev].segment_count
1739 >> (fs->blocksize_bits - 1))) {
1742 if (dev >= fs->devices) {
1743 fs->scan.free_dev = -1;
1745 fs->scan.do_decay = 0;
1747 fs->total_free_prev = fs->total_free;
1749 fs->scan.first_free_pass = 0;
1753 if (fs->scan.youth_db)
1754 if (fs->scan.youth_db->b.fileaddr != block ||
1756 fs->scan.youth_db->b.inode != fs->devs[dev].segsum) {
1757 putdref(fs->scan.youth_db, MKREF(youth_scan));
1758 fs->scan.youth_db = NULL;
1762 lafs_wake_thread(fs);
1763 return MAX_SCHEDULE_TIMEOUT;
1766 if (fs->scan.youth_db == NULL)
1768 lafs_get_block(fs->devs[dev].segsum,
1770 NULL, GFP_KERNEL, MKREF(youth_scan));
1771 if (!fs->scan.youth_db) {
1772 printk("EEEEEKKKKK get_block failed\n");
1773 return MAX_SCHEDULE_TIMEOUT;
1775 switch (err = lafs_read_block_async(fs->scan.youth_db)) {
1778 printk("EEEEEKKKKK read of youth block failed\n");
1781 return MAX_SCHEDULE_TIMEOUT;
1786 if (fs->scan.do_decay) {
1787 /* youth_db must be writable */
1788 struct datablock *db = fs->scan.youth_db;
1789 lafs_checkpoint_lock(fs);
1790 set_bit(B_PinPending, &db->b.flags);
1791 lafs_pin_dblock(db, AccountSpace);
1793 spin_lock(&fs->lock);
1794 fs->scan.free_block = block;
1795 fs->scan.free_dev = dev;
1796 if (!err && fs->scan.do_decay) {
1797 u16 *yp = map_dblock(fs->scan.youth_db);
1799 int segperblk = fs->blocksize / 2;
1801 for (i = 0 ; i < segperblk ; i++) {
1802 int y = le16_to_cpu(yp[i]);
1806 unmap_dblock(fs->scan.youth_db, yp);
1807 lafs_dirty_dblock(fs->scan.youth_db);
1809 spin_unlock(&fs->lock);
1810 if (fs->scan.do_decay)
1811 lafs_checkpoint_unlock(fs);
1814 fs->scan.free_stage = 1;
1816 lafs_check_seg_cnt(fs->segtrack);
1817 if (fs->scan.free_stage == 1) {
1818 /* Find the main usage block and copy the data into
1821 struct datablock *db;
1827 int segperblk = fs->blocksize / 2;
1828 int segments = segperblk;
1832 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1833 fs->scan.free_block +
1834 fs->devs[fs->scan.free_dev].tablesize,
1835 NULL, GFP_KERNEL, MKREF(usage0));
1837 printk("EEEEKKK get_block for first usage failed\n");
1839 fs->scan.free_stage = 0;
1842 switch (lafs_read_block_async(db)) {
1845 printk("EEEEKKK read of usage block failed\n");
1846 putdref(db, MKREF(usage0));
1849 putdref(db, MKREF(usage0));
1850 return MAX_SCHEDULE_TIMEOUT;
1855 memcpy(fs->scan.free_usages, d, fs->blocksize);
1856 unmap_dblock(db, d);
1858 /* Now add any free blocks */
1859 segcount = fs->devs[fs->scan.free_dev].segment_count;
1860 blks = segcount / segments;
1861 if (fs->scan.free_block == blks)
1862 segments = segcount % segperblk;
1863 firstseg = fs->scan.free_block * segperblk;
1865 yp = map_dblock(fs->scan.youth_db);
1866 for (i = 0; i < segments ; i++)
1867 if (yp[i] == cpu_to_le16(0)) {
1868 if (fs->scan.first_free_pass) {
1869 spin_lock(&fs->lock);
1871 fs->devs[fs->scan.free_dev]
1873 spin_unlock(&fs->lock);
1875 if (add_free(fs, fs->scan.free_dev, firstseg + i,
1877 /* Everything in the table owns a reference
1878 * to youth and segusage[0]
1880 (void)getdref(fs->scan.youth_db, MKREF(intable));
1881 (void)getdref(db, MKREF(intable));
1884 fs->devs[fs->scan.free_dev]
1885 .segment_size /*- 1*/;
1887 unmap_dblock(fs->scan.youth_db, yp);
1889 fs->scan.usage0_db = db;
1890 fs->scan.free_stage = 2;
1892 lafs_check_seg_cnt(fs->segtrack);
1893 while (fs->scan.free_stage > 1 &&
1894 fs->scan.free_stage < fs->maxsnapshot + 1) {
1895 struct datablock *db;
1898 if (fs->ss[fs->scan.free_stage-1].root == NULL) {
1899 fs->scan.free_stage++;
1902 /* Load the usage block for this snapshot and merge */
1904 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1905 fs->scan.free_block +
1906 fs->devs[fs->scan.free_dev].tablesize *
1907 LAFSI(fs->ss[fs->scan.free_stage-1].root)
1909 NULL, GFP_KERNEL, MKREF(usage_ss));
1911 printk("EEEEKKK get_block for subsequent usage failed\n");
1913 fs->scan.free_stage = 0;
1914 putdref(fs->scan.usage0_db, MKREF(usage0));
1915 fs->scan.usage0_db = NULL;
1918 switch (lafs_read_block_async(db)) {
1921 printk("EEEEKKK read of subsequent usage block failed\n");
1922 putdref(db, MKREF(usage_ss));
1925 putdref(db, MKREF(usage_ss));
1926 return MAX_SCHEDULE_TIMEOUT;
1932 unmap_dblock(db, d);
1933 fs->scan.free_stage++;
1934 putdref(db, MKREF(usage_ss));
1936 lafs_check_seg_cnt(fs->segtrack);
1938 if (fs->scan.free_stage == fs->maxsnapshot + 1) {
1939 /* All usage data has been merged, we can record all these
1940 * cleanable segments now
1942 u16 *yp = map_dblock(fs->scan.youth_db);
1943 u16 *up = fs->scan.free_usages;
1945 int segperblk = fs->blocksize / 2;
1946 int segments = segperblk;
1947 int segcount = fs->devs[fs->scan.free_dev].segment_count;
1948 int blks = segcount / segments;
1949 if (fs->scan.free_block == blks)
1950 segments = segcount % segperblk;
1952 for (i = 0; i < segments; i++)
1953 if (add_cleanable(fs, fs->scan.free_dev,
1954 i + fs->scan.free_block * segperblk,
1956 le16_to_cpu(up[i]))) {
1957 /* Every segment in table owns these
1960 (void)getdref(fs->scan.youth_db, MKREF(intable));
1961 (void)getdref(fs->scan.usage0_db, MKREF(intable));
1964 unmap_dblock(fs->scan.youth_db, yp);
1965 putdref(fs->scan.usage0_db, MKREF(usage0));
1966 fs->scan.usage0_db = NULL;
1967 fs->scan.free_stage = 0;
1969 lafs_check_seg_cnt(fs->segtrack);
1972 else if (fs->scan.free_stage == 0)
1975 return MAX_SCHEDULE_TIMEOUT;
1978 void lafs_empty_segment_table(struct fs *fs)
1980 /* remove everything from the table. */
1984 ssnum = fs->segtrack->cleanable.first;
1985 fs->segtrack->cleanable.first = 0xffff;
1986 fs->segtrack->cleanable.last = 0xffff;
1987 fs->segtrack->cleanable.cnt = 0;
1988 while (ssnum != 0xffff) {
1989 ss = segfollow(fs->segtrack, ssnum);
1990 ss->score = SCORE_DEAD;
1994 ssnum = fs->segtrack->clean.first;
1995 fs->segtrack->clean.first = 0xffff;
1996 fs->segtrack->clean.last = 0xffff;
1997 fs->segtrack->clean.cnt = 0;
1998 while (ssnum != 0xffff) {
1999 ss = segfollow(fs->segtrack, ssnum);
2000 ss->score = SCORE_DEAD;
2004 ssnum = fs->segtrack->free.first;
2005 fs->segtrack->free.first = 0xffff;
2006 fs->segtrack->free.last = 0xffff;
2007 fs->segtrack->free.cnt = 0;
2008 while (ssnum != 0xffff) {
2009 ss = segfollow(fs->segtrack, ssnum);
2010 ss->score = SCORE_DEAD;
2014 segdelete_all(fs->segtrack, fs);
2017 void lafs_dump_usage(void)
2021 dfs->scan.trace = 1;
2023 lafs_wake_thread(dfs);