]> git.neil.brown.name Git - LaFS.git/blob - segments.c
8b3acb385f32fd940c9652f0897553d05c6dc11b
[LaFS.git] / segments.c
1
2 /*
3  * segment tracking routines for LaFS
4  * fs/lafs/segments.c
5  * Copyright (C) 2006-2009
6  * NeilBrown <neilb@suse.de>
7  * Released under the GPL, version 2
8  */
9
10 /* In here we manage:
11  * - active blocks in segment usage files
12  * - next-phase usage updates
13  * - youth tracking and decay
14  * - list of segments that could be cleaned
15  * - list of free segments that could be reused
16  * - tracking space allocation and available free space
17  *
18  * Segment usage blocks:
19  *  Every Pinned, Dirty or Realloc block with physaddr != 0 owns a reference
20  *  on the block in the segment usage file that counts that
21  *  block.  When the physaddr is changed, the new block will always
22  *  be available so the reference is simply moved.
23  *
24  *  Normally when ->physaddr changes, the counts in the relevant
25  *  segusage blocks are updated directly.  However when the change is
26  *  for a write in the next phase (during a checkpoint) that is not
27  *  possible as the segusage blocks need to be written with the old value.
28  *  So we track these updates in a simple data structure that
29  *  records delta against each ss+dev+seg.  If such a data structure
30  *  cannot be allocated (-ENOMEM) the associated write is blocked until
31  *  the checkpoint completes. FIXME - implement that.
32  *
33  *  Every Pinned or Dirty block holds a number of space credits,
34  *  following a pattern described elsewhere.  The space allocation
35  *  unit identifies segments to be used in advance and holds a reference
36  *  on every block.
37  *
38  * Youth
39  *  Youth numbers are handed out sequentially to new write clusters.
40  *  When the available space approaches being full all youth numbers are
41  *  decayed (as differentiation between old youth values is uninteresting).
42  *  This decay is done in a staged fashion, one block at a time so that
43  *  a checkpoint can happen at any time and the current block in the
44  *  decay sweep is recorded.
45  *
46  * Free Segments
47  *  Holding a record of all free segments is impractical in general.  Instead
48  *  We record a limited number of free segments (1 page worth - 512 entries,
49  *  possibly with ranges) and when needed we scan the segment usage
50  *  files for free segments and add them to the list.  This scan needs to
51  *  be rate limited - not sure how best to do that.
52  *
53  * Cleanable Segments
54  *  Similar to free segments we need to keep track of cleanable segments.
55  *  Almost all non-free segments are cleanable, and we assign a weight to
56  *  each based on youth and usage.  We want to keep the most cleanable
57  *  segments based on this weight.
58  *  We keep an unordered list of cleanable segments.  When it becomes
59  *  full we sort it by weight and discard the worst half.
60  *
61  * Scanning
62  *  As there could be hundreds of snapshots (hopefully not thousands)
63  *  we cannot load the segusage block for each snapshot and then parse them
64  *  in parallel.  Instead we allocate space to store a max usage and
65  *  merge each block one at a time into that max.  We then combine the
66  *  max with the youth to get a 32bit weight... I wonder if that is good.
67  *
68  */
69
70 /* Every block that could change its ->physaddr holds a reference
71  * on the segment summary block through this structure
72  */
73
74 #include        "lafs.h"
75 #include        <linux/hash.h>
76 #include        <linux/random.h>
77 #include        <linux/slab.h>
78
79 struct segsum {
80         u32     segnum;
81         int     devnum;
82         int     ssnum;
83
84         struct hlist_node hash;
85
86         atomic_t        refcnt;
87         atomic_t        delayed;
88
89         struct datablock *ssblk;
90         /* youthblk is only set when ssnum is 0 and
91          * usage is zero we are cleaning.
92          */
93         struct datablock *youthblk;
94 };
95
96 static int shash(u32 segnum, int devnum, int ssnum)
97 {
98         unsigned long h = hash_long(segnum, BITS_PER_LONG);
99         return hash_long(h ^ (devnum | (ssnum << 16)), SHASHBITS);
100 }
101
102 static inline void ss_get(struct segsum *ss)
103 {
104         BUG_ON(atomic_read(&ss->refcnt) == 0);
105         atomic_inc(&ss->refcnt);
106 }
107
108 static void ss_put(struct segsum *ss, struct fs *fs)
109 {
110         if (atomic_dec_and_lock(&ss->refcnt, &fs->stable_lock)) {
111                 if (atomic_read(&ss->delayed) != 0)
112                         spin_unlock(&fs->stable_lock);
113                 else {
114                         if (!hlist_unhashed(&ss->hash))
115                                 hlist_del(&ss->hash);
116                         spin_unlock(&fs->stable_lock);
117                         putdref(ss->ssblk, MKREF(ss));
118                         putdref(ss->youthblk, MKREF(ssyouth));
119                         kfree(ss);
120                 }
121         }
122 }
123
124 static struct segsum *segsum_find(struct fs *fs, u32 segnum,
125                                   int devnum, int ssnum)
126 {
127         struct hlist_head *head = &fs->stable[shash(segnum, devnum, ssnum)];
128         struct segsum *ss, *new = NULL;
129         struct hlist_node *n;
130         struct fs_dev *dv;
131         u32 addr;
132         int err;
133
134 retry:
135         spin_lock(&fs->stable_lock);
136         hlist_for_each_entry(ss, n, head, hash)
137                 if (ss->segnum == segnum &&
138                     ss->ssnum == ssnum &&
139                     ss->devnum == devnum) {
140                         atomic_inc(&ss->refcnt);
141                         spin_unlock(&fs->stable_lock);
142                         if (new)
143                                 ss_put(new, fs);
144                         if (ss->ssblk &&
145                             !test_bit(B_Valid, &ss->ssblk->b.flags) &&
146                             (fs->rolled || fs->qphase == fs->phase)) {
147                                 /* need to read this now that roll forward
148                                  * has progressed.
149                                  */
150                                 err = lafs_read_block(ss->ssblk);
151                                 if (err) {
152                                         ss_put(ss, fs);
153                                         return ERR_PTR(err);
154                                 }
155                         }
156                         return ss;
157                 }
158         if (new) {
159                 hlist_add_head(&new->hash, head);
160                 spin_unlock(&fs->stable_lock);
161                 return new;
162         }
163         spin_unlock(&fs->stable_lock);
164
165         /* seems we must allocate something */
166
167         new = kmalloc(sizeof(*new), GFP_KERNEL | __GFP_NOFAIL);
168         new->segnum = segnum;
169         new->devnum = devnum;
170         new->ssnum = ssnum;
171         atomic_set(&new->refcnt, 1);
172         atomic_set(&new->delayed, 0);
173         INIT_HLIST_NODE(&new->hash);
174         dv = fs->devs + devnum;
175         addr = LAFSI(fs->ss[ssnum].root)->md.fs.usagetable * dv->tablesize;
176         addr += segnum >> (fs->blocksize_bits-1);
177         new->ssblk = lafs_get_block(dv->segsum, addr, NULL,
178                                     GFP_KERNEL,
179                                     MKREF(ss));
180         if (ssnum == 0)
181                 new->youthblk = lafs_get_block(dv->segsum,
182                                                segnum >> (fs->blocksize_bits-1),
183                                                NULL,
184                                                GFP_KERNEL,
185                                                MKREF(ssyouth));
186         else
187                 new->youthblk = NULL;
188         BUG_ON(!new->ssblk || IS_ERR(new->ssblk) || IS_ERR(new->youthblk));
189         if (!fs->rolled && fs->qphase != fs->phase) {
190                 /* Too early to safely read segusage blocks,
191                  * leave it until later
192                  */
193                 goto retry;
194         }
195         err = lafs_read_block(new->ssblk);
196         if (err) {
197                 ss_put(new, fs);
198                 return ERR_PTR(err);
199         }
200         goto retry;
201 }
202
203 static struct segsum *segsum_byaddr(struct fs *fs, u64 addr, int ssnum)
204 {
205         int dev;
206         u32 offset;
207         u32 seg;
208         virttoseg(fs, addr, &dev, &seg, &offset);
209
210         return segsum_find(fs, seg, dev, ssnum);
211 }
212
213 void lafs_seg_put_all(struct fs *fs)
214 {
215         /* remove all the ssblk and youthblk references from
216          * all segsums.
217          * This is called when filesystem is being unmounted
218          */
219         int i;
220         for (i = 0; i < SHASHSIZE; i++)
221                 if (!hlist_empty(&fs->stable[i])) {
222                         struct segsum *ss;
223                         struct datablock *b, *y;
224                         struct hlist_node *n;
225                         /* FIXME this is quadratic */
226                 retry:
227                         spin_lock(&fs->stable_lock);
228                         hlist_for_each_entry(ss, n, &fs->stable[i], hash) {
229                                 b = ss->ssblk;
230                                 y = ss->youthblk;
231                                 if (!b && !y)
232                                         continue;
233                                 ss->ssblk = NULL;
234                                 ss->youthblk = NULL;
235                                 spin_unlock(&fs->stable_lock);
236                                 if (b && test_and_clear_bit(B_Async, &b->b.flags))
237                                         putdref(b, MKREF(async));
238                                 if (y && test_and_clear_bit(B_Async, &y->b.flags))
239                                         putdref(y, MKREF(async));
240                                 putdref(b, MKREF(ss));
241                                 putdref(y, MKREF(ssyouth));
242                                 goto retry;
243                         }
244                         spin_unlock(&fs->stable_lock);
245                 }
246 }
247
248 /* We need to set SegRef on this block, and all parents,
249  * and the segsum block and all parents, and so on.
250  * We find the first ancestor that does not have SegRef,
251  * find the matching segsum block.  If it already
252  * has SegRef, we add the reference and retry from the start.
253  * If it doesn't have SegRef, we prealloc and then tail recurse.
254  * Note: we never set SegRef on B_Root blocks as their usage
255  * isn't recorded.
256  */
257 int lafs_seg_ref_block(struct block *b, int ssnum)
258 {
259         struct fs *fs = fs_from_inode(b->inode);
260
261         LAFS_BUG(test_bit(B_InoIdx, &b->flags), b);
262         getref(b, MKREF(segref));
263
264         while (!test_bit(B_SegRef, &b->flags)) {
265                 struct block *p;
266                 struct block *b2;
267                 struct segsum *ss;
268                 struct inode *ino;
269                 int err;
270
271                 BUG_ON(test_bit(B_Root, &b->flags));
272
273                 b2 = b;
274                 /* Need to check parents */
275                 ino = b->inode;
276                 spin_lock(&ino->i_data.private_lock);
277                 for (p = b;
278                      p && !test_bit(B_SegRef, &p->flags);
279                      p = &(p->parent)->b) {
280                         if (test_bit(B_InoIdx, &p->flags)) {
281                                 struct datablock *db = LAFSI(p->inode)->dblock;
282                                 p = &db->b;
283                                 spin_unlock(&ino->i_data.private_lock);
284                                 ino = p->inode;
285                                 spin_lock(&ino->i_data.private_lock);
286                         }
287                         if (test_bit(B_SegRef, &p->flags))
288                                 break;
289                         if (!test_bit(B_Root, &p->flags))
290                                 b2 = p;
291                 }
292                 spin_unlock(&ino->i_data.private_lock);
293                 /* b2 is the first ancestor (closest to root) without SegRef */
294                 /* FIXME we have an implicit reference on b2
295                  * through b->parent...
296                  * But if a split happens, b2 might not be a
297                  * true parent any more, so we no longer have a
298                  * reference.  If that can actually happen,
299                  * we need to take a reference before dropping
300                  * the lock.
301                  * If it cannot, then we don't need the lock.
302                  */
303
304                 if (b2->physaddr == 0) {
305                         /* There is no segsum to reference */
306                         set_bit(B_SegRef, &b2->flags);
307                         continue;
308                 }
309
310                 ss = segsum_byaddr(fs, b2->physaddr, ssnum);
311                 if (IS_ERR(ss)) {
312                         putref(b, MKREF(segref));
313                         return PTR_ERR(ss);
314                 }
315
316                 if (&ss->ssblk->b == b) {
317                         /* Don't need to check for Prealloc or
318                          * SegRef, just take a reference now
319                          */
320                         if (test_and_set_bit(B_SegRef, &b2->flags))
321                                 ss_put(ss, fs);
322                         continue;
323                 }
324
325                 /* Don't need iolock here as segusage is very special */
326                 set_bit(B_PinPending, &ss->ssblk->b.flags);
327                 err = lafs_pin_dblock(ss->ssblk, AccountSpace);
328                 if (err) {
329                         ss_put(ss, fs);
330                         putref(b, MKREF(segref));
331                         return err;
332                 }
333                 if (!test_bit(B_SegRef, &ss->ssblk->b.flags)) {
334                         putref(b, MKREF(segref));
335                         b = getref(&ss->ssblk->b, MKREF(segref));
336                         ss_put(ss, fs);
337                 } else if (test_and_set_bit(B_SegRef, &b2->flags))
338                         ss_put(ss, fs);
339         }
340         putref(b, MKREF(segref));
341         return 0;
342 }
343
344 void lafs_seg_deref(struct fs *fs, u64 physaddr, int ssnum)
345 {
346         if (physaddr) {
347                 struct segsum *ss = segsum_byaddr(fs, physaddr, ssnum);
348                 BUG_ON(IS_ERR(ss));
349                 BUG_ON(atomic_read(&ss->refcnt) < 2);
350                 ss_put(ss, fs);
351                 ss_put(ss, fs);
352         }
353 }
354
355 static void seg_inc(struct fs *fs, struct segsum *ss, int diff, int in_phase)
356 {
357         if (!in_phase)
358                 atomic_add(diff, &ss->delayed);
359         else {
360                 u16 *b, *p;
361                 b = map_dblock(ss->ssblk);
362                 spin_lock(&fs->stable_lock);
363                 p = &b[ss->segnum & ((fs->blocksize-1)>>1)];
364                 //BUG_ON(diff < 0 && le16_to_cpu(*p) < -diff);
365                 if (diff < 0 && le16_to_cpu(*p) < -diff) {
366                         printk("diff=%d p=%d segnum=%d\n", diff, le16_to_cpu(*p),
367                                ss->segnum);
368                         BUG();
369                 }
370                 *p = cpu_to_le16(le16_to_cpu(*p) + diff);
371                 spin_unlock(&fs->stable_lock);
372                 unmap_dblock(ss->ssblk, b);
373                 lafs_dirty_dblock(ss->ssblk);
374         }
375 }
376
377 void lafs_seg_move(struct fs *fs, u64 oldaddr, u64 newaddr,
378                    int ssnum, int phase, int moveref)
379 {
380         struct segsum *ss;
381
382         dprintk("SEGMOVE %llu %llu\n",
383                 (unsigned long long) oldaddr,
384                 (unsigned long long) newaddr);
385
386         if (newaddr) {
387                 ss = segsum_byaddr(fs, newaddr, ssnum);
388                 seg_inc(fs, ss, 1, fs->qphase == phase);
389
390                 if (!moveref)
391                         ss_put(ss, fs);
392         }
393
394         if (oldaddr) {
395                 ss = segsum_byaddr(fs, oldaddr, ssnum);
396                 seg_inc(fs, ss, -1, fs->qphase == phase);
397                 ss_put(ss, fs);
398                 if (moveref)
399                         ss_put(ss, fs);
400         }
401 }
402
403 static void seg_apply(struct fs *fs, struct segsum *ss)
404 {
405         int err;
406         int diff = atomic_read(&ss->delayed);
407         if (diff == 0)
408                 return;
409         atomic_set(&ss->delayed, 0);
410         dprintk("Seg apply %d %d\n", (int)ss->segnum, diff);
411         err = lafs_read_block(ss->ssblk);
412         BUG_ON(err); // FIXME do something useful here
413         seg_inc(fs, ss, diff, 1);
414 }
415
416 /* lafs_seg_apply_all
417  * During a checkpoint, blocks written to the next phase cannot immediately
418  * update the segment usage tables so those updates need to be delayed.
419  * This function applies all the delayed updates to the segment usage
420  * files at the end of a checkpoint.
421  */
422 void lafs_seg_apply_all(struct fs *fs)
423 {
424         int i;
425
426         for (i = 0 ; i < SHASHSIZE ; i++) {
427                 struct hlist_head *head = &fs->stable[i];
428                 struct segsum *ss;
429                 struct hlist_node *n, *pos;
430                 spin_lock(&fs->stable_lock);
431                 hlist_for_each_entry_safe(ss, pos, n, head, hash) {
432                         atomic_inc(&ss->refcnt);
433                         spin_unlock(&fs->stable_lock);
434                         seg_apply(fs, ss);
435                         ss_put(ss, fs);
436                         spin_lock(&fs->stable_lock);
437                         // FIXME this still isn't safe - 'n' could
438                         // disappear while unlocked.
439                 }
440                 spin_unlock(&fs->stable_lock);
441         }
442         /* Now any clean segments found earlier are free. */
443 }
444
445 /*
446  * When creating a snapshot, we need to duplicate table[1] in
447  * each segment usage file into table[n].
448  * It would be nice to just copy block pointers but this might
449  * confuse cleaning etc. so duplicate the data for now.
450  */
451 int lafs_seg_dup(struct fs *fs, int newtable)
452 {
453         /* for each device,
454          * get blocks for each table and memcpy
455          */
456         int d;
457         int err = 0;
458         for (d=0; d < fs->devices ; d++) {
459                 struct fs_dev *dv = &fs->devs[d];
460                 int b;
461                 for (b=0 ; b < dv->tablesize ; b++) {
462                         struct datablock *obl, *nbl;
463                         u32 addr;
464                         void *from, *to;
465
466                         addr = dv->tablesize + b;
467                         obl = lafs_get_block(dv->segsum, addr, 0,
468                                              GFP_KERNEL | __GFP_NOFAIL,
469                                              MKREF(ss));
470                         err = lafs_read_block(obl);
471                         if (err) {
472                                 putdref(obl, MKREF(ss));
473                                 goto out;
474                         }
475
476                         addr = dv->tablesize * newtable + b;
477                         nbl = lafs_get_block(dv->segsum, addr, 0,
478                                              GFP_KERNEL | __GFP_NOFAIL,
479                                              MKREF(ss));
480
481                         from = map_dblock(obl);
482                         to = map_dblock_2(nbl);
483                         memcpy(to, from, fs->blocksize);
484                         unmap_dblock_2(nbl, to);
485                         unmap_dblock(obl, from);
486                         lafs_dirty_dblock(nbl);
487                         putdref(obl, MKREF(ss));
488                         putdref(nbl, MKREF(ss));
489                 }
490         }
491 out:
492         return err;
493 }
494
495 /*************************************************************
496  * Space management:  allocate, use, free
497  */
498
499 static int count_credits(struct block *b)
500 {
501         int credits = 0;
502         struct indexblock *ib = iblk(b);
503         struct inode *ino = NULL;
504
505         if (test_bit(B_Credit, &b->flags))
506                 credits++;
507         if (test_bit(B_ICredit, &b->flags))
508                 credits++;
509         if (test_bit(B_NCredit, &b->flags))
510                 credits++;
511         if (test_bit(B_NICredit, &b->flags))
512                 credits++;
513         if (test_bit(B_UnincCredit, &b->flags))
514                 credits++;
515         if (test_bit(B_Dirty, &b->flags))
516                 credits++;
517         if (test_bit(B_Realloc, &b->flags))
518                 credits++;
519
520         if (test_bit(B_Index, &b->flags)) {
521                 list_for_each_entry(b, &ib->children, siblings) {
522                         LAFS_BUG(b->parent != ib, b);
523                         credits += count_credits(b);
524                 }
525                 credits += ib->uninc_table.credits;
526         } else if ((ino = rcu_my_inode(dblk(b))) != NULL &&
527                    LAFSI(ino)->iblock &&
528                    LAFSI(ino)->iblock->b.parent
529                 ) {
530                 LAFS_BUG(LAFSI(ino)->iblock->b.parent != b->parent, b);
531                 credits += count_credits(&LAFSI(ino)->iblock->b);
532         }
533         rcu_iput(ino);
534         return credits;
535 }
536
537 int temp_credits;
538 static void check_credits(struct fs *fs)
539 {
540         /* DEBUGGING AID
541          * count credits in tree and make sure they match allocate_blocks
542          */
543         int credits;
544         if (fs->ss[0].root == NULL ||
545             LAFSI(fs->ss[0].root)->iblock == NULL ||
546             LAFSI(fs->ss[0].root)->dblock == NULL)
547                 return;
548         credits = count_credits(&LAFSI(fs->ss[0].root)->iblock->b);
549         credits += count_credits(&LAFSI(fs->ss[0].root)->dblock->b);
550         if (credits + fs->cluster_updates + temp_credits != fs->allocated_blocks) {
551                 printk("credits=%d updates=%d temp=%d allocated=%d\n", credits,
552                        fs->cluster_updates, temp_credits,
553                        (int)fs->allocated_blocks);
554                 lafs_dump_tree();
555                 WARN_ON(1);
556         }
557 }
558
559 void lafs_space_return(struct fs *fs, int credits)
560 {
561         spin_lock(&fs->alloc_lock);
562         BUG_ON(credits < 0);
563         if (credits > fs->allocated_blocks)
564                 printk("cred=%d ab=%d\n", credits, (int)fs->allocated_blocks);
565         BUG_ON(credits > fs->allocated_blocks);
566         fs->allocated_blocks -= credits;
567         spin_unlock(&fs->alloc_lock);
568         check_credits(fs);
569 }
570
571 int lafs_alloc_cleaner_segs(struct fs *fs, int max)
572 {
573         /* cleaner is about to start.
574          * See how many segments of space can safely
575          * be reserved for its use.
576          * We don't allocate more to the cleaner than we
577          * leave spare for new allocations.
578          */
579         int watermark = fs->max_segment * 4;
580         int rv = 0;
581         spin_lock(&fs->alloc_lock);
582         while (fs->clean_reserved < max * fs->max_segment &&
583                fs->free_blocks > (fs->clean_reserved
584                                   + fs->allocated_blocks
585                                   + watermark)) {
586                 fs->clean_reserved += fs->max_segment;
587                 fs->free_blocks -= fs->max_segment;
588                 rv++;
589         }
590         spin_unlock(&fs->alloc_lock);
591         return rv;
592 }
593
594 int lafs_space_alloc(struct fs *fs, int credits, int why)
595 {
596         /* 'why's for space reservation.
597          * NewSpace means we want to write a block which didn't exist
598          *   before.  This is allowed to fail or block if the cleaner
599          *   appears able to make progress.
600          * ReleaseSpace means we want to write a block which does currently
601          *   exist, so doing this will eventually free up space.  This must
602          *   never fail, but can block.
603          * CleanSpace means we want to write a block to relocate it to
604          *   a 'cleaning' segment.   This may never fail.
605          * AccountSpace means we absolutely need this block now, and it is
606          *   a BUG is there is no space available.
607          */
608         u64 watermark = 0;
609
610         check_credits(fs);
611         spin_lock(&fs->alloc_lock);
612         switch(why) {
613         case NewSpace:
614                 watermark += RELEASE_RESERVED * fs->max_segment;
615                 /* FALL THROUGH */
616         case ReleaseSpace:
617                 watermark += ACCOUNT_RESERVED * fs->max_segment;
618                 /* FALL THROUGH */
619         case CleanSpace:
620         case AccountSpace:
621                 /* Definitely no water mark here. */
622                 break;
623         }
624
625         if (fs->rolled && watermark) {
626                 /* We cannot account properly before roll-forward has
627                  * completed. FIXME once it has completed we need to
628                  * check and invalidate the FS if there was a problem.
629                  */
630                 if (fs->free_blocks < fs->allocated_blocks
631                     + credits + watermark)
632                         credits = 0; /* Sorry, no room */
633         }
634         if (fs->rolled && watermark == 0) {
635                 /* When including the clean_reserved space, there should
636                  * be room for these controlled allocations
637                  */
638                 if (fs->free_blocks + fs->clean_reserved <
639                     fs->allocated_blocks + credits)
640                         BUG();
641         }
642
643         if (credits == 0) {
644                 if (why == NewSpace && !test_bit(EmergencyClean, &fs->fsstate))
645                         set_bit(EmergencyPending, &fs->fsstate);
646                 if (why >= ReleaseSpace || !test_bit(EmergencyClean, &fs->fsstate))
647                         if (!test_bit(CleanerBlocks, &fs->fsstate) ||
648                             fs->cleaner.need > watermark + fs->max_segment) {
649                                 fs->cleaner.need = watermark + fs->max_segment;
650                                 set_bit(CleanerBlocks, &fs->fsstate);
651                                 lafs_wake_thread(fs);
652                         }
653         } else if (why == NewSpace)
654                 if (test_bit(EmergencyClean, &fs->fsstate) ||
655                     test_bit(EmergencyPending, &fs->fsstate)) {
656                         clear_bit(EmergencyPending, &fs->fsstate);
657                         clear_bit(EmergencyClean, &fs->fsstate);
658                 }
659
660         fs->allocated_blocks += credits;
661         BUG_ON(fs->free_blocks + fs->clean_reserved < fs->allocated_blocks);
662         spin_unlock(&fs->alloc_lock);
663         return credits;
664 }
665
666 /********************************************************************
667  *
668  * Segment usage / youth tracking
669  */
670
671 /* Information about free and cleanable segments are stored in a table
672  * comprised of a few pages.  Indexes into this table are 16bit.  4bits
673  * for page number, 12 bits for index in the page.
674  * Different pages have different sized entries to allow for different
675  * depths in the skiplist that sorts entries by dev/segment.
676  * Entries are at least 16 bytes, so 12 bit can index up 64K.
677  * Each entry is also on one of four lists:
678  *  - unused:  this entry is not used
679  *  - free: this entry is for a free segment
680  *  - cleanable:  this entry is for a cleanable segment.
681  *  - clean: segment has been cleaned but is not yet free (awaiting checkpoint).
682  * These are singly linked lists (via 'next').  We record head and tail.
683  * The end of this list has a pointer to 0xFFFF, not 0.
684  * We usually remove entries from the head, but when the table is
685  * full, we might walk part way down a list and discard all the rest.
686  * We discard them by setting score to 0xFFFFFFFF.  We then walk the
687  * skip list moving anything with a score of 0xFFFFFFFF to the unused list.
688  *
689  * We occasionally sort the cleanable list using a merge sort.
690  *
691  * There is a progression from 'cleanable' to 'clean' to 'free', though
692  * segments can enter at any point.
693  * When we choose a segment to clean, we remove the entry from the cleanable
694  * list. We do this by setting the score to 0xFFFFFFFE and unlinking it.
695  * After cleaning has completed a scan should find that it is clean and so
696  * add it to the 'clean' list.
697  * When we record a 'clean' segment as 'free' (after a checkpoint) we
698  * move it from the clean list to the free list, from where it will be
699  * removed when needed.
700  * We don't add new free segments when the total of free+clean segments
701  * is more than half the size of the table.
702  * Similarly when the cleanable table reaches half the available size
703  * we remove the least-interesting half.
704  * When we choose a free segment to start filling, we remove it from the
705  * free list but not from the table.  The score is set to 0xFFFFFFFD to
706  * record that it is unlinked but busy.  If it is found to be cleanable,
707  * it will be ignored.  When we finish filling a segment, we find the entry
708  * again and remove it properly so it can become cleanable later.
709  */
710
711 #define SCORE_MAX       0xFFFFFFFC      /* Maximum normal score */
712 #define SCORE_ACTIVE    0xFFFFFFFD      /* This segment is being written to */
713 #define SCORE_CLEANING  0xFFFFFFFE      /* This segment in being cleaned */
714 #define SCORE_DEAD      0xFFFFFFFF      /* This segment is to be removed */
715
716 struct segstat {
717         u16     next;
718         u16     dev;
719         u32     segment;
720         u32     score;
721         u16     usage;
722         u16     skip[0]; /* or larger... */
723 };
724
725 static inline struct segstat *segfollow(struct segtracker *st, u16 link)
726 {
727         void *a;
728         if (link == 0xFFFF)
729                 return NULL;
730         a = st->page[link >> 12];
731         a += (link & 0xFFF) *
732                 (st->size[link >> 12] * sizeof(u16) + sizeof(struct segstat));
733         return (struct segstat *) a;
734 }
735
736 int lafs_segtrack_init(struct segtracker *st)
737 {
738         int found;
739         int h, i;
740         int n[4];
741
742         st->page[0] = kmalloc(PAGE_SIZE, GFP_KERNEL);
743         st->page[1] = kmalloc(PAGE_SIZE, GFP_KERNEL);
744         st->page[2] = kmalloc(PAGE_SIZE, GFP_KERNEL);
745         st->page[3] = kmalloc(PAGE_SIZE, GFP_KERNEL);
746
747         if (st->page[0] == NULL ||
748             st->page[1] == NULL ||
749             st->page[2] == NULL ||
750             st->page[3] == NULL) {
751                 lafs_segtrack_free(st);
752                 return -ENOMEM;
753         }
754         st->size[0] = 1;
755         st->size[1] = 1;
756         st->size[2] = 5;
757         st->size[3] = 9;
758         BUG_ON(SEG_NUM_HEIGHTS <= 9);
759
760         st->unused.first = st->unused.last = 0xffff;
761         st->cleanable.first = st->cleanable.last = 0xffff;
762         st->free.first = st->free.last = 0xffff;
763         st->clean.first = st->clean.last = 0xffff;
764         st->unused.cnt = st->free.cnt = st->cleanable.cnt = 0;
765
766         for (h = 0; h < SEG_NUM_HEIGHTS; h++)
767                 st->head[h] = 0xFFFF;
768
769         /* how many entries in each page */
770         for (i = 0 ; i < 4; i++)
771                 n[i] = PAGE_SIZE /  (sizeof(struct segstat) +
772                                      st->size[i] * sizeof(u16));
773
774         do {
775                 char rand;
776                 int p;
777                 found = 0;
778                 get_random_bytes(&rand, 1);
779                 p = rand & 3;
780
781                 while (p < 3 && n[p] == 0)
782                         p++;
783                 for ( ; p >= 0; p--)
784                         if (n[p]) {
785                                 int sn;
786                                 struct segstat *ss;
787                                 n[p]--;
788                                 sn = (p<<12) + n[p];
789
790                                 ss = segfollow(st, sn);
791                                 ss->next = st->unused.first;
792                                 st->unused.first = sn;
793                                 if (st->unused.cnt == 0)
794                                         st->unused.last = sn;
795                                 st->unused.cnt++;
796                                 found = 1;
797                         }
798         } while (found);
799         st->total = st->unused.cnt;
800         return 0;
801 }
802
803 void lafs_segtrack_free(struct segtracker *st)
804 {
805         kfree(st->page[0]);
806         kfree(st->page[1]);
807         kfree(st->page[2]);
808         kfree(st->page[3]);
809 }
810
811 int lafs_check_seg_cnt(struct segtracker *st)
812 {
813         /* check at unused, free, cleanable, clean have correct count.
814          */
815         int sn, cnt, prev;
816
817         cnt = 0;
818         for (prev = sn = st->unused.first ;
819              sn != 0xFFFF ;
820              sn = segfollow(st, (prev = sn))->next)
821                 cnt++;
822         if (cnt != st->unused.cnt) {
823                 printk("%d != %d\n", cnt, st->unused.cnt); WARN_ON(1);
824                 return 1;
825         }
826         if (st->unused.last != prev) {
827                 printk("L%d != %d\n", prev, st->unused.last); WARN_ON(1);
828                 return 1;
829         }
830
831         cnt = 0;
832         for (prev = sn = st->free.first ;
833              sn != 0xFFFF;
834              sn = segfollow(st, (prev = sn))->next)
835                 cnt++;
836         if (cnt != st->free.cnt) {
837                 printk("%d != %d\n", cnt, st->free.cnt); WARN_ON(1);
838                 return 1;
839         }
840         if (st->free.last != prev) {
841                 printk("L%d != %d\n", prev, st->free.last); WARN_ON(1);
842                 return 1;
843         }
844
845         cnt = 0;
846         for (prev = sn = st->clean.first ;
847              sn != 0xFFFF;
848              sn = segfollow(st, (prev = sn))->next)
849                 cnt++;
850         if (cnt != st->clean.cnt) {
851                 printk("%d != %d\n", cnt, st->clean.cnt); WARN_ON(1);
852                 return 1;
853         }
854         if (st->clean.last != prev) {
855                 printk("L%d != %d\n", prev, st->clean.last); WARN_ON(1);
856                 return 1;
857         }
858
859         cnt = 0;
860         for (prev = sn = st->cleanable.first ;
861              sn != 0xFFFF;
862              sn = segfollow(st, (prev = sn))->next)
863                 cnt++;
864         if (cnt != st->cleanable.cnt) {
865                 printk("%d != %d\n", cnt, st->cleanable.cnt); WARN_ON(1);
866                 return 1;
867         }
868         if (st->cleanable.last != prev) {
869                 printk("L%d != %d\n", prev, st->cleanable.last); WARN_ON(1);
870                 return 1;
871         }
872
873         return 0;
874 }
875
876 static void segsort(struct segtracker *st, struct slist *l)
877 {
878         /* sort the 'l' list of st by score - lowest first */
879         /* use a merge sort */
880
881         u16 last = 0xFFFF;
882         u16 head[2];
883         head[0] = l->first;
884         head[1] = 0xFFFF;
885
886         do {
887                 u16 *hp[2], h[2];
888                 int curr = 0;
889                 u32 prev = 0;
890                 int next = 0;
891
892                 hp[0] = &head[0];
893                 hp[1] = &head[1];
894
895                 h[0] = head[0];
896                 h[1] = head[1];
897
898                 /* Find the least of h[0] and h[1] that is not less than
899                  * prev, and add it to hp[curr].  If both are larger than
900                  * prev, flip 'curr' and add smallest.
901                  */
902                 while (h[0] != 0xFFFF || h[1] != 0xFFFF) {
903                         if (h[next] == 0xFFFF ||
904                             (h[1-next] != 0xFFFF &&
905                              !((prev <= segfollow(st, h[1-next])->score)
906                                ^ (segfollow(st, h[1-next])->score
907                                   <= segfollow(st, h[next])->score)
908                                ^ (segfollow(st, h[next])->score <= prev))
909                                     ))
910                                 next = 1 - next;
911
912                         if (segfollow(st, h[next])->score < prev)
913                                 curr = 1 - curr;
914                         prev = segfollow(st, h[next])->score;
915                         *hp[curr] = h[next];
916                         hp[curr] = &segfollow(st, h[next])->next;
917                         last = h[next];
918                         h[next] = segfollow(st, h[next])->next;
919                 }
920                 *hp[0] = 0xFFFF;
921                 *hp[1] = 0xFFFF;
922         } while (head[0] != 0xFFFF && head[1] != 0xFFFF);
923         if (head[0] == 0xFFFF)
924                 l->first = head[1];
925         else
926                 l->first = head[0];
927         l->last = last;
928 }
929
930 static int statcmp(struct segstat *ss, u16 dev, u32 seg)
931 {
932         if (ss->dev < dev)
933                 return -1;
934         if (ss->dev > dev)
935                 return 1;
936         if (ss->segment < seg)
937                 return -1;
938         if (ss->segment > seg)
939                 return 1;
940         return 0;
941 }
942
943 static struct segstat *segfind(struct segtracker *st, u16 dev,
944                                u32 segnum, u16 *where[SEG_NUM_HEIGHTS])
945 {
946         /* Find the segment entry for dev/segnum.
947          * Return link info in where to allow insertion/deletion.
948          */
949         int level;
950         u16 *head = st->head;
951         for (level = SEG_NUM_HEIGHTS-1 ; level >= 0; level--) {
952                 while (head[level] != 0xffff &&
953                        statcmp(segfollow(st, head[level]), dev, segnum) < 0) {
954                         head = segfollow(st, head[level])->skip;
955                 }
956                 where[level] = &head[level];
957         }
958         if (head[0] == 0xFFFF ||
959             statcmp(segfollow(st, head[0]), dev, segnum) != 0)
960                 return NULL;
961
962         return segfollow(st, head[0]);
963 }
964
965 static int segchoose_height(struct segtracker *st, u16 ssn)
966 {
967         unsigned long rand[DIV_ROUND_UP(SEG_NUM_HEIGHTS * 2 / 8 + 1,
968                                         sizeof(unsigned long))];
969         int max = st->size[ssn>>12];
970         int h = 0;
971         get_random_bytes(rand, sizeof(rand));
972
973         if (max > 1)
974                 h = 1;
975
976         while (h < max &&
977                test_bit(h*2, rand) &&
978                test_bit(h*2+1, rand))
979                 h++;
980         return h;
981 }
982
983 static void seginsert(struct segtracker *st, u16 ssn,
984                       u16 *where[SEG_NUM_HEIGHTS])
985 {
986         /* We looked for 'ss' but didn't find it.  'where' is the
987          * result of looking.  Now insert 'ss'
988          */
989         struct segstat *ss = segfollow(st, ssn);
990         int h = segchoose_height(st, ssn);
991         while (h >= 0) {
992                 ss->skip[h] = *where[h];
993                 *where[h] = ssn;
994                 h--;
995         }
996 }
997
998 static void segdelete(struct segtracker *st, struct segstat *ss)
999 {
1000         /* This segstat has just been removed from a list (free or cleanable).
1001          * Remove it from the skiplist as well
1002          */
1003         u16 *where[SEG_NUM_HEIGHTS];
1004         struct segstat *ss2 = segfind(st, ss->dev, ss->segment, where);
1005         int h;
1006         int pos;
1007
1008         BUG_ON(ss2 == NULL);
1009
1010         pos = *where[0];
1011         for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos ; h++)
1012                 *where[h] = ss2->skip[h];
1013         ss2->next = st->unused.first;
1014         st->unused.first = pos;
1015         st->unused.cnt++;
1016         lafs_check_seg_cnt(st);
1017         // FIXME what about 'last' ??
1018 }
1019
1020 static void segdelete_all(struct segtracker *st, struct fs *fs)
1021 {
1022         /* walk entire skip list.  Any entry with score of SCORE_DEAD
1023          * is deleted
1024          */
1025         u16 *where[SEG_NUM_HEIGHTS];
1026         int h;
1027
1028         for (h = SEG_NUM_HEIGHTS-1; h >= 0; h--)
1029                 where[h] = &st->head[h];
1030
1031         while (*where[0] != 0xFFFF) {
1032                 u16 pos = *where[0];
1033                 struct segstat *ss = segfollow(st, pos);
1034                 if (ss->score == SCORE_DEAD) {
1035                         /* delete this entry */
1036
1037                         /* FIXME Locking is really bad here */
1038                         struct segsum *ssm;
1039                         ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1040                         putdref(ssm->ssblk, MKREF(intable));
1041                         putdref(ssm->youthblk, MKREF(intable));
1042                         ss_put(ssm, fs);
1043
1044                         for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1045                                 *where[h] = ss->skip[h];
1046                         ss->next = st->unused.first;
1047                         st->unused.first = pos;
1048                         st->unused.cnt++;
1049                         lafs_check_seg_cnt(st);
1050                 } else {
1051                         /* advance 'where' to here */
1052                         for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1053                                 where[h] = &ss->skip[h];
1054                 }
1055         }
1056 }
1057
1058 static u16 seg_pop(struct segtracker *st, struct slist *which)
1059 {
1060         struct segstat *ss;
1061         u16 rv = which->first;
1062         if (rv == 0xFFFF)
1063                 return rv;
1064         ss = segfollow(st, rv);
1065         which->first = ss->next;
1066         which->cnt--;
1067         if (ss->next == 0xFFFF)
1068                 which->last = 0xFFFF;
1069         return rv;
1070 }
1071
1072 static struct segstat *seg_add_new(struct segtracker *st, struct slist *which, int atend,
1073                                    int dev, u32 seg, int score, int usage,
1074                                    u16 *where[SEG_NUM_HEIGHTS])
1075 {
1076         int ssn;
1077         struct segstat *ss;
1078
1079         ssn = seg_pop(st, &st->unused);
1080         ss = segfollow(st, ssn);
1081         if (!ss)
1082                 return ss;
1083
1084         if (which) {
1085                 if (atend) {
1086                         int l = which->last;
1087                         ss->next = 0xFFFF;
1088                         if (l == 0xFFFF) {
1089                                 which->first = ssn;
1090                                 which->last = ssn;
1091                         } else {
1092                                 which->last = ssn;
1093                                 BUG_ON(segfollow(st, l)->next != 0xFFFF);
1094                                 segfollow(st, l)->next = ssn;
1095                         }
1096                 } else {
1097                         ss->next = which->first;
1098                         which->first = ssn;
1099                         if (which->last == 0xFFFF)
1100                                 which->last = ssn;
1101                 }
1102                 which->cnt++;
1103         }
1104         ss->dev = dev;
1105         ss->segment = seg;
1106         ss->score = score;
1107         ss->usage = usage;
1108         seginsert(st, ssn, where);
1109         lafs_check_seg_cnt(st);
1110         return ss;
1111 }
1112
1113 void lafs_free_get(struct fs *fs, unsigned int *dev, u32 *seg,
1114                    int nonlogged)
1115 {
1116         /* Select and return a free segment.
1117          * The youth value must have been zero, and we set it to the
1118          * current youth number.
1119          */
1120         u16 *youthp;
1121         struct segstat *ss;
1122         struct segsum *ssum = NULL;
1123         int ssnum;
1124
1125         BUG_ON(nonlogged); // FIXME should handle this case, not ignore it
1126
1127 again:
1128         spin_lock(&fs->lock);
1129
1130         wait_event_lock(fs->phase_wait,
1131                         !fs->scan.first_free_pass ||
1132                         fs->segtrack->free.first != 0xFFFF,
1133                         fs->lock);
1134
1135         ss = segfollow(fs->segtrack, fs->segtrack->free.first);
1136         BUG_ON(!ss);
1137
1138         *dev = ss->dev;
1139         *seg = ss->segment;
1140
1141         /* On the final checkpoint we skip the youth update as
1142          * rollforward will do it for us, and it is too late
1143          * the write the new youth block anyway.
1144          * FIXME there should be a more general way to delay
1145          * updates to the Youth block.
1146          */
1147         if (!test_bit(DelayYouth, &fs->fsstate) &&
1148             !(ssum &&
1149               ssum->devnum == ss->dev &&
1150               ssum->segnum == ss->segment)) {
1151                 spin_unlock(&fs->lock);
1152                 if (ssum)
1153                         ss_put(ssum, fs);
1154
1155                 ssum = segsum_find(fs, ss->segment, ss->dev, 0);
1156                 /* As we hold a ref on youth block for anything in the
1157                  * table, and that block was loaded at the time, it must
1158                  * still be valid.
1159                  */
1160                 BUG_ON(!ssum || !ssum->youthblk);
1161                 BUG_ON(!test_bit(B_Valid, &ssum->youthblk->b.flags));
1162                 set_bit(B_PinPending, &ssum->youthblk->b.flags);
1163                 lafs_pin_dblock(ssum->youthblk, AccountSpace);
1164                 goto again;
1165         }
1166         seg_pop(fs->segtrack, &fs->segtrack->free);
1167
1168         ss->score = SCORE_ACTIVE;
1169         /* still in table, but unlinked */
1170
1171         if (ssum && test_bit(DelayYouth, &fs->fsstate)) {
1172                 ss_put(ssum, fs);
1173                 ssum = NULL;
1174         }
1175
1176         if (ssum) {
1177                 u16 y;
1178                 /* HACK youth_next should always be at least 0x8000 so that
1179                  * cleanable score differentiates well for new segments.
1180                  * old code would sometimes set youth_next very low, so
1181                  * over-ride that
1182                  */
1183
1184                 if (fs->youth_next < 0x8000)
1185                         fs->youth_next = 0x8000;
1186                 y = fs->youth_next;
1187                 if (fs->scan.do_decay &&
1188                     (fs->scan.free_dev < ss->dev
1189                      || (fs->scan.free_dev == ss->dev 
1190                          && fs->scan.free_block < (ss->segment
1191                                                    / (fs->blocksize / 2)))
1192                             ))
1193                         /* Haven't decayed this block yet - revert decay */
1194                         y = decay_undo(y);
1195                 youthp = map_dblock(ssum->youthblk);
1196                 youthp[(*seg) & ((1 << (fs->blocksize_bits
1197                                         - 1)) - 1)]
1198                         = cpu_to_le16(y);
1199                 fs->youth_next++;
1200                 unmap_dblock(ssum->youthblk, youthp);
1201                 lafs_dirty_dblock(ssum->youthblk);
1202         }
1203
1204         spin_unlock(&fs->lock);
1205
1206         /* now need to reserve/dirty/reference the youth and
1207          * segsum block for each snapshot that could possibly
1208          * get written here.
1209          * NOTE: this needs fixing to support snapshots. Later.
1210          */
1211         for (ssnum = 0; ssnum < 1 ; ssnum++) {
1212                 ssum = segsum_find(fs, ss->segment, ss->dev, ssnum);
1213                 if (IS_ERR(ss))
1214                         /* ?? what do I need to release etc */
1215                         /* Maybe this cannot fail because we own references
1216                          * to the two blocks !! */
1217                         LAFS_BUG(1, NULL);
1218                 lafs_checkpoint_lock(fs);
1219                 set_bit(B_PinPending, &ssum->ssblk->b.flags);
1220                 (void)lafs_pin_dblock(ssum->ssblk, AccountSpace);
1221                 lafs_checkpoint_unlock(fs);
1222         }
1223         if (ssum)
1224                 ss_put(ssum, fs);
1225
1226         dprintk("NEXT segment found %d/%d youth %d\n",
1227                 *dev, *seg, fs->youth_next - 1);
1228         /* Note that we return an implicit reference to the ssum */
1229 }
1230
1231 void lafs_seg_forget(struct fs *fs, int dev, u32 seg)
1232 {
1233         /* this segment was being filled and is now full.
1234          * We need to drop it from the table, and drop
1235          * references to the blocks
1236          */
1237         struct segstat tmp;
1238         struct segsum *ss;
1239
1240         spin_lock(&fs->lock);
1241         tmp.dev = dev;
1242         tmp.segment = seg;
1243         segdelete(fs->segtrack, &tmp);
1244         spin_unlock(&fs->lock);
1245
1246         ss = segsum_find(fs, seg, dev, 0);
1247         BUG_ON(IS_ERR(ss));
1248         BUG_ON(atomic_read(&ss->refcnt) < 2);
1249         /* Removed from table so ... */
1250         putdref(ss->ssblk, MKREF(intable));
1251         putdref(ss->youthblk, MKREF(intable));
1252         ss_put(ss, fs);
1253         ss_put(ss, fs);
1254 }
1255
1256 int lafs_clean_count(struct fs *fs, int *any_clean)
1257 {
1258         int rv;
1259         spin_lock(&fs->lock);
1260         *any_clean = fs->segtrack->clean.cnt != 0;
1261         rv = fs->segtrack->free.cnt + fs->segtrack->clean.cnt;
1262         spin_unlock(&fs->lock);
1263         return rv;
1264 }
1265
1266 static int add_free(struct fs *fs, unsigned int dev, u32 seg, u16 *youthp)
1267 {
1268         /* This dev/seg is known to be free. add it to the list */
1269         struct segstat *ss;
1270         u16 *where[SEG_NUM_HEIGHTS];
1271         dprintk("add_free %d %d\n", (int)dev, (int)seg);
1272         spin_lock(&fs->lock);
1273         if (*youthp) {
1274                 /* not really free any more */
1275                 spin_unlock(&fs->lock);
1276                 return 0;
1277         }
1278
1279         ss = segfind(fs->segtrack, dev, seg, where);
1280         if (ss) {
1281                 /* already in the table.  If it happens to be
1282                  * on the cleanable list, we have to wait for
1283                  * the cleaner to find it and add it to our list.
1284                  */
1285                 spin_unlock(&fs->lock);
1286                 return 0;
1287         }
1288         if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1289             fs->segtrack->total / 2) {
1290                 /* Have enough free/clean entries already */
1291                 spin_unlock(&fs->lock);
1292                 return 0;
1293         }
1294
1295         seg_add_new(fs->segtrack, &fs->segtrack->free, 0,
1296                     dev, seg, 0, 0, where);
1297
1298         spin_unlock(&fs->lock);
1299         return 1;
1300 }
1301
1302 static int add_clean(struct fs *fs, unsigned int dev, u32 seg)
1303 {
1304         /* This dev/seg is now clean.  Make sure it is on the list.
1305          * Chances are that this is already present in the table as
1306          * a recently cleaned segment.
1307          * Return TRUE if segment was added to the table (as this
1308          * implies a reference count).
1309          */
1310         struct segstat *ss;
1311         u16 *where[SEG_NUM_HEIGHTS];
1312
1313         dprintk("ADD CLEAN %d/%d %d #####################################\n",
1314                 dev, seg, fs->segtrack->clean.cnt);
1315         spin_lock(&fs->lock);
1316         ss = segfind(fs->segtrack, dev, seg, where);
1317         if (ss) {
1318                 /* already in the table.  If loose, attach it to
1319                  * the clean list.  If cleanable, set score to 0 to make
1320                  * sure it is found soon
1321                  */
1322                 if (ss->score == SCORE_CLEANING) {
1323                         ss->next = fs->segtrack->clean.first;
1324                         fs->segtrack->clean.first = where[0][0];
1325                         if (fs->segtrack->clean.last == 0xFFFF)
1326                                 fs->segtrack->clean.last =
1327                                         fs->segtrack->clean.first;
1328                         fs->segtrack->clean.cnt++;
1329                 }
1330                 if (ss->score != SCORE_ACTIVE) {
1331                         /* Must be on the clean list now */
1332                         ss->score = 0;
1333                         ss->usage = 0;
1334                 }
1335                 spin_unlock(&fs->lock);
1336                 return 0;
1337         }
1338
1339         if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1340             fs->segtrack->total / 2) {
1341                 /* Have enough free/clean entries already */
1342                 spin_unlock(&fs->lock);
1343                 return 0;
1344         }
1345
1346         ss = seg_add_new(fs->segtrack, &fs->segtrack->clean, 0,
1347                          dev, seg, 0, 0, where);
1348
1349         spin_unlock(&fs->lock);
1350         if (ss)
1351                 return 1;
1352         else
1353                 return 0;
1354 }
1355
1356 void lafs_add_active(struct fs *fs, u64 addr)
1357 {
1358         /* add this segment to the table as 'active'.
1359          * This is only called once at mount time for the
1360          * active segments.  Other segments become active
1361          * via free_get
1362          */
1363         struct segstat *ss;
1364         u16 *where[SEG_NUM_HEIGHTS];
1365         struct segsum *ssum = segsum_byaddr(fs, addr, 0);
1366         (void)getdref(ssum->ssblk, MKREF(intable));
1367         (void)getdref(ssum->youthblk, MKREF(intable));
1368         spin_lock(&fs->lock);
1369         ss = segfind(fs->segtrack, ssum->devnum, ssum->segnum, where);
1370         BUG_ON(ss);
1371         ss = seg_add_new(fs->segtrack, NULL, 0,
1372                          ssum->devnum, ssum->segnum,
1373                          SCORE_ACTIVE, 0, where);
1374         BUG_ON(!ss);
1375
1376         spin_unlock(&fs->lock);
1377 }
1378
1379 void lafs_clean_free(struct fs *fs)
1380 {
1381         /* We are finishing off a checkpoint.  Move all from 'clean'
1382          * list to 'free' list, and set the youth for each to 0.
1383          * Note that we can walk the 'clean' list without locking as
1384          * items are only ever removed by this thread.
1385          */
1386         int ssn;
1387         struct segstat *ss;
1388         if (fs->segtrack->clean.cnt == 0)
1389                 return;
1390         for (ssn = fs->segtrack->clean.first ; ssn != 0xffff; ssn = ss->next) {
1391                 struct datablock *db;
1392                 int err;
1393                 ss = segfollow(fs->segtrack, ssn);
1394                 spin_lock(&fs->lock);
1395                 fs->free_blocks += fs->devs[ss->dev].segment_size;
1396                 spin_unlock(&fs->lock);
1397                 db = lafs_get_block(fs->devs[ss->dev].segsum,
1398                                     ss->segment >> (fs->blocksize_bits-1),
1399                                     NULL, GFP_KERNEL | __GFP_NOFAIL,
1400                                     MKREF(cleanfree));
1401                 err = lafs_read_block(db);
1402                 if (err == 0)
1403                         err = lafs_reserve_block(&db->b, AccountSpace);
1404                 if (err == 0) {
1405                         u16 *b = map_dblock(db);
1406                         spin_lock(&fs->stable_lock);
1407                         b[ss->segment & ((fs->blocksize-1)>>1)] = 0;
1408                         spin_unlock(&fs->stable_lock);
1409                         unmap_dblock(db, b);
1410                         lafs_dirty_dblock(db);
1411                 } else
1412                         /* FIXME make filesystem read-only */
1413                         BUG();
1414                 putdref(db, MKREF(cleanfree));
1415         }
1416         /* Now move them to the 'free' list */
1417         spin_lock(&fs->lock);
1418         fs->segtrack->free.cnt += fs->segtrack->clean.cnt;
1419         if (fs->segtrack->free.last == 0xFFFF)
1420                 fs->segtrack->free.first = fs->segtrack->clean.first;
1421         else
1422                 segfollow(fs->segtrack, fs->segtrack->free.last)->next
1423                         = fs->segtrack->clean.first;
1424         fs->segtrack->free.last = fs->segtrack->clean.last;
1425         fs->segtrack->clean.first = 0xFFFF;
1426         fs->segtrack->clean.last = 0xFFFF;
1427         fs->segtrack->clean.cnt = 0;
1428         spin_unlock(&fs->lock);
1429 }
1430
1431 void lafs_dump_cleanable(void)
1432 {
1433         int i;
1434         struct segtracker *st;
1435         u16 ssn;
1436         struct segstat *ss;
1437         if (!dfs) {
1438                 printk("No cleanable table\n");
1439                 return;
1440         }
1441         st = dfs->segtrack;
1442         printk("============= Cleanable table (%d) =================\n",
1443                st->cleanable.cnt);
1444         printk("pos: dev/seg  usage score\n");
1445
1446         i = 0;
1447         for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1448                 ss = segfollow(st, ssn);
1449
1450                 printk("%3d: %3d/%-4d %5d %d\n",
1451                        i, ss->dev,
1452                        ss->segment,
1453                        ss->usage,
1454                        ss->score);
1455                 i++;
1456         }
1457         printk("...sorted....\n");
1458         segsort(st, &st->cleanable);
1459         i = 0;
1460         for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1461                 ss = segfollow(st, ssn);
1462
1463                 printk("%3d: %3d/%-4d %5d %d\n",
1464                        i, ss->dev,
1465                        ss->segment,
1466                        ss->usage,
1467                        ss->score);
1468                 i++;
1469         }
1470         printk("--------------- Free table (%d) ---------------\n",
1471                st->free.cnt);
1472         i = 0;
1473         for (ssn = st->free.first; ssn != 0xffff; ssn = ss->next) {
1474                 ss = segfollow(st, ssn);
1475
1476                 printk("%3d: %3d/%-4d %5d %d\n",
1477                        ssn, ss->dev,
1478                        ss->segment,
1479                        ss->usage,
1480                        ss->score);
1481                 i++;
1482         }
1483         printk("--------------- Clean table (%d) ---------------\n",
1484                st->clean.cnt);
1485         i = 0;
1486         for (ssn = st->clean.first; ssn != 0xffff; ssn = ss->next) {
1487                 ss = segfollow(st, ssn);
1488
1489                 printk("%3d: %3d/%-4d %5d %d\n",
1490                        ssn, ss->dev,
1491                        ss->segment,
1492                        ss->usage,
1493                        ss->score);
1494                 i++;
1495         }
1496         printk("--------\n");
1497         printk("free_blocks=%llu allocated=%llu max_seg=%llu clean_reserved=%llu\n",
1498                dfs->free_blocks, dfs->allocated_blocks, dfs->max_segment,
1499                dfs->clean_reserved);
1500 }
1501
1502 int lafs_get_cleanable(struct fs *fs, u16 *dev, u32 *seg)
1503 {
1504         /* Return the most cleanable segment on the list.
1505          * The segstat is removed from the 'cleanable' list
1506          * and left in the table.  This prevents it being re-added.
1507          * When it is later found to be clean, it will be added
1508          * to the 'clean' list.
1509          */
1510         struct segtracker *st = fs->segtrack;
1511         struct segsum *ssm;
1512         struct segstat *ss;
1513
1514         lafs_check_seg_cnt(st);
1515
1516 retry:
1517         spin_lock(&fs->lock);
1518         if (st->cleanable.first == 0xFFFF) {
1519                 spin_unlock(&fs->lock);
1520                 return 0;
1521         }
1522
1523         if (st->sorted_size < st->cleanable.cnt / 2) {
1524                 segsort(st, &st->cleanable);
1525                 st->sorted_size = st->cleanable.cnt;
1526                 st->max_score = segfollow(st, st->cleanable.last)->score;
1527         }
1528
1529         lafs_check_seg_cnt(fs->segtrack);
1530
1531         ss = segfollow(st, seg_pop(st, &st->cleanable));
1532
1533         if (lafs_check_seg_cnt(fs->segtrack)) {
1534                 int sj;
1535                 printk("first=%x last=%x cnt=%d\n", st->cleanable.first,
1536                        st->cleanable.last, st->cleanable.cnt);
1537                 for (sj = st->cleanable.first ;
1538                      sj != 0xFFFF;
1539                      sj = segfollow(st, sj)->next)
1540                         printk("  %x\n", sj);
1541
1542                 WARN_ON(1);
1543         }
1544
1545         st->sorted_size--;
1546
1547         *dev = ss->dev;
1548         *seg = ss->segment;
1549
1550         dprintk("SEG: cleanable %d/%d score=%d usage=%d\n",
1551                 ss->dev, ss->segment, ss->score, ss->usage);
1552
1553         ss->score = SCORE_CLEANING;
1554         if (ss->usage == 0) {
1555                 int rv;
1556                 spin_unlock(&fs->lock);
1557                 rv = add_clean(fs, ss->dev, ss->segment);
1558                 /* FIXME we dropped the lock, so maybe this could bug?? */
1559                 /* I should make sure I hold the block references */
1560                 BUG_ON(rv);
1561                 goto retry;
1562         }
1563         segdelete(fs->segtrack, ss);
1564         spin_unlock(&fs->lock);
1565
1566         ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1567         putdref(ssm->ssblk, MKREF(intable));
1568         putdref(ssm->youthblk, MKREF(intable));
1569         ss_put(ssm, fs);
1570
1571         return 1;
1572 }
1573
1574 static int add_cleanable(struct fs *fs, unsigned int dev, u32 seg,
1575                    u16 youth, u16 usage)
1576 {
1577         u32 score;
1578         struct segstat *ss;
1579         u32 segsize;
1580         u16 *where[SEG_NUM_HEIGHTS];
1581
1582         lafs_check_seg_cnt(fs->segtrack);
1583         if (fs->scan.trace || lafs_trace || 1)
1584                 printk("CLEANABLE: %u/%lu y=%d u=%d\n",
1585                        dev, (unsigned long)seg, (int)youth, (int)usage);
1586         if (youth < 8)
1587                 return 0;
1588         if (youth >= fs->checkpoint_youth)
1589                 return 0;
1590
1591         segsize = fs->devs[dev].segment_size;
1592         fs->total_free += segsize - usage /* - 1 */;
1593
1594         if (usage == 0) {
1595                 int rv = add_clean(fs, dev, seg);
1596                 lafs_check_seg_cnt(fs->segtrack);
1597                 return rv;
1598         }
1599
1600         if (test_bit(EmergencyClean, &fs->fsstate))
1601                 score = usage;
1602         else
1603                 /* 0x10000 is to ensure this score is always
1604                  * more than the above score */
1605                 score = youth * usage / segsize + 0x10000;
1606
1607         spin_lock(&fs->lock);
1608         if (score > SCORE_MAX)
1609                 score = SCORE_MAX;
1610
1611         ss = segfind(fs->segtrack, dev, seg, where);
1612         if (ss) {
1613                 /* already in the table.  Just update the usage.
1614                  * It must be on the right list.
1615                  */
1616                 if (ss->score == SCORE_ACTIVE)
1617                         ; /* still in use */
1618                 else if (ss->usage == 0 && ss->score > 0)
1619                         ; /* on free list, leave it alone */
1620                 else {
1621                         ss->usage = usage;
1622                         ss->score = score;
1623                 }
1624                 lafs_check_seg_cnt(fs->segtrack);
1625                 spin_unlock(&fs->lock);
1626                 return 0;
1627         }
1628
1629         if (fs->segtrack->cleanable.cnt >= fs->segtrack->total/2) {
1630                 /* Table of cleanable segments is full.  Sort and discard
1631                  * half
1632                  */
1633                 int ssnum, prev;
1634                 struct segstat *ss;
1635                 int keep, i;
1636                 segsort(fs->segtrack, &fs->segtrack->cleanable);
1637                 ssnum = fs->segtrack->cleanable.first;
1638                 keep = (fs->segtrack->cleanable.cnt+1) / 2;
1639                 for (i = 0; i < keep; i++) {
1640                         prev = ssnum;
1641                         ss = segfollow(fs->segtrack, ssnum);
1642                         ssnum = ss->next;
1643                 }
1644                 fs->segtrack->max_score = ss->score;
1645                 ss->next = 0xffff;
1646                 fs->segtrack->cleanable.last = prev;
1647                 fs->segtrack->cleanable.cnt = keep;
1648                 ss = segfollow(fs->segtrack, ssnum);
1649                 while (ss) {
1650                         ss->score = SCORE_DEAD;
1651                         ss = segfollow(fs->segtrack, ss->next);
1652                 }
1653                 segdelete_all(fs->segtrack, fs);
1654                 fs->segtrack->sorted_size = fs->segtrack->total/2;
1655         }
1656         if (fs->segtrack->cleanable.cnt <= fs->segtrack->total/4)
1657                 fs->segtrack->max_score = 0;
1658         if (fs->segtrack->max_score &&
1659             fs->segtrack->max_score < score) {
1660                 /* score too high to bother with */
1661                 lafs_check_seg_cnt(fs->segtrack);
1662                 spin_unlock(&fs->lock);
1663                 return 0;
1664         }
1665
1666         seg_add_new(fs->segtrack, &fs->segtrack->cleanable, 1,
1667                     dev, seg, score, usage, where);
1668         spin_unlock(&fs->lock);
1669         return 1;
1670 }
1671
1672 static void merge_usage(struct fs *fs, u16 *d)
1673 {
1674         u16 *u = fs->scan.free_usages;
1675         int segperblk = fs->blocksize / 2;
1676         int i;
1677
1678         for (i = 0; i < segperblk; i++)
1679                 if (le16_to_cpu(d[i]) > le16_to_cpu(u[i]))
1680                         u[i] = d[i];
1681 }
1682
1683 unsigned long lafs_scan_seg(struct fs *fs)
1684 {
1685         /* Process one block of youth or segment-usage data.  We
1686          * collect free segments (youth==0) into a table that is kept
1687          * sorted to ensure against duplicates.  It is treated like a
1688          * ring buffer with a head and a tail to distinguish free
1689          * space from used space.  head/tail point to the free space
1690          * (which is never empty).  As we scan all segments
1691          * sequentially any free segment we find is placed at the head
1692          * of the free list and the entry just after the tail might
1693          * get discarded if we run out of room, or if that entry
1694          * matches the entry we just inserted.  New free segments are
1695          * allocated from just after the tail.  i.e. we increment the
1696          * tail and return the entry recorded there.  If tail+1 ==
1697          * head, there are no segments in the buffer.
1698          *
1699          * We also collect potentially cleanable segments.  These are any
1700          * segments which is not empty and not non-logged (i.e youth>=8).
1701          * We record them in a table and whenever the table gets full, we
1702          * sort the table by score, and discard the less-cleanable half.
1703          * We only add segments to the table when it is less than half full,
1704          * or when the new segment has a higher score than the segment at
1705          * the half-way mark.
1706          *
1707          * To get the score for a segment, we need its youth and its usage.
1708          * The usage is the maximum usage from all snapshots.
1709          * We allocate a block to hold the current max usage.
1710          * We:
1711          *   load the youth table, find free segments, decay youth if needed.
1712          *   load the first usage table and copy to temporary block
1713          *   repeat: load other usage tables and merge
1714          *   calculate score for each segment and add those with a good score
1715          *
1716          * On the first pass through, we count all the free segments into
1717          * the fs->free_blocks.
1718          *
1719          * In the first pass, we continue make a full pass without deliberately
1720          * waiting.  In subsequent passes ... only once per checkpoint???
1721          */
1722
1723         if (fs->scan.done)
1724                 return MAX_SCHEDULE_TIMEOUT;
1725
1726         dprintk("scan: dev=%d block=%d stage=%d\n",
1727                 fs->scan.free_dev, fs->scan.free_block, fs->scan.free_stage);
1728         if (fs->scan.free_stage == 0) {
1729                 /* Need to find the youth block for next dev/offset.
1730                  * Possibly we are finished with this dev and must go
1731                  * to next.  Possibly we are finished altogether.
1732                  */
1733                 int dev = fs->scan.free_dev;
1734                 int block = fs->scan.free_block + 1;
1735                 int err;
1736
1737                 while (dev < 0 ||
1738                        block > (fs->devs[dev].segment_count
1739                                 >> (fs->blocksize_bits - 1))) {
1740                         dev++;
1741                         block = 0;
1742                         if (dev >= fs->devices) {
1743                                 fs->scan.free_dev = -1;
1744                                 dev = -1;
1745                                 fs->scan.do_decay = 0;
1746                                 fs->scan.trace = 0;
1747                                 fs->total_free_prev = fs->total_free;
1748                                 fs->total_free = 0;
1749                                 fs->scan.first_free_pass = 0;
1750                                 break;
1751                         }
1752                 }
1753                 if (fs->scan.youth_db)
1754                         if (fs->scan.youth_db->b.fileaddr != block ||
1755                             dev < 0 ||
1756                             fs->scan.youth_db->b.inode != fs->devs[dev].segsum) {
1757                                 putdref(fs->scan.youth_db, MKREF(youth_scan));
1758                                 fs->scan.youth_db = NULL;
1759                         }
1760                 if (dev == -1) {
1761                         fs->scan.done = 1;
1762                         lafs_wake_thread(fs);
1763                         return MAX_SCHEDULE_TIMEOUT;
1764                 }
1765
1766                 if (fs->scan.youth_db == NULL)
1767                         fs->scan.youth_db =
1768                                 lafs_get_block(fs->devs[dev].segsum,
1769                                                block,
1770                                                NULL, GFP_KERNEL, MKREF(youth_scan));
1771                 if (!fs->scan.youth_db) {
1772                         printk("EEEEEKKKKK get_block failed\n");
1773                         return MAX_SCHEDULE_TIMEOUT;
1774                 } else
1775                         switch (err = lafs_read_block_async(fs->scan.youth_db)) {
1776                         default:
1777                         case -EIO:
1778                                 printk("EEEEEKKKKK read of youth block failed\n");
1779                                 break;
1780                         case -EAGAIN:
1781                                 return MAX_SCHEDULE_TIMEOUT;
1782                         case 0:
1783                                 break;
1784                         }
1785
1786                 if (fs->scan.do_decay) {
1787                         /* youth_db must be writable */
1788                         struct datablock *db = fs->scan.youth_db;
1789                         lafs_checkpoint_lock(fs);
1790                         set_bit(B_PinPending, &db->b.flags);
1791                         lafs_pin_dblock(db, AccountSpace);
1792                 }
1793                 spin_lock(&fs->lock);
1794                 fs->scan.free_block = block;
1795                 fs->scan.free_dev = dev;
1796                 if (!err && fs->scan.do_decay) {
1797                         u16 *yp = map_dblock(fs->scan.youth_db);
1798                         int i;
1799                         int segperblk = fs->blocksize / 2;
1800
1801                         for (i = 0 ; i < segperblk ; i++) {
1802                                 int y = le16_to_cpu(yp[i]);
1803                                 if (y >= 8)
1804                                         y = decay_youth(y);
1805                         }
1806                         unmap_dblock(fs->scan.youth_db, yp);
1807                         lafs_dirty_dblock(fs->scan.youth_db);
1808                 }
1809                 spin_unlock(&fs->lock);
1810                 if (fs->scan.do_decay)
1811                         lafs_checkpoint_unlock(fs);
1812                 if (err)
1813                         return 1;
1814                 fs->scan.free_stage = 1;
1815         }
1816         lafs_check_seg_cnt(fs->segtrack);
1817         if (fs->scan.free_stage == 1) {
1818                 /* Find the main usage block and copy the data into
1819                  * our temp block
1820                  */
1821                 struct datablock *db;
1822                 char *d;
1823                 u16 *yp;
1824
1825                 int i;
1826                 int firstseg;
1827                 int segperblk = fs->blocksize / 2;
1828                 int segments = segperblk;
1829                 int segcount;
1830                 int blks;
1831
1832                 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1833                                     fs->scan.free_block +
1834                                     fs->devs[fs->scan.free_dev].tablesize,
1835                                     NULL, GFP_KERNEL, MKREF(usage0));
1836                 if (!db) {
1837                         printk("EEEEKKK get_block for first usage failed\n");
1838                 abort:
1839                         fs->scan.free_stage = 0;
1840                         return 1;
1841                 }
1842                 switch (lafs_read_block_async(db)) {
1843                 default:
1844                 case -EIO:
1845                         printk("EEEEKKK read of usage block failed\n");
1846                         putdref(db, MKREF(usage0));
1847                         goto abort;
1848                 case -EAGAIN:
1849                         putdref(db, MKREF(usage0));
1850                         return MAX_SCHEDULE_TIMEOUT;
1851                 case 0:
1852                         break;
1853                 }
1854                 d = map_dblock(db);
1855                 memcpy(fs->scan.free_usages, d, fs->blocksize);
1856                 unmap_dblock(db, d);
1857
1858                 /* Now add any free blocks */
1859                 segcount = fs->devs[fs->scan.free_dev].segment_count;
1860                 blks = segcount / segments;
1861                 if (fs->scan.free_block == blks)
1862                         segments = segcount % segperblk;
1863                 firstseg = fs->scan.free_block * segperblk;
1864
1865                 yp = map_dblock(fs->scan.youth_db);
1866                 for (i = 0; i < segments ; i++)
1867                         if (yp[i] == cpu_to_le16(0)) {
1868                                 if (fs->scan.first_free_pass) {
1869                                         spin_lock(&fs->lock);
1870                                         fs->free_blocks +=
1871                                                 fs->devs[fs->scan.free_dev]
1872                                                 .segment_size;
1873                                         spin_unlock(&fs->lock);
1874                                 }
1875                                 if (add_free(fs, fs->scan.free_dev, firstseg + i,
1876                                              &yp[i])) {
1877                                         /* Everything in the table owns a reference
1878                                          * to youth and segusage[0]
1879                                          */
1880                                         (void)getdref(fs->scan.youth_db, MKREF(intable));
1881                                         (void)getdref(db, MKREF(intable));
1882                                 }
1883                                 fs->total_free +=
1884                                         fs->devs[fs->scan.free_dev]
1885                                         .segment_size /*- 1*/;
1886                         }
1887                 unmap_dblock(fs->scan.youth_db, yp);
1888
1889                 fs->scan.usage0_db = db;
1890                 fs->scan.free_stage = 2;
1891         }
1892         lafs_check_seg_cnt(fs->segtrack);
1893         while (fs->scan.free_stage > 1 &&
1894                fs->scan.free_stage < fs->maxsnapshot + 1) {
1895                 struct datablock *db;
1896                 u16 *d;
1897
1898                 if (fs->ss[fs->scan.free_stage-1].root == NULL) {
1899                         fs->scan.free_stage++;
1900                         continue;
1901                 }
1902                 /* Load the usage block for this snapshot and merge */
1903
1904                 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1905                                     fs->scan.free_block +
1906                                     fs->devs[fs->scan.free_dev].tablesize *
1907                                     LAFSI(fs->ss[fs->scan.free_stage-1].root)
1908                                     ->md.fs.usagetable,
1909                                     NULL, GFP_KERNEL, MKREF(usage_ss));
1910                 if (!db) {
1911                         printk("EEEEKKK get_block for subsequent usage failed\n");
1912                 abort2:
1913                         fs->scan.free_stage = 0;
1914                         putdref(fs->scan.usage0_db, MKREF(usage0));
1915                         fs->scan.usage0_db = NULL;
1916                         return 1;
1917                 }
1918                 switch (lafs_read_block_async(db)) {
1919                 default:
1920                 case -EIO:
1921                         printk("EEEEKKK read of subsequent usage block failed\n");
1922                         putdref(db, MKREF(usage_ss));
1923                         goto abort2;
1924                 case -EAGAIN:
1925                         putdref(db, MKREF(usage_ss));
1926                         return MAX_SCHEDULE_TIMEOUT;
1927                 case 0:
1928                         break;
1929                 }
1930                 d = map_dblock(db);
1931                 merge_usage(fs, d);
1932                 unmap_dblock(db, d);
1933                 fs->scan.free_stage++;
1934                 putdref(db, MKREF(usage_ss));
1935
1936                 lafs_check_seg_cnt(fs->segtrack);
1937         }
1938         if (fs->scan.free_stage == fs->maxsnapshot + 1) {
1939                 /* All usage data has been merged, we can record all these
1940                  * cleanable segments now
1941                  */
1942                 u16 *yp = map_dblock(fs->scan.youth_db);
1943                 u16 *up = fs->scan.free_usages;
1944                 int i;
1945                 int segperblk = fs->blocksize / 2;
1946                 int segments = segperblk;
1947                 int segcount = fs->devs[fs->scan.free_dev].segment_count;
1948                 int blks = segcount / segments;
1949                 if (fs->scan.free_block == blks)
1950                         segments = segcount % segperblk;
1951
1952                 for (i = 0; i < segments; i++)
1953                         if (add_cleanable(fs, fs->scan.free_dev,
1954                                           i + fs->scan.free_block * segperblk,
1955                                           le16_to_cpu(yp[i]),
1956                                           le16_to_cpu(up[i]))) {
1957                                 /* Every segment in table owns these
1958                                  * references
1959                                  */
1960                                 (void)getdref(fs->scan.youth_db, MKREF(intable));
1961                                 (void)getdref(fs->scan.usage0_db, MKREF(intable));
1962                         }
1963
1964                 unmap_dblock(fs->scan.youth_db, yp);
1965                 putdref(fs->scan.usage0_db, MKREF(usage0));
1966                 fs->scan.usage0_db = NULL;
1967                 fs->scan.free_stage = 0;
1968         }
1969         lafs_check_seg_cnt(fs->segtrack);
1970         if (fs->scan.trace)
1971                 return HZ/10;
1972         else if (fs->scan.free_stage == 0)
1973                 return 1;
1974         else
1975                 return MAX_SCHEDULE_TIMEOUT;
1976 }
1977
1978 void lafs_empty_segment_table(struct fs *fs)
1979 {
1980         /* remove everything from the table. */
1981         int ssnum;
1982         struct segstat *ss;
1983
1984         ssnum = fs->segtrack->cleanable.first;
1985         fs->segtrack->cleanable.first = 0xffff;
1986         fs->segtrack->cleanable.last = 0xffff;
1987         fs->segtrack->cleanable.cnt = 0;
1988         while (ssnum != 0xffff) {
1989                 ss = segfollow(fs->segtrack, ssnum);
1990                 ss->score = SCORE_DEAD;
1991                 ssnum = ss->next;
1992         }
1993
1994         ssnum = fs->segtrack->clean.first;
1995         fs->segtrack->clean.first = 0xffff;
1996         fs->segtrack->clean.last = 0xffff;
1997         fs->segtrack->clean.cnt = 0;
1998         while (ssnum != 0xffff) {
1999                 ss = segfollow(fs->segtrack, ssnum);
2000                 ss->score = SCORE_DEAD;
2001                 ssnum = ss->next;
2002         }
2003
2004         ssnum = fs->segtrack->free.first;
2005         fs->segtrack->free.first = 0xffff;
2006         fs->segtrack->free.last = 0xffff;
2007         fs->segtrack->free.cnt = 0;
2008         while (ssnum != 0xffff) {
2009                 ss = segfollow(fs->segtrack, ssnum);
2010                 ss->score = SCORE_DEAD;
2011                 ssnum = ss->next;
2012         }
2013
2014         segdelete_all(fs->segtrack, fs);
2015 }
2016
2017 void lafs_dump_usage(void)
2018 {
2019         if (!dfs)
2020                 return;
2021         dfs->scan.trace = 1;
2022         dfs->scan.done = 0;
2023         lafs_wake_thread(dfs);
2024 }