]> git.neil.brown.name Git - LaFS.git/blob - segments.c
README update
[LaFS.git] / segments.c
1
2 /*
3  * segment tracking routines for LaFS
4  * fs/lafs/segments.c
5  * Copyright (C) 2006-2009
6  * NeilBrown <neilb@suse.de>
7  * Released under the GPL, version 2
8  */
9
10 /* In here we manage:
11  * - active blocks in segment usage files
12  * - next-phase usage updates
13  * - youth tracking and decay
14  * - list of segments that could be cleaned
15  * - list of free segments that could be reused
16  * - tracking space allocation and available free space
17  *
18  * Segment usage blocks:
19  *  Every Pinned, Dirty or Realloc block with physaddr != 0 owns a reference
20  *  on the block in the segment usage file that counts that
21  *  block.  When the physaddr is changed, the new block will always
22  *  be available so the reference is simply moved.
23  *
24  *  Normally when ->physaddr changes, the counts in the relevant
25  *  segusage blocks are updated directly.  However when the change is
26  *  for a write in the next phase (during a checkpoint) that is not
27  *  possible as the segusage blocks need to be written with the old value.
28  *  So we track these updates in a simple data structure that
29  *  records delta against each ss+dev+seg.  If such a data structure
30  *  cannot be allocated (-ENOMEM) the associated write is blocked until
31  *  the checkpoint completes. FIXME - implement that.
32  *
33  *  Every Pinned or Dirty block holds a number of space credits,
34  *  following a pattern described elsewhere.  The space allocation
35  *  unit identifies segments to be used in advance and holds a reference
36  *  on every block.
37  *
38  * Youth
39  *  Youth numbers are handed out sequentially to new write clusters.
40  *  When the available space approaches being full all youth numbers are
41  *  decayed (as differentiation between old youth values is uninteresting).
42  *  This decay is done in a staged fashion, one block at a time so that
43  *  a checkpoint can happen at any time and the current block in the
44  *  decay sweep is recorded.
45  *
46  * Free Segments
47  *  Holding a record of all free segments is impractical in general.  Instead
48  *  We record a limited number of free segments (1 page worth - 512 entries,
49  *  possibly with ranges) and when needed we scan the segment usage
50  *  files for free segments and add them to the list.  This scan needs to
51  *  be rate limited - not sure how best to do that.
52  *
53  * Cleanable Segments
54  *  Similar to free segments we need to keep track of cleanable segments.
55  *  Almost all non-free segments are cleanable, and we assign a weight to
56  *  each based on youth and usage.  We want to keep the most cleanable
57  *  segments based on this weight.
58  *  We keep an unordered list of cleanable segments.  When it becomes
59  *  full we sort it by weight and discard the worst half.
60  *
61  * Scanning
62  *  As there could be hundreds of snapshots (hopefully not thousands)
63  *  we cannot load the segusage block for each snapshot and then parse them
64  *  in parallel.  Instead we allocate space to store a max usage and
65  *  merge each block one at a time into that max.  We then combine the
66  *  max with the youth to get a 64bit weight... I wonder if that is good.
67  *
68  */
69
70 /* Every block that could change its ->physaddr holds a reference
71  * on the segment summary block through this structure
72  */
73
74 #include        "lafs.h"
75 #include        <linux/hash.h>
76 #include        <linux/random.h>
77 #include        <linux/slab.h>
78
79 struct segsum {
80         u32     segnum;
81         int     devnum;
82         int     ssnum;
83
84         struct hlist_node hash;
85
86         atomic_t        refcnt;
87         atomic_t        delayed;
88
89         struct datablock *ssblk;
90         /* youthblk is only set when ssnum is 0 */
91         struct datablock *youthblk;
92 };
93
94 static int shash(u32 segnum, int devnum, int ssnum)
95 {
96         unsigned long h = hash_long(segnum, BITS_PER_LONG);
97         return hash_long(h ^ (devnum | (ssnum << 16)), SHASHBITS);
98 }
99
100 static inline void ss_get(struct segsum *ss)
101 {
102         BUG_ON(atomic_read(&ss->refcnt) == 0);
103         atomic_inc(&ss->refcnt);
104 }
105
106 static void ss_put(struct segsum *ss, struct fs *fs)
107 {
108         if (atomic_dec_and_lock(&ss->refcnt, &fs->stable_lock)) {
109                 if (atomic_read(&ss->delayed) != 0)
110                         spin_unlock(&fs->stable_lock);
111                 else {
112                         if (!hlist_unhashed(&ss->hash))
113                                 hlist_del(&ss->hash);
114                         if (!fs->stable_changed)
115                                 fs->stable_changed = 1;
116                         spin_unlock(&fs->stable_lock);
117                         putdref(ss->ssblk, MKREF(ss));
118                         putdref(ss->youthblk, MKREF(ssyouth));
119                         kfree(ss);
120                 }
121         }
122 }
123
124 static struct segsum *segsum_find(struct fs *fs, u32 segnum,
125                                   int devnum, int ssnum)
126 {
127         struct hlist_head *head = &fs->stable[shash(segnum, devnum, ssnum)];
128         struct segsum *ss, *new = NULL;
129         struct hlist_node *n;
130         struct fs_dev *dv;
131         u32 addr;
132         int err;
133
134 retry:
135         spin_lock(&fs->stable_lock);
136         hlist_for_each_entry(ss, n, head, hash)
137                 if (ss->segnum == segnum &&
138                     ss->ssnum == ssnum &&
139                     ss->devnum == devnum) {
140                         atomic_inc(&ss->refcnt);
141                         spin_unlock(&fs->stable_lock);
142                         if (new)
143                                 ss_put(new, fs);
144                         if (!fs->rolled && fs->qphase != fs->phase)
145                                 return ss;
146                         if (ss->ssblk &&
147                             !test_bit(B_Valid, &ss->ssblk->b.flags) ) {
148                                 /* need to read this now that roll forward
149                                  * has progressed.
150                                  */
151                                 err = lafs_read_block(ss->ssblk);
152                                 if (err) {
153                                         ss_put(ss, fs);
154                                         return ERR_PTR(err);
155                                 }
156                         }
157                         if (ss->youthblk &&
158                             !test_bit(B_Valid, &ss->youthblk->b.flags)) {
159                                 /* need to read this now that roll forward
160                                  * has progressed.
161                                  */
162                                 err = lafs_read_block(ss->youthblk);
163                                 if (err) {
164                                         ss_put(ss, fs);
165                                         return ERR_PTR(err);
166                                 }
167                         }
168                         return ss;
169                 }
170         if (new) {
171                 hlist_add_head(&new->hash, head);
172                 spin_unlock(&fs->stable_lock);
173                 return new;
174         }
175         spin_unlock(&fs->stable_lock);
176
177         /* seems we must allocate something */
178
179         new = kmalloc(sizeof(*new), GFP_KERNEL | __GFP_NOFAIL);
180         new->segnum = segnum;
181         new->devnum = devnum;
182         new->ssnum = ssnum;
183         atomic_set(&new->refcnt, 1);
184         atomic_set(&new->delayed, 0);
185         INIT_HLIST_NODE(&new->hash);
186         dv = fs->devs + devnum;
187         addr = LAFSI(fs->ss[ssnum].root)->md.fs.usagetable * dv->tablesize;
188         addr += segnum >> (fs->blocksize_bits - USAGE_SHIFT);
189         new->ssblk = lafs_get_block(dv->segsum, addr, NULL,
190                                     GFP_KERNEL,
191                                     MKREF(ss));
192         if (ssnum == 0)
193                 new->youthblk = lafs_get_block(dv->segsum,
194                                                segnum >> (fs->blocksize_bits
195                                                           - YOUTH_SHIFT),
196                                                NULL,
197                                                GFP_KERNEL,
198                                                MKREF(ssyouth));
199         else
200                 new->youthblk = NULL;
201         BUG_ON(!new->ssblk || IS_ERR(new->ssblk) || IS_ERR(new->youthblk));
202         if (!fs->rolled && fs->qphase != fs->phase) {
203                 /* Too early to safely read segusage blocks,
204                  * leave it until later
205                  */
206                 goto retry;
207         }
208         err = lafs_read_block(new->ssblk);
209         if (new->youthblk && err == 0)
210                 err = lafs_read_block(new->youthblk);
211         if (err) {
212                 ss_put(new, fs);
213                 return ERR_PTR(err);
214         }
215         goto retry;
216 }
217
218 static struct segsum *segsum_byaddr(struct fs *fs, u64 addr, int ssnum)
219 {
220         int dev;
221         u32 offset;
222         u32 seg;
223         virttoseg(fs, addr, &dev, &seg, &offset);
224
225         return segsum_find(fs, seg, dev, ssnum);
226 }
227
228 void lafs_seg_put_all(struct fs *fs)
229 {
230         /* remove all the ssblk and youthblk references from
231          * all segsums.
232          * This is called when filesystem is being unmounted
233          */
234         int i;
235         for (i = 0; i < SHASHSIZE; i++)
236                 if (!hlist_empty(&fs->stable[i])) {
237                         struct segsum *ss;
238                         struct datablock *b, *y;
239                         struct hlist_node *n, *pos;
240
241                         spin_lock(&fs->stable_lock);
242                 retry:
243                         hlist_for_each_entry_safe(ss, pos, n, &fs->stable[i], hash) {
244                                 b = ss->ssblk;
245                                 y = ss->youthblk;
246                                 if (!b && !y)
247                                         continue;
248                                 ss->ssblk = NULL;
249                                 ss->youthblk = NULL;
250                                 spin_unlock(&fs->stable_lock);
251                                 if (b && test_and_clear_bit(B_Async, &b->b.flags))
252                                         putdref(b, MKREF(async));
253                                 if (y && test_and_clear_bit(B_Async, &y->b.flags))
254                                         putdref(y, MKREF(async));
255                                 putdref(b, MKREF(ss));
256                                 putdref(y, MKREF(ssyouth));
257                                 spin_lock(&fs->stable_lock);
258                                 if (fs->stable_changed)
259                                         goto retry;
260                         }
261                         spin_unlock(&fs->stable_lock);
262                 }
263 }
264
265 /* We need to set SegRef on this block, and all parents,
266  * and the segsum block and all parents, and so on.
267  * We find the first ancestor that does not have SegRef,
268  * find the matching segsum block.  If it already
269  * has SegRef, we add the reference and retry from the start.
270  * If it doesn't have SegRef, we prealloc and then tail recurse.
271  * Note: we never set SegRef on B_Root blocks as their usage
272  * isn't recorded.
273  */
274 int lafs_seg_ref_block(struct block *b, int ssnum)
275 {
276         struct fs *fs = fs_from_inode(b->inode);
277
278         LAFS_BUG(test_bit(B_InoIdx, &b->flags), b);
279         getref(b, MKREF(segref));
280
281         while (!test_bit(B_SegRef, &b->flags)) {
282                 struct block *p;
283                 struct block *b2, *to_put = NULL;
284                 struct segsum *ss;
285                 struct inode *ino;
286                 int err;
287
288                 BUG_ON(test_bit(B_Root, &b->flags));
289
290                 /* The extra reference of 'b2' and the use for 'to_put'
291                  * deserves some explanation.
292                  * We will normally hold an implicit reference to b2 due to
293                  * our reference on 'b' and the fact that b2 is an ancestor
294                  * of 'b'.  However at this point we have no locks on the file
295                  * at all so it is possible that a btree manipulation could
296                  * split b2 resulting in b2 not being the parent of b any more
297                  * so our reference guarantee is lost.  Even in that case,
298                  * the ancestor of b would probably have a B_PrimaryRef on
299                  * b2.  However it is hard to prove that will stay around long
300                  * enough so we take an extra ref just in case.
301                  * We only need the ref when we drop private_lock so only take
302                  * it then.  We cannot drop an old ref at that point, so
303                  * store the old ref in 'to_put' and drop it when releasing
304                  * private_lock.
305                  */
306
307                 b2 = b;
308                 /* Need to check parents */
309                 ino = b->inode;
310                 spin_lock(&ino->i_data.private_lock);
311                 for (p = b;
312                      p && !test_bit(B_SegRef, &p->flags);
313                      p = &(p->parent)->b) {
314                         if (test_bit(B_InoIdx, &p->flags)) {
315                                 struct datablock *db = LAFSI(p->inode)->dblock;
316                                 p = &db->b;
317                                 getref(b2, MKREF(segref2));
318                                 spin_unlock(&ino->i_data.private_lock);
319                                 ino = p->inode;
320                                 if (to_put)
321                                         putref(to_put, MKREF(segref2));
322                                 spin_lock(&ino->i_data.private_lock);
323                                 to_put = b2;
324                         }
325                         if (test_bit(B_SegRef, &p->flags))
326                                 break;
327                         if (!test_bit(B_Root, &p->flags))
328                                 b2 = p;
329                 }
330                 getref(b2, MKREF(segref2));
331                 spin_unlock(&ino->i_data.private_lock);
332                 if (to_put)
333                         putref(to_put, MKREF(segref2));
334                 /* b2 is the first ancestor (closest to root) without SegRef */
335
336                 if (b2->physaddr == 0) {
337                         /* There is no segsum to reference */
338                         set_bit(B_SegRef, &b2->flags);
339                         putref(b2, MKREF(segref2));
340                         continue;
341                 }
342
343                 ss = segsum_byaddr(fs, b2->physaddr, ssnum);
344                 if (IS_ERR(ss)) {
345                         putref(b, MKREF(segref));
346                         putref(b2, MKREF(segref2));
347                         return PTR_ERR(ss);
348                 }
349
350                 if (&ss->ssblk->b == b) {
351                         /* Don't need to check for Prealloc or
352                          * SegRef, just take a reference now
353                          */
354                         if (test_and_set_bit(B_SegRef, &b2->flags))
355                                 ss_put(ss, fs);
356                         putref(b2, MKREF(segref2));
357                         continue;
358                 }
359
360                 /* Don't need iolock here as segusage is very special */
361                 set_bit(B_PinPending, &ss->ssblk->b.flags);
362                 err = lafs_pin_dblock(ss->ssblk, AccountSpace);
363                 if (err) {
364                         ss_put(ss, fs);
365                         putref(b, MKREF(segref));
366                         putref(b2, MKREF(segref2));
367                         return err;
368                 }
369                 if (!test_bit(B_SegRef, &ss->ssblk->b.flags)) {
370                         putref(b, MKREF(segref));
371                         b = getref(&ss->ssblk->b, MKREF(segref));
372                         ss_put(ss, fs);
373                 } else if (test_and_set_bit(B_SegRef, &b2->flags))
374                         ss_put(ss, fs);
375                 putref(b2, MKREF(segref2));
376         }
377         putref(b, MKREF(segref));
378         return 0;
379 }
380
381 void lafs_seg_deref(struct fs *fs, u64 physaddr, int ssnum)
382 {
383         if (physaddr) {
384                 struct segsum *ss = segsum_byaddr(fs, physaddr, ssnum);
385                 BUG_ON(IS_ERR(ss));
386                 BUG_ON(atomic_read(&ss->refcnt) < 2);
387                 ss_put(ss, fs);
388                 ss_put(ss, fs);
389         }
390 }
391
392 static void seg_inc(struct fs *fs, struct segsum *ss, int diff, int in_phase)
393 {
394         if (!in_phase)
395                 atomic_add(diff, &ss->delayed);
396         else {
397                 u32 *b, *p;
398                 b = map_dblock(ss->ssblk);
399                 spin_lock(&fs->stable_lock);
400                 p = &b[ss->segnum & ((fs->blocksize-1)>>USAGE_SHIFT)];
401                 //BUG_ON(diff < 0 && le32_to_cpu(*p) < -diff);
402                 if (diff < 0 && le32_to_cpu(*p) < -diff) {
403                         printk("diff=%d p=%d segnum=%d\n", diff, le32_to_cpu(*p),
404                                ss->segnum);
405                         BUG();
406                 }
407                 *p = cpu_to_le32(le32_to_cpu(*p) + diff);
408                 spin_unlock(&fs->stable_lock);
409                 unmap_dblock(ss->ssblk, b);
410                 lafs_dirty_dblock(ss->ssblk);
411         }
412 }
413
414 void lafs_seg_move(struct fs *fs, u64 oldaddr, u64 newaddr,
415                    int ssnum, int phase, int moveref)
416 {
417         struct segsum *ss;
418
419         dprintk("SEGMOVE %llu %llu\n",
420                 (unsigned long long) oldaddr,
421                 (unsigned long long) newaddr);
422
423         if (newaddr) {
424                 ss = segsum_byaddr(fs, newaddr, ssnum);
425                 seg_inc(fs, ss, 1, fs->qphase == phase);
426
427                 if (!moveref)
428                         ss_put(ss, fs);
429         }
430
431         if (oldaddr) {
432                 ss = segsum_byaddr(fs, oldaddr, ssnum);
433                 seg_inc(fs, ss, -1, fs->qphase == phase);
434                 ss_put(ss, fs);
435                 if (moveref)
436                         ss_put(ss, fs);
437         }
438 }
439
440 static void set_youth(struct fs *fs, struct segsum *ss)
441 {
442         u16 y;
443         u16 *ybuf, *youthp;
444         int dirty = 0;
445
446         /* HACK youth_next should always be at least 0x8000 so that
447          * cleanable score differentiates well for new segments.
448          * old code would sometimes set youth_next very low, so
449          * over-ride that
450          */
451         if (fs->youth_next < 0x8000)
452                 fs->youth_next = 0x8000;
453         y = fs->youth_next;
454         if (fs->scan.do_decay &&
455             (fs->scan.free_dev < ss->devnum
456              || (fs->scan.free_dev == ss->devnum
457                  && fs->scan.free_block < (ss->segnum
458                                            / (fs->blocksize / 2)))
459                     ))
460                 /* Haven't decayed this block yet - revert decay */
461                 y = decay_undo(y);
462         ybuf = map_dblock(ss->youthblk);
463         youthp = ybuf + (ss->segnum & ((1 << (fs->blocksize_bits
464                                               - YOUTH_SHIFT)) - 1));
465         if (le16_to_cpu(*youthp) < 8) {
466                 *youthp = cpu_to_le16(y);
467                 fs->youth_next++;
468                 dirty = 1;
469         }
470         unmap_dblock(ss->youthblk, youthp);
471         if (dirty)
472                 lafs_dirty_dblock(ss->youthblk);
473 }
474
475 static void seg_apply(struct fs *fs, struct segsum *ss)
476 {
477         int err;
478         int diff = atomic_read(&ss->delayed);
479         if (diff == 0)
480                 return;
481         atomic_set(&ss->delayed, 0);
482         dprintk("Seg apply %d %d\n", (int)ss->segnum, diff);
483         err = lafs_read_block(ss->ssblk);
484         BUG_ON(err); // FIXME do something useful here
485         if (ss->youthblk)
486                 err = lafs_read_block(ss->youthblk);
487         BUG_ON(err);
488         seg_inc(fs, ss, diff, 1);
489
490         if (diff > 0 && ss->youthblk)
491                 set_youth(fs, ss);
492 }
493
494 /* lafs_seg_apply_all
495  * During a checkpoint, blocks written to the next phase cannot immediately
496  * update the segment usage tables so those updates need to be delayed.
497  * This function applies all the delayed updates to the segment usage
498  * files at the end of a checkpoint.
499  */
500 void lafs_seg_apply_all(struct fs *fs)
501 {
502         int i;
503
504         for (i = 0 ; i < SHASHSIZE ; i++) {
505                 struct hlist_head *head = &fs->stable[i];
506                 struct segsum *ss;
507                 struct hlist_node *n, *pos;
508                 spin_lock(&fs->stable_lock);
509         retry:
510                 fs->stable_changed = 0;
511                 hlist_for_each_entry_safe(ss, pos, n, head, hash) {
512                         if (atomic_read(&ss->delayed) == 0)
513                                 continue;
514                         atomic_inc(&ss->refcnt);
515                         spin_unlock(&fs->stable_lock);
516                         seg_apply(fs, ss);
517                         ss_put(ss, fs);
518                         spin_lock(&fs->stable_lock);
519                         if (fs->stable_changed)
520                                 goto retry;
521                 }
522                 spin_unlock(&fs->stable_lock);
523         }
524         /* Now any clean segments found earlier are free. */
525 }
526
527 /*
528  * When creating a snapshot, we need to duplicate table[1] in
529  * each segment usage file into table[n].
530  * It would be nice to just copy block pointers but this might
531  * confuse cleaning etc. so duplicate the data for now.
532  */
533 int lafs_seg_dup(struct fs *fs, int newtable)
534 {
535         /* for each device,
536          * get blocks for each table and memcpy
537          */
538         int d;
539         int err = 0;
540         for (d=0; d < fs->devices ; d++) {
541                 struct fs_dev *dv = &fs->devs[d];
542                 int b;
543                 for (b=0 ; b < dv->tablesize ; b++) {
544                         struct datablock *obl, *nbl;
545                         u32 addr;
546                         void *from, *to;
547
548                         addr = dv->tablesize + b;
549                         obl = lafs_get_block(dv->segsum, addr, 0,
550                                              GFP_KERNEL | __GFP_NOFAIL,
551                                              MKREF(ss));
552                         err = lafs_read_block(obl);
553                         if (err) {
554                                 putdref(obl, MKREF(ss));
555                                 goto out;
556                         }
557
558                         addr = dv->tablesize * newtable + b;
559                         nbl = lafs_get_block(dv->segsum, addr, 0,
560                                              GFP_KERNEL | __GFP_NOFAIL,
561                                              MKREF(ss));
562
563                         from = map_dblock(obl);
564                         to = map_dblock_2(nbl);
565                         memcpy(to, from, fs->blocksize);
566                         unmap_dblock_2(nbl, to);
567                         unmap_dblock(obl, from);
568                         lafs_dirty_dblock(nbl);
569                         putdref(obl, MKREF(ss));
570                         putdref(nbl, MKREF(ss));
571                 }
572         }
573 out:
574         return err;
575 }
576
577 /*************************************************************
578  * Space management:  allocate, use, free
579  */
580
581 static int count_credits(struct block *b)
582 {
583         int credits = 0;
584         struct indexblock *ib = iblk(b);
585         struct inode *ino = NULL;
586
587         if (test_bit(B_Credit, &b->flags))
588                 credits++;
589         if (test_bit(B_ICredit, &b->flags))
590                 credits++;
591         if (test_bit(B_NCredit, &b->flags))
592                 credits++;
593         if (test_bit(B_NICredit, &b->flags))
594                 credits++;
595         if (test_bit(B_UnincCredit, &b->flags))
596                 credits++;
597         if (test_bit(B_Dirty, &b->flags))
598                 credits++;
599         if (test_bit(B_Realloc, &b->flags))
600                 credits++;
601
602         if (test_bit(B_Index, &b->flags)) {
603                 list_for_each_entry(b, &ib->children, siblings) {
604                         LAFS_BUG(b->parent != ib, b);
605                         credits += count_credits(b);
606                 }
607                 credits += ib->uninc_table.credits;
608         } else if ((ino = rcu_my_inode(dblk(b))) != NULL &&
609                    LAFSI(ino)->iblock &&
610                    LAFSI(ino)->iblock->b.parent
611                 ) {
612                 LAFS_BUG(LAFSI(ino)->iblock->b.parent != b->parent, b);
613                 credits += count_credits(&LAFSI(ino)->iblock->b);
614         }
615         rcu_iput(ino);
616         return credits;
617 }
618
619 int temp_credits;
620 static void check_credits(struct fs *fs)
621 {
622         /* DEBUGGING AID
623          * count credits in tree and make sure they match allocate_blocks
624          */
625         int credits;
626         if (fs->ss[0].root == NULL ||
627             LAFSI(fs->ss[0].root)->iblock == NULL ||
628             LAFSI(fs->ss[0].root)->dblock == NULL)
629                 return;
630         credits = count_credits(&LAFSI(fs->ss[0].root)->iblock->b);
631         credits += count_credits(&LAFSI(fs->ss[0].root)->dblock->b);
632         if (credits + fs->cluster_updates + temp_credits != fs->allocated_blocks) {
633                 printk("credits=%d updates=%d temp=%d allocated=%d\n", credits,
634                        fs->cluster_updates, temp_credits,
635                        (int)fs->allocated_blocks);
636                 lafs_dump_tree();
637                 WARN_ON(1);
638         }
639 }
640
641 void lafs_space_return(struct fs *fs, int credits)
642 {
643         spin_lock(&fs->alloc_lock);
644         BUG_ON(credits < 0);
645         if (credits > fs->allocated_blocks)
646                 printk("cred=%d ab=%d\n", credits, (int)fs->allocated_blocks);
647         BUG_ON(credits > fs->allocated_blocks);
648         fs->allocated_blocks -= credits;
649         spin_unlock(&fs->alloc_lock);
650         check_credits(fs);
651 }
652
653 int lafs_alloc_cleaner_segs(struct fs *fs, int max)
654 {
655         /* cleaner is about to start.
656          * See how many segments of space can safely
657          * be reserved for its use.
658          * We don't allocate more to the cleaner than we
659          * leave spare for new allocations.
660          */
661         int watermark = fs->max_segment * 4;
662         int rv = 0;
663         spin_lock(&fs->alloc_lock);
664         while (fs->clean_reserved < max * fs->max_segment &&
665                fs->free_blocks > 0 &&
666                (u64)fs->free_blocks > (fs->clean_reserved
667                                        + fs->allocated_blocks
668                                        + watermark)) {
669                 fs->clean_reserved += fs->max_segment;
670                 fs->free_blocks -= fs->max_segment;
671                 rv++;
672         }
673         spin_unlock(&fs->alloc_lock);
674         return rv;
675 }
676
677 int lafs_space_alloc(struct fs *fs, int credits, int why)
678 {
679         /* 'why's for space reservation.
680          * NewSpace means we want to write a block which didn't exist
681          *   before.  This is allowed to fail or block if the cleaner
682          *   appears able to make progress.
683          * ReleaseSpace means we want to write a block which does currently
684          *   exist, so doing this will eventually free up space.  This must
685          *   never fail, but can block.
686          * CleanSpace means we want to write a block to relocate it to
687          *   a 'cleaning' segment.   This may never fail.
688          * AccountSpace means we absolutely need this block now, and it is
689          *   a BUG is there is no space available.
690          */
691         u64 watermark = 0;
692
693         check_credits(fs);
694         spin_lock(&fs->alloc_lock);
695         switch(why) {
696         case NewSpace:
697                 watermark += RELEASE_RESERVED * fs->max_segment;
698                 /* FALL THROUGH */
699         case ReleaseSpace:
700                 watermark += ACCOUNT_RESERVED * fs->max_segment;
701                 /* FALL THROUGH */
702         case CleanSpace:
703         case AccountSpace:
704                 /* Definitely no water mark here. */
705                 break;
706         }
707
708         if (fs->rolled && watermark) {
709                 /* We cannot account properly before roll-forward has
710                  * completed. FIXME once it has completed we need to
711                  * check and invalidate the FS if there was a problem.
712                  */
713                 if (fs->free_blocks < 0 ||
714                     (u64)fs->free_blocks < (fs->allocated_blocks
715                                             + credits + watermark))
716                         credits = 0; /* Sorry, no room */
717         }
718         if (fs->rolled && watermark == 0) {
719                 /* When including the clean_reserved space, there should
720                  * be room for these controlled allocations
721                  */
722                 BUG_ON(fs->free_blocks < 0);
723                 if (fs->free_blocks + fs->clean_reserved <
724                     fs->allocated_blocks + credits)
725                         BUG();
726         }
727
728         if (credits == 0) {
729                 if (why == NewSpace && !test_bit(EmergencyClean, &fs->fsstate))
730                         set_bit(EmergencyPending, &fs->fsstate);
731                 if (why >= ReleaseSpace || !test_bit(EmergencyClean, &fs->fsstate))
732                         if (!test_bit(CleanerBlocks, &fs->fsstate) ||
733                             fs->cleaner.need > watermark + fs->max_segment) {
734                                 fs->cleaner.need = watermark + fs->max_segment;
735                                 set_bit(CleanerBlocks, &fs->fsstate);
736                                 lafs_wake_thread(fs);
737                         }
738         } else if (why == NewSpace)
739                 if (test_bit(EmergencyClean, &fs->fsstate) ||
740                     test_bit(EmergencyPending, &fs->fsstate)) {
741                         clear_bit(EmergencyPending, &fs->fsstate);
742                         clear_bit(EmergencyClean, &fs->fsstate);
743                 }
744
745         fs->allocated_blocks += credits;
746         BUG_ON(fs->free_blocks + fs->clean_reserved < fs->allocated_blocks);
747         spin_unlock(&fs->alloc_lock);
748         return credits;
749 }
750
751 /********************************************************************
752  *
753  * Segment usage / youth tracking
754  */
755
756 /* Information about free and cleanable segments are stored in a table
757  * comprised of a few pages.  Indexes into this table are 16bit.  4bits
758  * for page number, 12 bits for index in the page.
759  * Different pages have different sized entries to allow for different
760  * depths in the skiplist that sorts entries by dev/segment.
761  * Entries are at least 16 bytes, so 12 bit can index up 64K.
762  * Each entry is also on one of four lists:
763  *  - unused:  this entry is not used
764  *  - free: this entry is for a free segment
765  *  - cleanable:  this entry is for a cleanable segment.
766  *  - clean: segment has been cleaned but is not yet free (awaiting checkpoint).
767  * These are singly linked lists (via 'next').  We record head and tail.
768  * The end of this list has a pointer to 0xFFFF, not 0.
769  * We usually remove entries from the head, but when the table is
770  * full, we might walk part way down a list and discard all the rest.
771  * We discard them by setting score to 0xFFFFFFFF.  We then walk the
772  * skip list moving anything with a score of 0xFFFFFFFF to the unused list.
773  *
774  * We occasionally sort the cleanable list using a merge sort.
775  *
776  * There is a progression from 'cleanable' to 'clean' to 'free', though
777  * segments can enter at any point.
778  * When we choose a segment to clean, we remove the entry from the cleanable
779  * list. We do this by setting the score to 0xFFFFFFFE and unlinking it.
780  * After cleaning has completed a scan should find that it is clean and so
781  * add it to the 'clean' list.
782  * When we record a 'clean' segment as 'free' (after a checkpoint) we
783  * move it from the clean list to the free list, from where it will be
784  * removed when needed.
785  * We don't add new free segments when the total of free+clean segments
786  * is more than half the size of the table.
787  * Similarly when the cleanable table reaches half the available size
788  * we remove the least-interesting half.
789  * When we choose a free segment to start filling, we remove it from the
790  * free list but not from the table.  The score is set to 0xFFFFFFFD to
791  * record that it is unlinked but busy.  If it is found to be cleanable,
792  * it will be ignored.  When we finish filling a segment, we find the entry
793  * again and remove it properly so it can become cleanable later.
794  */
795
796 #define SCORE_MAX       0xFFFFFFFFFFFFFFFCULL   /* Maximum normal score */
797 #define SCORE_ACTIVE    0xFFFFFFFFFFFFFFFDULL   /* This segment is being written to */
798 #define SCORE_CLEANING  0xFFFFFFFFFFFFFFFEULL   /* This segment in being cleaned */
799 #define SCORE_DEAD      0xFFFFFFFFFFFFFFFFULL   /* This segment is to be removed */
800
801 struct segstat {
802         u16     next;
803         u16     dev;
804         u32     segment;
805         u64     score;
806         u32     usage;
807         u16     skip[0]; /* or larger... */
808 };
809
810 static inline struct segstat *segfollow(struct segtracker *st, u16 link)
811 {
812         void *a;
813         if (link == 0xFFFF)
814                 return NULL;
815         a = st->page[link >> 12];
816         a += (link & 0xFFF) *
817                 (st->size[link >> 12] * sizeof(u16) + sizeof(struct segstat));
818         return (struct segstat *) a;
819 }
820
821 int lafs_segtrack_init(struct segtracker *st)
822 {
823         int found;
824         int h, i;
825         int n[4];
826
827         st->page[0] = kmalloc(PAGE_SIZE, GFP_KERNEL);
828         st->page[1] = kmalloc(PAGE_SIZE, GFP_KERNEL);
829         st->page[2] = kmalloc(PAGE_SIZE, GFP_KERNEL);
830         st->page[3] = kmalloc(PAGE_SIZE, GFP_KERNEL);
831
832         if (st->page[0] == NULL ||
833             st->page[1] == NULL ||
834             st->page[2] == NULL ||
835             st->page[3] == NULL) {
836                 lafs_segtrack_free(st);
837                 return -ENOMEM;
838         }
839         st->size[0] = 1;
840         st->size[1] = 1;
841         st->size[2] = 5;
842         st->size[3] = 9;
843         BUG_ON(SEG_NUM_HEIGHTS <= 9);
844
845         st->unused.first = st->unused.last = 0xffff;
846         st->cleanable.first = st->cleanable.last = 0xffff;
847         st->free.first = st->free.last = 0xffff;
848         st->clean.first = st->clean.last = 0xffff;
849         st->unused.cnt = st->free.cnt = st->cleanable.cnt = st->clean.cnt = 0;
850
851         for (h = 0; h < SEG_NUM_HEIGHTS; h++)
852                 st->head[h] = 0xFFFF;
853
854         /* how many entries in each page */
855         for (i = 0 ; i < 4; i++)
856                 n[i] = PAGE_SIZE /  (sizeof(struct segstat) +
857                                      st->size[i] * sizeof(u16));
858
859         do {
860                 char rand;
861                 int p;
862                 found = 0;
863                 get_random_bytes(&rand, 1);
864                 p = rand & 3;
865
866                 while (p < 3 && n[p] == 0)
867                         p++;
868                 for ( ; p >= 0; p--)
869                         if (n[p]) {
870                                 int sn;
871                                 struct segstat *ss;
872                                 n[p]--;
873                                 sn = (p<<12) + n[p];
874
875                                 ss = segfollow(st, sn);
876                                 ss->next = st->unused.first;
877                                 st->unused.first = sn;
878                                 if (st->unused.cnt == 0)
879                                         st->unused.last = sn;
880                                 st->unused.cnt++;
881                                 found = 1;
882                         }
883         } while (found);
884         st->total = st->unused.cnt;
885         return 0;
886 }
887
888 void lafs_segtrack_free(struct segtracker *st)
889 {
890         kfree(st->page[0]);
891         kfree(st->page[1]);
892         kfree(st->page[2]);
893         kfree(st->page[3]);
894 }
895
896 int lafs_check_seg_cnt(struct segtracker *st)
897 {
898         /* check at unused, free, cleanable, clean have correct count.
899          */
900         int sn, cnt, prev;
901
902         cnt = 0;
903         for (prev = sn = st->unused.first ;
904              sn != 0xFFFF ;
905              sn = segfollow(st, (prev = sn))->next)
906                 cnt++;
907         if (cnt != st->unused.cnt) {
908                 printk("%d != %d\n", cnt, st->unused.cnt); WARN_ON(1);
909                 return 1;
910         }
911         if (st->unused.last != prev) {
912                 printk("L%d != %d\n", prev, st->unused.last); WARN_ON(1);
913                 return 1;
914         }
915
916         cnt = 0;
917         for (prev = sn = st->free.first ;
918              sn != 0xFFFF;
919              sn = segfollow(st, (prev = sn))->next)
920                 cnt++;
921         if (cnt != st->free.cnt) {
922                 printk("%d != %d\n", cnt, st->free.cnt); WARN_ON(1);
923                 return 1;
924         }
925         if (st->free.last != prev) {
926                 printk("L%d != %d\n", prev, st->free.last); WARN_ON(1);
927                 return 1;
928         }
929
930         cnt = 0;
931         for (prev = sn = st->clean.first ;
932              sn != 0xFFFF;
933              sn = segfollow(st, (prev = sn))->next)
934                 cnt++;
935         if (cnt != st->clean.cnt) {
936                 printk("%d != %d\n", cnt, st->clean.cnt); WARN_ON(1);
937                 return 1;
938         }
939         if (st->clean.last != prev) {
940                 printk("L%d != %d\n", prev, st->clean.last); WARN_ON(1);
941                 return 1;
942         }
943
944         cnt = 0;
945         for (prev = sn = st->cleanable.first ;
946              sn != 0xFFFF;
947              sn = segfollow(st, (prev = sn))->next)
948                 cnt++;
949         if (cnt != st->cleanable.cnt) {
950                 printk("%d != %d\n", cnt, st->cleanable.cnt); WARN_ON(1);
951                 return 1;
952         }
953         if (st->cleanable.last != prev) {
954                 printk("L%d != %d\n", prev, st->cleanable.last); WARN_ON(1);
955                 return 1;
956         }
957
958         return 0;
959 }
960
961 static void segsort(struct segtracker *st, struct slist *l)
962 {
963         /* sort the 'l' list of st by score - lowest first */
964         /* use a merge sort */
965
966         u16 last = 0xFFFF;
967         u16 head[2];
968         head[0] = l->first;
969         head[1] = 0xFFFF;
970
971         do {
972                 u16 *hp[2], h[2];
973                 int curr = 0;
974                 u32 prev = 0;
975                 int next = 0;
976
977                 hp[0] = &head[0];
978                 hp[1] = &head[1];
979
980                 h[0] = head[0];
981                 h[1] = head[1];
982
983                 /* Find the least of h[0] and h[1] that is not less than
984                  * prev, and add it to hp[curr].  If both are larger than
985                  * prev, flip 'curr' and add smallest.
986                  */
987                 while (h[0] != 0xFFFF || h[1] != 0xFFFF) {
988                         if (h[next] == 0xFFFF ||
989                             (h[1-next] != 0xFFFF &&
990                              !((prev <= segfollow(st, h[1-next])->score)
991                                ^ (segfollow(st, h[1-next])->score
992                                   <= segfollow(st, h[next])->score)
993                                ^ (segfollow(st, h[next])->score <= prev))
994                                     ))
995                                 next = 1 - next;
996
997                         if (segfollow(st, h[next])->score < prev)
998                                 curr = 1 - curr;
999                         prev = segfollow(st, h[next])->score;
1000                         *hp[curr] = h[next];
1001                         hp[curr] = &segfollow(st, h[next])->next;
1002                         last = h[next];
1003                         h[next] = segfollow(st, h[next])->next;
1004                 }
1005                 *hp[0] = 0xFFFF;
1006                 *hp[1] = 0xFFFF;
1007         } while (head[0] != 0xFFFF && head[1] != 0xFFFF);
1008         if (head[0] == 0xFFFF)
1009                 l->first = head[1];
1010         else
1011                 l->first = head[0];
1012         l->last = last;
1013 }
1014
1015 static int statcmp(struct segstat *ss, u16 dev, u32 seg)
1016 {
1017         if (ss->dev < dev)
1018                 return -1;
1019         if (ss->dev > dev)
1020                 return 1;
1021         if (ss->segment < seg)
1022                 return -1;
1023         if (ss->segment > seg)
1024                 return 1;
1025         return 0;
1026 }
1027
1028 static struct segstat *segfind(struct segtracker *st, u16 dev,
1029                                u32 segnum, u16 *where[SEG_NUM_HEIGHTS])
1030 {
1031         /* Find the segment entry for dev/segnum.
1032          * Return link info in where to allow insertion/deletion.
1033          */
1034         int level;
1035         u16 *head = st->head;
1036         for (level = SEG_NUM_HEIGHTS-1 ; level >= 0; level--) {
1037                 while (head[level] != 0xffff &&
1038                        statcmp(segfollow(st, head[level]), dev, segnum) < 0) {
1039                         head = segfollow(st, head[level])->skip;
1040                 }
1041                 where[level] = &head[level];
1042         }
1043         if (head[0] == 0xFFFF ||
1044             statcmp(segfollow(st, head[0]), dev, segnum) != 0)
1045                 return NULL;
1046
1047         return segfollow(st, head[0]);
1048 }
1049
1050 static int segchoose_height(struct segtracker *st, u16 ssn)
1051 {
1052         unsigned long rand[DIV_ROUND_UP(SEG_NUM_HEIGHTS * 2 / 8 + 1,
1053                                         sizeof(unsigned long))];
1054         int max = st->size[ssn>>12];
1055         int h = 0;
1056         get_random_bytes(rand, sizeof(rand));
1057
1058         if (max > 1)
1059                 h = 1;
1060
1061         while (h < max &&
1062                test_bit(h*2, rand) &&
1063                test_bit(h*2+1, rand))
1064                 h++;
1065         return h;
1066 }
1067
1068 static void seginsert(struct segtracker *st, u16 ssn,
1069                       u16 *where[SEG_NUM_HEIGHTS])
1070 {
1071         /* We looked for 'ss' but didn't find it.  'where' is the
1072          * result of looking.  Now insert 'ss'
1073          */
1074         struct segstat *ss = segfollow(st, ssn);
1075         int h = segchoose_height(st, ssn);
1076         while (h >= 0) {
1077                 ss->skip[h] = *where[h];
1078                 *where[h] = ssn;
1079                 h--;
1080         }
1081 }
1082
1083 static void segdelete(struct segtracker *st, struct segstat *ss)
1084 {
1085         /* This segstat has just been removed from a list (free or cleanable).
1086          * Remove it from the skiplist as well
1087          */
1088         u16 *where[SEG_NUM_HEIGHTS];
1089         struct segstat *ss2 = segfind(st, ss->dev, ss->segment, where);
1090         int h;
1091         int pos;
1092
1093         BUG_ON(ss2 == NULL);
1094
1095         pos = *where[0];
1096         for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos ; h++)
1097                 *where[h] = ss2->skip[h];
1098         ss2->next = st->unused.first;
1099         st->unused.first = pos;
1100         if (st->unused.cnt == 0)
1101                 st->unused.last = pos;
1102         st->unused.cnt++;
1103         lafs_check_seg_cnt(st);
1104 }
1105
1106 static void segdelete_all(struct segtracker *st, struct fs *fs)
1107 {
1108         /* walk entire skip list.  Any entry with score of SCORE_DEAD
1109          * is deleted
1110          */
1111         u16 *where[SEG_NUM_HEIGHTS];
1112         int h;
1113
1114         for (h = SEG_NUM_HEIGHTS-1; h >= 0; h--)
1115                 where[h] = &st->head[h];
1116
1117         while (*where[0] != 0xFFFF) {
1118                 u16 pos = *where[0];
1119                 struct segstat *ss = segfollow(st, pos);
1120                 if (ss->score == SCORE_DEAD) {
1121                         /* delete this entry */
1122
1123                         /* FIXME Locking is really bad here */
1124                         struct segsum *ssm;
1125                         ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1126                         putdref(ssm->ssblk, MKREF(intable));
1127                         putdref(ssm->youthblk, MKREF(intable));
1128                         ss_put(ssm, fs);
1129
1130                         for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1131                                 *where[h] = ss->skip[h];
1132                         ss->next = st->unused.first;
1133                         st->unused.first = pos;
1134                         if (st->unused.cnt == 0)
1135                                 st->unused.last = pos;
1136                         st->unused.cnt++;
1137                         lafs_check_seg_cnt(st);
1138                 } else {
1139                         /* advance 'where' to here */
1140                         for (h = 0; h < SEG_NUM_HEIGHTS && *where[h] == pos; h++)
1141                                 where[h] = &ss->skip[h];
1142                 }
1143         }
1144 }
1145
1146 static u16 seg_pop(struct segtracker *st, struct slist *which)
1147 {
1148         struct segstat *ss;
1149         u16 rv = which->first;
1150         if (rv == 0xFFFF)
1151                 return rv;
1152         ss = segfollow(st, rv);
1153         which->first = ss->next;
1154         which->cnt--;
1155         if (ss->next == 0xFFFF)
1156                 which->last = 0xFFFF;
1157         return rv;
1158 }
1159
1160 static struct segstat *seg_add_new(struct segtracker *st, struct slist *which, int atend,
1161                                    int dev, u32 seg, long long score, int usage,
1162                                    u16 *where[SEG_NUM_HEIGHTS])
1163 {
1164         int ssn;
1165         struct segstat *ss;
1166
1167         ssn = seg_pop(st, &st->unused);
1168         ss = segfollow(st, ssn);
1169         if (!ss)
1170                 return ss;
1171
1172         if (which) {
1173                 if (atend) {
1174                         int l = which->last;
1175                         ss->next = 0xFFFF;
1176                         if (l == 0xFFFF) {
1177                                 which->first = ssn;
1178                                 which->last = ssn;
1179                         } else {
1180                                 which->last = ssn;
1181                                 BUG_ON(segfollow(st, l)->next != 0xFFFF);
1182                                 segfollow(st, l)->next = ssn;
1183                         }
1184                 } else {
1185                         ss->next = which->first;
1186                         which->first = ssn;
1187                         if (which->last == 0xFFFF)
1188                                 which->last = ssn;
1189                 }
1190                 which->cnt++;
1191         }
1192         ss->dev = dev;
1193         ss->segment = seg;
1194         ss->score = score;
1195         ss->usage = usage;
1196         seginsert(st, ssn, where);
1197         lafs_check_seg_cnt(st);
1198         return ss;
1199 }
1200
1201 void lafs_free_get(struct fs *fs, unsigned int *dev, u32 *seg,
1202                    int nonlogged)
1203 {
1204         /* Select and return a free segment.
1205          * The youth value must have been zero, and we set it to the
1206          * current youth number.
1207          */
1208         struct segstat *ss;
1209         struct segsum *ssum = NULL;
1210         int ssnum;
1211
1212         BUG_ON(nonlogged); // FIXME should handle this case, not ignore it
1213
1214 again:
1215         spin_lock(&fs->lock);
1216
1217         wait_event_lock(fs->phase_wait,
1218                         !fs->scan.first_free_pass ||
1219                         fs->segtrack->free.first != 0xFFFF,
1220                         fs->lock);
1221
1222         ss = segfollow(fs->segtrack, fs->segtrack->free.first);
1223         BUG_ON(!ss);
1224
1225         *dev = ss->dev;
1226         *seg = ss->segment;
1227
1228         if (!test_bit(DelayYouth, &fs->fsstate) &&
1229             !(ssum &&
1230               ssum->devnum == ss->dev &&
1231               ssum->segnum == ss->segment)) {
1232                 spin_unlock(&fs->lock);
1233                 if (ssum)
1234                         ss_put(ssum, fs);
1235
1236                 ssum = segsum_find(fs, ss->segment, ss->dev, 0);
1237                 /* As we hold a ref on youth block for anything in the
1238                  * table, and that block was loaded at the time, it must
1239                  * still be valid.
1240                  */
1241                 BUG_ON(!ssum || !ssum->youthblk);
1242                 BUG_ON(!test_bit(B_Valid, &ssum->youthblk->b.flags));
1243                 set_bit(B_PinPending, &ssum->youthblk->b.flags);
1244                 lafs_pin_dblock(ssum->youthblk, AccountSpace);
1245                 goto again;
1246         }
1247         seg_pop(fs->segtrack, &fs->segtrack->free);
1248
1249         ss->score = SCORE_ACTIVE;
1250         /* still in table, but unlinked */
1251
1252         if (ssum && test_bit(DelayYouth, &fs->fsstate)) {
1253                 ss_put(ssum, fs);
1254                 ssum = NULL;
1255         }
1256
1257         if (ssum)
1258                 set_youth(fs, ssum);
1259
1260         spin_unlock(&fs->lock);
1261
1262         /* now need to reserve/dirty/reference the youth and
1263          * segsum block for each snapshot that could possibly
1264          * get written here.
1265          * NOTE: this needs fixing to support snapshots. Later.
1266          */
1267         for (ssnum = 0; ssnum < 1 ; ssnum++) {
1268                 struct segsum *ssum2;
1269                 ssum2 = segsum_find(fs, ss->segment, ss->dev, ssnum);
1270                 if (IS_ERR(ssum2))
1271                         /* ?? what do I need to release etc */
1272                         /* Maybe this cannot fail because we own references
1273                          * to the two blocks !! */
1274                         LAFS_BUG(1, NULL);
1275                 lafs_checkpoint_lock(fs);
1276                 set_bit(B_PinPending, &ssum2->ssblk->b.flags);
1277                 (void)lafs_pin_dblock(ssum2->ssblk, AccountSpace);
1278                 lafs_checkpoint_unlock(fs);
1279         }
1280         if (ssum)
1281                 ss_put(ssum, fs);
1282
1283         dprintk("NEXT segment found %d/%d youth %d\n",
1284                 *dev, *seg, fs->youth_next - 1);
1285         /* Note that we return an implicit reference to the ssum */
1286 }
1287
1288 void lafs_update_youth(struct fs *fs, int dev, u32 seg)
1289 {
1290         struct segsum *ssum = NULL;
1291         ssum = segsum_find(fs, seg, dev, 0);
1292         set_bit(B_PinPending, &ssum->youthblk->b.flags);
1293         lafs_pin_dblock(ssum->youthblk, AccountSpace);
1294         spin_lock(&fs->lock);
1295         set_youth(fs, ssum);
1296         spin_unlock(&fs->lock);
1297         ss_put(ssum, fs);
1298 }
1299
1300 void lafs_seg_forget(struct fs *fs, int dev, u32 seg)
1301 {
1302         /* this segment was being filled and is now full.
1303          * We need to drop it from the table, and drop
1304          * references to the blocks
1305          */
1306         struct segstat tmp;
1307         struct segsum *ss;
1308
1309         spin_lock(&fs->lock);
1310         tmp.dev = dev;
1311         tmp.segment = seg;
1312         segdelete(fs->segtrack, &tmp);
1313         spin_unlock(&fs->lock);
1314
1315         ss = segsum_find(fs, seg, dev, 0);
1316         BUG_ON(IS_ERR(ss));
1317         BUG_ON(atomic_read(&ss->refcnt) < 2);
1318         /* Removed from table so ... */
1319         putdref(ss->ssblk, MKREF(intable));
1320         putdref(ss->youthblk, MKREF(intable));
1321         ss_put(ss, fs);
1322         ss_put(ss, fs);
1323 }
1324
1325 int lafs_clean_count(struct fs *fs, int *any_clean)
1326 {
1327         int rv;
1328         spin_lock(&fs->lock);
1329         *any_clean = fs->segtrack->clean.cnt != 0;
1330         rv = fs->segtrack->free.cnt + fs->segtrack->clean.cnt;
1331         spin_unlock(&fs->lock);
1332         return rv;
1333 }
1334
1335 static int add_free(struct fs *fs, unsigned int dev, u32 seg, u16 *youthp)
1336 {
1337         /* This dev/seg is known to be free. add it to the list */
1338         struct segstat *ss;
1339         u16 *where[SEG_NUM_HEIGHTS];
1340         dprintk("add_free %d %d\n", (int)dev, (int)seg);
1341         spin_lock(&fs->lock);
1342         if (*youthp) {
1343                 /* not really free any more */
1344                 spin_unlock(&fs->lock);
1345                 return 0;
1346         }
1347
1348         ss = segfind(fs->segtrack, dev, seg, where);
1349         if (ss) {
1350                 /* already in the table.  If it happens to be
1351                  * on the cleanable list, we have to wait for
1352                  * the cleaner to find it and add it to our list.
1353                  */
1354                 spin_unlock(&fs->lock);
1355                 return 0;
1356         }
1357         if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1358             fs->segtrack->total / 2) {
1359                 /* Have enough free/clean entries already */
1360                 spin_unlock(&fs->lock);
1361                 return 0;
1362         }
1363
1364         seg_add_new(fs->segtrack, &fs->segtrack->free, 0,
1365                     dev, seg, 0, 0, where);
1366
1367         spin_unlock(&fs->lock);
1368         return 1;
1369 }
1370
1371 static int add_clean(struct fs *fs, unsigned int dev, u32 seg, bool if_present)
1372 {
1373         /* This dev/seg is now clean.  Make sure it is on the list.
1374          * Chances are that this is already present in the table as
1375          * a recently cleaned segment.  If 'if_present', then insist
1376          * on this.
1377          * Return TRUE if segment was added to the table (as this
1378          * implies a reference count).
1379          */
1380         struct segstat *ss;
1381         u16 *where[SEG_NUM_HEIGHTS];
1382
1383         dprintk("ADD CLEAN %d/%d %d #####################################\n",
1384                 dev, seg, fs->segtrack->clean.cnt);
1385         spin_lock(&fs->lock);
1386         ss = segfind(fs->segtrack, dev, seg, where);
1387         if (ss) {
1388                 /* already in the table.  If loose, attach it to
1389                  * the clean list.  If cleanable, set score to 0 to make
1390                  * sure it is found soon
1391                  */
1392                 if (ss->score == SCORE_CLEANING) {
1393                         ss->next = fs->segtrack->clean.first;
1394                         fs->segtrack->clean.first = where[0][0];
1395                         if (fs->segtrack->clean.last == 0xFFFF)
1396                                 fs->segtrack->clean.last =
1397                                         fs->segtrack->clean.first;
1398                         fs->segtrack->clean.cnt++;
1399                 }
1400                 if (ss->score != SCORE_ACTIVE) {
1401                         /* Must be on the clean list now */
1402                         ss->score = 0;
1403                         ss->usage = 0;
1404                 }
1405                 spin_unlock(&fs->lock);
1406                 return 0;
1407         }
1408         if (if_present) {
1409                 spin_unlock(&fs->lock);
1410                 return 0;
1411         }
1412
1413         if (fs->segtrack->free.cnt + fs->segtrack->clean.cnt >=
1414             fs->segtrack->total / 2) {
1415                 /* Have enough free/clean entries already */
1416                 spin_unlock(&fs->lock);
1417                 return 0;
1418         }
1419
1420         ss = seg_add_new(fs->segtrack, &fs->segtrack->clean, 0,
1421                          dev, seg, 0, 0, where);
1422
1423         spin_unlock(&fs->lock);
1424         if (ss)
1425                 return 1;
1426         else
1427                 return 0;
1428 }
1429
1430 void lafs_add_active(struct fs *fs, u64 addr)
1431 {
1432         /* add this segment to the table as 'active'.
1433          * This is only called once at mount time for the
1434          * active segments.  Other segments become active
1435          * via free_get
1436          */
1437         struct segstat *ss;
1438         u16 *where[SEG_NUM_HEIGHTS];
1439         struct segsum *ssum = segsum_byaddr(fs, addr, 0);
1440         (void)getdref(ssum->ssblk, MKREF(intable));
1441         (void)getdref(ssum->youthblk, MKREF(intable));
1442         spin_lock(&fs->lock);
1443         ss = segfind(fs->segtrack, ssum->devnum, ssum->segnum, where);
1444         BUG_ON(ss);
1445         ss = seg_add_new(fs->segtrack, NULL, 0,
1446                          ssum->devnum, ssum->segnum,
1447                          SCORE_ACTIVE, 0, where);
1448         BUG_ON(!ss);
1449
1450         spin_unlock(&fs->lock);
1451 }
1452
1453 void lafs_clean_free(struct fs *fs)
1454 {
1455         /* We are finishing off a checkpoint.  Move all from 'clean'
1456          * list to 'free' list, and set the youth for each to 0.
1457          * Note that we can walk the 'clean' list without locking as
1458          * items are only ever removed by this thread.
1459          */
1460         int ssn;
1461         struct segstat *ss;
1462         if (fs->segtrack->clean.cnt == 0)
1463                 return;
1464         for (ssn = fs->segtrack->clean.first ; ssn != 0xffff; ssn = ss->next) {
1465                 struct datablock *db;
1466                 int err;
1467                 ss = segfollow(fs->segtrack, ssn);
1468                 spin_lock(&fs->lock);
1469                 fs->free_blocks += fs->devs[ss->dev].segment_size;
1470                 spin_unlock(&fs->lock);
1471                 db = lafs_get_block(fs->devs[ss->dev].segsum,
1472                                     ss->segment >> (fs->blocksize_bits
1473                                                     - YOUTH_SHIFT),
1474                                     NULL, GFP_KERNEL | __GFP_NOFAIL,
1475                                     MKREF(cleanfree));
1476                 err = lafs_read_block(db);
1477                 if (err == 0)
1478                         err = lafs_reserve_block(&db->b, AccountSpace);
1479                 if (err == 0) {
1480                         u16 *b = map_dblock(db);
1481                         spin_lock(&fs->stable_lock);
1482                         b[ss->segment & ((fs->blocksize-1)>>YOUTH_SHIFT)] = 0;
1483                         spin_unlock(&fs->stable_lock);
1484                         unmap_dblock(db, b);
1485                         lafs_dirty_dblock(db);
1486                 } else
1487                         /* FIXME make filesystem read-only */
1488                         BUG();
1489                 putdref(db, MKREF(cleanfree));
1490         }
1491         /* Now move them to the 'free' list */
1492         spin_lock(&fs->lock);
1493         fs->segtrack->free.cnt += fs->segtrack->clean.cnt;
1494         if (fs->segtrack->free.last == 0xFFFF)
1495                 fs->segtrack->free.first = fs->segtrack->clean.first;
1496         else
1497                 segfollow(fs->segtrack, fs->segtrack->free.last)->next
1498                         = fs->segtrack->clean.first;
1499         fs->segtrack->free.last = fs->segtrack->clean.last;
1500         fs->segtrack->clean.first = 0xFFFF;
1501         fs->segtrack->clean.last = 0xFFFF;
1502         fs->segtrack->clean.cnt = 0;
1503         spin_unlock(&fs->lock);
1504 }
1505
1506 void lafs_dump_cleanable(void)
1507 {
1508         int i;
1509         struct segtracker *st;
1510         u16 ssn;
1511         struct segstat *ss;
1512         if (!dfs) {
1513                 printk("No cleanable table\n");
1514                 return;
1515         }
1516         st = dfs->segtrack;
1517         printk("============= Cleanable table (%d) =================\n",
1518                st->cleanable.cnt);
1519         printk("pos: dev/seg  usage score\n");
1520
1521         i = 0;
1522         for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1523                 ss = segfollow(st, ssn);
1524
1525                 printk("%3d: %3d/%-4d %5d %lld\n",
1526                        i, ss->dev,
1527                        ss->segment,
1528                        ss->usage,
1529                        ss->score);
1530                 i++;
1531         }
1532         printk("...sorted....\n");
1533         segsort(st, &st->cleanable);
1534         i = 0;
1535         for (ssn = st->cleanable.first; ssn != 0xffff; ssn = ss->next) {
1536                 ss = segfollow(st, ssn);
1537
1538                 printk("%3d: %3d/%-4d %5d %lld\n",
1539                        i, ss->dev,
1540                        ss->segment,
1541                        ss->usage,
1542                        ss->score);
1543                 i++;
1544         }
1545         printk("--------------- Free table (%d) ---------------\n",
1546                st->free.cnt);
1547         i = 0;
1548         for (ssn = st->free.first; ssn != 0xffff; ssn = ss->next) {
1549                 ss = segfollow(st, ssn);
1550
1551                 printk("%3d: %3d/%-4d %5d %lld\n",
1552                        ssn, ss->dev,
1553                        ss->segment,
1554                        ss->usage,
1555                        ss->score);
1556                 i++;
1557         }
1558         printk("--------------- Clean table (%d) ---------------\n",
1559                st->clean.cnt);
1560         i = 0;
1561         for (ssn = st->clean.first; ssn != 0xffff; ssn = ss->next) {
1562                 ss = segfollow(st, ssn);
1563
1564                 printk("%3d: %3d/%-4d %5d %lld\n",
1565                        ssn, ss->dev,
1566                        ss->segment,
1567                        ss->usage,
1568                        ss->score);
1569                 i++;
1570         }
1571         printk("--------\n");
1572         printk("free_blocks=%lld allocated=%llu max_seg=%llu clean_reserved=%llu\n",
1573                dfs->free_blocks, dfs->allocated_blocks, dfs->max_segment,
1574                dfs->clean_reserved);
1575 }
1576
1577 int lafs_get_cleanable(struct fs *fs, u16 *dev, u32 *seg)
1578 {
1579         /* Return the most cleanable segment on the list.
1580          * The segstat is removed from the 'cleanable' list
1581          * and left in the table.  This prevents it being re-added.
1582          * When it is later found to be clean, it will be added
1583          * to the 'clean' list.
1584          */
1585         struct segtracker *st = fs->segtrack;
1586         struct segsum *ssm;
1587         struct segstat *ss;
1588
1589         lafs_check_seg_cnt(st);
1590
1591 retry:
1592         spin_lock(&fs->lock);
1593         if (st->cleanable.first == 0xFFFF) {
1594                 spin_unlock(&fs->lock);
1595                 return 0;
1596         }
1597
1598         if (st->sorted_size < st->cleanable.cnt / 2) {
1599                 segsort(st, &st->cleanable);
1600                 st->sorted_size = st->cleanable.cnt;
1601                 st->max_score = segfollow(st, st->cleanable.last)->score;
1602         }
1603
1604         lafs_check_seg_cnt(fs->segtrack);
1605
1606         ss = segfollow(st, seg_pop(st, &st->cleanable));
1607
1608         if (lafs_check_seg_cnt(fs->segtrack)) {
1609                 int sj;
1610                 printk("first=%x last=%x cnt=%d\n", st->cleanable.first,
1611                        st->cleanable.last, st->cleanable.cnt);
1612                 for (sj = st->cleanable.first ;
1613                      sj != 0xFFFF;
1614                      sj = segfollow(st, sj)->next)
1615                         printk("  %x\n", sj);
1616
1617                 WARN_ON(1);
1618         }
1619
1620         st->sorted_size--;
1621
1622         *dev = ss->dev;
1623         *seg = ss->segment;
1624
1625         dprintk("SEG: cleanable %d/%d score=%llu usage=%d\n",
1626                 ss->dev, ss->segment, (unsigned long long)ss->score, ss->usage);
1627
1628         ss->score = SCORE_CLEANING;
1629         if (ss->usage == 0) {
1630                 int rv;
1631                 spin_unlock(&fs->lock);
1632                 rv = add_clean(fs, *dev, *seg, true);
1633                 BUG_ON(rv); /* passing 'true' makes this impossible */
1634                 goto retry;
1635         }
1636         segdelete(fs->segtrack, ss);
1637         spin_unlock(&fs->lock);
1638
1639         ssm = segsum_find(fs, ss->segment, ss->dev, 0);
1640         putdref(ssm->ssblk, MKREF(intable));
1641         putdref(ssm->youthblk, MKREF(intable));
1642         ss_put(ssm, fs);
1643
1644         return 1;
1645 }
1646
1647 static int add_cleanable(struct fs *fs, unsigned int dev, u32 seg,
1648                    u16 youth, u32 usage)
1649 {
1650         u64 score;
1651         struct segstat *ss;
1652         u32 segsize;
1653         u16 *where[SEG_NUM_HEIGHTS];
1654
1655         lafs_check_seg_cnt(fs->segtrack);
1656         if (fs->scan.trace || lafs_trace || 1)
1657                 printk("CLEANABLE: %u/%lu y=%d u=%d\n",
1658                        dev, (unsigned long)seg, (int)youth, (int)usage);
1659         if (youth < 8)
1660                 return 0;
1661         if (youth >= fs->checkpoint_youth)
1662                 return 0;
1663
1664         segsize = fs->devs[dev].segment_size;
1665         fs->total_free += segsize - usage /* - 1 */;
1666
1667         if (usage == 0) {
1668                 int rv = add_clean(fs, dev, seg, false);
1669                 lafs_check_seg_cnt(fs->segtrack);
1670                 return rv;
1671         }
1672
1673         if (test_bit(EmergencyClean, &fs->fsstate))
1674                 score = usage;
1675         else {
1676                 /* 0x100000000 is to ensure this score is always
1677                  * more than the above score */
1678                 score = (u64)youth * usage;
1679                 do_div(score, segsize);
1680                 score += 0x100000000;
1681         }
1682
1683         spin_lock(&fs->lock);
1684         if (score > SCORE_MAX)
1685                 score = SCORE_MAX;
1686
1687         ss = segfind(fs->segtrack, dev, seg, where);
1688         if (ss) {
1689                 /* already in the table.  Just update the usage.
1690                  * It must be on the right list.
1691                  */
1692                 if (ss->score == SCORE_ACTIVE)
1693                         ; /* still in use */
1694                 else if (ss->usage == 0 && ss->score > 0)
1695                         ; /* on free list, leave it alone */
1696                 else {
1697                         ss->usage = usage;
1698                         ss->score = score;
1699                 }
1700                 lafs_check_seg_cnt(fs->segtrack);
1701                 spin_unlock(&fs->lock);
1702                 return 0;
1703         }
1704
1705         if (fs->segtrack->cleanable.cnt >= fs->segtrack->total/2) {
1706                 /* Table of cleanable segments is full.  Sort and discard
1707                  * half
1708                  */
1709                 int ssnum, prev;
1710                 struct segstat *ss;
1711                 int keep, i;
1712                 segsort(fs->segtrack, &fs->segtrack->cleanable);
1713                 ssnum = fs->segtrack->cleanable.first;
1714                 keep = (fs->segtrack->cleanable.cnt+1) / 2;
1715                 for (i = 0; i < keep; i++) {
1716                         prev = ssnum;
1717                         ss = segfollow(fs->segtrack, ssnum);
1718                         ssnum = ss->next;
1719                 }
1720                 fs->segtrack->max_score = ss->score;
1721                 ss->next = 0xffff;
1722                 fs->segtrack->cleanable.last = prev;
1723                 fs->segtrack->cleanable.cnt = keep;
1724                 ss = segfollow(fs->segtrack, ssnum);
1725                 while (ss) {
1726                         ss->score = SCORE_DEAD;
1727                         ss = segfollow(fs->segtrack, ss->next);
1728                 }
1729                 segdelete_all(fs->segtrack, fs);
1730                 fs->segtrack->sorted_size = fs->segtrack->total/2;
1731         }
1732         if (fs->segtrack->cleanable.cnt <= fs->segtrack->total/4)
1733                 fs->segtrack->max_score = 0;
1734         if (fs->segtrack->max_score &&
1735             fs->segtrack->max_score < score) {
1736                 /* score too high to bother with */
1737                 lafs_check_seg_cnt(fs->segtrack);
1738                 spin_unlock(&fs->lock);
1739                 return 0;
1740         }
1741
1742         seg_add_new(fs->segtrack, &fs->segtrack->cleanable, 1,
1743                     dev, seg, score, usage, where);
1744         spin_unlock(&fs->lock);
1745         return 1;
1746 }
1747
1748 static void merge_usage(struct fs *fs, u32 *d)
1749 {
1750         u32 *u = fs->scan.free_usages;
1751         int segperblk = fs->blocksize >> USAGE_SHIFT;
1752         int i;
1753
1754         for (i = 0; i < segperblk; i++)
1755                 if (le32_to_cpu(d[i]) > le32_to_cpu(u[i]))
1756                         u[i] = d[i];
1757 }
1758
1759 unsigned long lafs_scan_seg(struct fs *fs)
1760 {
1761         /* FIXME this comment is very out-dated */
1762         /* Process one block of youth or segment-usage data.  We
1763          * collect free segments (youth==0) into a table that is kept
1764          * sorted to ensure against duplicates.  It is treated like a
1765          * ring buffer with a head and a tail to distinguish free
1766          * space from used space.  head/tail point to the free space
1767          * (which is never empty).  As we scan all segments
1768          * sequentially any free segment we find is placed at the head
1769          * of the free list and the entry just after the tail might
1770          * get discarded if we run out of room, or if that entry
1771          * matches the entry we just inserted.  New free segments are
1772          * allocated from just after the tail.  i.e. we increment the
1773          * tail and return the entry recorded there.  If tail+1 ==
1774          * head, there are no segments in the buffer.
1775          *
1776          * We also collect potentially cleanable segments.  These are any
1777          * segments which is not empty and not non-logged (i.e youth>=8).
1778          * We record them in a table and whenever the table gets full, we
1779          * sort the table by score, and discard the less-cleanable half.
1780          * We only add segments to the table when it is less than half full,
1781          * or when the new segment has a higher score than the segment at
1782          * the half-way mark.
1783          *
1784          * To get the score for a segment, we need its youth and its usage.
1785          * The usage is the maximum usage from all snapshots.
1786          * We allocate a block to hold the current max usage.
1787          * We:
1788          *   load the youth table, find free segments, decay youth if needed.
1789          *   load the first usage table and copy to temporary block
1790          *   repeat: load other usage tables and merge
1791          *   calculate score for each segment and add those with a good score
1792          *
1793          * On the first pass through, we count all the free segments into
1794          * the fs->free_blocks.
1795          *
1796          * In the first pass, we continue make a full pass without deliberately
1797          * waiting.  In subsequent passes ... only once per checkpoint???
1798          */
1799
1800         if (fs->scan.done)
1801                 return MAX_SCHEDULE_TIMEOUT;
1802
1803         dprintk("scan: dev=%d block=%d stage=%d\n",
1804                 fs->scan.free_dev, fs->scan.free_block, fs->scan.free_stage);
1805         if (fs->scan.free_stage == 0) {
1806                 /* Need to find the youth block for next dev/offset.
1807                  * Possibly we are finished with this dev and must go
1808                  * to next.  Possibly we are finished altogether.
1809                  */
1810                 int dev = fs->scan.free_dev;
1811                 int block = fs->scan.free_block + 1;
1812                 int youthblock = block >> (USAGE_SHIFT - YOUTH_SHIFT);
1813                 int err;
1814
1815                 while (dev < 0 ||
1816                        block > (fs->devs[dev].segment_count
1817                                 >> (fs->blocksize_bits - 1))) {
1818                         dev++;
1819                         block = 0;
1820                         if (dev >= fs->devices) {
1821                                 fs->scan.free_dev = -1;
1822                                 dev = -1;
1823                                 fs->scan.do_decay = 0;
1824                                 fs->scan.trace = 0;
1825                                 fs->total_free_prev = fs->total_free;
1826                                 fs->total_free = 0;
1827                                 fs->scan.first_free_pass = 0;
1828                                 break;
1829                         }
1830                 }
1831                 if (fs->scan.youth_db)
1832                         if (fs->scan.youth_db->b.fileaddr != youthblock ||
1833                             dev < 0 ||
1834                             fs->scan.youth_db->b.inode != fs->devs[dev].segsum) {
1835                                 putdref(fs->scan.youth_db, MKREF(youth_scan));
1836                                 fs->scan.youth_db = NULL;
1837                         }
1838                 if (dev == -1) {
1839                         fs->scan.done = 1;
1840                         lafs_wake_thread(fs);
1841                         return MAX_SCHEDULE_TIMEOUT;
1842                 }
1843
1844                 if (fs->scan.youth_db == NULL)
1845                         fs->scan.youth_db =
1846                                 lafs_get_block(fs->devs[dev].segsum,
1847                                                youthblock,
1848                                                NULL, GFP_KERNEL, MKREF(youth_scan));
1849                 if (!fs->scan.youth_db) {
1850                         printk("EEEEEKKKKK get_block failed\n");
1851                         return MAX_SCHEDULE_TIMEOUT;
1852                 } else
1853                         switch (err = lafs_read_block_async(fs->scan.youth_db)) {
1854                         default:
1855                         case -EIO:
1856                                 printk("EEEEEKKKKK read of youth block failed\n");
1857                                 break;
1858                         case -EAGAIN:
1859                                 return MAX_SCHEDULE_TIMEOUT;
1860                         case 0:
1861                                 break;
1862                         }
1863
1864                 if (fs->scan.do_decay) {
1865                         /* youth_db must be writable */
1866                         struct datablock *db = fs->scan.youth_db;
1867                         lafs_checkpoint_lock(fs);
1868                         set_bit(B_PinPending, &db->b.flags);
1869                         lafs_pin_dblock(db, AccountSpace);
1870                 }
1871                 spin_lock(&fs->lock);
1872                 fs->scan.free_block = block;
1873                 fs->scan.free_dev = dev;
1874                 if (!err && fs->scan.do_decay &&
1875                     youthblock << (USAGE_SHIFT - YOUTH_SHIFT) == block) {
1876                         u16 *yp = map_dblock(fs->scan.youth_db);
1877                         int i;
1878                         int segperblk = fs->blocksize >> YOUTH_SHIFT;
1879
1880                         for (i = 0 ; i < segperblk ; i++) {
1881                                 int y = le16_to_cpu(yp[i]);
1882                                 if (y >= 8)
1883                                         y = decay_youth(y);
1884                         }
1885                         unmap_dblock(fs->scan.youth_db, yp);
1886                         lafs_dirty_dblock(fs->scan.youth_db);
1887                 }
1888                 spin_unlock(&fs->lock);
1889                 if (fs->scan.do_decay)
1890                         lafs_checkpoint_unlock(fs);
1891                 if (err)
1892                         return 1;
1893                 fs->scan.free_stage = 1;
1894         }
1895         lafs_check_seg_cnt(fs->segtrack);
1896         if (fs->scan.free_stage == 1) {
1897                 /* Find the main usage block and copy the data into
1898                  * our temp block
1899                  */
1900                 struct datablock *db;
1901                 char *d;
1902                 u16 *yp, *yp0;
1903
1904                 int i;
1905                 int firstseg;
1906                 int segperblk = fs->blocksize >> USAGE_SHIFT;
1907                 int segments = segperblk;
1908                 int segcount;
1909                 int blks;
1910
1911                 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1912                                     fs->scan.free_block +
1913                                     fs->devs[fs->scan.free_dev].tablesize,
1914                                     NULL, GFP_KERNEL, MKREF(usage0));
1915                 if (!db) {
1916                         printk("EEEEKKK get_block for first usage failed\n");
1917                 abort:
1918                         fs->scan.free_stage = 0;
1919                         return 1;
1920                 }
1921                 switch (lafs_read_block_async(db)) {
1922                 default:
1923                 case -EIO:
1924                         printk("EEEEKKK read of usage block failed\n");
1925                         putdref(db, MKREF(usage0));
1926                         goto abort;
1927                 case -EAGAIN:
1928                         putdref(db, MKREF(usage0));
1929                         return MAX_SCHEDULE_TIMEOUT;
1930                 case 0:
1931                         break;
1932                 }
1933                 d = map_dblock(db);
1934                 memcpy(fs->scan.free_usages, d, fs->blocksize);
1935                 unmap_dblock(db, d);
1936
1937                 /* Now add any free blocks */
1938                 segcount = fs->devs[fs->scan.free_dev].segment_count;
1939                 blks = segcount / segments;
1940                 if (fs->scan.free_block == blks)
1941                         segments = segcount % segperblk;
1942                 firstseg = fs->scan.free_block * segperblk;
1943
1944                 yp0 = yp = map_dblock(fs->scan.youth_db);
1945                 yp += (fs->scan.free_block -
1946                        (fs->scan.youth_db->b.fileaddr << (USAGE_SHIFT - YOUTH_SHIFT)))
1947                         * segperblk;
1948                 for (i = 0; i < segments ; i++)
1949                         if (yp[i] == cpu_to_le16(0)) {
1950                                 if (fs->scan.first_free_pass) {
1951                                         spin_lock(&fs->lock);
1952                                         fs->free_blocks +=
1953                                                 fs->devs[fs->scan.free_dev]
1954                                                 .segment_size;
1955                                         spin_unlock(&fs->lock);
1956                                 }
1957                                 if (add_free(fs, fs->scan.free_dev, firstseg + i,
1958                                              &yp[i])) {
1959                                         /* Everything in the table owns a reference
1960                                          * to youth and segusage[0]
1961                                          */
1962                                         (void)getdref(fs->scan.youth_db, MKREF(intable));
1963                                         (void)getdref(db, MKREF(intable));
1964                                 }
1965                                 fs->total_free +=
1966                                         fs->devs[fs->scan.free_dev]
1967                                         .segment_size /*- 1*/;
1968                         }
1969                 unmap_dblock(fs->scan.youth_db, yp0);
1970
1971                 fs->scan.usage0_db = db;
1972                 fs->scan.free_stage = 2;
1973         }
1974         lafs_check_seg_cnt(fs->segtrack);
1975         while (fs->scan.free_stage > 1 &&
1976                fs->scan.free_stage < fs->maxsnapshot + 1) {
1977                 struct datablock *db;
1978                 u32 *d;
1979
1980                 if (fs->ss[fs->scan.free_stage-1].root == NULL) {
1981                         fs->scan.free_stage++;
1982                         continue;
1983                 }
1984                 /* Load the usage block for this snapshot and merge */
1985
1986                 db = lafs_get_block(fs->devs[fs->scan.free_dev].segsum,
1987                                     fs->scan.free_block +
1988                                     fs->devs[fs->scan.free_dev].tablesize *
1989                                     LAFSI(fs->ss[fs->scan.free_stage-1].root)
1990                                     ->md.fs.usagetable,
1991                                     NULL, GFP_KERNEL, MKREF(usage_ss));
1992                 if (!db) {
1993                         printk("EEEEKKK get_block for subsequent usage failed\n");
1994                 abort2:
1995                         fs->scan.free_stage = 0;
1996                         putdref(fs->scan.usage0_db, MKREF(usage0));
1997                         fs->scan.usage0_db = NULL;
1998                         return 1;
1999                 }
2000                 switch (lafs_read_block_async(db)) {
2001                 default:
2002                 case -EIO:
2003                         printk("EEEEKKK read of subsequent usage block failed\n");
2004                         putdref(db, MKREF(usage_ss));
2005                         goto abort2;
2006                 case -EAGAIN:
2007                         putdref(db, MKREF(usage_ss));
2008                         return MAX_SCHEDULE_TIMEOUT;
2009                 case 0:
2010                         break;
2011                 }
2012                 d = map_dblock(db);
2013                 merge_usage(fs, d);
2014                 unmap_dblock(db, d);
2015                 fs->scan.free_stage++;
2016                 putdref(db, MKREF(usage_ss));
2017
2018                 lafs_check_seg_cnt(fs->segtrack);
2019         }
2020         if (fs->scan.free_stage == fs->maxsnapshot + 1) {
2021                 /* All usage data has been merged, we can record all these
2022                  * cleanable segments now
2023                  */
2024                 u16 *yp = map_dblock(fs->scan.youth_db);
2025                 u16 *yp0 = yp;
2026                 u32 *up = fs->scan.free_usages;
2027                 int i;
2028                 int segperblk = fs->blocksize >> USAGE_SHIFT;
2029                 int segments = segperblk;
2030                 int segcount = fs->devs[fs->scan.free_dev].segment_count;
2031                 int blks = segcount / segments;
2032                 if (fs->scan.free_block == blks)
2033                         segments = segcount % segperblk;
2034
2035                 yp += (fs->scan.free_block -
2036                        (fs->scan.youth_db->b.fileaddr << (USAGE_SHIFT - YOUTH_SHIFT)))
2037                         * segperblk;
2038                 for (i = 0; i < segments; i++)
2039                         if (add_cleanable(fs, fs->scan.free_dev,
2040                                           i + fs->scan.free_block * segperblk,
2041                                           le16_to_cpu(yp[i]),
2042                                           le16_to_cpu(up[i]))) {
2043                                 /* Every segment in table owns these
2044                                  * references
2045                                  */
2046                                 (void)getdref(fs->scan.youth_db, MKREF(intable));
2047                                 (void)getdref(fs->scan.usage0_db, MKREF(intable));
2048                         }
2049
2050                 unmap_dblock(fs->scan.youth_db, yp0);
2051                 putdref(fs->scan.usage0_db, MKREF(usage0));
2052                 fs->scan.usage0_db = NULL;
2053                 fs->scan.free_stage = 0;
2054         }
2055         lafs_check_seg_cnt(fs->segtrack);
2056         if (fs->scan.trace)
2057                 return HZ/10;
2058         else if (fs->scan.free_stage == 0)
2059                 return 1;
2060         else
2061                 return MAX_SCHEDULE_TIMEOUT;
2062 }
2063
2064 void lafs_empty_segment_table(struct fs *fs)
2065 {
2066         /* remove everything from the table. */
2067         int ssnum;
2068         struct segstat *ss;
2069
2070         ssnum = fs->segtrack->cleanable.first;
2071         fs->segtrack->cleanable.first = 0xffff;
2072         fs->segtrack->cleanable.last = 0xffff;
2073         fs->segtrack->cleanable.cnt = 0;
2074         while (ssnum != 0xffff) {
2075                 ss = segfollow(fs->segtrack, ssnum);
2076                 ss->score = SCORE_DEAD;
2077                 ssnum = ss->next;
2078         }
2079
2080         ssnum = fs->segtrack->clean.first;
2081         fs->segtrack->clean.first = 0xffff;
2082         fs->segtrack->clean.last = 0xffff;
2083         fs->segtrack->clean.cnt = 0;
2084         while (ssnum != 0xffff) {
2085                 ss = segfollow(fs->segtrack, ssnum);
2086                 ss->score = SCORE_DEAD;
2087                 ssnum = ss->next;
2088         }
2089
2090         ssnum = fs->segtrack->free.first;
2091         fs->segtrack->free.first = 0xffff;
2092         fs->segtrack->free.last = 0xffff;
2093         fs->segtrack->free.cnt = 0;
2094         while (ssnum != 0xffff) {
2095                 ss = segfollow(fs->segtrack, ssnum);
2096                 ss->score = SCORE_DEAD;
2097                 ssnum = ss->next;
2098         }
2099
2100         segdelete_all(fs->segtrack, fs);
2101 }
2102
2103 void lafs_dump_usage(void)
2104 {
2105         if (!dfs)
2106                 return;
2107         dfs->scan.trace = 1;
2108         dfs->scan.done = 0;
2109         lafs_wake_thread(dfs);
2110 }