]> git.neil.brown.name Git - LaFS.git/blob - index.c
split out set_youth.
[LaFS.git] / index.c
1
2 /*
3  * Handle index block for LaFS.
4  * fs/lafs/index.c
5  * Copyright (C) 2005-2009
6  * Neil Brown <neilb@suse.de>
7  * Released under the GPL, version 2
8  */
9
10 #include        "lafs.h"
11 #include        <linux/types.h>
12 #include        <linux/hash.h>
13 #include        <linux/slab.h>
14 /*
15  * For each index-block there is an 'indexblock' data structure,
16  * and a 'block' of memory, which is a page or part there-of.
17  *
18  * Index-blocks are stored in a hash table, indexed by
19  * (struct inode *, fileaddr, depth).  There is one hash table
20  * for all filesystems.
21  *
22  * There is a single free-list for all index blocks from all
23  * filesystems, of all sizes.  Index blocks only get on this list when
24  * they are unpinned leafs.  i.e. their 'children' list is empty.  As they
25  * are freed, their parents may get added to the list.
26  *
27  * It's not clear yet how memory pressure will flush out dirty index blocks.
28  *
29  * All indexblocks for an inode are either attached under the inode via
30  * the children/siblings tree, or on a per-inode 'free' list also using
31  * the 'siblings' linkage. Thus we are sure that they can be removed when
32  * the inode is removed.  Note that data blocks do *not* necessarily live
33  * in this tree (though they will when they are dirty).  They can always be
34  * found in the radix_tree for the inode's address-space.
35  *
36  * STILL TO DO:
37  *   Consider how 'rcu' can be used for
38  *     - hash table
39  *     - freelist
40  *   Consider gang-freeing for adding to head of lru lists.
41  */
42
43 /* FIXME these should be configured at runtime based on memory */
44 #define HASHBITS        10
45 static struct hlist_head hash_table[1<<HASHBITS];
46 spinlock_t lafs_hash_lock;
47
48 /* static */ struct freelists freelist;
49
50 static int lafs_shrinker(int nr_to_scan, /*gfp_t*/unsigned int gfp_mask)
51 {
52         if (nr_to_scan) {
53                 LIST_HEAD(togo);
54                 if (!(gfp_mask & __GFP_FS))
55                         return -1;
56                 dprintk("scan %d\n", nr_to_scan);
57                 spin_lock(&lafs_hash_lock);
58                 while (nr_to_scan && !list_empty(&freelist.lru)) {
59                         struct indexblock *ib = list_entry(freelist.lru.next,
60                                                            struct indexblock,
61                                                            b.lru);
62                         /* Check if it is really free */
63
64                         if (atomic_read(&ib->b.refcnt)) {
65                                 list_del_init(&ib->b.lru);
66                                 clear_bit(B_OnFree, &ib->b.flags);
67                                 freelist.freecnt--;
68                                 continue;
69                         }
70                         if (test_bit(B_InoIdx, &ib->b.flags))
71                                 LAFSI(ib->b.inode)->iblock = NULL;
72
73                         /* Any pinned iblock must have children, be
74                          * refcounted, or be on a 'leaf' lru, which gives
75                          * a refcount.  So this unreffed block cannot be
76                          * pinned.  This BUG_ON is here because previously
77                          * the above 'if' also tested for 'Pinned', and that
78                          * seems unnecessary, but I want to be sure.
79                          */
80                         LAFS_BUG(test_bit(B_Pinned, &ib->b.flags), &ib->b);
81                         if (!hlist_unhashed(&ib->hash))
82                                 hlist_del_init(&ib->hash);
83                         /* delete from inode->free_index */
84                         list_del_init(&ib->b.siblings);
85                         list_move(&ib->b.lru, &togo);
86                         nr_to_scan--;
87                         freelist.freecnt--;
88                 }
89                 spin_unlock(&lafs_hash_lock);
90                 while (!list_empty(&togo)) {
91                         struct indexblock *ib = list_entry(togo.next,
92                                                            struct indexblock,
93                                                            b.lru);
94                         list_del(&ib->b.lru);
95                         lafs_iblock_free(ib);
96                 }
97         }
98         return freelist.freecnt;
99 }
100
101 void lafs_release_index(struct list_head *head)
102 {
103         /* This is the other place that index blocks get freed.
104          * They are freed either by memory pressure in lafs_shrinker
105          * above, or when an inode is destroyed by this code.
106          * They need to both obey the same locking rules, and ensure
107          * blocks are removed from each of siblings, lru, and hash.
108          * This list is an inode's free_index and are linked on
109          * b.siblings.  They are all on the freelist.lru
110          */
111         LIST_HEAD(togo);
112         spin_lock(&lafs_hash_lock);
113         while (!list_empty(head)) {
114                 struct indexblock *ib = list_entry(head->next,
115                                                    struct indexblock,
116                                                    b.siblings);
117                 list_del_init(&ib->b.siblings);
118                 LAFS_BUG(!test_bit(B_OnFree, &ib->b.flags), &ib->b);
119                 if (!hlist_unhashed(&ib->hash))
120                         hlist_del_init(&ib->hash);
121                 list_move(&ib->b.lru, &togo);
122                 freelist.freecnt--;
123         }
124         spin_unlock(&lafs_hash_lock);
125         while (!list_empty(&togo)) {
126                 struct indexblock *ib = list_entry(togo.next,
127                                                    struct indexblock,
128                                                    b.lru);
129                 list_del(&ib->b.lru);
130                 lafs_iblock_free(ib);
131         }
132 }
133
134 struct indexblock *
135 lafs_iblock_alloc(struct fs *fs, int gfp, int with_buffer, REFARG)
136 {
137         struct indexblock *ib;
138
139         ib = kzalloc(sizeof(*ib), gfp);
140         if (!ib)
141                 return NULL;
142         if (with_buffer) {
143                 ib->data = kmalloc(fs->blocksize, gfp);
144                 if (!ib->data) {
145                         kfree(ib);
146                         return NULL;
147                 }
148         } else
149                 ib->data = NULL;
150
151         ib->b.flags = 0;
152         set_bit(B_Index, &ib->b.flags);
153         atomic_set(&ib->b.refcnt, 1);
154         add_ref(&ib->b, REF, __FILE__, __LINE__);
155
156         INIT_LIST_HEAD(&ib->children);
157         atomic_set(&ib->pincnt[0], 0);
158         atomic_set(&ib->pincnt[1], 0);
159         ib->uninc = NULL;
160         ib->uninc_next = NULL;
161
162         ib->b.parent = NULL;
163         INIT_LIST_HEAD(&ib->b.siblings);
164         INIT_LIST_HEAD(&ib->b.lru);
165         INIT_LIST_HEAD(&ib->b.peers);
166         INIT_HLIST_NODE(&ib->hash);
167         ib->b.chain = NULL;
168         ib->uninc_table.pending_cnt = 0;
169         ib->uninc_table.credits = 0;
170         ib->b.inode = NULL;
171         return ib;
172 }
173
174 void lafs_iblock_free(struct indexblock *ib)
175 {
176         kfree(ib->data);
177         kfree(ib);
178 }
179
180 static struct shrinker hash_shrink = {
181         .shrink = lafs_shrinker,
182         .seeks = DEFAULT_SEEKS,
183 };
184
185 int lafs_ihash_init(void)
186 {
187         int i;
188         for (i = 0 ; i < (1<<HASHBITS); i++)
189                 INIT_HLIST_HEAD(hash_table+i);
190         spin_lock_init(&lafs_hash_lock);
191         INIT_LIST_HEAD(&freelist.lru);
192         register_shrinker(&hash_shrink);
193
194         return 0;
195 }
196
197 void lafs_ihash_free(void)
198 {
199         unregister_shrinker(&hash_shrink);
200 }
201
202 static int
203 ihash(struct inode *inode, faddr_t addr, int depth)
204 {
205         unsigned long h = hash_ptr(inode, BITS_PER_LONG);
206         h = hash_long(h ^ addr, BITS_PER_LONG);
207         return hash_long(h ^ depth, HASHBITS);
208 }
209
210 static struct indexblock *
211 ihash_lookup(struct inode *inode, faddr_t addr, int depth,
212              struct indexblock *new, REFARG)
213 {
214         struct indexblock *ib = NULL;
215         struct hlist_node *tmp;
216         struct hlist_head *head =  &hash_table[ihash(inode, addr, depth)];
217
218         spin_lock(&lafs_hash_lock);
219         hlist_for_each_entry(ib, tmp, head, hash)
220                 if (ib->b.fileaddr == addr && ib->b.inode == inode &&
221                     ib->depth == depth)
222                         break;
223         if (!tmp) {
224                 if (new) {
225                         hlist_add_head(&new->hash, head);
226                         ib = new;
227                 } else
228                         ib = NULL;
229         }
230         if (ib)
231                 getiref_locked_needsync(ib, REF);
232         spin_unlock(&lafs_hash_lock);
233         if (ib)
234                 sync_ref(&ib->b);
235         return ib;
236 }
237
238 struct indexblock *lafs_getiref_locked(struct indexblock *ib)
239 {
240         int ok = 0;
241         if (!ib)
242                 return ib;
243         LAFS_BUG(!test_bit(B_InoIdx, &ib->b.flags), &ib->b);
244         if (spin_is_locked(&ib->b.inode->i_data.private_lock))
245                 ok = 1;
246         if (spin_is_locked(&lafs_hash_lock))
247                 ok = 1;
248         if (spin_is_locked(&fs_from_inode(ib->b.inode)->lock))
249                 ok = 1;
250         LAFS_BUG(!ok, &ib->b);
251
252         if (atomic_inc_return(&ib->b.refcnt) == 1) {
253                 /* First reference to an InoIdx block implies a reference
254                  * to the dblock
255                  * If two different threads come here under different locks, one could
256                  * exit before the ref on the dblock is taken.  However as each
257                  * thread must own an indirect reference to be here, the last indirect ref to
258                  * the dblock cannot disappear before all threads have exited this function,
259                  * by which time this direct ref will be active.
260                  */
261                 struct datablock *db;
262                 db = getdref(LAFSI(ib->b.inode)->dblock, MKREF(iblock));
263                 LAFS_BUG(db == NULL, &ib->b);
264         }
265         return ib;
266 }
267
268 struct indexblock *
269 lafs_iblock_get(struct inode *ino, faddr_t addr, int depth, paddr_t phys, REFARG)
270 {
271         /* Find the index block with this inode/depth/address.
272          * If it isn't in the cache, create it with given
273          * phys address
274          */
275         struct indexblock *ib = ihash_lookup(ino, addr, depth, NULL, REF);
276         struct indexblock *new;
277         if (ib)
278                 return ib;
279
280         if (!phys)
281                 return NULL;
282
283         new = lafs_iblock_alloc(fs_from_inode(ino), GFP_KERNEL, 1, REF);
284         if (!new)
285                 return NULL;
286         new->b.fileaddr = addr;
287         new->b.inode = ino;
288         new->depth = depth;
289         new->b.physaddr = phys;
290         set_bit(B_PhysValid, &new->b.flags);
291
292         ib = ihash_lookup(ino, addr, depth, new, REF);
293         if (ib != new)
294                 lafs_iblock_free(new);
295         return ib;
296 }
297
298 void lafs_hash_iblock(struct indexblock *ib)
299 {
300         /* This block was an InoIdx block but has just become a real
301          * index block, so hash it.  There cannot be a block with this
302          * address already as we haven't increased the inode depth yet.
303          * Alternately this block was recently split off another,
304          * and has now been incorporated, so a lookup might need
305          * to find it.  So it better be in the hash table.
306          */
307         struct hlist_head *head =
308                 &hash_table[ihash(ib->b.inode, ib->b.fileaddr, ib->depth)];
309         spin_lock(&lafs_hash_lock);
310         LAFS_BUG(!hlist_unhashed(&ib->hash), &ib->b);
311         hlist_add_head(&ib->hash, head);
312         spin_unlock(&lafs_hash_lock);
313 }
314
315 void lafs_unhash_iblock(struct indexblock *ib)
316 {
317         /* This block has been emptied and deleted and is no longer
318          * wanted in the hash tables
319          */
320         spin_lock(&lafs_hash_lock);
321         LAFS_BUG(hlist_unhashed(&ib->hash), &ib->b);
322         hlist_del_init(&ib->hash);
323         spin_unlock(&lafs_hash_lock);
324 }
325
326 /* adopt blk into parent if possible.
327  */
328 static void
329 block_adopt(struct block *blk, struct indexblock *parent)
330 {
331         struct address_space *as;
332
333         if (parent == NULL) {
334                 LAFS_BUG(!test_bit(B_Root, &blk->flags), blk);
335                 return;
336         }
337
338         if (blk->parent) {
339                 LAFS_BUG(blk->parent != parent, blk);
340                 return;
341         }
342
343         LAFS_BUG(!(parent->b.flags & B_Valid), &parent->b);
344
345         as = &blk->inode->i_data;
346
347         spin_lock(&as->private_lock);
348         if (blk->parent) {
349                 if (blk->parent != parent) {
350                         printk("blk = %s\n", strblk(blk));
351                         printk("blk->p = %s\n", strblk(&blk->parent->b));
352                         printk("parent = %s\n", strblk(&parent->b));
353                         lafs_dump_tree();
354                 }
355                 LAFS_BUG(blk->parent != parent, blk);
356         } else {
357                 if (test_bit(B_EmptyIndex, &parent->b.flags)) {
358                         LAFS_BUG(!test_bit(B_Index, &parent->b.flags), &parent->b);
359                         LAFS_BUG(test_bit(B_InoIdx, &parent->b.flags), &parent->b);
360                         LAFS_BUG(parent->b.fileaddr != parent->b.parent->b.fileaddr, &parent->b);
361
362                         clear_bit(B_EmptyIndex, &parent->b.flags);
363                 }
364                 LAFS_BUG(parent == iblk(blk), blk);
365                 blk->parent = parent;
366                 getiref(parent, MKREF(child));
367                 /* Remove from the per-inode free list */
368                 spin_lock(&lafs_hash_lock);
369                 if (!test_bit(B_InoIdx, &blk->flags))
370                         list_move(&blk->siblings, &parent->children);
371                 else {
372                         /* Remove from ->free_index list */
373                         list_del_init(&blk->siblings);
374                         /* Having set ->parent, we take a ref on the dblock.
375                          * The dblock must exist because we can only reach here from
376                          * lafs_make_iblock which can only be called while holding a ref
377                          * on the dblock
378                          */
379                         (void)getdref(LAFSI(blk->inode)->dblock, MKREF(pdblock));
380                 }
381                 spin_unlock(&lafs_hash_lock);
382         }
383         spin_unlock(&as->private_lock);
384 }
385
386 static int __must_check
387 find_block(struct datablock *b, int adopt, int async);
388
389 static int __must_check
390 setparent(struct datablock *blk, int async)
391 {
392         int err = 0;
393         if (blk->b.parent == NULL && !test_bit(B_Root, &blk->b.flags))
394                 err = find_block(blk, 1, async);
395         return err;
396 }
397
398 int __must_check
399 lafs_setparent(struct datablock *blk)
400 {
401         return setparent(blk, 0);
402 }
403
404 void lafs_pin_block_ph(struct block *b, int ph)
405 {
406         struct fs *fs = fs_from_inode(b->inode);
407         struct block *p;
408         struct inode *ino;
409
410         LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags), b);
411         if (!test_bit(B_Realloc, &b->flags) &&
412             LAFSI(b->inode)->type != TypeSegmentMap)
413                 LAFS_BUG(!test_phase_locked(fs), b);
414
415         ino = b->inode;
416         spin_lock(&ino->i_data.private_lock);
417         for (p = b ;
418              p && !test_bit(B_Pinned, &p->flags) ;
419              p = &(p->parent)->b) {
420                 set_phase(p, ph);
421                 if (test_bit(B_InoIdx, &p->flags)) {
422                         struct datablock *db = LAFSI(p->inode)->dblock;
423
424                         spin_unlock(&ino->i_data.private_lock);
425                         lafs_inode_checkpin(p->inode);
426                         ino = db->b.inode;
427                         spin_lock(&ino->i_data.private_lock);
428                 }
429         }
430         spin_unlock(&ino->i_data.private_lock);
431         lafs_refile(b, 0);
432 }
433
434 static void pin_all_children(struct indexblock *ib)
435 {
436         /* If we cannot get new credits for phase_flip, we need to
437          * pin all dirty data-block children so that they get
438          * written and don't remain to require a phase_flip
439          * on the next checkpoint (at which point there will be
440          * no credit left)
441          * So if any dirty child is found, set phase.  We don't
442          * need PinPending as it is already Dirty.  All other
443          * requirement for pinning will already be met.
444          */
445         struct block *b;
446         struct indexblock *start = NULL;
447         int depth = 0;
448         struct fs *fs = fs_from_inode(ib->b.inode);
449         int ph = fs->phase;
450
451         /* Locking:
452          * This is called with an IOLock on ib,  So it is unlikely
453          * that children will be added or removed(?), but we really
454          * should hole the inode private_lock anyway.
455          */
456         getiref(ib, MKREF(pinall));
457         b = NULL;
458 restart:
459         /* We hold a reference on ib, and on 'start' if non-NULL */
460         spin_lock(&ib->b.inode->i_data.private_lock);
461         b = list_prepare_entry(b, &ib->children, siblings);
462         list_for_each_entry_continue(b, &ib->children, siblings) {
463                 /* FIXME I don't descend through inode blocks to the file below! */
464                 if (test_bit(B_Index, &b->flags)) {
465                         /* restart down */
466                         depth++;
467                         getref_locked(b, MKREF(pinall));
468                         spin_unlock(&ib->b.inode->i_data.private_lock);
469                         putiref(ib, MKREF(pinall));
470                         ib = iblk(b);
471                         b = NULL;
472                         goto restart;
473                 } else if (test_bit(B_Dirty, &b->flags) &&
474                            !test_bit(B_Pinned, &b->flags)) {
475                         getref_locked(b, MKREF(pinall));
476                         spin_unlock(&ib->b.inode->i_data.private_lock);
477                         set_phase(b, ph);
478                         putref(b, MKREF(pinall));
479                         /* Slightly safer to restart this level */
480                         b = NULL;
481                         goto restart;
482                 }
483         }
484         spin_unlock(&ib->b.inode->i_data.private_lock);
485         putiref(start, MKREF(pinall));
486         if (depth) {
487                 depth--;
488                 start = ib;
489                 ib = getiref(ib->b.parent, MKREF(pinall));
490                 b = &start->b;
491                 goto restart;
492         }
493         putiref(ib, MKREF(pinall));
494 }
495
496 static void flip_phase(struct block *b)
497 {
498         struct indexblock *p;
499         int oldphase = !!test_bit(B_Phase1, &b->flags);
500
501         LAFS_BUG(!test_bit(B_Pinned, &b->flags), b);
502         if (oldphase)
503                 clear_bit(B_Phase1, &b->flags);
504         else
505                 set_bit(B_Phase1, &b->flags);
506
507         p = b->parent;
508
509         if (p) {
510                 atomic_inc(&p->pincnt[1-oldphase]);
511                 atomic_dec(&p->pincnt[oldphase]);
512         } else
513                 LAFS_BUG(!test_bit(B_Root, &b->flags), b);
514
515         /* move 'next' credits to 'here' */
516         if (!test_bit(B_Credit, &b->flags) &&
517             test_and_clear_bit(B_NCredit, &b->flags))
518                 set_bit(B_Credit, &b->flags);
519
520         if (!test_bit(B_ICredit, &b->flags) &&
521             test_and_clear_bit(B_NICredit, &b->flags))
522                 set_bit(B_ICredit, &b->flags);
523 }
524
525 /* When the pinning of a block needs to be carried across a
526  * checkpoint, we need to 'flip' the phase.
527  * This only applies to blocks that can be pinned by a block that
528  * may not be written until the next phase.
529  * This includes any index block (is it may have some children in one
530  * phase and some in the next) and any space-accounting block
531  * for the same reason.
532  * So indexblocks will need to flip, and use lafs_phase_flip.
533  * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock.
534  * TypeInodeFile don't need to be phase_flipped, though their InoIdx
535  * block might.  InodeFile blocks are only pinned by metadata transactions
536  * which happen inside a checkpoint lock.
537  */
538
539 void lafs_flip_dblock(struct datablock *db)
540 {
541         /* This is an accounting block (SegmentMap or Quota)
542          * which we need to write out after ->phase has changed
543          * in the tail of the checkpoint.
544          * We always flip the phase and reallocate from AccountSpace.
545          * If all references get dropped, it will then get unpinned
546          * before the next phase finished - we don't unpin here
547          * (unlike lafs_phase_flip for index blocks).
548          */
549         flip_phase(&db->b);
550         lafs_prealloc(&db->b, AccountSpace);
551         /* Parent might need to be on a leaflist now */
552         lafs_refile(&db->b.parent->b, 0);
553 }
554
555 void lafs_phase_flip(struct fs *fs, struct indexblock *ib)
556 {
557         /* We are performing a checkpoint, this block has been written
558          * out and now needs to be flipped into the next phase.
559          *
560          * It involves.
561          *  - Processing all uninc_next blocks into uninc_table.
562          *  - adjusting counts on parent
563          *  - moving credits from 'next' to 'this' phase.
564          *  - update block counts to included phase-delayed updates.
565          */
566         int oldphase = !!test_bit(B_Phase1, &ib->b.flags);
567         struct block *ulist;
568
569         dprintk("FLIP %s\n", strblk(&ib->b));
570
571         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
572         LAFS_BUG(!test_bit(B_Index, &ib->b.flags), &ib->b);
573         LAFS_BUG(ib->b.parent == NULL && !test_bit(B_Root, &ib->b.flags), &ib->b);
574         LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
575         LAFS_BUG(atomic_read(&ib->pincnt[oldphase]), &ib->b);
576         LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
577
578         /* If this is the only reference, and pincnts are zero
579          * and all relevant flags are clear, then don't flip
580          * phase but unpin.
581          */
582         if (test_bit(B_Pinned, &ib->b.flags) &&
583             !test_bit(B_Dirty, &ib->b.flags) &&
584             !test_bit(B_Realloc, &ib->b.flags) &&
585             atomic_read(&ib->b.refcnt) == 1 && /* This is us */
586
587             ib->uninc_table.pending_cnt == 0 &&
588             ib->uninc == NULL &&
589             ib->uninc_next == NULL &&
590             atomic_read(&ib->pincnt[0]) == 0 &&
591             atomic_read(&ib->pincnt[1]) == 0 &&
592
593             (!test_bit(B_InoIdx, &ib->b.flags) ||
594              !(test_bit(B_PinPending, &LAFSI(ib->b.inode)->dblock->b.flags)
595                || test_bit(B_Dirty, &LAFSI(ib->b.inode)->dblock->b.flags)
596                || test_bit(B_Realloc, &LAFSI(ib->b.inode)->dblock->b.flags)))
597                 ) {
598                 if (test_and_clear_bit(B_Pinned, &ib->b.flags)) {
599                         if (!test_bit(B_Root, &ib->b.flags)) {
600                                 atomic_dec(&ib->b.parent->pincnt[oldphase]);
601                                 lafs_refile(&ib->b.parent->b, 0);
602                         }
603                         lafs_inode_checkpin(ib->b.inode);
604                 }
605                 if (test_bit(B_InoIdx, &ib->b.flags))
606                         /* maybe data block should go in leaf list now */
607                         lafs_refile(&LAFSI(ib->b.inode)->dblock->b, 0);
608                 lafs_iounlock_block(&ib->b);
609                 return;
610         }
611
612         flip_phase(&ib->b);
613
614         if (test_bit(B_InoIdx, &ib->b.flags)) {
615                 struct lafs_inode *lai = LAFSI(ib->b.inode);
616                 struct datablock *db = lai->dblock;
617                 struct inode *ino = db->b.inode;
618
619                 spin_lock(&ino->i_data.private_lock);
620                 lai->cblocks += lai->pblocks;
621                 lai->pblocks = 0;
622                 lai->ciblocks += lai->piblocks;
623                 lai->piblocks = 0;
624                 if (lai->type == TypeInodeFile) {
625                         /* Root of filesystem */
626                         lai->md.fs.cblocks_used +=
627                                 lai->md.fs.pblocks_used;
628                         lai->md.fs.pblocks_used = 0;
629                 }
630                 spin_unlock(&ino->i_data.private_lock);
631
632                 /* maybe data block needs to be on leaf list */
633                 lafs_refile(&db->b, 0);
634         }
635
636         if (lafs_prealloc(&ib->b, ReleaseSpace) < 0) {
637                 /* Couldn't get all N*Credit, so
638                  * Pin all children to this phase, so they
639                  * won't be needed.
640                  */
641                 pin_all_children(ib);
642         }
643
644         spin_lock(&ib->b.inode->i_data.private_lock);
645         ulist = ib->uninc_next;
646         ib->uninc_next = NULL;
647         spin_unlock(&ib->b.inode->i_data.private_lock);
648
649         lafs_iounlock_block(&ib->b);
650
651         if (ulist)
652                 lafs_dirty_iblock(ib);
653         while (ulist) {
654                 struct block *b2 = ulist;
655                 ulist = b2->chain;
656                 b2->chain = NULL;
657                 clear_bit(B_Uninc, &b2->flags);
658                 while (lafs_add_block_address(fs, b2) == 0)
659                         /* We had to incorporate b2->parent which might
660                          * have split the parent but as that will have made
661                          * it dirty, there is nothing extra that we need to
662                          * do here
663                          */
664                         ;
665                 putref(b2, MKREF(uninc));
666         }
667         LAFS_BUG(ib->uninc_next, &ib->b);
668
669         lafs_refile(&ib->b, 0);
670         if (ib->b.parent)
671                 /* Parent may need to be attached to a phase_leafs now */
672                 lafs_refile(&ib->b.parent->b, 0);
673 }
674
675 /*
676  * lafs_refile is called whenever we finish doing stuff
677  * to a block.  lafs_refile checks the state of the block and
678  * make sure various lists etc are correct.
679  * When finished with a block we often want to drop a reference,
680  * and this functionality is included in refile so that the
681  * decref and associated refiling can be done under a lock.
682  *
683  * Key issues that lafs_refile deals with are:
684  *   Make sure ->lru is on correct list.
685  *   Remove ->parent link if no longer needed.
686  *   remove B_Lock if no longer needed
687  *   trigger a phase change when appropriate
688  *   Remove dir parent link if no longer needed.
689  *
690  * Fleshing these out a bit:
691  *   possible lru lists are:
692  *      phase_leafs[phase] - for dirty/pinned blocks with no pinned children
693  *      clean_leafs - for blocks that are being relocated for cleaning
694  *                       and that have no pinned children
695  *   The parent link is only needed if this block has a refcount or one
696  *     of B_Pinned, B_Dirty, B_Uninc
697  *   B_Lock is only needed if one of
698  *      B_Dirty
699  *    or refcount
700  *  though a refcount on a pinned datablock is only significant
701  *         while the current phase is locked.
702  *
703  *  a phase change can (and should) happen for index blocks
704  *  in the 'other' phase that are
705  *    not Dirty, not Alloc, pincnt[oldphase]==0, uninc-table empty
706  *
707  *
708  * Refiling one block may cause a change in another block which requires
709  * a recursive refile.  Naturally we unroll the tail-recursion, but we
710  * we need to be sure there are a limited number of tails.
711  * The causes of recursion are:
712  *   A datablock which no-longer has a PagePrivate page holds a ref
713  *      on block0 of that page, which must be dropped on last put.
714  *   When any block becomes unpinned, we must refile the parent.
715  *   When we clear ->parent we must deref the parent.
716  *   When an InoIdx block drops ->parent it is dropped from inode->iblock
717  *      so we need to drop the implied refcnt on ->dblock
718  * So there can only be 2 block to be recursed on to.
719  *  A block0 or dblock, and a parent.
720  * A block pointing to a block0 never has a parent, and an iblock has the same
721  * parent as the dblock, so there can never be more than 2 outstanding blocks.
722  */
723 int lafs_is_leaf(struct block *b, int ph)
724 {
725         /* This is pinned and not iolocked.  Should it be on a leaf
726          * list?
727          */
728         struct inode *myi;
729         struct lafs_inode *lai;
730         int rv = 1;
731
732         if (!test_bit(B_Pinned, &b->flags) ||
733             test_bit(B_Writeback, &b->flags))
734                 return 0;
735
736         if (test_bit(B_Index, &b->flags)) {
737                 if (ph >= 0)
738                         return atomic_read(&iblk(b)->pincnt[ph]) == 0;
739                 return (atomic_read(&iblk(b)->pincnt[0]) +
740                         atomic_read(&iblk(b)->pincnt[1])) == 0;
741         }
742
743         /* This is a data block */
744         myi = rcu_my_inode(dblk(b));
745         if (!myi) {
746                 /* Non-inode data blocks are always leafs */
747                 return 1;
748         }
749         lai = LAFSI(myi);
750         spin_lock(&lai->vfs_inode.i_data.private_lock);
751         if (ph < 0)
752                 ph = !!test_bit(B_Phase1, &b->flags);
753         if (lai->iblock &&
754             test_bit(B_Pinned, &lai->iblock->b.flags) &&
755             !!test_bit(B_Phase1, &lai->iblock->b.flags) == ph)
756                 /* iblock still pinned in the same phase, so
757                  * this block isn't a leaf
758                  */
759                 rv = 0;
760
761         spin_unlock(&lai->vfs_inode.i_data.private_lock);
762         rcu_iput(myi);
763         return rv;
764 }
765
766 /*
767  * This for debugging and is racy and is probably only safe on UP
768  */
769 static void check_consistency(struct block *b)
770 {
771         /* sanity tests.
772          * 1/ make sure pincnt is right
773          */
774         if (test_bit(B_Index, &b->flags)) {
775                 int c[2];
776                 struct block *cb;
777                 c[0] = c[1] = 0;
778                 list_for_each_entry(cb, &iblk(b)->children, siblings) {
779                         struct inode *myi = NULL;
780                         if (test_bit(B_Pinned, &cb->flags)) {
781                                 int pp = !!test_bit(B_Phase1, &cb->flags);
782                                 c[pp]++;
783                         }
784                         /* InoIdx block might be pinned but is not a direct
785                          * child */
786                         if (!test_bit(B_Index, &cb->flags) &&
787                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
788                             LAFSI(myi)->iblock &&
789                             test_bit(B_Pinned, &LAFSI(myi)->iblock->b.flags)) {
790                                 int pp = !!test_bit(B_Phase1,
791                                                     &LAFSI(myi)->iblock->b.flags);
792                                 c[pp]++;
793                         }
794                         rcu_iput(myi);
795                 }
796                 if (c[0] != atomic_read(&iblk(b)->pincnt[0]) ||
797                     c[1] != atomic_read(&iblk(b)->pincnt[1])) {
798                         printk("%d %d\n", c[0], c[1]);
799                         lafs_dump_tree();
800                         LAFS_BUG(1, b);
801                 }
802         }
803
804         if (b->parent) {
805                 struct indexblock *ib = b->parent;
806                 int c[2];
807                 struct block *cb;
808                 c[0] = c[1] = 0;
809                 list_for_each_entry(cb, &ib->children, siblings) {
810                         struct inode *myi = NULL;
811                         if (test_bit(B_Pinned, &cb->flags)) {
812                                 int pp = !!test_bit(B_Phase1, &cb->flags);
813                                 c[pp]++;
814                         }
815                         /* InoIdx block might be pinned but is not a direct
816                          * child */
817                         if (!test_bit(B_Index, &cb->flags) &&
818                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
819                             LAFSI(myi)->iblock &&
820                             test_bit(B_Pinned, &(LAFSI(myi)
821                                                  ->iblock->b.flags))) {
822                                 int pp = !!test_bit(B_Phase1,
823                                                     &LAFSI(myi)
824                                                     ->iblock->b.flags);
825                                 c[pp]++;
826                         }
827                         rcu_iput(myi);
828                 }
829                 if (c[0] != atomic_read(&ib->pincnt[0]) ||
830                     c[1] != atomic_read(&ib->pincnt[1])) {
831                         printk("badcnt %d %d\n", c[0], c[1]);
832                         LAFS_BUG(1, &ib->b);
833                 }
834         }
835 }
836
837 static void set_lru(struct block *b)
838 {
839         struct fs *fs;
840         int ph = !!test_bit(B_Phase1, &b->flags);
841
842         if (!test_bit(B_OnFree, &b->flags) && !list_empty_careful(&b->lru))
843                 return;
844         if (test_bit(B_IOLock, &b->flags) || !lafs_is_leaf(b, ph))
845                 return;
846
847         /* This is close enough to a leaf that we should put it on a list.
848          * If we raced and it isn't then it will be found and removed
849          */
850         if (test_bit(B_OnFree, &b->flags)) {
851                 spin_lock(&lafs_hash_lock);
852                 if (test_and_clear_bit(B_OnFree, &b->flags)) {
853                         list_del_init(&b->lru);
854                         freelist.freecnt--;
855                 }
856                 spin_unlock(&lafs_hash_lock);
857         }
858         fs = fs_from_inode(b->inode);
859         spin_lock(&fs->lock);
860         if (list_empty(&b->lru)) {
861                 if (test_bit(B_Realloc, &b->flags))
862                         list_add(&b->lru, &fs->clean_leafs);
863                 else {
864                         ph = !!test_bit(B_Phase1, &b->flags);
865                         list_add(&b->lru, &fs->phase_leafs[ph]);
866                 }
867
868         }
869         spin_unlock(&fs->lock);
870         /* allow lafs_iolock_block-empty to complete */
871         lafs_io_wake(b);
872 }
873
874 void lafs_refile(struct block *b, int dec)
875 {
876         struct block *next = NULL, *next_parent = NULL;
877         struct fs *fs = NULL;
878         u64 physref = 0;
879
880         if (!b)
881                 return;
882
883         fs = fs_from_inode(b->inode);
884         check_consistency(b);
885
886 /* To (mostly) avoid recursion, we have a loop which may
887  * walk up to the parent.
888  * The main reason for holding lafs_hash_lock is that it protects
889  * ->lru - i.e. all the lists. FIXME that should be fs->lock
890  */
891         LAFS_BUG(atomic_read(&b->refcnt) == 0, b);
892         while (b) {
893                 int ph;
894                 int free_me = 0;
895                 struct inode *checkpin = NULL;
896                 struct inode *myi = NULL;
897                 struct datablock *db;
898
899                 set_lru(b);
900
901                 if (test_bit(B_Pinned, &b->flags) &&
902                     !test_bit(B_Index, &b->flags) &&
903                     !test_bit(B_PinPending, &b->flags) &&
904                     !test_bit(B_Dirty, &b->flags) &&
905                     !test_bit(B_Realloc, &b->flags)
906                         ) {
907                         /* Don't need to be Pinned any more */
908                         /* FIXME do I need to lock access to ->parent */
909                         lafs_checkpoint_lock(fs);
910                         ph = !!test_bit(B_Phase1, &b->flags);
911                         if (test_and_clear_bit(B_Pinned, &b->flags)) {
912                                 if (test_bit(B_Dirty, &b->flags) ||
913                                     test_bit(B_Realloc, &b->flags) ||
914                                     test_bit(B_PinPending, &b->flags))
915                                         /* lost a race */
916                                         set_phase(b, ph);
917                                 else if (!test_bit(B_Root, &b->flags)) {
918                                         struct block *nb;
919                                         atomic_dec(&b->parent->pincnt[ph]);
920                                         nb = &b->parent->b;
921                                         if (next_parent != nb) {
922                                                 LAFS_BUG(next_parent, b);
923                                                 next_parent = nb;
924                                                 atomic_inc(&nb->refcnt);
925                                         }
926                                 }
927                         }
928                         lafs_checkpoint_unlock(fs);
929                 }
930
931                 if (dec && atomic_dec_and_lock(&b->refcnt, &b->inode->i_data.private_lock)) {
932                         /* PinPending disappears with the last non-lru reference,
933                          * (or possibly at other times).
934                          */
935                         clear_bit(B_PinPending, &b->flags);
936
937                         ph = !!test_bit(B_Phase1, &b->flags);
938                         /* See if we still need to be pinned */
939                         if (test_bit(B_Pinned, &b->flags) &&
940                             !test_bit(B_Dirty, &b->flags) &&
941                             !test_bit(B_Realloc, &b->flags)) {
942                                 /* Don't need to be Pinned any more */
943                                 if (test_and_clear_bit(B_Pinned, &b->flags)) {
944                                         if (!list_empty_careful(&b->lru)) {
945                                                 spin_lock(&fs->lock);
946                                                 list_del_init(&b->lru);
947                                                 spin_unlock(&fs->lock);
948                                         }
949                                         if (test_bit(B_InoIdx, &b->flags) &&
950                                             b->inode->i_nlink)
951                                                 checkpin = b->inode;
952                                         if (!test_bit(B_Root, &b->flags)) {
953                                                 struct block *nb;
954                                                 atomic_dec(&b->parent->pincnt[ph]);
955                                                 if (test_bit(B_InoIdx, &b->flags))
956                                                         nb = &LAFSI(b->inode)->dblock->b;
957                                                 else
958                                                         nb = &b->parent->b;
959                                                 if (next_parent != nb) {
960                                                         LAFS_BUG(next_parent, b);
961                                                         next_parent = nb;
962                                                         atomic_inc(&nb->refcnt);
963                                                 }
964                                         }
965                                 }
966                         }
967
968                         /* check the ->parent link */
969                         if (b->parent &&
970                             !(b->flags & (
971                                       (1<<B_Pinned) |
972                                       (1<<B_Uninc) |
973                                       (1<<B_Realloc) |
974                                       (1<<B_Dirty)))
975                                 ) {
976                                 int credits;
977                                 /* Don't need ->parent any more */
978                                 if (next_parent == NULL)
979                                         next_parent = &b->parent->b;
980                                 else if (next_parent == &b->parent->b ||
981                                          next_parent->parent == b->parent) {
982                                         if (atomic_dec_and_test(&b->parent->b.refcnt))
983                                                 LAFS_BUG(1, b);
984                                 } else
985                                         LAFS_BUG(1, b);
986
987                                 del_ref(&b->parent->b, MKREF(child),
988                                         __FILE__, __LINE__);
989                                 b->parent = NULL;
990
991                                 if (test_bit(B_InoIdx, &b->flags)) {
992                                         /* While an InoIdx has a parent we hold a count on
993                                          * the dblock.  Now we have dropped one, we must drop the
994                                          * other
995                                          */
996                                         struct datablock *db = LAFSI(b->inode)->dblock;
997                                         if (&db->b.parent->b != next_parent &&
998                                             &db->b != next_parent) {
999                                                 printk("db    = %s\n", strblk(&db->b));
1000                                                 printk("dp->p = %s\n", strblk(&db->b.parent->b));
1001                                                 printk("np    = %s\n", strblk(next_parent));
1002                                                 printk("b     = %s\n", strblk(b));
1003                                         }
1004                                         BUG_ON(&db->b.parent->b != next_parent && &db->b != next_parent);
1005                                         if (atomic_dec_and_test(&next_parent->refcnt))
1006                                                 LAFS_BUG(1, b);
1007                                         next_parent = &db->b;
1008                                         del_ref(&db->b, MKREF(pdblock), __FILE__, __LINE__);
1009                                 }
1010                                 if (test_and_clear_bit(B_SegRef, &b->flags))
1011                                         physref = b->physaddr;
1012
1013                                 LAFS_BUG(test_bit(B_PrimaryRef, &b->flags), b);
1014                                 list_del_init(&b->siblings);
1015
1016                                 if (test_bit(B_Index, &b->flags))
1017                                         list_add(&b->siblings,
1018                                                  &LAFSI(b->inode)->free_index);
1019
1020                                 if (test_and_clear_bit(B_Prealloc, &b->flags) &&
1021                                     b->physaddr == 0)
1022                                         lafs_summary_allocate(fs, b->inode, -1);
1023                                 credits = 0;
1024                                 if (test_and_clear_bit(B_Credit, &b->flags))
1025                                         credits++;
1026                                 if (test_and_clear_bit(B_ICredit, &b->flags))
1027                                         credits++;
1028                                 if (test_and_clear_bit(B_NCredit, &b->flags))
1029                                         credits++;
1030                                 if (test_and_clear_bit(B_NICredit, &b->flags))
1031                                         credits++;
1032                                 if (test_and_clear_bit(B_UnincCredit, &b->flags))
1033                                         credits++;
1034                                 lafs_space_return(fs, credits);
1035                         }
1036                         if (test_bit(B_Index, &b->flags) &&
1037                             !(b->flags & (
1038                                       (1<<B_Pinned) |
1039                                       (1<<B_Uninc) |
1040                                       (1<<B_Root) |
1041                                       (1<<B_Realloc) |
1042                                       (1<<B_Dirty)))
1043                                 ) {
1044                                 if (b->parent != NULL)
1045                                         /* Could this be uninc - where
1046                                          * is refcnt */
1047                                         printk("Problem: %s\n", strblk(b));
1048                                 LAFS_BUG(b->parent != NULL, b);
1049                                 /* put it on the lru */
1050                                 spin_lock(&lafs_hash_lock);
1051                                 if (!test_and_set_bit(B_OnFree, &b->flags)) {
1052                                         LAFS_BUG(!list_empty(&b->lru), b);
1053                                         list_move(&b->lru, &freelist.lru);
1054                                         freelist.freecnt++;
1055                                 }
1056                                 spin_unlock(&lafs_hash_lock);
1057                         }
1058                         /* last reference to a dblock with no page
1059                          * requires special handling
1060                          * The first block on a page must be freed,
1061                          * the other blocks hold a reference on that
1062                          * first block which must be dropped.
1063                          * However it is possible that a new reference was taken
1064                          * via _leafs. If so we have now cleared Pinned so
1065                          * get_flushable will immediately put this again.
1066                          * so we should leave this cleanup to later.
1067                          * Unlike other tests, this one isn't idempotent, so
1068                          * need the check on refcnt
1069                          */
1070                         db = dblk(b);
1071                         if (!test_bit(B_Index, &b->flags) &&
1072                             !PagePrivate(db->page) &&
1073                             atomic_read(&b->refcnt) == 0) {
1074                                 int bits = (PAGE_SHIFT -
1075                                             b->inode->i_blkbits);
1076                                 int mask = (1<<bits) - 1;
1077                                 int bnum = b->fileaddr & mask;
1078
1079                                 LAFS_BUG(next, next);
1080                                 if (bnum) {
1081                                         next = &db[-bnum].b;
1082                                         del_ref(next, "lafs_release_0",
1083                                                 __FILE__, __LINE__);
1084                                 } else {
1085                                         free_me = 1;
1086                                         put_page(db->page);
1087                                 }
1088                         }
1089                         /* last reference to an iblock requires that we
1090                          * deref the dblock.  We don't need to re-check
1091                          * refcnt here as a racing getiref_locked will take an
1092                          * extra dblock reference it.
1093                          */
1094                         if (test_bit(B_InoIdx, &b->flags)) {
1095                                 LAFS_BUG(LAFSI(b->inode)->iblock
1096                                          != iblk(b), b);
1097                                 LAFS_BUG(next, next);
1098                                 next = &LAFSI(b->inode)->dblock->b;
1099                                 del_ref(next, MKREF(iblock),
1100                                         __FILE__, __LINE__);
1101                         }
1102
1103                         /* Free a delayed-release inode */
1104                         if (!test_bit(B_Index, &b->flags) &&
1105                             (myi = rcu_my_inode(dblk(b))) != NULL &&
1106                             (!PagePrivate(dblk(b)->page) ||
1107                              test_bit(I_Destroyed, &LAFSI(myi)->iflags))) {
1108                                 dblk(b)->my_inode = NULL;
1109                                 LAFSI(myi)->dblock = NULL;
1110                                 /* Don't need lafs_hash_lock to clear iblock as
1111                                  * new refs on iblock are only taken while holding
1112                                  * dblock, and we know dblock has no references.
1113                                  * You would need an iget to get a ref on dblock now,
1114                                  * and because I_Destroyed, we know that isn't possible.
1115                                  */
1116                                 LAFSI(myi)->iblock = NULL;
1117                                 spin_unlock(&b->inode->i_data.private_lock);
1118                                 if (test_bit(I_Destroyed, &LAFSI(myi)->iflags))
1119                                         lafs_destroy_inode(myi);
1120                         } else
1121                                 spin_unlock(&b->inode->i_data.private_lock);
1122                 }
1123                 rcu_iput(myi);
1124
1125                 if (physref) {
1126                         lafs_seg_deref(fs, physref, 0);
1127                         physref = 0;
1128                 }
1129
1130                 if (checkpin)
1131                         lafs_inode_checkpin(checkpin);
1132
1133                 if (free_me)
1134                         kfree(dblk(b));
1135                 b = NULL;
1136                 if (next) {
1137                         b = next;
1138                         next = NULL;
1139                 } else if (next_parent) {
1140                         b = next_parent;
1141                         next_parent = NULL;
1142                 }
1143                 dec = 1;
1144         }
1145 }
1146
1147 /*
1148  * create (if it doesn't already exist) the 'iblock' for an inode.
1149  * This is a shadow of the dblock but comes into it's own if/when
1150  * the inode's indexing tree needs to grow.
1151  * Return a counted reference to the index block.
1152  * NOTE: we must hold a reference to ->dblock when we call
1153  * lafs_make_iblock.
1154  */
1155 struct indexblock * __must_check
1156 lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG)
1157 {
1158         struct lafs_inode *lai = LAFSI(ino);
1159         struct indexblock *ib, *new = NULL;
1160         struct fs *fs = fs_from_inode(ino);
1161         int err = 0;
1162
1163         dprintk("MAKE_IBLOCK %d\n", (int)ino->i_ino);
1164
1165         BUG_ON(lai->dblock == NULL);
1166         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1167 retry:
1168         spin_lock(&lafs_hash_lock);
1169         if (lai->iblock)
1170                 ib = getiref_locked_needsync(lai->iblock, REF);
1171         else if (new) {
1172                 lai->iblock = new;
1173                 ib = new;
1174                 new = NULL;
1175                 (void)getdref(lai->dblock, MKREF(iblock));
1176         }
1177         spin_unlock(&lafs_hash_lock);
1178         if (new)
1179                 lafs_iblock_free(new);
1180         if (ib) {
1181                 sync_ref(&ib->b);
1182                 if (adopt) {
1183                         err = setparent(lai->dblock, async);
1184                         if (err == 0)
1185                                 block_adopt(&ib->b, lai->dblock->b.parent);
1186                 }
1187                 if (err == 0)
1188                         return ib;
1189                 putiref(ib, REF);
1190                 return ERR_PTR(err);
1191         }
1192
1193         new = lafs_iblock_alloc(fs, GFP_KERNEL, 0, REF);
1194         if (!new)
1195                 return ERR_PTR(-ENOMEM);
1196
1197         new->b.fileaddr = 0;
1198         new->b.physaddr = lai->dblock->b.physaddr;
1199         if (test_bit(B_PhysValid, &lai->dblock->b.flags))
1200                 set_bit(B_PhysValid, &new->b.flags);
1201         LAFS_BUG(!test_bit(B_Valid, &lai->dblock->b.flags), &lai->dblock->b);
1202         set_bit(B_Valid, &new->b.flags);
1203         new->b.inode = ino;
1204         new->depth = lai->depth;
1205         /* Note: this doesn't get hashed until the index
1206          * tree grows and this block is disconnected from
1207          * the inode.
1208          */
1209         set_bit(B_InoIdx, &new->b.flags);
1210         if (test_bit(B_Root, &lai->dblock->b.flags))
1211                 set_bit(B_Root, &new->b.flags);
1212         new->b.parent = NULL;
1213
1214         goto retry;
1215 }
1216
1217 static u64
1218 leaf_lookup(void *bf, int len, u32 startaddr, u32 target, u32 *nextp)
1219 {
1220         /* buf starts with a 2byte header
1221          * if 1, then 6byte littleending indirect entries.
1222          * if 2, then 12byte extent entries
1223          */
1224         unsigned char *buf = bf;
1225         u64 p;
1226         unsigned char *cp;
1227         int elen;
1228         int hi, lo;
1229         u32 addr, next;
1230
1231         if (nextp)
1232                 *nextp = 0xffffffff;
1233
1234         if (buf[1])
1235                 return 0;
1236         switch (buf[0]) {
1237         default:
1238                 p = 0;
1239                 break;
1240
1241         case IBLK_INDIRECT: /* indirect */
1242                 dprintk("indirect lookup for %lu from %lu, len %d\n",
1243                         (unsigned long)target, (unsigned long)startaddr, len);
1244                 len -= 2;
1245                 buf += 2;
1246
1247                 if (target < startaddr)
1248                         return 0;
1249
1250                 next = target;
1251                 target -= startaddr;
1252
1253                 /* Need both tests as target could be v.large and
1254                  * multiply by 6 could overflow
1255                  */
1256                 if (target > len ||
1257                     target*6 + 6 > len)
1258                         return 0;
1259                 buf += target*6;
1260                 p = decode48(buf);
1261                 if (nextp) {
1262                         /* find the next allocated block */
1263                         u64 p2;
1264
1265                         len -= target*6;
1266                         len -= 6;
1267                         next++;
1268                         *nextp = 0xffffffff;
1269                         while (len >= 6) {
1270                                 p2 = decode48(buf);
1271                                 if (p2) {
1272                                         *nextp = next;
1273                                         break;
1274                                 }
1275                                 len -= 6;
1276                                 next++;
1277                         }
1278                 }
1279                 break;
1280
1281         case IBLK_EXTENT: /* extent */
1282                 /* 12 byte records: 6byte device, 2 byte length, 4
1283                  * byte fileaddr */
1284
1285                 len -= 2;
1286                 buf += 2;
1287                 lo = 0;
1288                 hi = len/12;
1289                 while (hi > lo+1) {
1290                         int mid = (lo+hi)/2;
1291                         cp = buf;
1292                         cp += mid*12 + 6;
1293
1294                         elen = decode16(cp);
1295                         addr = decode32(cp);
1296
1297                         if (elen && addr <= target)
1298                                 lo = mid;
1299                         else
1300                                 hi = mid;
1301                 }
1302
1303                 cp = buf;
1304                 cp += lo*12;
1305                 p = decode48(cp);
1306                 elen = decode16(cp);
1307                 addr = decode32(cp);
1308                 if (target < addr) {
1309                         p = 0;
1310                         cp -= 12;
1311                 } else if (target >= addr + elen)
1312                         p = 0;
1313                 else
1314                         p += target - addr;
1315
1316                 if (nextp) {
1317                         if (lo == len/12)
1318                                 *nextp = 0xffffffff; /* next extent */
1319                         else {
1320                                 u64 p2;
1321                                 p2 = decode48(cp);
1322                                 elen = decode16(cp);
1323                                 addr = decode32(cp);
1324                                 if (elen == 0)
1325                                         *nextp = 0xffffffff; /* no more meaningful
1326                                                           extents*/
1327                                 else
1328                                         *nextp = addr;
1329                         }
1330                 }
1331         }
1332         return p;
1333 }
1334
1335 u32 lafs_leaf_next(struct indexblock *ib, u32 start)
1336 {
1337         /* In this leaf index block, find the first address
1338          * at-or-after 'start'.  If none, return 0xFFFFFFFF
1339          */
1340         char *buf = map_iblock(ib);
1341         int blocksize = ib->b.inode->i_sb->s_blocksize;
1342         struct lafs_inode *li = LAFSI(ib->b.inode);
1343         u32 next = 0xffffffff;
1344         u64 phys;
1345
1346         if (test_bit(B_InoIdx, &ib->b.flags))
1347                 phys = leaf_lookup(buf + li->metadata_size,
1348                                    blocksize - li->metadata_size,
1349                                    ib->b.fileaddr, start, &next);
1350         else
1351                 phys = leaf_lookup(buf, blocksize,
1352                                    ib->b.fileaddr, start, &next);
1353         unmap_iblock(ib, buf);
1354         if (phys)
1355                 return start;
1356         else
1357                 return next;
1358 }
1359
1360 static u64
1361 index_lookup(void *bf, int len, u32 target, u32 *addrp, u32 *nextp)
1362 {
1363         unsigned char *buf = bf;
1364         int lo, hi;
1365         u32 addr;
1366         u64 p;
1367         unsigned char *cp;
1368
1369         if (buf[1] || buf[0]) {
1370                 dprintk("WARNING: not an index block %02x %02x\n",
1371                         buf[0], buf[1]);
1372                 return 0;
1373         }
1374
1375         buf += 2;
1376         len -= 2;
1377
1378         /* 10 byte records: 6 byte Device address, 4 byte file address */
1379         lo = 0; hi = (len/10);
1380         while (hi > lo+1) {
1381                 int mid = (lo+hi)/2;
1382                 cp = buf;
1383                 cp += mid*10;
1384                 p = decode48(cp);
1385                 addr = decode32(cp);
1386                 dprintk("...addr=%lu target=%lu lo=%d mid=%d hi=%d\n",
1387                         (unsigned long)addr, (unsigned  long)target,
1388                         lo, mid, hi);
1389                 if (p && addr <= target)
1390                         lo = mid;
1391                 else
1392                         hi = mid;
1393         }
1394
1395         cp = buf;
1396         cp += lo*10;
1397         p = decode48(cp);
1398         addr = decode32(cp);
1399
1400         if (p == 0)
1401                 /* Nothing in this block */
1402                 return 0;
1403
1404         if (addr > target) {
1405                 /* target is before first address */
1406                 p = 0;
1407                 cp -= 10;
1408         } else
1409                 *addrp = addr;
1410
1411         if (cp+10 <= (unsigned char *) buf + len && nextp) {
1412                 u64 p2 = decode48(cp);
1413                 u32 nxt = decode32(cp);
1414                 if (p2)
1415                         *nextp = nxt; /* address of next index block */
1416         }
1417         return p;
1418 }
1419
1420 int lafs_index_empty(struct indexblock *ib)
1421 {
1422         char *buf = map_iblock(ib);
1423         int blocksize = ib->b.inode->i_sb->s_blocksize;
1424         struct lafs_inode *li = LAFSI(ib->b.inode);
1425         u32 addr;
1426         u64 phys;
1427
1428         if (test_bit(B_InoIdx, &ib->b.flags))
1429                 phys = index_lookup(buf + li->metadata_size,
1430                                     blocksize - li->metadata_size,
1431                                     0xFFFFFFFF, &addr, NULL);
1432         else
1433                 phys = index_lookup(buf, blocksize,
1434                                     0xFFFFFFFF, &addr, NULL);
1435         unmap_iblock(ib, buf);
1436
1437         return phys == 0;
1438 }
1439
1440 static struct indexblock *find_better(struct inode *ino,
1441                                       struct indexblock *ib,
1442                                       u32 addr, u32 *next, REFARG)
1443 {
1444         struct indexblock *rv = ib;
1445         spin_lock(&ino->i_data.private_lock);
1446         list_for_each_entry_continue(ib, &ib->b.parent->children, b.siblings) {
1447                 if (!test_bit(B_PrimaryRef, &ib->b.flags))
1448                         break;
1449                 if (test_bit(B_EmptyIndex, &ib->b.flags))
1450                         continue;
1451                 if (ib->b.fileaddr > addr) {
1452                         if (next && *next > ib->b.fileaddr)
1453                                 *next = ib->b.fileaddr;
1454                         break;
1455                 }
1456                 /* This appears to be better. */
1457                 rv = ib;
1458         }
1459         getref_locked(&rv->b, REF);
1460         spin_unlock(&ino->i_data.private_lock);
1461         return rv;
1462 }
1463
1464 struct indexblock *
1465 lafs_leaf_find(struct inode *inode, u32 addr, int adopt, u32 *next,
1466                int async, REFARG)
1467 {
1468         /* Find the leaf index block which refers to this address (if any do)
1469          * Returns the leaf IOLocked.
1470          * *next will be set to an address that is higher than addr such
1471          * that there are no other leafs before *next.  There might not
1472          * actually be a leaf at *next though.
1473          */
1474         struct indexblock *ib, *ib2;
1475         struct indexblock *hold = NULL;
1476         u64 iphys;
1477         u32 iaddr = 0;
1478         void *buf;
1479         struct lafs_inode *li = LAFSI(inode);
1480         int blocksize = inode->i_sb->s_blocksize;
1481         int err = 0;
1482         int offset;
1483
1484         /* Must own a reference to ->dblock */
1485         BUG_ON(li->dblock == NULL);
1486         BUG_ON(atomic_read(&li->dblock->b.refcnt) == 0);
1487
1488         /* We come back to restart if we needed to unlock to
1489          * perform IO and we cannot be sure the parent didn't change.
1490          * We hold the last seen index block in 'hold' so that if
1491          * no changes happened (the common case) we are certain
1492          * that no IO will be required to get back to where
1493          * we were.
1494          */
1495 restart:
1496         if (next)
1497                 *next = 0xFFFFFFFF;
1498
1499         ib = lafs_make_iblock(inode, adopt, async, REF);
1500         err = PTR_ERR(ib);
1501         if (IS_ERR(ib))
1502                 goto err;
1503         offset = li->metadata_size;
1504
1505         err = -EAGAIN;
1506         if (!async)
1507                 lafs_iolock_block(&ib->b);
1508         else if (!lafs_iolock_block_async(&ib->b))
1509                 goto err_ib;
1510
1511         while (ib->depth > 1) {
1512                 /* internal index block */
1513                 /* NOTE:  index block could be empty, or could have
1514                  * nothing as early as 'addr'.  In that case '0' is
1515                  * returned implying that the block is either in
1516                  * cache, or needs to be synthesised as an empty
1517                  * index block (one level down).
1518                  */
1519                 struct indexblock *better;
1520                 u32 target = addr;
1521
1522         retry:
1523                 buf = map_iblock(ib);
1524                 iaddr = ib->b.fileaddr;
1525                 iphys = index_lookup(buf + offset, blocksize - offset,
1526                                      target, &iaddr,
1527                                      target == addr ? next : NULL);
1528                 unmap_iblock(ib, buf);
1529
1530                 ib2 = lafs_iblock_get(inode, iaddr, ib->depth-1, iphys, REF);
1531
1532                 if (!ib2) {
1533                         err = -ENOMEM;
1534                         lafs_iounlock_block(&ib->b);
1535                         goto err_ib;
1536                 }
1537
1538                 better = find_better(inode, ib2, addr, next, REF);
1539                 putiref(ib2, REF);
1540                 ib2 = better;
1541
1542                 if (!test_bit(B_Valid, &ib2->b.flags)) {
1543                         lafs_iounlock_block(&ib->b);
1544                         err = lafs_load_block(&ib2->b, NULL);
1545                         if (err)
1546                                 goto err_ib2;
1547                         if (async)
1548                                 err = lafs_wait_block_async(&ib2->b);
1549                         else
1550                                 err = lafs_wait_block(&ib2->b);
1551                         if (err)
1552                                 goto err_ib2;
1553
1554                         err = -EAGAIN;
1555                         if (!async)
1556                                 lafs_iolock_block(&ib->b);
1557                         else if (!lafs_iolock_block_async(&ib->b))
1558                                 goto err_ib2;
1559
1560                         /* While we did IO, the parent could have been
1561                          * split, so it is now the wrong parent, or it
1562                          * could have become EmptyIndex, though that is
1563                          * much less likely.  If it did, then it must
1564                          * have had a parent, so the ref we hold means
1565                          * it still has a parent.
1566                          * So if ib->b.parent, then we might need to
1567                          * retry from the top, and holding a ref on ib
1568                          * means that we don't risk live-locking if
1569                          * memory for index blocks is very tight.
1570                          * If there is no parent, then there is
1571                          * no risk that ib changed.
1572                          */
1573                         if (ib->b.parent) {
1574                                 if (hold)
1575                                         putiref(hold, MKREF(hold));
1576                                 hold = getiref(ib, MKREF(hold));
1577                                 putiref(ib2, REF);
1578                                 putiref(ib, REF);
1579                                 goto restart;
1580                         }
1581                 }
1582
1583                 err = -EAGAIN;
1584                 if (!async)
1585                         lafs_iolock_block(&ib2->b);
1586                 else if (!lafs_iolock_block_async(&ib2->b))
1587                         goto err_ib2;
1588
1589                 if (ib2->b.fileaddr != ib->b.fileaddr &&
1590                     test_bit(B_EmptyIndex, &ib2->b.flags)) {
1591                         /* need to look earlier in the parent */
1592                         target = ib2->b.fileaddr - 1;
1593                         lafs_iounlock_block(&ib2->b);
1594                         putiref(ib2, REF);
1595                         goto retry;
1596                 }
1597
1598 #ifdef FIXME
1599                 if (!(ib2->b.flags & B_Linked)) {
1600                         struct block *b2 = peer_find(fs,
1601                                                      inode, iaddr,
1602                                                      depth, iphys);
1603                         if (b2)
1604                                 list_add(&ib2->peers, &b2->peers);
1605                         set_bit(B_Linked, &ib2->b.flags);
1606                 }
1607 #endif
1608                 if (adopt)
1609                         block_adopt(&ib2->b, ib);
1610                 lafs_iounlock_block(&ib->b);
1611                 putiref(ib, REF);
1612                 ib = ib2;
1613                 offset = 0;
1614         }
1615
1616         return ib;
1617
1618 err_ib2:
1619         putiref(ib2, REF);
1620 err_ib:
1621         putiref(ib, REF);
1622 err:
1623         if (hold)
1624                 putiref(hold, MKREF(hold));
1625         return ERR_PTR(err);
1626 }
1627
1628 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr);
1629
1630 static int __lafs_find_next(struct inode *ino, loff_t *addrp)
1631 {
1632         /* Find the next allocated block in inode at or after *addrp
1633          * and set *addrp.
1634          * Return:
1635          *  -1 -  error
1636          *   0 -  there is no block at or after '*addrp'
1637          *   1 -  It is worth checking the block at *addrp.
1638          *   2 -  *addrp has been updated, check again.
1639          */
1640         struct lafs_inode *lai = LAFSI(ino);
1641         struct indexblock *leaf;
1642         struct fs *fs = fs_from_inode(ino);
1643         struct block *b;
1644         u32 nexti, nextd, nextl;
1645         int rv = 0;
1646         int offset;
1647         char *buf;
1648         u64 p;
1649
1650         /* Must hold a reference to ->dblock */
1651         BUG_ON(lai->dblock == NULL);
1652         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1653
1654         if (lai->depth == 0) {
1655                 if (*addrp == 0)
1656                         return 1;
1657                 leaf = lafs_make_iblock(ino, NOADOPT, SYNC, MKREF(find_next));
1658                 if (IS_ERR(leaf))
1659                         return PTR_ERR(leaf);
1660                 nextd = 0xFFFFFFFF;
1661         } else {
1662                 leaf = lafs_leaf_find(ino, *addrp, NOADOPT, &nexti, SYNC,
1663                                       MKREF(find_next));
1664                 /* FIXME what if 'leaf' is an error?? */
1665                 offset = 0;
1666                 if (lai->depth == 1)
1667                         offset = lai->metadata_size;
1668                 buf = map_iblock(leaf);
1669                 p = leaf_lookup(buf+offset, fs->blocksize-offset,
1670                                 leaf->b.fileaddr, (u32)*addrp, &nextd);
1671                 unmap_iblock(leaf, buf);
1672                 if (p) {
1673                         lafs_iounlock_block(&leaf->b);
1674                         putiref(leaf, MKREF(find_next));
1675                         if (*addrp == 0xFFFFFFFF) {
1676                                 int j;
1677                                 printk("at %s\n", strblk(&lai->dblock->b));
1678                                 printk("offset = %d depth=%d\n", offset,
1679                                        lai->depth);
1680                                 for (j = 0; j < 16; j++)
1681                                         printk("%04x ", buf[offset+j] & 0xffff);
1682                                 for (j = 0; j < 16; j++)
1683                                         printk("%04x ", buf[1024-16+j] & 0xffff);
1684                                 printk("\n");
1685                         }
1686                         BUG_ON(*addrp == 0xFFFFFFFF);
1687                         return 1;
1688                 }
1689                 if (nextd != 0xFFFFFFFF)
1690                         rv = 1;
1691                 else if (nexti != 0xFFFFFFFF) {
1692                         nextd = nexti;
1693                         rv = 2;
1694                 }
1695         }
1696         /* If rv==2, nextd may not be high enough as it might
1697          * be the address of an EmptyIndex block.  So only
1698          * use it if nothing better is found.
1699          */
1700         if (rv == 2)
1701                 nextl = 0xFFFFFFFF;
1702         else
1703                 nextl = nextd;
1704         list_for_each_entry(b, &leaf->children, siblings)
1705                 if (b->fileaddr >= *addrp &&
1706                     b->fileaddr <= nextl &&
1707                     test_bit(B_Valid, &b->flags)) {
1708                         nextd = nextl = b->fileaddr;
1709                         rv = 1;
1710                 }
1711
1712         if (leaf->uninc_table.pending_cnt) {
1713                 spin_lock(&leaf->b.inode->i_data.private_lock);
1714                 if (table_find_first(&leaf->uninc_table, *addrp, &nextl)) {
1715                         nextd = nextl;
1716                         rv = 1;
1717                 }
1718                 spin_unlock(&leaf->b.inode->i_data.private_lock);
1719         }
1720         lafs_iounlock_block(&leaf->b);
1721         putiref(leaf, MKREF(find_next));
1722         *addrp = nextd;
1723         return rv;
1724 }
1725
1726 /*
1727  * find the first data block at or after *bnump, and
1728  * store the address in *bnump.
1729  * Return 0 if nothing found, -1 on error, or
1730  * 1 if *bnump is a valid data block address
1731  *
1732  * The block could be in the indexing tree, or in
1733  * an uninc_table, or totally unincorporated.
1734  * Blocks that are really part of the file, but are not in
1735  * the indexing tree will still be on a ->children list
1736  * from an index block, and the index block will be very close
1737  * to the indexing tree.
1738  * So once we have found a suitable index block, we need to check
1739  * all it's children.  This could possibly be optimised using
1740  * find_get_pages if that was exported.
1741  */
1742 int lafs_find_next(struct inode *ino, loff_t *bnump)
1743 {
1744         int rv;
1745         /* Need to hold a reference to dblock while calling
1746          * __lafs_find_next
1747          */
1748         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(find_nexti));
1749
1750         if (IS_ERR(db))
1751                 return PTR_ERR(db);
1752         while (1) {
1753                 struct datablock *b;
1754                 int hole;
1755
1756                 rv = __lafs_find_next(ino, bnump);
1757
1758                 if (rv <= 0)
1759                         break;
1760                 if (rv == 2)
1761                         continue;
1762
1763                 b = lafs_get_block(ino, *bnump, NULL, GFP_KERNEL,
1764                                    MKREF(find_next));
1765                 if (!b)
1766                         return -ENOMEM;
1767
1768                 rv = lafs_find_block(b, NOADOPT);
1769                 if (rv) {
1770                         putdref(b, MKREF(find_next));
1771                         return rv;
1772                 }
1773                 hole = (b->b.physaddr == 0 ||
1774                         !test_bit(B_PhysValid, &b->b.flags)) &&
1775                         !test_bit(B_Valid, &b->b.flags);
1776                 if (LAFSI(ino)->depth == 0 &&
1777                     b->b.fileaddr == 0)
1778                         hole = 0; /* first block lives in inode */
1779                 putdref(b, MKREF(find_next));
1780
1781                 rv = 1;
1782                 if (!hole)
1783                         break;
1784                 *bnump = (*bnump)+1;
1785         }
1786         putdref(db, MKREF(find_nexti));
1787         BUG_ON(rv > 0 && *bnump == 0xFFFFFFFF);
1788         return rv;
1789 }
1790
1791 static int table_lookup(struct uninc *tbl, u32 addr, u64 *phys)
1792 {
1793         int i;
1794         for (i = tbl->pending_cnt; i; ) {
1795                 struct addr *a = &tbl->pending_addr[--i];
1796                 if (a->fileaddr <= addr &&
1797                     a->fileaddr + a->cnt > addr) {
1798                         *phys =  a->physaddr + (addr - a->fileaddr);
1799                         return 1;
1800                 }
1801         }
1802         return 0;
1803 }
1804
1805 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr)
1806 {
1807         /* Find the earliest allocated block in tbl which is
1808          * at-or-after 'min' and before '*addr', and store it in
1809          * *addr, returning 1 if something was found.
1810          */
1811         int rv = 0;
1812         int i;
1813
1814         for (i = tbl->pending_cnt ; i ; ) {
1815                 struct addr *a = &tbl->pending_addr[--i];
1816                 if (a->fileaddr >= *addr)
1817                         continue;
1818                 if (a->fileaddr + a->cnt < min)
1819                         continue;
1820                 if (a->physaddr == 0)
1821                         continue;
1822                 if (a->fileaddr >= min)
1823                         *addr = a->fileaddr;
1824                 else
1825                         *addr = min;
1826                 rv = 1;
1827         }
1828         return rv;
1829 }
1830
1831 static void fill_block_zero(struct datablock *db, struct datablock *inob)
1832 {
1833         struct lafs_inode *lai = LAFSI(db->b.inode);
1834         void *baddr = map_dblock(db);
1835         void *iaddr = map_dblock_2(inob);
1836         int offset = lai->metadata_size;
1837         int len = (1<<db->b.inode->i_blkbits) - offset;
1838
1839         memcpy(baddr, iaddr+offset, len);
1840         unmap_dblock_2(inob, iaddr);
1841         memset(baddr+len, 0, (1<<db->b.inode->i_blkbits)-len);
1842         unmap_dblock(db, baddr);
1843         set_bit(B_Valid, &db->b.flags);
1844 }
1845
1846 static int __must_check
1847 find_block(struct datablock *b, int adopt, int async)
1848 {
1849         /* Find where this block is in storage, and
1850          * set b->b.physaddr.
1851          * If the block lives in the inode, physaddr
1852          * gets set to zero, and lafs_load_block works with this.
1853          * Otherwise we walk down the index tree
1854          * Possible errors:
1855          *  -EIO
1856          *  -ENOMEM
1857          */
1858         struct lafs_inode *lai;
1859         struct indexblock *ib;
1860         struct datablock *db;
1861         void *buf;
1862         int  offset;
1863
1864         LAFS_BUG(test_bit(B_Pinned, &b->b.flags) &&
1865                  b->b.parent == NULL,
1866                  &b->b);
1867         if (b->b.inode == NULL)
1868                 return 0;
1869         lai = LAFSI(b->b.inode);
1870
1871         if (lai->depth == 0) {
1872                 /* Data is in the inode if anywhere */
1873                 if (!test_bit(B_PhysValid, &b->b.flags)) {
1874                         b->b.physaddr = 0;
1875                         set_bit(B_PhysValid, &b->b.flags);
1876                 }
1877                 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1878                 if (IS_ERR(db))
1879                         return PTR_ERR(db);
1880                 if (!test_bit(B_Valid, &b->b.flags) && b->b.fileaddr == 0) {
1881                         /* Now is the best time to copy data from inode into
1882                          * datablock, as we know we have a valid inode block
1883                          * now.
1884                          */
1885                         LAFS_BUG(b->b.physaddr != 0, &b->b);
1886                         fill_block_zero(b, db);
1887                 }
1888                 if (adopt) {
1889                         ib = lafs_make_iblock(b->b.inode, adopt, async,
1890                                               MKREF(findblock));
1891                         putdref(db, MKREF(findblock));
1892                         if (IS_ERR(ib))
1893                                 return PTR_ERR(ib);
1894                         block_adopt(&b->b, ib);
1895                         putiref(ib, MKREF(findblock));
1896                 } else
1897                         putdref(db, MKREF(findblock));
1898                 return 0;
1899         }
1900
1901         if (test_bit(B_PhysValid, &b->b.flags)
1902             && (!adopt || b->b.parent || test_bit(B_Root, &b->b.flags))) {
1903                 return 0;
1904         }
1905
1906         db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1907         if (IS_ERR(db))
1908                 return PTR_ERR(db);
1909
1910         /* Ok, there is some indexing information we need to
1911          * look through.  Find the leaf first
1912          */
1913         ib = lafs_leaf_find(b->b.inode, b->b.fileaddr, adopt, NULL, async,
1914                             MKREF(findblock));
1915         putdref(db, MKREF(findblock));
1916
1917         if (IS_ERR(ib)) {
1918                 dprintk("lafs_leaf_find returned error to lafs_find_block\n");
1919                 goto out;
1920         }
1921
1922         if (!test_bit(B_PhysValid, &b->b.flags)
1923             && ib->uninc_table.pending_cnt) {
1924                 spin_lock(&b->b.inode->i_data.private_lock);
1925                 if (table_lookup(&ib->uninc_table, b->b.fileaddr,
1926                                  &b->b.physaddr))
1927                         set_bit(B_PhysValid, &b->b.flags);
1928                 spin_unlock(&b->b.inode->i_data.private_lock);
1929         }
1930         if (!test_bit(B_PhysValid, &b->b.flags)) {
1931                 buf = map_iblock(ib);
1932                 offset = lai->depth > 1 ? 0 : lai->metadata_size;
1933                 if (test_bit(B_InoIdx, &ib->b.flags))
1934                         dprintk("InoIdx!!!\n");
1935                 dprintk("offset %d: %02x %02x\n", offset,
1936                         ((char *)buf)[offset], ((char *)buf)[offset+1]);
1937                 b->b.physaddr = leaf_lookup(buf+offset,
1938                                             b->b.inode->i_sb->s_blocksize
1939                                             - offset,
1940                                             ib->b.fileaddr, b->b.fileaddr,
1941                                             NULL);
1942                 set_bit(B_PhysValid, &b->b.flags);
1943                 unmap_iblock(ib, buf);
1944         }
1945         dprintk("lafs_find_block: lookup for %lu of %lu at %lu found %llu\n",
1946                 (unsigned long)b->b.fileaddr,
1947                 (unsigned long)b->b.inode->i_ino,
1948                 (unsigned long)ib->b.fileaddr,
1949                 b->b.physaddr);
1950         /* FIXME should peer_find go here or in load_block? */
1951
1952         if (adopt)
1953                 block_adopt(&b->b, ib);
1954         lafs_iounlock_block(&ib->b);
1955 out:
1956         if (IS_ERR(ib))
1957                 return PTR_ERR(ib);
1958         if (ib)
1959                 putiref(ib, MKREF(findblock));
1960         return 0;
1961 }
1962
1963 int __must_check
1964 lafs_find_block(struct datablock *b, int adopt)
1965 {
1966         return find_block(b, adopt, SYNC);
1967 }
1968
1969 int __must_check
1970 lafs_find_block_async(struct datablock *b)
1971 {
1972         return find_block(b, ADOPT, ASYNC);
1973 }
1974
1975 /*
1976  * lafs_allocated_block records that a block has been written at a new
1977  * address (or found at an address during roll-forward). This involves
1978  * updating some summary information (quota etc) and making sure
1979  * the change gets recorded in the parent block
1980  * This must be called before B_Dirty or B_Realloc are cleared,
1981  * to ensure that the index block is marked accordingly.
1982  *  B_UnincCredit should still be set, and we clear it.
1983  */
1984
1985 int lafs_add_block_address(struct fs *fs, struct block *blk)
1986 {
1987         struct indexblock *p = blk->parent;
1988         struct addr *a;
1989
1990         spin_lock(&blk->inode->i_data.private_lock);
1991         if (test_bit(B_Index, &blk->flags)) {
1992                 if (!test_and_set_bit(B_Uninc, &blk->flags)) {
1993                         blk->chain = p->uninc;
1994                         LAFS_BUG(p->depth == 1, blk);
1995                         p->uninc = blk;
1996                         getref(blk, MKREF(uninc));
1997                 }
1998                 spin_unlock(&blk->inode->i_data.private_lock);
1999                 return 1;
2000         }
2001
2002         /* same phase and leaf; add the address to uninc_table */
2003
2004         /* the only place we try to merge is at the end
2005          * of the embedded table, and then only for data
2006          * blocks
2007          * It is essential that uninc_table is sorted and,
2008          * in particular, has no overlapping extents.
2009          * If this block threatens problems we incorporate first.
2010          */
2011
2012         LAFS_BUG(!test_bit(B_Dirty, &p->b.flags) &&
2013                  !test_bit(B_Realloc, &p->b.flags), blk);
2014
2015         a = &p->uninc_table.pending_addr
2016                 [p->uninc_table.pending_cnt-1];
2017         if (p->uninc_table.pending_cnt >= 1 &&
2018             a->fileaddr+a->cnt == blk->fileaddr &&
2019             a->physaddr != 0 &&
2020             a->physaddr+a->cnt == blk->physaddr) {
2021                 a->cnt++;
2022                 if (test_and_clear_bit(B_UnincCredit, &blk->flags))
2023                         p->uninc_table.credits++;
2024                 spin_unlock(&blk->inode->i_data.private_lock);
2025                 return 1;
2026         } else if ((p->uninc_table.pending_cnt == 0 ||
2027                     a->fileaddr + a->cnt <= blk->fileaddr) &&
2028                    /* properly ordered, maybe add */
2029                    (p->uninc_table.pending_cnt <
2030                     ARRAY_SIZE(p->uninc_table.pending_addr))) {
2031                 a++;
2032                 a->fileaddr = blk->fileaddr;
2033                 a->physaddr = blk->physaddr;
2034                 a->cnt = 1;
2035                 p->uninc_table.pending_cnt++;
2036                 if (test_and_clear_bit(B_UnincCredit,
2037                                        &blk->flags))
2038                         p->uninc_table.credits++;
2039                 spin_unlock(&blk->inode->i_data.private_lock);
2040
2041                 return 1;
2042         } else {
2043                 spin_unlock(&blk->inode->i_data.private_lock);
2044                 /* We own a ref through blk->parent now, but
2045                  * after lafs_incorporate, we might not
2046                  */
2047                 getiref(p,MKREF(addblock));
2048                 lafs_iolock_written(&p->b);
2049                 lafs_incorporate(fs, p);
2050                 lafs_iounlock_block(&p->b);
2051                 putiref(p,MKREF(addblock));
2052                 return 0;
2053         }
2054 }
2055
2056 int lafs_allocated_block(struct fs *fs, struct block *blk, u64 phys)
2057 {
2058         int ph;
2059         struct indexblock *p;
2060
2061         dprintk("Allocated %s -> %llu\n",  strblk(blk), phys);
2062
2063         /* If phys is 0, I don't need uninc credit.
2064          * Also, Index block that are not near full do not need UnincCredits
2065          */
2066         LAFS_BUG(test_bit(B_InoIdx, &blk->flags), blk);
2067         if (!test_bit(B_Dirty, &blk->flags) &&
2068             !test_bit(B_Realloc, &blk->flags) &&
2069             phys != 0)
2070                 printk("something missing %s\n", strblk(blk));
2071         LAFS_BUG(!test_bit(B_UnincCredit, &blk->flags) && phys &&
2072                  !test_bit(B_Index, &blk->flags), blk);
2073         LAFS_BUG(!test_bit(B_Dirty, &blk->flags) &&
2074                  !test_bit(B_Realloc, &blk->flags) &&
2075                  phys != 0, blk);
2076
2077         LAFS_BUG(!test_bit(B_Writeback, &blk->flags), blk);
2078
2079         if (test_bit(B_Root, &blk->flags)) {
2080                 int i;
2081                 struct lafs_inode *lai;
2082
2083                 LAFS_BUG(test_bit(B_Index, &blk->flags), blk);
2084
2085                 for (i = 0; i < fs->maxsnapshot; i++)
2086                         if (fs->ss[i].root == blk->inode)
2087                                 break;
2088                 LAFS_BUG(i == fs->maxsnapshot, blk);
2089                 blk->physaddr = phys; /* superblock doesn't get
2090                                          counted in summaries */
2091                 LAFS_BUG(test_bit(B_SegRef, &blk->flags), blk);
2092                 set_bit(B_PhysValid, &blk->flags);
2093                 fs->ss[i].root_addr = phys;
2094                 lai = LAFSI(blk->inode);
2095                 if (i == 0 && lai->md.fs.usagetable > 1)
2096                         /* snapshot checkpoint is happening.  Record root
2097                          * address in main fs and in snapshot
2098                          */
2099                         fs->ss[lai->md.fs.usagetable-1].root_addr = phys;
2100
2101                 /* Don't bother to clear the B_UnincCredit in the root.
2102                  * It'll just get set again.  This way there is less churn.
2103                  */
2104                 return 0;
2105         }
2106         if (test_bit(FinalCheckpoint, &fs->fsstate) &&
2107             !test_bit(B_Index, &blk->flags) &&
2108             (LAFSI(blk->inode)->type == TypeSegmentMap ||
2109              LAFSI(blk->inode)->type == TypeQuota)) {
2110                 /* These blocks will be picked up by roll-forward,
2111                  * so don't need to have their address recorded,
2112                  * and doing so would leave the index blocks
2113                  * dirty after the checkpoint.
2114                  * So just return
2115                  */
2116                 if (test_and_clear_bit(B_SegRef, &blk->flags))
2117                         lafs_seg_deref(fs, blk->physaddr, 0);
2118                 blk->physaddr = phys;
2119                 set_bit(B_PhysValid, &blk->flags);
2120                 return 0;
2121         }
2122         p = blk->parent;
2123
2124         LAFS_BUG(!p, blk);
2125
2126         /* We need to work out if this block is in the current phase or
2127          * the next one.  This can only be an issue if a phase change
2128          * (aka checkpoint) is underway. But if it is, we need to
2129          * ensure the right phase is used
2130          */
2131         lafs_checkpoint_lock(fs);
2132         if (test_bit(B_Pinned, &blk->flags))
2133                 ph = !!test_bit(B_Phase1, &blk->flags);
2134         else if (test_bit(B_Index, &blk->flags))
2135                 LAFS_BUG(1, blk); /* Index blocks must always be pinned when dirty (*/
2136         else if (p->uninc_table.pending_cnt) {
2137                 /* attach data block into previous phase as incorporation of
2138                  * this phase is still to happen.
2139                  * This is needed for flush-if-cannot-preallocate to work.
2140                  */
2141                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), &p->b);
2142                 ph = !!test_bit(B_Phase1, &p->b.flags);
2143                 lafs_pin_block_ph(blk, ph);
2144         } else {
2145                 lafs_pin_block(blk);
2146                 ph = !!test_bit(B_Phase1, &blk->flags);
2147         }
2148         lafs_checkpoint_unlock(fs);
2149         if (!test_bit(B_PhysValid, &blk->flags))
2150                 blk->physaddr = 0;
2151         lafs_summary_update(fs, blk->inode, blk->physaddr, phys,
2152                             test_bit(B_Index, &blk->flags), ph,
2153                             test_bit(B_SegRef, &blk->flags));
2154         clear_bit(B_Prealloc, &blk->flags);
2155
2156         blk->physaddr = phys;
2157         set_bit(B_PhysValid, &blk->flags);
2158
2159         if (ph != !!test_bit(B_Phase1, &p->b.flags)) {
2160                 /* in the next phase */
2161                 dprintk("uninc next phase\n");
2162                 if (test_and_set_bit(B_Uninc, &blk->flags))
2163                         /* This block was already on the 'unincorporated'
2164                          * list, (it must be changing quickly) so there
2165                          * is nothing else to do.
2166                          */
2167                         return 0;
2168                 getref(blk, MKREF(uninc));
2169
2170                 spin_lock(&blk->inode->i_data.private_lock);
2171                 blk->chain = p->uninc_next;
2172                 p->uninc_next = blk;
2173                 spin_unlock(&blk->inode->i_data.private_lock);
2174                 /* We will try again after the phase change.
2175                  * That will be when we clean B_UnincCredit
2176                  */
2177                 return 0;
2178         }
2179
2180 new_parent:
2181         if (test_bit(B_Dirty, &blk->flags) || phys == 0) {
2182                 if (!test_bit(B_Dirty, &p->b.flags)
2183                     && !test_bit(B_Credit, &p->b.flags)) {
2184                         printk("Oh dear: %s\n", strblk(blk));
2185                         printk(".......: %s\n", strblk(&p->b));
2186                 }
2187                 lafs_dirty_iblock(p);
2188         } else {
2189                 if (!test_bit(B_Valid, &p->b.flags)) {
2190                         printk("Not valid in lafs_allocated_block: %s\n",
2191                                strblk(&p->b));
2192                         printk("blk is %s\n", strblk(blk));
2193                         BUG();
2194                 }
2195                 LAFS_BUG(!test_bit(B_Valid, &p->b.flags), &p->b);
2196                 if (!test_bit(B_Realloc, &p->b.flags)) {
2197                         /* I cannot use B_Credit to fill B_Realloc as that
2198                          * might still be needed for B_Dirty.
2199                          * So if we cannot allocated a new credit,
2200                          * just set the block as 'dirty' now.
2201                          */
2202                         if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
2203                                 if (test_and_set_bit(B_Realloc, &p->b.flags))
2204                                         lafs_space_return(fs, 1);
2205                         } else
2206                                 lafs_dirty_iblock(p);
2207                 }
2208                 if (!test_and_set_bit(B_UnincCredit, &p->b.flags))
2209                         if (!test_and_clear_bit(B_ICredit, &p->b.flags))
2210                                 LAFS_BUG(1, &p->b); // ICredit should be set before
2211                 //we dirty a block.
2212
2213         }
2214
2215         dprintk("Uninc same phase\n");
2216
2217         BUG_ON(!test_bit(B_Pinned, &blk->parent->b.flags));
2218
2219         if (!lafs_add_block_address(fs, blk)) {
2220                 p = blk->parent;
2221
2222                 LAFS_BUG(!p, blk);
2223                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), blk);
2224                 goto new_parent;
2225         }
2226
2227         /* That credit has now been added to the uninc_table and
2228          * thus made available to the parent
2229          */
2230
2231         lafs_refile(&p->b, 0);
2232         return 0;
2233 }