]> git.neil.brown.name Git - LaFS.git/blob - index.c
Replace lots of ino_from_sb uses.
[LaFS.git] / index.c
1
2 /*
3  * Handle index block for LaFS.
4  * fs/lafs/index.c
5  * Copyright (C) 2005-2009
6  * Neil Brown <neilb@suse.de>
7  * Released under the GPL, version 2
8  */
9
10 #include        "lafs.h"
11 #include        <linux/types.h>
12 #include        <linux/hash.h>
13 #include        <linux/slab.h>
14 /*
15  * For each index-block there is an 'indexblock' data structure,
16  * and a 'block' of memory, which is a page or part there-of.
17  *
18  * Index-blocks are stored in a hash table, indexed by
19  * (struct inode *, fileaddr, depth).  There is one hash table
20  * for all filesystems.
21  *
22  * There is a single free-list for all index blocks from all
23  * filesystems, of all sizes.  Index blocks only get on this list when
24  * they are unpinned leafs.  i.e. their 'children' list is empty.  As they
25  * are freed, their parents may get added to the list.
26  *
27  * It's not clear yet how memory pressure will flush out dirty index blocks.
28  *
29  * All indexblocks for an inode are either attached under the inode via
30  * the children/siblings tree, or on a per-inode 'free' list also using
31  * the 'siblings' linkage. Thus we are sure that they can be removed when
32  * the inode is removed.  Note that data blocks do *not* necessarily live
33  * in this tree (though they will when they are dirty).  They can always be
34  * found in the radix_tree for the inode's address-space.
35  *
36  * STILL TO DO:
37  *   Consider how 'rcu' can be used for
38  *     - hash table
39  *     - freelist
40  *   Consider gang-freeing for adding to head of lru lists.
41  */
42
43 /* FIXME these should be configured at runtime based on memory */
44 #define HASHBITS        10
45 static struct hlist_head hash_table[1<<HASHBITS];
46 spinlock_t lafs_hash_lock;
47
48 /* static */ struct freelists freelist;
49
50 static int lafs_shrinker(int nr_to_scan, /*gfp_t*/unsigned int gfp_mask)
51 {
52         if (nr_to_scan) {
53                 LIST_HEAD(togo);
54                 if (!(gfp_mask & __GFP_FS))
55                         return -1;
56                 dprintk("scan %d\n", nr_to_scan);
57                 spin_lock(&lafs_hash_lock);
58                 while (nr_to_scan && !list_empty(&freelist.lru)) {
59                         struct indexblock *ib = list_entry(freelist.lru.next,
60                                                            struct indexblock,
61                                                            b.lru);
62                         /* Check if it is really free */
63
64                         if (atomic_read(&ib->b.refcnt)) {
65                                 list_del_init(&ib->b.lru);
66                                 clear_bit(B_OnFree, &ib->b.flags);
67                                 freelist.freecnt--;
68                                 continue;
69                         }
70                         if (test_bit(B_InoIdx, &ib->b.flags))
71                                 LAFSI(ib->b.inode)->iblock = NULL;
72
73                         /* Any pinned iblock must have children, be
74                          * refcounted, or be on a 'leaf' lru, which gives
75                          * a refcount.  So this unreffed block cannot be
76                          * pinned.  This BUG_ON is here because previously
77                          * the above 'if' also tested for 'Pinned', and that
78                          * seems unnecessary, but I want to be sure.
79                          */
80                         LAFS_BUG(test_bit(B_Pinned, &ib->b.flags), &ib->b);
81                         if (!hlist_unhashed(&ib->hash))
82                                 hlist_del_init(&ib->hash);
83                         /* delete from inode->free_index */
84                         list_del_init(&ib->b.siblings);
85                         list_move(&ib->b.lru, &togo);
86                         nr_to_scan--;
87                         freelist.freecnt--;
88                 }
89                 spin_unlock(&lafs_hash_lock);
90                 while (!list_empty(&togo)) {
91                         struct indexblock *ib = list_entry(togo.next,
92                                                            struct indexblock,
93                                                            b.lru);
94                         list_del(&ib->b.lru);
95                         lafs_iblock_free(ib);
96                 }
97         }
98         return freelist.freecnt;
99 }
100
101 void lafs_release_index(struct list_head *head)
102 {
103         /* This is the other place that index blocks get freed.
104          * They are freed either by memory pressure in lafs_shrinker
105          * above, or when an inode is destroyed by this code.
106          * They need to both obey the same locking rules, and ensure
107          * blocks are removed from each of siblings, lru, and hash.
108          * This list is an inode's free_index and are linked on
109          * b.siblings.  They are all on the freelist.lru
110          */
111         LIST_HEAD(togo);
112         spin_lock(&lafs_hash_lock);
113         while (!list_empty(head)) {
114                 struct indexblock *ib = list_entry(head->next,
115                                                    struct indexblock,
116                                                    b.siblings);
117                 list_del_init(&ib->b.siblings);
118                 LAFS_BUG(!test_bit(B_OnFree, &ib->b.flags), &ib->b);
119                 if (!hlist_unhashed(&ib->hash))
120                         hlist_del_init(&ib->hash);
121                 list_move(&ib->b.lru, &togo);
122                 freelist.freecnt--;
123         }
124         spin_unlock(&lafs_hash_lock);
125         while (!list_empty(&togo)) {
126                 struct indexblock *ib = list_entry(togo.next,
127                                                    struct indexblock,
128                                                    b.lru);
129                 list_del(&ib->b.lru);
130                 lafs_iblock_free(ib);
131         }
132 }
133
134 struct indexblock *
135 lafs_iblock_alloc(struct fs *fs, int gfp, int with_buffer, REFARG)
136 {
137         struct indexblock *ib;
138
139         ib = kzalloc(sizeof(*ib), gfp);
140         if (!ib)
141                 return NULL;
142         if (with_buffer) {
143                 ib->data = kmalloc(fs->blocksize, gfp);
144                 if (!ib->data) {
145                         kfree(ib);
146                         return NULL;
147                 }
148         } else
149                 ib->data = NULL;
150
151         ib->b.flags = 0;
152         set_bit(B_Index, &ib->b.flags);
153         atomic_set(&ib->b.refcnt, 1);
154         add_ref(&ib->b, REF, __FILE__, __LINE__);
155
156         INIT_LIST_HEAD(&ib->children);
157         atomic_set(&ib->pincnt[0], 0);
158         atomic_set(&ib->pincnt[1], 0);
159         ib->uninc = NULL;
160         ib->uninc_next = NULL;
161
162         ib->b.parent = NULL;
163         INIT_LIST_HEAD(&ib->b.siblings);
164         INIT_LIST_HEAD(&ib->b.lru);
165         INIT_LIST_HEAD(&ib->b.peers);
166         INIT_HLIST_NODE(&ib->hash);
167         ib->b.chain = NULL;
168         ib->uninc_table.pending_cnt = 0;
169         ib->uninc_table.credits = 0;
170         ib->b.inode = NULL;
171         return ib;
172 }
173
174 void lafs_iblock_free(struct indexblock *ib)
175 {
176         kfree(ib->data);
177         kfree(ib);
178 }
179
180 static struct shrinker hash_shrink = {
181         .shrink = lafs_shrinker,
182         .seeks = DEFAULT_SEEKS,
183 };
184
185 int lafs_ihash_init(void)
186 {
187         int i;
188         for (i = 0 ; i < (1<<HASHBITS); i++)
189                 INIT_HLIST_HEAD(hash_table+i);
190         spin_lock_init(&lafs_hash_lock);
191         INIT_LIST_HEAD(&freelist.lru);
192         register_shrinker(&hash_shrink);
193
194         return 0;
195 }
196
197 void lafs_ihash_free(void)
198 {
199         unregister_shrinker(&hash_shrink);
200 }
201
202 static int
203 ihash(struct inode *inode, faddr_t addr, int depth)
204 {
205         unsigned long h = hash_ptr(inode, BITS_PER_LONG);
206         h = hash_long(h ^ addr, BITS_PER_LONG);
207         return hash_long(h ^ depth, HASHBITS);
208 }
209
210 static struct indexblock *
211 ihash_lookup(struct inode *inode, faddr_t addr, int depth,
212              struct indexblock *new, REFARG)
213 {
214         struct indexblock *ib = NULL;
215         struct hlist_node *tmp;
216         struct hlist_head *head =  &hash_table[ihash(inode, addr, depth)];
217
218         spin_lock(&lafs_hash_lock);
219         hlist_for_each_entry(ib, tmp, head, hash)
220                 if (ib->b.fileaddr == addr && ib->b.inode == inode &&
221                     ib->depth == depth)
222                         break;
223         if (!tmp) {
224                 if (new) {
225                         hlist_add_head(&new->hash, head);
226                         ib = new;
227                 } else
228                         ib = NULL;
229         }
230         if (ib)
231                 getiref_locked_needsync(ib, REF);
232         spin_unlock(&lafs_hash_lock);
233         if (ib)
234                 sync_ref(&ib->b);
235         return ib;
236 }
237
238 struct indexblock *lafs_getiref_locked(struct indexblock *ib)
239 {
240         int ok = 0;
241         if (!ib)
242                 return ib;
243         LAFS_BUG(!test_bit(B_InoIdx, &ib->b.flags), &ib->b);
244         if (spin_is_locked(&ib->b.inode->i_data.private_lock))
245                 ok = 1;
246         if (spin_is_locked(&lafs_hash_lock))
247                 ok = 1;
248         if (spin_is_locked(&fs_from_inode(ib->b.inode)->lock))
249                 ok = 1;
250         LAFS_BUG(!ok, &ib->b);
251
252         if (atomic_inc_return(&ib->b.refcnt) == 1) {
253                 /* First reference to an InoIdx block implies a reference
254                  * to the dblock
255                  * If two different threads come here under different locks, one could
256                  * exit before the ref on the dblock is taken.  However as each
257                  * thread must own an indirect reference to be here, the last indirect ref to
258                  * the dblock cannot disappear before all threads have exited this function,
259                  * by which time this direct ref will be active.
260                  */
261                 struct datablock *db;
262                 db = getdref(LAFSI(ib->b.inode)->dblock, MKREF(iblock));
263                 LAFS_BUG(db == NULL, &ib->b);
264         }
265         return ib;
266 }
267
268 struct indexblock *
269 lafs_iblock_get(struct inode *ino, faddr_t addr, int depth, paddr_t phys, REFARG)
270 {
271         /* Find the index block with this inode/depth/address.
272          * If it isn't in the cache, create it with given
273          * phys address
274          */
275         struct indexblock *ib = ihash_lookup(ino, addr, depth, NULL, REF);
276         struct indexblock *new;
277         if (ib)
278                 return ib;
279
280         if (!phys)
281                 return NULL;
282
283         new = lafs_iblock_alloc(fs_from_inode(ino), GFP_KERNEL, 1, REF);
284         if (!new)
285                 return NULL;
286         new->b.fileaddr = addr;
287         new->b.inode = ino;
288         new->depth = depth;
289         new->b.physaddr = phys;
290         set_bit(B_PhysValid, &new->b.flags);
291
292         ib = ihash_lookup(ino, addr, depth, new, REF);
293         if (ib != new)
294                 lafs_iblock_free(new);
295         return ib;
296 }
297
298 void lafs_hash_iblock(struct indexblock *ib)
299 {
300         /* This block was an InoIdx block but has just become a real
301          * index block, so hash it.  There cannot be a block with this
302          * address already as we haven't increased the inode depth yet.
303          * Alternately this block was recently split off another,
304          * and has now been incorporated, so a lookup might need
305          * to find it.  So it better be in the hash table.
306          */
307         struct hlist_head *head =
308                 &hash_table[ihash(ib->b.inode, ib->b.fileaddr, ib->depth)];
309         spin_lock(&lafs_hash_lock);
310         LAFS_BUG(!hlist_unhashed(&ib->hash), &ib->b);
311         hlist_add_head(&ib->hash, head);
312         spin_unlock(&lafs_hash_lock);
313 }
314
315 void lafs_unhash_iblock(struct indexblock *ib)
316 {
317         /* This block has been emptied and deleted and is no longer
318          * wanted in the hash tables
319          */
320         spin_lock(&lafs_hash_lock);
321         LAFS_BUG(hlist_unhashed(&ib->hash), &ib->b);
322         hlist_del_init(&ib->hash);
323         spin_unlock(&lafs_hash_lock);
324 }
325
326 /* adopt blk into parent if possible.
327  */
328 static void
329 block_adopt(struct block *blk, struct indexblock *parent)
330 {
331         struct address_space *as;
332
333         if (parent == NULL) {
334                 LAFS_BUG(!test_bit(B_Root, &blk->flags), blk);
335                 return;
336         }
337
338         if (blk->parent) {
339                 LAFS_BUG(blk->parent != parent, blk);
340                 return;
341         }
342
343         LAFS_BUG(!(parent->b.flags & B_Valid), &parent->b);
344
345         as = &blk->inode->i_data;
346
347         spin_lock(&as->private_lock);
348         if (blk->parent) {
349                 if (blk->parent != parent) {
350                         printk("blk = %s\n", strblk(blk));
351                         printk("blk->p = %s\n", strblk(&blk->parent->b));
352                         printk("parent = %s\n", strblk(&parent->b));
353                         lafs_dump_tree();
354                 }
355                 LAFS_BUG(blk->parent != parent, blk);
356         } else {
357                 if (test_bit(B_EmptyIndex, &parent->b.flags)) {
358                         LAFS_BUG(!test_bit(B_Index, &parent->b.flags), &parent->b);
359                         LAFS_BUG(test_bit(B_InoIdx, &parent->b.flags), &parent->b);
360                         LAFS_BUG(parent->b.fileaddr != parent->b.parent->b.fileaddr, &parent->b);
361
362                         clear_bit(B_EmptyIndex, &parent->b.flags);
363                 }
364                 LAFS_BUG(parent == iblk(blk), blk);
365                 blk->parent = parent;
366                 getiref(parent, MKREF(child));
367                 /* Remove from the per-inode free list */
368                 spin_lock(&lafs_hash_lock);
369                 if (!test_bit(B_InoIdx, &blk->flags))
370                         list_move(&blk->siblings, &parent->children);
371                 else {
372                         /* Remove from ->free_index list */
373                         list_del_init(&blk->siblings);
374                         /* Having set ->parent, we take a ref on the dblock.
375                          * The dblock must exist because we can only reach here from
376                          * lafs_make_iblock which can only be called while holding a ref
377                          * on the dblock
378                          */
379                         (void)getdref(LAFSI(blk->inode)->dblock, MKREF(pdblock));
380                 }
381                 spin_unlock(&lafs_hash_lock);
382         }
383         spin_unlock(&as->private_lock);
384 }
385
386 static int __must_check
387 find_block(struct datablock *b, int adopt, int async);
388
389 static int __must_check
390 setparent(struct datablock *blk, int async)
391 {
392         int err = 0;
393         if (blk->b.parent == NULL && !test_bit(B_Root, &blk->b.flags))
394                 err = find_block(blk, 1, async);
395         return err;
396 }
397
398 int __must_check
399 lafs_setparent(struct datablock *blk)
400 {
401         return setparent(blk, 0);
402 }
403
404 void lafs_pin_block_ph(struct block *b, int ph)
405 {
406         struct fs *fs = fs_from_inode(b->inode);
407         struct block *p;
408         struct inode *ino;
409
410         LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags), b);
411         if (!test_bit(B_Realloc, &b->flags) &&
412             LAFSI(b->inode)->type != TypeSegmentMap)
413                 LAFS_BUG(!test_phase_locked(fs), b);
414
415         ino = b->inode;
416         spin_lock(&ino->i_data.private_lock);
417         for (p = b ;
418              p && !test_bit(B_Pinned, &p->flags) ;
419              p = &(p->parent)->b) {
420                 set_phase(p, ph);
421                 if (test_bit(B_InoIdx, &p->flags)) {
422                         struct datablock *db = LAFSI(p->inode)->dblock;
423
424                         spin_unlock(&ino->i_data.private_lock);
425                         lafs_inode_checkpin(p->inode);
426                         ino = db->b.inode;
427                         spin_lock(&ino->i_data.private_lock);
428                 }
429         }
430         spin_unlock(&ino->i_data.private_lock);
431         lafs_refile(b, 0);
432 }
433
434 static void pin_all_children(struct indexblock *ib)
435 {
436         /* If we cannot get new credits for phase_flip, we need to
437          * pin all dirty data-block children so that they get
438          * written and don't remain to require a phase_flip
439          * on the next checkpoint (at which point there will be
440          * no credit left)
441          * So if any dirty child is found, set phase.  We don't
442          * need PinPending as it is already Dirty.  All other
443          * requirement for pinning will already be met.
444          */
445         struct block *b;
446         struct indexblock *start = NULL;
447         int depth = 0;
448         struct fs *fs = fs_from_inode(ib->b.inode);
449         int ph = fs->phase;
450
451         /* Locking:
452          * This is called with an IOLock on ib,  So it is unlikely
453          * that children will be added or removed(?), but we really
454          * should hole the inode private_lock anyway.
455          */
456         getiref(ib, MKREF(pinall));
457         b = NULL;
458 restart:
459         /* We hold a reference on ib, and on 'start' if non-NULL */
460         spin_lock(&ib->b.inode->i_data.private_lock);
461         b = list_prepare_entry(b, &ib->children, siblings);
462         list_for_each_entry_continue(b, &ib->children, siblings) {
463                 /* FIXME I don't descend through inode blocks to the file below! */
464                 if (test_bit(B_Index, &b->flags)) {
465                         /* restart down */
466                         depth++;
467                         getref_locked(b, MKREF(pinall));
468                         spin_unlock(&ib->b.inode->i_data.private_lock);
469                         putiref(ib, MKREF(pinall));
470                         ib = iblk(b);
471                         b = NULL;
472                         goto restart;
473                 } else if (test_bit(B_Dirty, &b->flags) &&
474                            !test_bit(B_Pinned, &b->flags)) {
475                         getref_locked(b, MKREF(pinall));
476                         spin_unlock(&ib->b.inode->i_data.private_lock);
477                         set_phase(b, ph);
478                         putref(b, MKREF(pinall));
479                         /* Slightly safer to restart this level */
480                         b = NULL;
481                         goto restart;
482                 }
483         }
484         spin_unlock(&ib->b.inode->i_data.private_lock);
485         putiref(start, MKREF(pinall));
486         if (depth) {
487                 depth--;
488                 start = ib;
489                 ib = getiref(ib->b.parent, MKREF(pinall));
490                 b = &start->b;
491                 goto restart;
492         }
493         putiref(ib, MKREF(pinall));
494 }
495
496 static void flip_phase(struct block *b)
497 {
498         struct indexblock *p;
499         int oldphase = !!test_bit(B_Phase1, &b->flags);
500
501         LAFS_BUG(!test_bit(B_Pinned, &b->flags), b);
502         if (oldphase)
503                 clear_bit(B_Phase1, &b->flags);
504         else
505                 set_bit(B_Phase1, &b->flags);
506
507         p = b->parent;
508
509         if (p) {
510                 atomic_inc(&p->pincnt[1-oldphase]);
511                 atomic_dec(&p->pincnt[oldphase]);
512         } else
513                 LAFS_BUG(!test_bit(B_Root, &b->flags), b);
514
515         /* move 'next' credits to 'here' */
516         if (!test_bit(B_Credit, &b->flags) &&
517             test_and_clear_bit(B_NCredit, &b->flags))
518                 set_bit(B_Credit, &b->flags);
519
520         if (!test_bit(B_ICredit, &b->flags) &&
521             test_and_clear_bit(B_NICredit, &b->flags))
522                 set_bit(B_ICredit, &b->flags);
523 }
524
525 /* When the pinning of a block needs to be carried across a
526  * checkpoint, we need to 'flip' the phase.
527  * This only applies to blocks that can be pinned by a block that
528  * may not be written until the next phase.
529  * This includes any index block (is it may have some children in one
530  * phase and some in the next) and any space-accounting block
531  * for the same reason.
532  * So indexblocks will need to flip, and use lafs_phase_flip.
533  * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock.
534  * TypeInodeFile don't need to be phase_flipped, though their InoIdx
535  * block might.  InodeFile blocks are only pinned by metadata transactions
536  * which happen inside a checkpoint lock.
537  */
538
539 void lafs_flip_dblock(struct datablock *db)
540 {
541         /* This is an accounting block (SegmentMap or Quota)
542          * which we need to write out after ->phase has changed
543          * in the tail of the checkpoint.
544          * We always flip the phase and reallocate from AccountSpace.
545          * If all references get dropped, it will then get unpinned
546          * before the next phase finished - we don't unpin here
547          * (unlike lafs_phase_flip for index blocks).
548          */
549         flip_phase(&db->b);
550         lafs_prealloc(&db->b, AccountSpace);
551         /* Parent might need to be on a leaflist now */
552         lafs_refile(&db->b.parent->b, 0);
553 }
554
555 void lafs_phase_flip(struct fs *fs, struct indexblock *ib)
556 {
557         /* We are performing a checkpoint, this block has been written
558          * out and now needs to be flipped into the next phase.
559          *
560          * It involves.
561          *  - Processing all uninc_next blocks into uninc_table.
562          *  - adjusting counts on parent
563          *  - moving credits from 'next' to 'this' phase.
564          *  - update block counts to included phase-delayed updates.
565          */
566         int oldphase = !!test_bit(B_Phase1, &ib->b.flags);
567         struct block *ulist;
568
569         dprintk("FLIP %s\n", strblk(&ib->b));
570
571         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
572         LAFS_BUG(!test_bit(B_Index, &ib->b.flags), &ib->b);
573         LAFS_BUG(ib->b.parent == NULL && !test_bit(B_Root, &ib->b.flags), &ib->b);
574         LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
575         LAFS_BUG(atomic_read(&ib->pincnt[oldphase]), &ib->b);
576         LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
577
578         /* If this is the only reference, and pincnts are zero
579          * and all relevant flags are clear, then don't flip
580          * phase but unpin.
581          */
582         if (test_bit(B_Pinned, &ib->b.flags) &&
583             !test_bit(B_Dirty, &ib->b.flags) &&
584             !test_bit(B_Realloc, &ib->b.flags) &&
585             atomic_read(&ib->b.refcnt) == 1 && /* This is us */
586
587             ib->uninc_table.pending_cnt == 0 &&
588             ib->uninc == NULL &&
589             ib->uninc_next == NULL &&
590             atomic_read(&ib->pincnt[0]) == 0 &&
591             atomic_read(&ib->pincnt[1]) == 0 &&
592
593             (!test_bit(B_InoIdx, &ib->b.flags) ||
594              !(test_bit(B_PinPending, &LAFSI(ib->b.inode)->dblock->b.flags)
595                || test_bit(B_Dirty, &LAFSI(ib->b.inode)->dblock->b.flags)
596                || test_bit(B_Realloc, &LAFSI(ib->b.inode)->dblock->b.flags)))
597                 ) {
598                 if (test_and_clear_bit(B_Pinned, &ib->b.flags)) {
599                         if (!test_bit(B_Root, &ib->b.flags)) {
600                                 atomic_dec(&ib->b.parent->pincnt[oldphase]);
601                                 lafs_refile(&ib->b.parent->b, 0);
602                         }
603                         lafs_inode_checkpin(ib->b.inode);
604                 }
605                 if (test_bit(B_InoIdx, &ib->b.flags))
606                         /* maybe data block should go in leaf list now */
607                         lafs_refile(&LAFSI(ib->b.inode)->dblock->b, 0);
608                 lafs_iounlock_block(&ib->b);
609                 return;
610         }
611
612         flip_phase(&ib->b);
613
614         if (test_bit(B_InoIdx, &ib->b.flags)) {
615                 struct lafs_inode *lai = LAFSI(ib->b.inode);
616                 struct datablock *db = lai->dblock;
617                 struct inode *ino = db->b.inode;
618
619                 spin_lock(&ino->i_data.private_lock);
620                 lai->cblocks += lai->pblocks;
621                 lai->pblocks = 0;
622                 lai->ciblocks += lai->piblocks;
623                 lai->piblocks = 0;
624                 if (lai->type == TypeInodeFile) {
625                         /* Root of filesystem */
626                         lai->md.fs.cblocks_used +=
627                                 lai->md.fs.pblocks_used;
628                         lai->md.fs.pblocks_used = 0;
629                 }
630                 spin_unlock(&ino->i_data.private_lock);
631
632                 /* maybe data block needs to be on leaf list */
633                 lafs_refile(&db->b, 0);
634         }
635
636         if (lafs_prealloc(&ib->b, ReleaseSpace) < 0) {
637                 /* Couldn't get all N*Credit, so
638                  * Pin all children to this phase, so they
639                  * won't be needed.
640                  */
641                 pin_all_children(ib);
642         }
643
644         spin_lock(&ib->b.inode->i_data.private_lock);
645         ulist = ib->uninc_next;
646         ib->uninc_next = NULL;
647         spin_unlock(&ib->b.inode->i_data.private_lock);
648
649         lafs_iounlock_block(&ib->b);
650
651         if (ulist)
652                 lafs_dirty_iblock(ib, 0);
653         while (ulist) {
654                 struct block *b2 = ulist;
655                 ulist = b2->chain;
656                 b2->chain = NULL;
657                 clear_bit(B_Uninc, &b2->flags);
658                 while (lafs_add_block_address(fs, b2) == 0)
659                         /* We had to incorporate b2->parent which might
660                          * have split the parent but as that will have made
661                          * it dirty, there is nothing extra that we need to
662                          * do here
663                          */
664                         ;
665                 putref(b2, MKREF(uninc));
666         }
667         LAFS_BUG(ib->uninc_next, &ib->b);
668
669         lafs_refile(&ib->b, 0);
670         if (ib->b.parent)
671                 /* Parent may need to be attached to a phase_leafs now */
672                 lafs_refile(&ib->b.parent->b, 0);
673 }
674
675 /*
676  * lafs_refile is called whenever we finish doing stuff
677  * to a block.  lafs_refile checks the state of the block and
678  * make sure various lists etc are correct.
679  * When finished with a block we often want to drop a reference,
680  * and this functionality is included in refile so that the
681  * decref and associated refiling can be done under a lock.
682  *
683  * Key issues that lafs_refile deals with are:
684  *   Make sure ->lru is on correct list.
685  *   Remove ->parent link if no longer needed.
686  *   remove B_Lock if no longer needed
687  *   trigger a phase change when appropriate
688  *   Remove dir parent link if no longer needed.
689  *
690  * Fleshing these out a bit:
691  *   possible lru lists are:
692  *      phase_leafs[phase] - for dirty/pinned blocks with no pinned children
693  *      clean_leafs - for blocks that are being relocated for cleaning
694  *                       and that have no pinned children
695  *   The parent link is only needed if this block has a refcount or one
696  *     of B_Pinned, B_Dirty, B_Uninc
697  *   B_Lock is only needed if one of
698  *      B_Dirty
699  *    or refcount
700  *  though a refcount on a pinned datablock is only significant
701  *         while the current phase is locked.
702  *
703  *  a phase change can (and should) happen for index blocks
704  *  in the 'other' phase that are
705  *    not Dirty, not Alloc, pincnt[oldphase]==0, uninc-table empty
706  *
707  *
708  * Refiling one block may cause a change in another block which requires
709  * a recursive refile.  Naturally we unroll the tail-recursion, but we
710  * we need to be sure there are a limited number of tails.
711  * The causes of recursion are:
712  *   A datablock which no-longer has a PagePrivate page holds a ref
713  *      on block0 of that page, which must be dropped on last put.
714  *   When any block becomes unpinned, we must refile the parent.
715  *   When we clear ->parent we must deref the parent.
716  *   When an InoIdx block drops ->parent it is dropped from inode->iblock
717  *      so we need to drop the implied refcnt on ->dblock
718  * So there can only be 2 block to be recursed on to.
719  *  A block0 or dblock, and a parent.
720  * A block pointing to a block0 never has a parent, and an iblock has the same
721  * parent as the dblock, so there can never be more than 2 outstanding blocks.
722  */
723 int lafs_is_leaf(struct block *b, int ph)
724 {
725         /* This is pinned and not iolocked.  Should it be on a leaf
726          * list?
727          */
728         struct inode *myi;
729         struct lafs_inode *lai;
730         int rv = 1;
731
732         if (!test_bit(B_Pinned, &b->flags) ||
733             test_bit(B_Writeback, &b->flags))
734                 return 0;
735
736         if (test_bit(B_Index, &b->flags)) {
737                 if (ph >= 0)
738                         return atomic_read(&iblk(b)->pincnt[ph]) == 0;
739                 return (atomic_read(&iblk(b)->pincnt[0]) +
740                         atomic_read(&iblk(b)->pincnt[1])) == 0;
741         }
742
743         /* This is a data block */
744         myi = rcu_my_inode(dblk(b));
745         if (!myi) {
746                 /* Non-inode data blocks are always leafs */
747                 return 1;
748         }
749         lai = LAFSI(myi);
750         spin_lock(&lai->vfs_inode.i_data.private_lock);
751         if (ph < 0)
752                 ph = !!test_bit(B_Phase1, &b->flags);
753         if (lai->iblock &&
754             test_bit(B_Pinned, &lai->iblock->b.flags) &&
755             !!test_bit(B_Phase1, &lai->iblock->b.flags) == ph)
756                 /* iblock still pinned in the same phase, so
757                  * this block isn't a leaf
758                  */
759                 rv = 0;
760
761         spin_unlock(&lai->vfs_inode.i_data.private_lock);
762         rcu_iput(myi);
763         return rv;
764 }
765
766 /*
767  * This for debugging and is racy and is probably only safe on UP
768  */
769 static void check_consistency(struct block *b)
770 {
771         /* sanity tests.
772          * 1/ make sure pincnt is right
773          */
774         if (test_bit(B_Index, &b->flags)) {
775                 int c[2];
776                 struct block *cb;
777                 c[0] = c[1] = 0;
778                 list_for_each_entry(cb, &iblk(b)->children, siblings) {
779                         struct inode *myi = NULL;
780                         if (test_bit(B_Pinned, &cb->flags)) {
781                                 int pp = !!test_bit(B_Phase1, &cb->flags);
782                                 c[pp]++;
783                         }
784                         /* InoIdx block might be pinned but is not a direct
785                          * child */
786                         if (!test_bit(B_Index, &cb->flags) &&
787                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
788                             LAFSI(myi)->iblock &&
789                             test_bit(B_Pinned, &LAFSI(myi)->iblock->b.flags)) {
790                                 int pp = !!test_bit(B_Phase1,
791                                                     &LAFSI(myi)->iblock->b.flags);
792                                 c[pp]++;
793                         }
794                         rcu_iput(myi);
795                 }
796                 if (c[0] != atomic_read(&iblk(b)->pincnt[0]) ||
797                     c[1] != atomic_read(&iblk(b)->pincnt[1])) {
798                         printk("%d %d\n", c[0], c[1]);
799                         lafs_dump_tree();
800                         LAFS_BUG(1, b);
801                 }
802         }
803
804         if (b->parent) {
805                 struct indexblock *ib = b->parent;
806                 int c[2];
807                 struct block *cb;
808                 c[0] = c[1] = 0;
809                 list_for_each_entry(cb, &ib->children, siblings) {
810                         struct inode *myi = NULL;
811                         if (test_bit(B_Pinned, &cb->flags)) {
812                                 int pp = !!test_bit(B_Phase1, &cb->flags);
813                                 c[pp]++;
814                         }
815                         /* InoIdx block might be pinned but is not a direct
816                          * child */
817                         if (!test_bit(B_Index, &cb->flags) &&
818                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
819                             LAFSI(myi)->iblock &&
820                             test_bit(B_Pinned, &(LAFSI(myi)
821                                                  ->iblock->b.flags))) {
822                                 int pp = !!test_bit(B_Phase1,
823                                                     &LAFSI(myi)
824                                                     ->iblock->b.flags);
825                                 c[pp]++;
826                         }
827                         rcu_iput(myi);
828                 }
829                 if (c[0] != atomic_read(&ib->pincnt[0]) ||
830                     c[1] != atomic_read(&ib->pincnt[1])) {
831                         printk("badcnt %d %d\n", c[0], c[1]);
832                         LAFS_BUG(1, &ib->b);
833                 }
834         }
835 }
836
837 static void set_lru(struct block *b)
838 {
839         struct fs *fs;
840         int ph = !!test_bit(B_Phase1, &b->flags);
841
842         if (!test_bit(B_OnFree, &b->flags) && !list_empty_careful(&b->lru))
843                 return;
844         if (test_bit(B_IOLock, &b->flags) || !lafs_is_leaf(b, ph))
845                 return;
846
847         /* This is close enough to a leaf that we should put it on a list.
848          * If we raced and it isn't then it will be found and removed
849          */
850         if (test_bit(B_OnFree, &b->flags)) {
851                 spin_lock(&lafs_hash_lock);
852                 if (test_and_clear_bit(B_OnFree, &b->flags)) {
853                         list_del_init(&b->lru);
854                         freelist.freecnt--;
855                 }
856                 spin_unlock(&lafs_hash_lock);
857         }
858         fs = fs_from_inode(b->inode);
859         spin_lock(&fs->lock);
860         if (list_empty(&b->lru)) {
861                 if (test_bit(B_Realloc, &b->flags))
862                         list_add(&b->lru, &fs->clean_leafs);
863                 else {
864                         ph = !!test_bit(B_Phase1, &b->flags);
865                         list_add(&b->lru, &fs->phase_leafs[ph]);
866                 }
867
868         }
869         spin_unlock(&fs->lock);
870 }
871
872 void lafs_refile(struct block *b, int dec)
873 {
874         struct block *next = NULL, *next_parent = NULL;
875         struct fs *fs = NULL;
876         u64 physref = 0;
877
878         if (!b)
879                 return;
880
881         fs = fs_from_inode(b->inode);
882         check_consistency(b);
883
884 /* To (mostly) avoid recursion, we have a loop which may
885  * walk up to the parent.
886  * The main reason for holding lafs_hash_lock is that it protects
887  * ->lru - i.e. all the lists. FIXME that should be fs->lock
888  */
889         LAFS_BUG(atomic_read(&b->refcnt) == 0, b);
890         while (b) {
891                 int ph;
892                 int free_me = 0;
893                 struct inode *checkpin = NULL;
894                 struct inode *myi = NULL;
895                 struct datablock *db;
896
897                 set_lru(b);
898
899                 if (test_bit(B_Pinned, &b->flags) &&
900                     !test_bit(B_Index, &b->flags) &&
901                     !test_bit(B_PinPending, &b->flags) &&
902                     !test_bit(B_Dirty, &b->flags) &&
903                     !test_bit(B_Realloc, &b->flags)
904                         ) {
905                         /* Don't need to be Pinned any more */
906                         /* FIXME do I need to lock access to ->parent */
907                         lafs_checkpoint_lock(fs);
908                         ph = !!test_bit(B_Phase1, &b->flags);
909                         if (test_and_clear_bit(B_Pinned, &b->flags)) {
910                                 if (test_bit(B_Dirty, &b->flags) ||
911                                     test_bit(B_Realloc, &b->flags) ||
912                                     test_bit(B_PinPending, &b->flags))
913                                         /* lost a race */
914                                         set_phase(b, ph);
915                                 else if (!test_bit(B_Root, &b->flags)) {
916                                         struct block *nb;
917                                         atomic_dec(&b->parent->pincnt[ph]);
918                                         nb = &b->parent->b;
919                                         if (next_parent != nb) {
920                                                 LAFS_BUG(next_parent, b);
921                                                 next_parent = nb;
922                                                 atomic_inc(&nb->refcnt);
923                                         }
924                                 }
925                         }
926                         lafs_checkpoint_unlock(fs);
927                 }
928
929                 if (dec && atomic_dec_and_lock(&b->refcnt, &b->inode->i_data.private_lock)) {
930                         /* PinPending disappears with the last non-lru reference,
931                          * (or possibly at other times).
932                          */
933                         clear_bit(B_PinPending, &b->flags);
934
935                         ph = !!test_bit(B_Phase1, &b->flags);
936                         /* See if we still need to be pinned */
937                         if (test_bit(B_Pinned, &b->flags) &&
938                             !test_bit(B_Dirty, &b->flags) &&
939                             !test_bit(B_Realloc, &b->flags)) {
940                                 /* Don't need to be Pinned any more */
941                                 if (test_and_clear_bit(B_Pinned, &b->flags)) {
942                                         if (!list_empty_careful(&b->lru)) {
943                                                 spin_lock(&fs->lock);
944                                                 list_del_init(&b->lru);
945                                                 spin_unlock(&fs->lock);
946                                         }
947                                         if (test_bit(B_InoIdx, &b->flags) &&
948                                             b->inode->i_nlink)
949                                                 checkpin = b->inode;
950                                         if (!test_bit(B_Root, &b->flags)) {
951                                                 struct block *nb;
952                                                 atomic_dec(&b->parent->pincnt[ph]);
953                                                 if (test_bit(B_InoIdx, &b->flags))
954                                                         nb = &LAFSI(b->inode)->dblock->b;
955                                                 else
956                                                         nb = &b->parent->b;
957                                                 if (next_parent != nb) {
958                                                         LAFS_BUG(next_parent, b);
959                                                         next_parent = nb;
960                                                         atomic_inc(&nb->refcnt);
961                                                 }
962                                         }
963                                 }
964                         }
965
966                         /* check the ->parent link */
967                         if (b->parent &&
968                             !(b->flags & (
969                                       (1<<B_Pinned) |
970                                       (1<<B_Uninc) |
971                                       (1<<B_Realloc) |
972                                       (1<<B_Dirty)))
973                                 ) {
974                                 int credits;
975                                 /* Don't need ->parent any more */
976                                 if (next_parent == NULL)
977                                         next_parent = &b->parent->b;
978                                 else if (next_parent == &b->parent->b ||
979                                          next_parent->parent == b->parent) {
980                                         if (atomic_dec_and_test(&b->parent->b.refcnt))
981                                                 LAFS_BUG(1, b);
982                                 } else
983                                         LAFS_BUG(1, b);
984
985                                 del_ref(&b->parent->b, MKREF(child),
986                                         __FILE__, __LINE__);
987                                 b->parent = NULL;
988
989                                 if (test_bit(B_InoIdx, &b->flags)) {
990                                         /* While an InoIdx has a parent we hold a count on
991                                          * the dblock.  Now we have dropped one, we must drop the
992                                          * other
993                                          */
994                                         struct datablock *db = LAFSI(b->inode)->dblock;
995                                         if (&db->b.parent->b != next_parent &&
996                                             &db->b != next_parent) {
997                                                 printk("db    = %s\n", strblk(&db->b));
998                                                 printk("dp->p = %s\n", strblk(&db->b.parent->b));
999                                                 printk("np    = %s\n", strblk(next_parent));
1000                                                 printk("b     = %s\n", strblk(b));
1001                                         }
1002                                         BUG_ON(&db->b.parent->b != next_parent && &db->b != next_parent);
1003                                         if (atomic_dec_and_test(&next_parent->refcnt))
1004                                                 LAFS_BUG(1, b);
1005                                         next_parent = &db->b;
1006                                         del_ref(&db->b, MKREF(pdblock), __FILE__, __LINE__);
1007                                 }
1008                                 if (test_and_clear_bit(B_SegRef, &b->flags))
1009                                         physref = b->physaddr;
1010
1011                                 LAFS_BUG(test_bit(B_PrimaryRef, &b->flags), b);
1012                                 list_del_init(&b->siblings);
1013
1014                                 if (test_bit(B_Index, &b->flags))
1015                                         list_add(&b->siblings,
1016                                                  &LAFSI(b->inode)->free_index);
1017
1018                                 if (test_and_clear_bit(B_Prealloc, &b->flags) &&
1019                                     b->physaddr == 0)
1020                                         lafs_summary_allocate(fs, b->inode, -1);
1021                                 credits = 0;
1022                                 if (test_and_clear_bit(B_Credit, &b->flags))
1023                                         credits++;
1024                                 if (test_and_clear_bit(B_ICredit, &b->flags))
1025                                         credits++;
1026                                 if (test_and_clear_bit(B_NCredit, &b->flags))
1027                                         credits++;
1028                                 if (test_and_clear_bit(B_NICredit, &b->flags))
1029                                         credits++;
1030                                 if (test_and_clear_bit(B_UnincCredit, &b->flags))
1031                                         credits++;
1032                                 lafs_space_return(fs, credits);
1033                         }
1034                         if (test_bit(B_Index, &b->flags) &&
1035                             !(b->flags & (
1036                                       (1<<B_Pinned) |
1037                                       (1<<B_Uninc) |
1038                                       (1<<B_Root) |
1039                                       (1<<B_Realloc) |
1040                                       (1<<B_Dirty)))
1041                                 ) {
1042                                 if (b->parent != NULL)
1043                                         /* Could this be uninc - where
1044                                          * is refcnt */
1045                                         printk("Problem: %s\n", strblk(b));
1046                                 LAFS_BUG(b->parent != NULL, b);
1047                                 /* put it on the lru */
1048                                 spin_lock(&lafs_hash_lock);
1049                                 if (!test_and_set_bit(B_OnFree, &b->flags)) {
1050                                         LAFS_BUG(!list_empty(&b->lru), b);
1051                                         list_move(&b->lru, &freelist.lru);
1052                                         freelist.freecnt++;
1053                                 }
1054                                 spin_unlock(&lafs_hash_lock);
1055                         }
1056                         /* last reference to a dblock with no page
1057                          * requires special handling
1058                          * The first block on a page must be freed,
1059                          * the other blocks hold a reference on that
1060                          * first block which must be dropped.
1061                          * However it is possible that a new reference was taken
1062                          * via _leafs. If so we have now cleared Pinned so
1063                          * get_flushable will immediately put this again.
1064                          * so we should leave this cleanup to later.
1065                          * Unlike other tests, this one isn't idempotent, so
1066                          * need the check on refcnt
1067                          */
1068                         db = dblk(b);
1069                         if (!test_bit(B_Index, &b->flags) &&
1070                             !PagePrivate(db->page) &&
1071                             atomic_read(&b->refcnt) == 0) {
1072                                 int bits = (PAGE_SHIFT -
1073                                             b->inode->i_blkbits);
1074                                 int mask = (1<<bits) - 1;
1075                                 int bnum = b->fileaddr & mask;
1076
1077                                 LAFS_BUG(next, next);
1078                                 if (bnum) {
1079                                         next = &db[-bnum].b;
1080                                         del_ref(next, "lafs_release_0",
1081                                                 __FILE__, __LINE__);
1082                                 } else {
1083                                         free_me = 1;
1084                                         put_page(db->page);
1085                                 }
1086                         }
1087                         /* last reference to an iblock requires that we
1088                          * deref the dblock.  We don't need to re-check
1089                          * refcnt here as a racing getiref_locked will take an
1090                          * extra dblock reference it.
1091                          */
1092                         if (test_bit(B_InoIdx, &b->flags)) {
1093                                 LAFS_BUG(LAFSI(b->inode)->iblock
1094                                          != iblk(b), b);
1095                                 LAFS_BUG(next, next);
1096                                 next = &LAFSI(b->inode)->dblock->b;
1097                                 del_ref(next, MKREF(iblock),
1098                                         __FILE__, __LINE__);
1099                         }
1100
1101                         /* Free a delayed-release inode */
1102                         if (!test_bit(B_Index, &b->flags) &&
1103                             (myi = rcu_my_inode(dblk(b))) != NULL &&
1104                             (!PagePrivate(dblk(b)->page) ||
1105                              test_bit(I_Destroyed, &LAFSI(myi)->iflags))) {
1106                                 dblk(b)->my_inode = NULL;
1107                                 LAFSI(myi)->dblock = NULL;
1108                                 /* Don't need lafs_hash_lock to clear iblock as
1109                                  * new refs on iblock are only taken while holding
1110                                  * dblock, and we know dblock has no references.
1111                                  * You would need an iget to get a ref on dblock now,
1112                                  * and because I_Destroyed, we know that isn't possible.
1113                                  */
1114                                 LAFSI(myi)->iblock = NULL;
1115                                 spin_unlock(&b->inode->i_data.private_lock);
1116                                 if (test_bit(I_Destroyed, &LAFSI(myi)->iflags))
1117                                         lafs_destroy_inode(myi);
1118                         } else
1119                                 spin_unlock(&b->inode->i_data.private_lock);
1120                 }
1121                 rcu_iput(myi);
1122
1123                 if (physref) {
1124                         lafs_seg_deref(fs, physref, 0);
1125                         physref = 0;
1126                 }
1127
1128                 if (checkpin)
1129                         lafs_inode_checkpin(checkpin);
1130
1131                 if (free_me)
1132                         kfree(dblk(b));
1133                 b = NULL;
1134                 if (next) {
1135                         b = next;
1136                         next = NULL;
1137                 } else if (next_parent) {
1138                         b = next_parent;
1139                         next_parent = NULL;
1140                 }
1141                 dec = 1;
1142         }
1143 }
1144
1145 /*
1146  * create (if it doesn't already exist) the 'iblock' for an inode.
1147  * This is a shadow of the dblock but comes into it's own if/when
1148  * the inode's indexing tree needs to grow.
1149  * Return a counted reference to the index block.
1150  * NOTE: we must hold a reference to ->dblock when we call
1151  * lafs_make_iblock.
1152  */
1153 struct indexblock * __must_check
1154 lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG)
1155 {
1156         struct lafs_inode *lai = LAFSI(ino);
1157         struct indexblock *ib, *new = NULL;
1158         struct fs *fs = fs_from_inode(ino);
1159         int err = 0;
1160
1161         dprintk("MAKE_IBLOCK %d\n", (int)ino->i_ino);
1162
1163         BUG_ON(lai->dblock == NULL);
1164         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1165 retry:
1166         spin_lock(&lafs_hash_lock);
1167         if (lai->iblock)
1168                 ib = getiref_locked_needsync(lai->iblock, REF);
1169         else if (new) {
1170                 lai->iblock = new;
1171                 ib = new;
1172                 new = NULL;
1173                 (void)getdref(lai->dblock, MKREF(iblock));
1174         }
1175         spin_unlock(&lafs_hash_lock);
1176         if (new)
1177                 lafs_iblock_free(new);
1178         if (ib) {
1179                 sync_ref(&ib->b);
1180                 if (adopt) {
1181                         err = setparent(lai->dblock, async);
1182                         if (err == 0)
1183                                 block_adopt(&ib->b, lai->dblock->b.parent);
1184                 }
1185                 if (err == 0)
1186                         return ib;
1187                 putiref(ib, REF);
1188                 return ERR_PTR(err);
1189         }
1190
1191         new = lafs_iblock_alloc(fs, GFP_KERNEL, 0, REF);
1192         if (!new)
1193                 return ERR_PTR(-ENOMEM);
1194
1195         new->b.fileaddr = 0;
1196         new->b.physaddr = lai->dblock->b.physaddr;
1197         if (test_bit(B_PhysValid, &lai->dblock->b.flags))
1198                 set_bit(B_PhysValid, &new->b.flags);
1199         LAFS_BUG(!test_bit(B_Valid, &lai->dblock->b.flags), &lai->dblock->b);
1200         set_bit(B_Valid, &new->b.flags);
1201         new->b.inode = ino;
1202         new->depth = lai->depth;
1203         /* Note: this doesn't get hashed until the index
1204          * tree grows and this block is disconnected from
1205          * the inode.
1206          */
1207         set_bit(B_InoIdx, &new->b.flags);
1208         if (test_bit(B_Root, &lai->dblock->b.flags))
1209                 set_bit(B_Root, &new->b.flags);
1210         new->b.parent = NULL;
1211
1212         goto retry;
1213 }
1214
1215 static u64
1216 leaf_lookup(void *bf, int len, u32 startaddr, u32 target, u32 *nextp)
1217 {
1218         /* buf starts with a 2byte header
1219          * if 1, then 6byte littleending indirect entries.
1220          * if 2, then 12byte extent entries
1221          */
1222         unsigned char *buf = bf;
1223         u64 p;
1224         unsigned char *cp;
1225         int elen;
1226         int hi, lo;
1227         u32 addr, next;
1228
1229         if (nextp)
1230                 *nextp = 0xffffffff;
1231
1232         if (buf[1])
1233                 return 0;
1234         switch (buf[0]) {
1235         default:
1236                 p = 0;
1237                 break;
1238
1239         case IBLK_INDIRECT: /* indirect */
1240                 dprintk("indirect lookup for %lu from %lu, len %d\n",
1241                         (unsigned long)target, (unsigned long)startaddr, len);
1242                 len -= 2;
1243                 buf += 2;
1244
1245                 if (target < startaddr)
1246                         return 0;
1247
1248                 next = target;
1249                 target -= startaddr;
1250
1251                 /* Need both tests as target could be v.large and
1252                  * multiply by 6 could overflow
1253                  */
1254                 if (target > len ||
1255                     target*6 + 6 > len)
1256                         return 0;
1257                 buf += target*6;
1258                 p = decode48(buf);
1259                 if (nextp) {
1260                         /* find the next allocated block */
1261                         u64 p2;
1262
1263                         len -= target*6;
1264                         len -= 6;
1265                         next++;
1266                         *nextp = 0xffffffff;
1267                         while (len >= 6) {
1268                                 p2 = decode48(buf);
1269                                 if (p2) {
1270                                         *nextp = next;
1271                                         break;
1272                                 }
1273                                 len -= 6;
1274                                 next++;
1275                         }
1276                 }
1277                 break;
1278
1279         case IBLK_EXTENT: /* extent */
1280                 /* 12 byte records: 6byte device, 2 byte length, 4
1281                  * byte fileaddr */
1282
1283                 len -= 2;
1284                 buf += 2;
1285                 lo = 0;
1286                 hi = len/12;
1287                 while (hi > lo+1) {
1288                         int mid = (lo+hi)/2;
1289                         cp = buf;
1290                         cp += mid*12 + 6;
1291
1292                         elen = decode16(cp);
1293                         addr = decode32(cp);
1294
1295                         if (elen && addr <= target)
1296                                 lo = mid;
1297                         else
1298                                 hi = mid;
1299                 }
1300
1301                 cp = buf;
1302                 cp += lo*12;
1303                 p = decode48(cp);
1304                 elen = decode16(cp);
1305                 addr = decode32(cp);
1306                 if (target < addr) {
1307                         p = 0;
1308                         cp -= 12;
1309                 } else if (target >= addr + elen)
1310                         p = 0;
1311                 else
1312                         p += target - addr;
1313
1314                 if (nextp) {
1315                         if (lo == len/12)
1316                                 *nextp = 0xffffffff; /* next extent */
1317                         else {
1318                                 u64 p2;
1319                                 p2 = decode48(cp);
1320                                 elen = decode16(cp);
1321                                 addr = decode32(cp);
1322                                 if (elen == 0)
1323                                         *nextp = 0xffffffff; /* no more meaningful
1324                                                           extents*/
1325                                 else
1326                                         *nextp = addr;
1327                         }
1328                 }
1329         }
1330         return p;
1331 }
1332
1333 u32 lafs_leaf_next(struct indexblock *ib, u32 start)
1334 {
1335         /* In this leaf index block, find the first address
1336          * at-or-after 'start'.  If none, return 0xFFFFFFFF
1337          */
1338         char *buf = map_iblock(ib);
1339         int blocksize = ib->b.inode->i_sb->s_blocksize;
1340         struct lafs_inode *li = LAFSI(ib->b.inode);
1341         u32 next = 0xffffffff;
1342         u64 phys;
1343
1344         if (test_bit(B_InoIdx, &ib->b.flags))
1345                 phys = leaf_lookup(buf + li->metadata_size,
1346                                    blocksize - li->metadata_size,
1347                                    ib->b.fileaddr, start, &next);
1348         else
1349                 phys = leaf_lookup(buf, blocksize,
1350                                    ib->b.fileaddr, start, &next);
1351         unmap_iblock(ib, buf);
1352         if (phys)
1353                 return start;
1354         else
1355                 return next;
1356 }
1357
1358 static u64
1359 index_lookup(void *bf, int len, u32 target, u32 *addrp, u32 *nextp)
1360 {
1361         unsigned char *buf = bf;
1362         int lo, hi;
1363         u32 addr;
1364         u64 p;
1365         unsigned char *cp;
1366
1367         if (buf[1] || buf[0]) {
1368                 dprintk("WARNING: not an index block %02x %02x\n",
1369                         buf[0], buf[1]);
1370                 return 0;
1371         }
1372
1373         buf += 2;
1374         len -= 2;
1375
1376         /* 10 byte records: 6 byte Device address, 4 byte file address */
1377         lo = 0; hi = (len/10);
1378         while (hi > lo+1) {
1379                 int mid = (lo+hi)/2;
1380                 cp = buf;
1381                 cp += mid*10;
1382                 p = decode48(cp);
1383                 addr = decode32(cp);
1384                 dprintk("...addr=%lu target=%lu lo=%d mid=%d hi=%d\n",
1385                         (unsigned long)addr, (unsigned  long)target,
1386                         lo, mid, hi);
1387                 if (p && addr <= target)
1388                         lo = mid;
1389                 else
1390                         hi = mid;
1391         }
1392
1393         cp = buf;
1394         cp += lo*10;
1395         p = decode48(cp);
1396         addr = decode32(cp);
1397
1398         if (p == 0)
1399                 /* Nothing in this block */
1400                 return 0;
1401
1402         if (addr > target) {
1403                 /* target is before first address */
1404                 p = 0;
1405                 cp -= 10;
1406         } else
1407                 *addrp = addr;
1408
1409         if (cp+10 <= (unsigned char *) buf + len && nextp) {
1410                 u64 p2 = decode48(cp);
1411                 u32 nxt = decode32(cp);
1412                 if (p2)
1413                         *nextp = nxt; /* address of next index block */
1414         }
1415         return p;
1416 }
1417
1418 int lafs_index_empty(struct indexblock *ib)
1419 {
1420         char *buf = map_iblock(ib);
1421         int blocksize = ib->b.inode->i_sb->s_blocksize;
1422         struct lafs_inode *li = LAFSI(ib->b.inode);
1423         u32 addr;
1424         u64 phys;
1425
1426         if (test_bit(B_InoIdx, &ib->b.flags))
1427                 phys = index_lookup(buf + li->metadata_size,
1428                                     blocksize - li->metadata_size,
1429                                     0xFFFFFFFF, &addr, NULL);
1430         else
1431                 phys = index_lookup(buf, blocksize,
1432                                     0xFFFFFFFF, &addr, NULL);
1433         unmap_iblock(ib, buf);
1434
1435         return phys == 0;
1436 }
1437
1438 static struct indexblock *find_better(struct inode *ino,
1439                                       struct indexblock *ib,
1440                                       u32 addr, u32 *next, REFARG)
1441 {
1442         struct indexblock *rv = ib;
1443         spin_lock(&ino->i_data.private_lock);
1444         list_for_each_entry_continue(ib, &ib->b.parent->children, b.siblings) {
1445                 if (!test_bit(B_PrimaryRef, &ib->b.flags))
1446                         break;
1447                 if (test_bit(B_EmptyIndex, &ib->b.flags))
1448                         continue;
1449                 if (ib->b.fileaddr > addr) {
1450                         if (next && *next > ib->b.fileaddr)
1451                                 *next = ib->b.fileaddr;
1452                         break;
1453                 }
1454                 /* This appears to be better. */
1455                 rv = ib;
1456         }
1457         getref_locked(&rv->b, REF);
1458         spin_unlock(&ino->i_data.private_lock);
1459         return rv;
1460 }
1461
1462 struct indexblock *
1463 lafs_leaf_find(struct inode *inode, u32 addr, int adopt, u32 *next,
1464                int async, REFARG)
1465 {
1466         /* Find the leaf index block which refers to this address (if any do)
1467          * Returns the leaf IOLocked.
1468          * *next will be set to an address that is higher than addr such
1469          * that there are no other leafs before *next.  There might not
1470          * actually be a leaf at *next though.
1471          */
1472         struct indexblock *ib, *ib2;
1473         struct indexblock *hold = NULL;
1474         u64 iphys;
1475         u32 iaddr = 0;
1476         void *buf;
1477         struct lafs_inode *li = LAFSI(inode);
1478         int blocksize = inode->i_sb->s_blocksize;
1479         int err = 0;
1480         int offset;
1481
1482         /* Must own a reference to ->dblock */
1483         BUG_ON(li->dblock == NULL);
1484         BUG_ON(atomic_read(&li->dblock->b.refcnt) == 0);
1485
1486         /* We come back to restart if we needed to unlock to
1487          * perform IO and we cannot be sure the parent didn't change.
1488          * We hold the last seen index block in 'hold' so that if
1489          * no changes happened (the common case) we are certain
1490          * that no IO will be required to get back to where
1491          * we were.
1492          */
1493 restart:
1494         if (next)
1495                 *next = 0xFFFFFFFF;
1496
1497         ib = lafs_make_iblock(inode, adopt, async, REF);
1498         err = PTR_ERR(ib);
1499         if (IS_ERR(ib))
1500                 goto err;
1501         offset = li->metadata_size;
1502
1503         err = -EAGAIN;
1504         if (!async)
1505                 lafs_iolock_block(&ib->b);
1506         else if (!lafs_iolock_block_async(&ib->b))
1507                 goto err_ib;
1508
1509         while (ib->depth > 1) {
1510                 /* internal index block */
1511                 /* NOTE:  index block could be empty, or could have
1512                  * nothing as early as 'addr'.  In that case '0' is
1513                  * returned implying that the block is either in
1514                  * cache, or needs to be synthesised as an empty
1515                  * index block (one level down).
1516                  */
1517                 struct indexblock *better;
1518                 u32 target = addr;
1519
1520         retry:
1521                 buf = map_iblock(ib);
1522                 iaddr = ib->b.fileaddr;
1523                 iphys = index_lookup(buf + offset, blocksize - offset,
1524                                      target, &iaddr,
1525                                      target == addr ? next : NULL);
1526                 unmap_iblock(ib, buf);
1527
1528                 ib2 = lafs_iblock_get(inode, iaddr, ib->depth-1, iphys, REF);
1529
1530                 if (!ib2) {
1531                         err = -ENOMEM;
1532                         lafs_iounlock_block(&ib->b);
1533                         goto err_ib;
1534                 }
1535
1536                 better = find_better(inode, ib2, addr, next, REF);
1537                 putiref(ib2, REF);
1538                 ib2 = better;
1539
1540                 if (!test_bit(B_Valid, &ib2->b.flags)) {
1541                         lafs_iounlock_block(&ib->b);
1542                         err = lafs_load_block(&ib2->b, NULL);
1543                         if (err)
1544                                 goto err_ib2;
1545                         if (async)
1546                                 err = lafs_wait_block_async(&ib2->b);
1547                         else
1548                                 err = lafs_wait_block(&ib2->b);
1549                         if (err)
1550                                 goto err_ib2;
1551
1552                         err = -EAGAIN;
1553                         if (!async)
1554                                 lafs_iolock_block(&ib->b);
1555                         else if (!lafs_iolock_block_async(&ib->b))
1556                                 goto err_ib2;
1557
1558                         /* While we did IO, the parent could have been
1559                          * split, so it is now the wrong parent, or it
1560                          * could have become EmptyIndex, though that is
1561                          * much less likely.  If it did, then it must
1562                          * have had a parent, so the ref we hold means
1563                          * it still has a parent.
1564                          * So if ib->b.parent, then we might need to
1565                          * retry from the top, and holding a ref on ib
1566                          * means that we don't risk live-locking if
1567                          * memory for index blocks is very tight.
1568                          * If there is no parent, then there is
1569                          * no risk that ib changed.
1570                          */
1571                         if (ib->b.parent) {
1572                                 if (hold)
1573                                         putiref(hold, MKREF(hold));
1574                                 hold = getiref(ib, MKREF(hold));
1575                                 putiref(ib2, REF);
1576                                 putiref(ib, REF);
1577                                 goto restart;
1578                         }
1579                 }
1580
1581                 err = -EAGAIN;
1582                 if (!async)
1583                         lafs_iolock_block(&ib2->b);
1584                 else if (!lafs_iolock_block_async(&ib2->b))
1585                         goto err_ib2;
1586
1587                 if (ib2->b.fileaddr != ib->b.fileaddr &&
1588                     test_bit(B_EmptyIndex, &ib2->b.flags)) {
1589                         /* need to look earlier in the parent */
1590                         target = ib2->b.fileaddr - 1;
1591                         lafs_iounlock_block(&ib2->b);
1592                         putiref(ib2, REF);
1593                         goto retry;
1594                 }
1595
1596 #ifdef FIXME
1597                 if (!(ib2->b.flags & B_Linked)) {
1598                         struct block *b2 = peer_find(fs,
1599                                                      inode, iaddr,
1600                                                      depth, iphys);
1601                         if (b2)
1602                                 list_add(&ib2->peers, &b2->peers);
1603                         set_bit(B_Linked, &ib2->b.flags);
1604                 }
1605 #endif
1606                 if (adopt)
1607                         block_adopt(&ib2->b, ib);
1608                 lafs_iounlock_block(&ib->b);
1609                 putiref(ib, REF);
1610                 ib = ib2;
1611                 offset = 0;
1612         }
1613
1614         return ib;
1615
1616 err_ib2:
1617         putiref(ib2, REF);
1618 err_ib:
1619         putiref(ib, REF);
1620 err:
1621         if (hold)
1622                 putiref(hold, MKREF(hold));
1623         return ERR_PTR(err);
1624 }
1625
1626 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr);
1627
1628 static int __lafs_find_next(struct inode *ino, loff_t *addrp)
1629 {
1630         /* Find the next allocated block in inode at or after *addrp
1631          * and set *addrp.
1632          * Return:
1633          *  -1 -  error
1634          *   0 -  there is no block at or after '*addrp'
1635          *   1 -  It is worth checking the block at *addrp.
1636          *   2 -  *addrp has been updated, check again.
1637          */
1638         struct lafs_inode *lai = LAFSI(ino);
1639         struct indexblock *leaf;
1640         struct fs *fs = fs_from_inode(ino);
1641         struct block *b;
1642         u32 nexti, nextd, nextl;
1643         int rv = 0;
1644         int offset;
1645         char *buf;
1646         u64 p;
1647
1648         /* Must hold a reference to ->dblock */
1649         BUG_ON(lai->dblock == NULL);
1650         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1651
1652         if (lai->depth == 0) {
1653                 if (*addrp == 0)
1654                         return 1;
1655                 leaf = lafs_make_iblock(ino, NOADOPT, SYNC, MKREF(find_next));
1656                 if (IS_ERR(leaf))
1657                         return PTR_ERR(leaf);
1658                 nextd = 0xFFFFFFFF;
1659         } else {
1660                 leaf = lafs_leaf_find(ino, *addrp, NOADOPT, &nexti, SYNC,
1661                                       MKREF(find_next));
1662                 /* FIXME what if 'leaf' is an error?? */
1663                 offset = 0;
1664                 if (lai->depth == 1)
1665                         offset = lai->metadata_size;
1666                 buf = map_iblock(leaf);
1667                 p = leaf_lookup(buf+offset, fs->blocksize-offset,
1668                                 leaf->b.fileaddr, (u32)*addrp, &nextd);
1669                 unmap_iblock(leaf, buf);
1670                 if (p) {
1671                         lafs_iounlock_block(&leaf->b);
1672                         putiref(leaf, MKREF(find_next));
1673                         if (*addrp == 0xFFFFFFFF) {
1674                                 int j;
1675                                 printk("at %s\n", strblk(&lai->dblock->b));
1676                                 printk("offset = %d depth=%d\n", offset,
1677                                        lai->depth);
1678                                 for (j = 0; j < 16; j++)
1679                                         printk("%04x ", buf[offset+j] & 0xffff);
1680                                 for (j = 0; j < 16; j++)
1681                                         printk("%04x ", buf[1024-16+j] & 0xffff);
1682                                 printk("\n");
1683                         }
1684                         BUG_ON(*addrp == 0xFFFFFFFF);
1685                         return 1;
1686                 }
1687                 if (nextd != 0xFFFFFFFF)
1688                         rv = 1;
1689                 else if (nexti != 0xFFFFFFFF) {
1690                         nextd = nexti;
1691                         rv = 2;
1692                 }
1693         }
1694         /* If rv==2, nextd may not be high enough as it might
1695          * be the address of an EmptyIndex block.  So only
1696          * use it if nothing better is found.
1697          */
1698         if (rv == 2)
1699                 nextl = 0xFFFFFFFF;
1700         else
1701                 nextl = nextd;
1702         list_for_each_entry(b, &leaf->children, siblings)
1703                 if (b->fileaddr >= *addrp &&
1704                     b->fileaddr <= nextl &&
1705                     test_bit(B_Valid, &b->flags)) {
1706                         nextd = nextl = b->fileaddr;
1707                         rv = 1;
1708                 }
1709
1710         if (leaf->uninc_table.pending_cnt) {
1711                 spin_lock(&leaf->b.inode->i_data.private_lock);
1712                 if (table_find_first(&leaf->uninc_table, *addrp, &nextl)) {
1713                         nextd = nextl;
1714                         rv = 1;
1715                 }
1716                 spin_unlock(&leaf->b.inode->i_data.private_lock);
1717         }
1718         lafs_iounlock_block(&leaf->b);
1719         putiref(leaf, MKREF(find_next));
1720         *addrp = nextd;
1721         return rv;
1722 }
1723
1724 /*
1725  * find the first data block at or after *bnump, and
1726  * store the address in *bnump.
1727  * Return 0 if nothing found, -1 on error, or
1728  * 1 if *bnump is a valid data block address
1729  *
1730  * The block could be in the indexing tree, or in
1731  * an uninc_table, or totally unincorporated.
1732  * Blocks that are really part of the file, but are not in
1733  * the indexing tree will still be on a ->children list
1734  * from an index block, and the index block will be very close
1735  * to the indexing tree.
1736  * So once we have found a suitable index block, we need to check
1737  * all it's children.  This could possibly be optimised using
1738  * find_get_pages if that was exported.
1739  */
1740 int lafs_find_next(struct inode *ino, loff_t *bnump)
1741 {
1742         int rv;
1743         /* Need to hold a reference to dblock while calling
1744          * __lafs_find_next
1745          */
1746         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(find_nexti));
1747
1748         if (IS_ERR(db))
1749                 return PTR_ERR(db);
1750         while (1) {
1751                 struct datablock *b;
1752                 int hole;
1753
1754                 rv = __lafs_find_next(ino, bnump);
1755
1756                 if (rv <= 0)
1757                         break;
1758                 if (rv == 2)
1759                         continue;
1760
1761                 b = lafs_get_block(ino, *bnump, NULL, GFP_KERNEL,
1762                                    MKREF(find_next));
1763                 if (!b)
1764                         return -ENOMEM;
1765
1766                 rv = lafs_find_block(b, NOADOPT);
1767                 if (rv) {
1768                         putdref(b, MKREF(find_next));
1769                         return rv;
1770                 }
1771                 hole = (b->b.physaddr == 0 ||
1772                         !test_bit(B_PhysValid, &b->b.flags)) &&
1773                         !test_bit(B_Valid, &b->b.flags);
1774                 if (LAFSI(ino)->depth == 0 &&
1775                     b->b.fileaddr == 0)
1776                         hole = 0; /* first block lives in inode */
1777                 putdref(b, MKREF(find_next));
1778
1779                 rv = 1;
1780                 if (!hole)
1781                         break;
1782                 *bnump = (*bnump)+1;
1783         }
1784         putdref(db, MKREF(find_nexti));
1785         BUG_ON(rv > 0 && *bnump == 0xFFFFFFFF);
1786         return rv;
1787 }
1788
1789 static int table_lookup(struct uninc *tbl, u32 addr, u64 *phys)
1790 {
1791         int i;
1792         for (i = tbl->pending_cnt; i; ) {
1793                 struct addr *a = &tbl->pending_addr[--i];
1794                 if (a->fileaddr <= addr &&
1795                     a->fileaddr + a->cnt > addr) {
1796                         *phys =  a->physaddr + (addr - a->fileaddr);
1797                         return 1;
1798                 }
1799         }
1800         return 0;
1801 }
1802
1803 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr)
1804 {
1805         /* Find the earliest allocated block in tbl which is
1806          * at-or-after 'min' and before '*addr', and store it in
1807          * *addr, returning 1 if something was found.
1808          */
1809         int rv = 0;
1810         int i;
1811
1812         for (i = tbl->pending_cnt ; i ; ) {
1813                 struct addr *a = &tbl->pending_addr[--i];
1814                 if (a->fileaddr >= *addr)
1815                         continue;
1816                 if (a->fileaddr + a->cnt < min)
1817                         continue;
1818                 if (a->physaddr == 0)
1819                         continue;
1820                 if (a->fileaddr >= min)
1821                         *addr = a->fileaddr;
1822                 else
1823                         *addr = min;
1824                 rv = 1;
1825         }
1826         return rv;
1827 }
1828
1829 static void fill_block_zero(struct datablock *db, struct datablock *inob)
1830 {
1831         struct lafs_inode *lai = LAFSI(db->b.inode);
1832         void *baddr = map_dblock(db);
1833         void *iaddr = map_dblock_2(inob);
1834         int offset = lai->metadata_size;
1835         int len = (1<<db->b.inode->i_blkbits) - offset;
1836
1837         memcpy(baddr, iaddr+offset, len);
1838         unmap_dblock_2(inob, iaddr);
1839         memset(baddr+len, 0, (1<<db->b.inode->i_blkbits)-len);
1840         unmap_dblock(db, baddr);
1841         set_bit(B_Valid, &db->b.flags);
1842 }
1843
1844 static int __must_check
1845 find_block(struct datablock *b, int adopt, int async)
1846 {
1847         /* Find where this block is in storage, and
1848          * set b->b.physaddr.
1849          * If the block lives in the inode, physaddr
1850          * gets set to zero, and lafs_load_block works with this.
1851          * Otherwise we walk down the index tree
1852          * Possible errors:
1853          *  -EIO
1854          *  -ENOMEM
1855          */
1856         struct lafs_inode *lai;
1857         struct indexblock *ib;
1858         struct datablock *db;
1859         void *buf;
1860         int  offset;
1861
1862         LAFS_BUG(test_bit(B_Pinned, &b->b.flags) &&
1863                  b->b.parent == NULL,
1864                  &b->b);
1865         if (b->b.inode == NULL)
1866                 return 0;
1867         lai = LAFSI(b->b.inode);
1868
1869         if (lai->depth == 0) {
1870                 /* Data is in the inode if anywhere */
1871                 if (!test_bit(B_PhysValid, &b->b.flags)) {
1872                         b->b.physaddr = 0;
1873                         set_bit(B_PhysValid, &b->b.flags);
1874                 }
1875                 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1876                 if (IS_ERR(db))
1877                         return PTR_ERR(db);
1878                 if (!test_bit(B_Valid, &b->b.flags) && b->b.fileaddr == 0) {
1879                         /* Now is the best time to copy data from inode into
1880                          * datablock, as we know we have a valid inode block
1881                          * now.
1882                          */
1883                         LAFS_BUG(b->b.physaddr != 0, &b->b);
1884                         fill_block_zero(b, db);
1885                 }
1886                 if (adopt) {
1887                         ib = lafs_make_iblock(b->b.inode, adopt, async,
1888                                               MKREF(findblock));
1889                         putdref(db, MKREF(findblock));
1890                         if (IS_ERR(ib))
1891                                 return PTR_ERR(ib);
1892                         block_adopt(&b->b, ib);
1893                         putiref(ib, MKREF(findblock));
1894                 } else
1895                         putdref(db, MKREF(findblock));
1896                 return 0;
1897         }
1898
1899         if (test_bit(B_PhysValid, &b->b.flags)
1900             && (!adopt || b->b.parent || test_bit(B_Root, &b->b.flags))) {
1901                 return 0;
1902         }
1903
1904         db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1905         if (IS_ERR(db))
1906                 return PTR_ERR(db);
1907
1908         /* Ok, there is some indexing information we need to
1909          * look through.  Find the leaf first
1910          */
1911         ib = lafs_leaf_find(b->b.inode, b->b.fileaddr, adopt, NULL, async,
1912                             MKREF(findblock));
1913         putdref(db, MKREF(findblock));
1914
1915         if (IS_ERR(ib)) {
1916                 dprintk("lafs_leaf_find returned error to lafs_find_block\n");
1917                 goto out;
1918         }
1919
1920         if (!test_bit(B_PhysValid, &b->b.flags)
1921             && ib->uninc_table.pending_cnt) {
1922                 spin_lock(&b->b.inode->i_data.private_lock);
1923                 if (table_lookup(&ib->uninc_table, b->b.fileaddr,
1924                                  &b->b.physaddr))
1925                         set_bit(B_PhysValid, &b->b.flags);
1926                 spin_unlock(&b->b.inode->i_data.private_lock);
1927         }
1928         if (!test_bit(B_PhysValid, &b->b.flags)) {
1929                 buf = map_iblock(ib);
1930                 offset = lai->depth > 1 ? 0 : lai->metadata_size;
1931                 if (test_bit(B_InoIdx, &ib->b.flags))
1932                         dprintk("InoIdx!!!\n");
1933                 dprintk("offset %d: %02x %02x\n", offset,
1934                         ((char *)buf)[offset], ((char *)buf)[offset+1]);
1935                 b->b.physaddr = leaf_lookup(buf+offset,
1936                                             b->b.inode->i_sb->s_blocksize
1937                                             - offset,
1938                                             ib->b.fileaddr, b->b.fileaddr,
1939                                             NULL);
1940                 set_bit(B_PhysValid, &b->b.flags);
1941                 unmap_iblock(ib, buf);
1942         }
1943         dprintk("lafs_find_block: lookup for %lu of %lu at %lu found %llu\n",
1944                 (unsigned long)b->b.fileaddr,
1945                 (unsigned long)b->b.inode->i_ino,
1946                 (unsigned long)ib->b.fileaddr,
1947                 b->b.physaddr);
1948         /* FIXME should peer_find go here or in load_block? */
1949
1950         if (adopt)
1951                 block_adopt(&b->b, ib);
1952         lafs_iounlock_block(&ib->b);
1953 out:
1954         if (IS_ERR(ib))
1955                 return PTR_ERR(ib);
1956         if (ib)
1957                 putiref(ib, MKREF(findblock));
1958         return 0;
1959 }
1960
1961 int __must_check
1962 lafs_find_block(struct datablock *b, int adopt)
1963 {
1964         return find_block(b, adopt, SYNC);
1965 }
1966
1967 int __must_check
1968 lafs_find_block_async(struct datablock *b)
1969 {
1970         return find_block(b, ADOPT, ASYNC);
1971 }
1972
1973 /*
1974  * lafs_allocated_block records that a block has been written at a new
1975  * address (or found at an address during roll-forward). This involves
1976  * updating some summary information (quota etc) and making sure
1977  * the change gets recorded in the parent block
1978  * This must be called before B_Dirty or B_Realloc are cleared,
1979  * to ensure that the index block is marked accordingly.
1980  *  B_UnincCredit should still be set, and we clear it.
1981  */
1982
1983 int lafs_add_block_address(struct fs *fs, struct block *blk)
1984 {
1985         struct indexblock *p = blk->parent;
1986         struct addr *a;
1987
1988         spin_lock(&blk->inode->i_data.private_lock);
1989         if (test_bit(B_Index, &blk->flags)) {
1990                 if (!test_and_set_bit(B_Uninc, &blk->flags)) {
1991                         blk->chain = p->uninc;
1992                         LAFS_BUG(p->depth == 1, blk);
1993                         p->uninc = blk;
1994                         getref(blk, MKREF(uninc));
1995                 }
1996                 spin_unlock(&blk->inode->i_data.private_lock);
1997                 return 1;
1998         }
1999
2000         /* same phase and leaf; add the address to uninc_table */
2001
2002         /* the only place we try to merge is at the end
2003          * of the embedded table, and then only for data
2004          * blocks
2005          * It is essential that uninc_table is sorted and,
2006          * in particular, has no overlapping extents.
2007          * If this block threatens problems we incorporate first.
2008          */
2009
2010         LAFS_BUG(!test_bit(B_Dirty, &p->b.flags) &&
2011                  !test_bit(B_Realloc, &p->b.flags), blk);
2012
2013         a = &p->uninc_table.pending_addr
2014                 [p->uninc_table.pending_cnt-1];
2015         if (p->uninc_table.pending_cnt >= 1 &&
2016             a->fileaddr+a->cnt == blk->fileaddr &&
2017             a->physaddr != 0 &&
2018             a->physaddr+a->cnt == blk->physaddr) {
2019                 a->cnt++;
2020                 if (test_and_clear_bit(B_UnincCredit, &blk->flags))
2021                         p->uninc_table.credits++;
2022                 spin_unlock(&blk->inode->i_data.private_lock);
2023                 return 1;
2024         } else if ((p->uninc_table.pending_cnt == 0 ||
2025                     a->fileaddr + a->cnt <= blk->fileaddr) &&
2026                    /* properly ordered, maybe add */
2027                    (p->uninc_table.pending_cnt <
2028                     ARRAY_SIZE(p->uninc_table.pending_addr))) {
2029                 a++;
2030                 a->fileaddr = blk->fileaddr;
2031                 a->physaddr = blk->physaddr;
2032                 a->cnt = 1;
2033                 p->uninc_table.pending_cnt++;
2034                 if (test_and_clear_bit(B_UnincCredit,
2035                                        &blk->flags))
2036                         p->uninc_table.credits++;
2037                 spin_unlock(&blk->inode->i_data.private_lock);
2038
2039                 return 1;
2040         } else {
2041                 spin_unlock(&blk->inode->i_data.private_lock);
2042                 /* We own a ref through blk->parent now, but
2043                  * after lafs_incorporate, we might not
2044                  */
2045                 getiref(p,MKREF(addblock));
2046                 lafs_iolock_written(&p->b);
2047                 lafs_incorporate(fs, p);
2048                 lafs_iounlock_block(&p->b);
2049                 putiref(p,MKREF(addblock));
2050                 return 0;
2051         }
2052 }
2053
2054 int lafs_allocated_block(struct fs *fs, struct block *blk, u64 phys)
2055 {
2056         int ph;
2057         struct indexblock *p;
2058
2059         dprintk("Allocated %s -> %llu\n",  strblk(blk), phys);
2060
2061         /* If phys is 0, I don't need uninc credit.
2062          * Also, Index block that are not near full do not need UnincCredits
2063          */
2064         LAFS_BUG(test_bit(B_InoIdx, &blk->flags), blk);
2065         if (!test_bit(B_Dirty, &blk->flags) &&
2066             !test_bit(B_Realloc, &blk->flags) &&
2067             phys != 0)
2068                 printk("something missing %s\n", strblk(blk));
2069         LAFS_BUG(!test_bit(B_UnincCredit, &blk->flags) && phys &&
2070                  !test_bit(B_Index, &blk->flags), blk);
2071         LAFS_BUG(!test_bit(B_Dirty, &blk->flags) &&
2072                  !test_bit(B_Realloc, &blk->flags) &&
2073                  phys != 0, blk);
2074
2075         LAFS_BUG(!test_bit(B_Writeback, &blk->flags), blk);
2076
2077         if (test_bit(B_Root, &blk->flags)) {
2078                 int i;
2079                 struct lafs_inode *lai;
2080
2081                 LAFS_BUG(test_bit(B_Index, &blk->flags), blk);
2082
2083                 for (i = 0; i < fs->maxsnapshot; i++)
2084                         if (fs->ss[i].root == blk->inode)
2085                                 break;
2086                 LAFS_BUG(i == fs->maxsnapshot, blk);
2087                 blk->physaddr = phys; /* superblock doesn't get
2088                                          counted in summaries */
2089                 LAFS_BUG(test_bit(B_SegRef, &blk->flags), blk);
2090                 set_bit(B_PhysValid, &blk->flags);
2091                 fs->ss[i].root_addr = phys;
2092                 lai = LAFSI(blk->inode);
2093                 if (i == 0 && lai->md.fs.usagetable > 1)
2094                         /* snapshot checkpoint is happening.  Record root
2095                          * address in main fs and in snapshot
2096                          */
2097                         fs->ss[lai->md.fs.usagetable-1].root_addr = phys;
2098
2099                 /* Don't bother to clear the B_UnincCredit in the root.
2100                  * It'll just get set again.  This way there is less churn.
2101                  */
2102                 return 0;
2103         }
2104         if (test_bit(FinalCheckpoint, &fs->fsstate) &&
2105             !test_bit(B_Index, &blk->flags) &&
2106             (LAFSI(blk->inode)->type == TypeSegmentMap ||
2107              LAFSI(blk->inode)->type == TypeQuota)) {
2108                 /* These blocks will be picked up by roll-forward,
2109                  * so don't need to have their address recorded,
2110                  * and doing so would leave the index blocks
2111                  * dirty after the checkpoint.
2112                  * So just return
2113                  */
2114                 if (test_and_clear_bit(B_SegRef, &blk->flags))
2115                         lafs_seg_deref(fs, blk->physaddr, 0);
2116                 blk->physaddr = phys;
2117                 set_bit(B_PhysValid, &blk->flags);
2118                 return 0;
2119         }
2120         p = blk->parent;
2121
2122         LAFS_BUG(!p, blk);
2123
2124         /* We need to work out if this block is in the current phase or
2125          * the next one.  This can only be an issue if a phase change
2126          * (aka checkpoint) is underway. But if it is, we need to
2127          * ensure the right phase is used
2128          */
2129         lafs_checkpoint_lock(fs);
2130         if (test_bit(B_Pinned, &blk->flags))
2131                 ph = !!test_bit(B_Phase1, &blk->flags);
2132         else if (test_bit(B_Index, &blk->flags))
2133                 LAFS_BUG(1, blk); /* Index blocks must always be pinned when dirty (*/
2134         else if (p->uninc_table.pending_cnt) {
2135                 /* attach data block into previous phase as incorporation of
2136                  * this phase is still to happen.
2137                  * This is needed for flush-if-cannot-preallocate to work.
2138                  */
2139                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), &p->b);
2140                 ph = !!test_bit(B_Phase1, &p->b.flags);
2141                 lafs_pin_block_ph(blk, ph);
2142         } else {
2143                 lafs_pin_block(blk);
2144                 ph = !!test_bit(B_Phase1, &blk->flags);
2145         }
2146         lafs_checkpoint_unlock(fs);
2147         if (!test_bit(B_PhysValid, &blk->flags))
2148                 blk->physaddr = 0;
2149         lafs_summary_update(fs, blk->inode, blk->physaddr, phys,
2150                             test_bit(B_Index, &blk->flags), ph,
2151                             test_bit(B_SegRef, &blk->flags));
2152         clear_bit(B_Prealloc, &blk->flags);
2153
2154         blk->physaddr = phys;
2155         set_bit(B_PhysValid, &blk->flags);
2156
2157         if (ph != !!test_bit(B_Phase1, &p->b.flags)) {
2158                 /* in the next phase */
2159                 dprintk("uninc next phase\n");
2160                 if (test_and_set_bit(B_Uninc, &blk->flags))
2161                         /* This block was already on the 'unincorporated'
2162                          * list, (it must be changing quickly) so there
2163                          * is nothing else to do.
2164                          */
2165                         return 0;
2166                 getref(blk, MKREF(uninc));
2167
2168                 spin_lock(&blk->inode->i_data.private_lock);
2169                 blk->chain = p->uninc_next;
2170                 p->uninc_next = blk;
2171                 spin_unlock(&blk->inode->i_data.private_lock);
2172                 /* We will try again after the phase change.
2173                  * That will be when we clean B_UnincCredit
2174                  */
2175                 return 0;
2176         }
2177
2178 new_parent:
2179         lafs_dirty_iblock(p, !(test_bit(B_Dirty, &blk->flags) || phys == 0));
2180         dprintk("Uninc same phase\n");
2181
2182         BUG_ON(!test_bit(B_Pinned, &blk->parent->b.flags));
2183
2184         if (!lafs_add_block_address(fs, blk)) {
2185                 p = blk->parent;
2186
2187                 LAFS_BUG(!p, blk);
2188                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), blk);
2189                 goto new_parent;
2190         }
2191
2192         /* That credit has now been added to the uninc_table and
2193          * thus made available to the parent
2194          */
2195
2196         lafs_refile(&p->b, 0);
2197         return 0;
2198 }