3 * Handle index block for LaFS.
5 * Copyright (C) 2005-2009
6 * Neil Brown <neilb@suse.de>
7 * Released under the GPL, version 2
11 #include <linux/types.h>
12 #include <linux/hash.h>
13 #include <linux/slab.h>
15 * For each index-block there is an 'indexblock' data structure,
16 * and a 'block' of memory, which is a page or part there-of.
18 * Index-blocks are stored in a hash table, indexed by
19 * (struct inode *, fileaddr, depth). There is one hash table
20 * for all filesystems.
22 * There is a single free-list for all index blocks from all
23 * filesystems, of all sizes. Index blocks only get on this list when
24 * they are unpinned leafs. i.e. their 'children' list is empty. As they
25 * are freed, their parents may get added to the list.
27 * It's not clear yet how memory pressure will flush out dirty index blocks.
29 * All indexblocks for an inode are either attached under the inode via
30 * the children/siblings tree, or on a per-inode 'free' list also using
31 * the 'siblings' linkage. Thus we are sure that they can be removed when
32 * the inode is removed. Note that data blocks do *not* necessarily live
33 * in this tree (though they will when they are dirty). They can always be
34 * found in the radix_tree for the inode's address-space.
37 * Consider how 'rcu' can be used for
40 * Consider gang-freeing for adding to head of lru lists.
43 /* FIXME these should be configured at runtime based on memory */
45 static struct hlist_head hash_table[1<<HASHBITS];
46 spinlock_t lafs_hash_lock;
48 /* static */ struct freelists freelist;
50 static int lafs_shrinker(int nr_to_scan, /*gfp_t*/unsigned int gfp_mask)
54 if (!(gfp_mask & __GFP_FS))
56 dprintk("scan %d\n", nr_to_scan);
57 spin_lock(&lafs_hash_lock);
58 while (nr_to_scan && !list_empty(&freelist.lru)) {
59 struct indexblock *ib = list_entry(freelist.lru.next,
62 /* Check if it is really free */
64 if (atomic_read(&ib->b.refcnt)) {
65 list_del_init(&ib->b.lru);
66 clear_bit(B_OnFree, &ib->b.flags);
70 if (test_bit(B_InoIdx, &ib->b.flags))
71 LAFSI(ib->b.inode)->iblock = NULL;
73 /* Any pinned iblock must have children, be
74 * refcounted, or be on a 'leaf' lru, which gives
75 * a refcount. So this unreffed block cannot be
76 * pinned. This BUG_ON is here because previously
77 * the above 'if' also tested for 'Pinned', and that
78 * seems unnecessary, but I want to be sure.
80 LAFS_BUG(test_bit(B_Pinned, &ib->b.flags), &ib->b);
81 if (!hlist_unhashed(&ib->hash))
82 hlist_del_init(&ib->hash);
83 /* delete from inode->free_index */
84 list_del_init(&ib->b.siblings);
85 list_move(&ib->b.lru, &togo);
89 spin_unlock(&lafs_hash_lock);
90 while (!list_empty(&togo)) {
91 struct indexblock *ib = list_entry(togo.next,
98 return freelist.freecnt;
101 void lafs_release_index(struct list_head *head)
103 /* This is the other place that index blocks get freed.
104 * They are freed either by memory pressure in lafs_shrinker
105 * above, or when an inode is destroyed by this code.
106 * They need to both obey the same locking rules, and ensure
107 * blocks are removed from each of siblings, lru, and hash.
108 * This list is an inode's free_index and are linked on
109 * b.siblings. They are all on the freelist.lru
112 spin_lock(&lafs_hash_lock);
113 while (!list_empty(head)) {
114 struct indexblock *ib = list_entry(head->next,
117 list_del_init(&ib->b.siblings);
118 LAFS_BUG(!test_bit(B_OnFree, &ib->b.flags), &ib->b);
119 if (!hlist_unhashed(&ib->hash))
120 hlist_del_init(&ib->hash);
121 list_move(&ib->b.lru, &togo);
124 spin_unlock(&lafs_hash_lock);
125 while (!list_empty(&togo)) {
126 struct indexblock *ib = list_entry(togo.next,
129 list_del(&ib->b.lru);
130 lafs_iblock_free(ib);
135 lafs_iblock_alloc(struct fs *fs, int gfp, int with_buffer, REFARG)
137 struct indexblock *ib;
139 ib = kzalloc(sizeof(*ib), gfp);
143 ib->data = kmalloc(fs->blocksize, gfp);
152 set_bit(B_Index, &ib->b.flags);
153 atomic_set(&ib->b.refcnt, 1);
154 add_ref(&ib->b, REF, __FILE__, __LINE__);
156 INIT_LIST_HEAD(&ib->children);
157 atomic_set(&ib->pincnt[0], 0);
158 atomic_set(&ib->pincnt[1], 0);
160 ib->uninc_next = NULL;
163 INIT_LIST_HEAD(&ib->b.siblings);
164 INIT_LIST_HEAD(&ib->b.lru);
165 INIT_LIST_HEAD(&ib->b.peers);
166 INIT_HLIST_NODE(&ib->hash);
168 ib->uninc_table.pending_cnt = 0;
169 ib->uninc_table.credits = 0;
174 void lafs_iblock_free(struct indexblock *ib)
180 static struct shrinker hash_shrink = {
181 .shrink = lafs_shrinker,
182 .seeks = DEFAULT_SEEKS,
185 int lafs_ihash_init(void)
188 for (i = 0 ; i < (1<<HASHBITS); i++)
189 INIT_HLIST_HEAD(hash_table+i);
190 spin_lock_init(&lafs_hash_lock);
191 INIT_LIST_HEAD(&freelist.lru);
192 register_shrinker(&hash_shrink);
197 void lafs_ihash_free(void)
199 unregister_shrinker(&hash_shrink);
203 ihash(struct inode *inode, faddr_t addr, int depth)
205 unsigned long h = hash_ptr(inode, BITS_PER_LONG);
206 h = hash_long(h ^ addr, BITS_PER_LONG);
207 return hash_long(h ^ depth, HASHBITS);
210 static struct indexblock *
211 ihash_lookup(struct inode *inode, faddr_t addr, int depth,
212 struct indexblock *new, REFARG)
214 struct indexblock *ib = NULL;
215 struct hlist_node *tmp;
216 struct hlist_head *head = &hash_table[ihash(inode, addr, depth)];
218 spin_lock(&lafs_hash_lock);
219 hlist_for_each_entry(ib, tmp, head, hash)
220 if (ib->b.fileaddr == addr && ib->b.inode == inode &&
225 hlist_add_head(&new->hash, head);
231 getiref_locked_needsync(ib, REF);
232 spin_unlock(&lafs_hash_lock);
238 struct indexblock *lafs_getiref_locked(struct indexblock *ib)
243 LAFS_BUG(!test_bit(B_InoIdx, &ib->b.flags), &ib->b);
244 if (spin_is_locked(&ib->b.inode->i_data.private_lock))
246 if (spin_is_locked(&lafs_hash_lock))
248 if (spin_is_locked(&fs_from_inode(ib->b.inode)->lock))
250 LAFS_BUG(!ok, &ib->b);
252 if (atomic_inc_return(&ib->b.refcnt) == 1) {
253 /* First reference to an InoIdx block implies a reference
255 * If two different threads come here under different locks, one could
256 * exit before the ref on the dblock is taken. However as each
257 * thread must own an indirect reference to be here, the last indirect ref to
258 * the dblock cannot disappear before all threads have exited this function,
259 * by which time this direct ref will be active.
261 struct datablock *db;
262 db = getdref(LAFSI(ib->b.inode)->dblock, MKREF(iblock));
263 LAFS_BUG(db == NULL, &ib->b);
269 lafs_iblock_get(struct inode *ino, faddr_t addr, int depth, paddr_t phys, REFARG)
271 /* Find the index block with this inode/depth/address.
272 * If it isn't in the cache, create it with given
275 struct indexblock *ib = ihash_lookup(ino, addr, depth, NULL, REF);
276 struct indexblock *new;
283 new = lafs_iblock_alloc(fs_from_inode(ino), GFP_KERNEL, 1, REF);
286 new->b.fileaddr = addr;
289 new->b.physaddr = phys;
290 set_bit(B_PhysValid, &new->b.flags);
292 ib = ihash_lookup(ino, addr, depth, new, REF);
294 lafs_iblock_free(new);
298 void lafs_hash_iblock(struct indexblock *ib)
300 /* This block was an InoIdx block but has just become a real
301 * index block, so hash it. There cannot be a block with this
302 * address already as we haven't increased the inode depth yet.
303 * Alternately this block was recently split off another,
304 * and has now been incorporated, so a lookup might need
305 * to find it. So it better be in the hash table.
307 struct hlist_head *head =
308 &hash_table[ihash(ib->b.inode, ib->b.fileaddr, ib->depth)];
309 spin_lock(&lafs_hash_lock);
310 LAFS_BUG(!hlist_unhashed(&ib->hash), &ib->b);
311 hlist_add_head(&ib->hash, head);
312 spin_unlock(&lafs_hash_lock);
315 void lafs_unhash_iblock(struct indexblock *ib)
317 /* This block has been emptied and deleted and is no longer
318 * wanted in the hash tables
320 spin_lock(&lafs_hash_lock);
321 LAFS_BUG(hlist_unhashed(&ib->hash), &ib->b);
322 hlist_del_init(&ib->hash);
323 spin_unlock(&lafs_hash_lock);
326 /* adopt blk into parent if possible.
329 block_adopt(struct block *blk, struct indexblock *parent)
331 struct address_space *as;
333 if (parent == NULL) {
334 LAFS_BUG(!test_bit(B_Root, &blk->flags), blk);
339 LAFS_BUG(blk->parent != parent, blk);
343 LAFS_BUG(!(parent->b.flags & B_Valid), &parent->b);
345 as = &blk->inode->i_data;
347 spin_lock(&as->private_lock);
349 if (blk->parent != parent) {
350 printk("blk = %s\n", strblk(blk));
351 printk("blk->p = %s\n", strblk(&blk->parent->b));
352 printk("parent = %s\n", strblk(&parent->b));
355 LAFS_BUG(blk->parent != parent, blk);
357 if (test_bit(B_EmptyIndex, &parent->b.flags)) {
358 LAFS_BUG(!test_bit(B_Index, &parent->b.flags), &parent->b);
359 LAFS_BUG(test_bit(B_InoIdx, &parent->b.flags), &parent->b);
360 LAFS_BUG(parent->b.fileaddr != parent->b.parent->b.fileaddr, &parent->b);
362 clear_bit(B_EmptyIndex, &parent->b.flags);
364 LAFS_BUG(parent == iblk(blk), blk);
365 blk->parent = parent;
366 getiref(parent, MKREF(child));
367 /* Remove from the per-inode free list */
368 spin_lock(&lafs_hash_lock);
369 if (!test_bit(B_InoIdx, &blk->flags))
370 list_move(&blk->siblings, &parent->children);
372 /* Remove from ->free_index list */
373 list_del_init(&blk->siblings);
374 /* Having set ->parent, we take a ref on the dblock.
375 * The dblock must exist because we can only reach here from
376 * lafs_make_iblock which can only be called while holding a ref
379 (void)getdref(LAFSI(blk->inode)->dblock, MKREF(pdblock));
381 spin_unlock(&lafs_hash_lock);
383 spin_unlock(&as->private_lock);
386 static int __must_check
387 find_block(struct datablock *b, int adopt, int async);
389 static int __must_check
390 setparent(struct datablock *blk, int async)
393 if (blk->b.parent == NULL && !test_bit(B_Root, &blk->b.flags))
394 err = find_block(blk, 1, async);
399 lafs_setparent(struct datablock *blk)
401 return setparent(blk, 0);
404 void lafs_pin_block_ph(struct block *b, int ph)
406 struct fs *fs = fs_from_inode(b->inode);
410 LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags), b);
411 if (!test_bit(B_Realloc, &b->flags) &&
412 LAFSI(b->inode)->type != TypeSegmentMap)
413 LAFS_BUG(!test_phase_locked(fs), b);
416 spin_lock(&ino->i_data.private_lock);
418 p && !test_bit(B_Pinned, &p->flags) ;
419 p = &(p->parent)->b) {
421 if (test_bit(B_InoIdx, &p->flags)) {
422 struct datablock *db = LAFSI(p->inode)->dblock;
424 spin_unlock(&ino->i_data.private_lock);
425 lafs_inode_checkpin(p->inode);
427 spin_lock(&ino->i_data.private_lock);
430 spin_unlock(&ino->i_data.private_lock);
434 static void pin_all_children(struct indexblock *ib)
436 /* If we cannot get new credits for phase_flip, we need to
437 * pin all dirty data-block children so that they get
438 * written and don't remain to require a phase_flip
439 * on the next checkpoint (at which point there will be
441 * So if any dirty child is found, set phase. We don't
442 * need PinPending as it is already Dirty. All other
443 * requirement for pinning will already be met.
446 struct indexblock *start = NULL;
448 struct fs *fs = fs_from_inode(ib->b.inode);
452 * This is called with an IOLock on ib, So it is unlikely
453 * that children will be added or removed(?), but we really
454 * should hole the inode private_lock anyway.
456 getiref(ib, MKREF(pinall));
459 /* We hold a reference on ib, and on 'start' if non-NULL */
460 spin_lock(&ib->b.inode->i_data.private_lock);
461 b = list_prepare_entry(b, &ib->children, siblings);
462 list_for_each_entry_continue(b, &ib->children, siblings) {
463 /* FIXME I don't descend through inode blocks to the file below! */
464 if (test_bit(B_Index, &b->flags)) {
467 getref_locked(b, MKREF(pinall));
468 spin_unlock(&ib->b.inode->i_data.private_lock);
469 putiref(ib, MKREF(pinall));
473 } else if (test_bit(B_Dirty, &b->flags) &&
474 !test_bit(B_Pinned, &b->flags)) {
475 getref_locked(b, MKREF(pinall));
476 spin_unlock(&ib->b.inode->i_data.private_lock);
478 putref(b, MKREF(pinall));
479 /* Slightly safer to restart this level */
484 spin_unlock(&ib->b.inode->i_data.private_lock);
485 putiref(start, MKREF(pinall));
489 ib = getiref(ib->b.parent, MKREF(pinall));
493 putiref(ib, MKREF(pinall));
496 static void flip_phase(struct block *b)
498 struct indexblock *p;
499 int oldphase = !!test_bit(B_Phase1, &b->flags);
501 LAFS_BUG(!test_bit(B_Pinned, &b->flags), b);
503 clear_bit(B_Phase1, &b->flags);
505 set_bit(B_Phase1, &b->flags);
510 atomic_inc(&p->pincnt[1-oldphase]);
511 atomic_dec(&p->pincnt[oldphase]);
513 LAFS_BUG(!test_bit(B_Root, &b->flags), b);
515 /* move 'next' credits to 'here' */
516 if (!test_bit(B_Credit, &b->flags) &&
517 test_and_clear_bit(B_NCredit, &b->flags))
518 set_bit(B_Credit, &b->flags);
520 if (!test_bit(B_ICredit, &b->flags) &&
521 test_and_clear_bit(B_NICredit, &b->flags))
522 set_bit(B_ICredit, &b->flags);
525 /* When the pinning of a block needs to be carried across a
526 * checkpoint, we need to 'flip' the phase.
527 * This only applies to blocks that can be pinned by a block that
528 * may not be written until the next phase.
529 * This includes any index block (is it may have some children in one
530 * phase and some in the next) and any space-accounting block
531 * for the same reason.
532 * So indexblocks will need to flip, and use lafs_phase_flip.
533 * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock.
534 * TypeInodeFile don't need to be phase_flipped, though their InoIdx
535 * block might. InodeFile blocks are only pinned by metadata transactions
536 * which happen inside a checkpoint lock.
539 void lafs_flip_dblock(struct datablock *db)
541 /* This is an accounting block (SegmentMap or Quota)
542 * which we need to write out after ->phase has changed
543 * in the tail of the checkpoint.
544 * We always flip the phase and reallocate from AccountSpace.
545 * If all references get dropped, it will then get unpinned
546 * before the next phase finished - we don't unpin here
547 * (unlike lafs_phase_flip for index blocks).
550 lafs_prealloc(&db->b, AccountSpace);
551 /* Parent might need to be on a leaflist now */
552 lafs_refile(&db->b.parent->b, 0);
555 void lafs_phase_flip(struct fs *fs, struct indexblock *ib)
557 /* We are performing a checkpoint, this block has been written
558 * out and now needs to be flipped into the next phase.
561 * - Processing all uninc_next blocks into uninc_table.
562 * - adjusting counts on parent
563 * - moving credits from 'next' to 'this' phase.
564 * - update block counts to included phase-delayed updates.
566 int oldphase = !!test_bit(B_Phase1, &ib->b.flags);
569 dprintk("FLIP %s\n", strblk(&ib->b));
571 LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
572 LAFS_BUG(!test_bit(B_Index, &ib->b.flags), &ib->b);
573 LAFS_BUG(ib->b.parent == NULL && !test_bit(B_Root, &ib->b.flags), &ib->b);
574 LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
575 LAFS_BUG(atomic_read(&ib->pincnt[oldphase]), &ib->b);
576 LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
578 /* If this is the only reference, and pincnts are zero
579 * and all relevant flags are clear, then don't flip
582 if (test_bit(B_Pinned, &ib->b.flags) &&
583 !test_bit(B_Dirty, &ib->b.flags) &&
584 !test_bit(B_Realloc, &ib->b.flags) &&
585 atomic_read(&ib->b.refcnt) == 1 && /* This is us */
587 ib->uninc_table.pending_cnt == 0 &&
589 ib->uninc_next == NULL &&
590 atomic_read(&ib->pincnt[0]) == 0 &&
591 atomic_read(&ib->pincnt[1]) == 0 &&
593 (!test_bit(B_InoIdx, &ib->b.flags) ||
594 !(test_bit(B_PinPending, &LAFSI(ib->b.inode)->dblock->b.flags)
595 || test_bit(B_Dirty, &LAFSI(ib->b.inode)->dblock->b.flags)
596 || test_bit(B_Realloc, &LAFSI(ib->b.inode)->dblock->b.flags)))
598 if (test_and_clear_bit(B_Pinned, &ib->b.flags)) {
599 if (!test_bit(B_Root, &ib->b.flags)) {
600 atomic_dec(&ib->b.parent->pincnt[oldphase]);
601 lafs_refile(&ib->b.parent->b, 0);
603 lafs_inode_checkpin(ib->b.inode);
605 if (test_bit(B_InoIdx, &ib->b.flags))
606 /* maybe data block should go in leaf list now */
607 lafs_refile(&LAFSI(ib->b.inode)->dblock->b, 0);
608 lafs_iounlock_block(&ib->b);
614 if (test_bit(B_InoIdx, &ib->b.flags)) {
615 struct lafs_inode *lai = LAFSI(ib->b.inode);
616 struct datablock *db = lai->dblock;
617 struct inode *ino = db->b.inode;
619 spin_lock(&ino->i_data.private_lock);
620 lai->cblocks += lai->pblocks;
622 lai->ciblocks += lai->piblocks;
624 if (lai->type == TypeInodeFile) {
625 /* Root of filesystem */
626 lai->md.fs.cblocks_used +=
627 lai->md.fs.pblocks_used;
628 lai->md.fs.pblocks_used = 0;
630 spin_unlock(&ino->i_data.private_lock);
632 /* maybe data block needs to be on leaf list */
633 lafs_refile(&db->b, 0);
636 if (lafs_prealloc(&ib->b, ReleaseSpace) < 0) {
637 /* Couldn't get all N*Credit, so
638 * Pin all children to this phase, so they
641 pin_all_children(ib);
644 spin_lock(&ib->b.inode->i_data.private_lock);
645 ulist = ib->uninc_next;
646 ib->uninc_next = NULL;
647 spin_unlock(&ib->b.inode->i_data.private_lock);
649 lafs_iounlock_block(&ib->b);
652 lafs_dirty_iblock(ib);
654 struct block *b2 = ulist;
657 clear_bit(B_Uninc, &b2->flags);
658 while (lafs_add_block_address(fs, b2) == 0)
659 /* We had to incorporate b2->parent which might
660 * have split the parent but as that will have made
661 * it dirty, there is nothing extra that we need to
665 putref(b2, MKREF(uninc));
667 LAFS_BUG(ib->uninc_next, &ib->b);
669 lafs_refile(&ib->b, 0);
671 /* Parent may need to be attached to a phase_leafs now */
672 lafs_refile(&ib->b.parent->b, 0);
676 * lafs_refile is called whenever we finish doing stuff
677 * to a block. lafs_refile checks the state of the block and
678 * make sure various lists etc are correct.
679 * When finished with a block we often want to drop a reference,
680 * and this functionality is included in refile so that the
681 * decref and associated refiling can be done under a lock.
683 * Key issues that lafs_refile deals with are:
684 * Make sure ->lru is on correct list.
685 * Remove ->parent link if no longer needed.
686 * remove B_Lock if no longer needed
687 * trigger a phase change when appropriate
688 * Remove dir parent link if no longer needed.
690 * Fleshing these out a bit:
691 * possible lru lists are:
692 * phase_leafs[phase] - for dirty/pinned blocks with no pinned children
693 * clean_leafs - for blocks that are being relocated for cleaning
694 * and that have no pinned children
695 * The parent link is only needed if this block has a refcount or one
696 * of B_Pinned, B_Dirty, B_Uninc
697 * B_Lock is only needed if one of
700 * though a refcount on a pinned datablock is only significant
701 * while the current phase is locked.
703 * a phase change can (and should) happen for index blocks
704 * in the 'other' phase that are
705 * not Dirty, not Alloc, pincnt[oldphase]==0, uninc-table empty
708 * Refiling one block may cause a change in another block which requires
709 * a recursive refile. Naturally we unroll the tail-recursion, but we
710 * we need to be sure there are a limited number of tails.
711 * The causes of recursion are:
712 * A datablock which no-longer has a PagePrivate page holds a ref
713 * on block0 of that page, which must be dropped on last put.
714 * When any block becomes unpinned, we must refile the parent.
715 * When we clear ->parent we must deref the parent.
716 * When an InoIdx block drops ->parent it is dropped from inode->iblock
717 * so we need to drop the implied refcnt on ->dblock
718 * So there can only be 2 block to be recursed on to.
719 * A block0 or dblock, and a parent.
720 * A block pointing to a block0 never has a parent, and an iblock has the same
721 * parent as the dblock, so there can never be more than 2 outstanding blocks.
723 int lafs_is_leaf(struct block *b, int ph)
725 /* This is pinned and not iolocked. Should it be on a leaf
729 struct lafs_inode *lai;
732 if (!test_bit(B_Pinned, &b->flags) ||
733 test_bit(B_Writeback, &b->flags))
736 if (test_bit(B_Index, &b->flags)) {
738 return atomic_read(&iblk(b)->pincnt[ph]) == 0;
739 return (atomic_read(&iblk(b)->pincnt[0]) +
740 atomic_read(&iblk(b)->pincnt[1])) == 0;
743 /* This is a data block */
744 myi = rcu_my_inode(dblk(b));
746 /* Non-inode data blocks are always leafs */
750 spin_lock(&lai->vfs_inode.i_data.private_lock);
752 ph = !!test_bit(B_Phase1, &b->flags);
754 test_bit(B_Pinned, &lai->iblock->b.flags) &&
755 !!test_bit(B_Phase1, &lai->iblock->b.flags) == ph)
756 /* iblock still pinned in the same phase, so
757 * this block isn't a leaf
761 spin_unlock(&lai->vfs_inode.i_data.private_lock);
767 * This for debugging and is racy and is probably only safe on UP
769 static void check_consistency(struct block *b)
772 * 1/ make sure pincnt is right
774 if (test_bit(B_Index, &b->flags)) {
778 list_for_each_entry(cb, &iblk(b)->children, siblings) {
779 struct inode *myi = NULL;
780 if (test_bit(B_Pinned, &cb->flags)) {
781 int pp = !!test_bit(B_Phase1, &cb->flags);
784 /* InoIdx block might be pinned but is not a direct
786 if (!test_bit(B_Index, &cb->flags) &&
787 (myi = rcu_my_inode(dblk(cb))) != NULL &&
788 LAFSI(myi)->iblock &&
789 test_bit(B_Pinned, &LAFSI(myi)->iblock->b.flags)) {
790 int pp = !!test_bit(B_Phase1,
791 &LAFSI(myi)->iblock->b.flags);
796 if (c[0] != atomic_read(&iblk(b)->pincnt[0]) ||
797 c[1] != atomic_read(&iblk(b)->pincnt[1])) {
798 printk("%d %d\n", c[0], c[1]);
805 struct indexblock *ib = b->parent;
809 list_for_each_entry(cb, &ib->children, siblings) {
810 struct inode *myi = NULL;
811 if (test_bit(B_Pinned, &cb->flags)) {
812 int pp = !!test_bit(B_Phase1, &cb->flags);
815 /* InoIdx block might be pinned but is not a direct
817 if (!test_bit(B_Index, &cb->flags) &&
818 (myi = rcu_my_inode(dblk(cb))) != NULL &&
819 LAFSI(myi)->iblock &&
820 test_bit(B_Pinned, &(LAFSI(myi)
821 ->iblock->b.flags))) {
822 int pp = !!test_bit(B_Phase1,
829 if (c[0] != atomic_read(&ib->pincnt[0]) ||
830 c[1] != atomic_read(&ib->pincnt[1])) {
831 printk("badcnt %d %d\n", c[0], c[1]);
837 static void set_lru(struct block *b)
840 int ph = !!test_bit(B_Phase1, &b->flags);
842 if (!test_bit(B_OnFree, &b->flags) && !list_empty_careful(&b->lru))
844 if (test_bit(B_IOLock, &b->flags) || !lafs_is_leaf(b, ph))
847 /* This is close enough to a leaf that we should put it on a list.
848 * If we raced and it isn't then it will be found and removed
850 if (test_bit(B_OnFree, &b->flags)) {
851 spin_lock(&lafs_hash_lock);
852 if (test_and_clear_bit(B_OnFree, &b->flags)) {
853 list_del_init(&b->lru);
856 spin_unlock(&lafs_hash_lock);
858 fs = fs_from_inode(b->inode);
859 spin_lock(&fs->lock);
860 if (list_empty(&b->lru)) {
861 if (test_bit(B_Realloc, &b->flags))
862 list_add(&b->lru, &fs->clean_leafs);
864 ph = !!test_bit(B_Phase1, &b->flags);
865 list_add(&b->lru, &fs->phase_leafs[ph]);
869 spin_unlock(&fs->lock);
870 /* allow lafs_iolock_block-empty to complete */
874 void lafs_refile(struct block *b, int dec)
876 struct block *next = NULL, *next_parent = NULL;
877 struct fs *fs = NULL;
883 fs = fs_from_inode(b->inode);
884 check_consistency(b);
886 /* To (mostly) avoid recursion, we have a loop which may
887 * walk up to the parent.
888 * The main reason for holding lafs_hash_lock is that it protects
889 * ->lru - i.e. all the lists. FIXME that should be fs->lock
891 LAFS_BUG(atomic_read(&b->refcnt) == 0, b);
895 struct inode *checkpin = NULL;
896 struct inode *myi = NULL;
897 struct datablock *db;
901 if (test_bit(B_Pinned, &b->flags) &&
902 !test_bit(B_Index, &b->flags) &&
903 !test_bit(B_PinPending, &b->flags) &&
904 !test_bit(B_Dirty, &b->flags) &&
905 !test_bit(B_Realloc, &b->flags)
907 /* Don't need to be Pinned any more */
908 /* FIXME do I need to lock access to ->parent */
909 lafs_checkpoint_lock(fs);
910 ph = !!test_bit(B_Phase1, &b->flags);
911 if (test_and_clear_bit(B_Pinned, &b->flags)) {
912 if (test_bit(B_Dirty, &b->flags) ||
913 test_bit(B_Realloc, &b->flags) ||
914 test_bit(B_PinPending, &b->flags))
917 else if (!test_bit(B_Root, &b->flags)) {
919 atomic_dec(&b->parent->pincnt[ph]);
921 if (next_parent != nb) {
922 LAFS_BUG(next_parent, b);
924 atomic_inc(&nb->refcnt);
928 lafs_checkpoint_unlock(fs);
931 if (dec && atomic_dec_and_lock(&b->refcnt, &b->inode->i_data.private_lock)) {
932 /* PinPending disappears with the last non-lru reference,
933 * (or possibly at other times).
935 clear_bit(B_PinPending, &b->flags);
937 ph = !!test_bit(B_Phase1, &b->flags);
938 /* See if we still need to be pinned */
939 if (test_bit(B_Pinned, &b->flags) &&
940 !test_bit(B_Dirty, &b->flags) &&
941 !test_bit(B_Realloc, &b->flags)) {
942 /* Don't need to be Pinned any more */
943 if (test_and_clear_bit(B_Pinned, &b->flags)) {
944 if (!list_empty_careful(&b->lru)) {
945 spin_lock(&fs->lock);
946 list_del_init(&b->lru);
947 spin_unlock(&fs->lock);
949 if (test_bit(B_InoIdx, &b->flags) &&
952 if (!test_bit(B_Root, &b->flags)) {
954 atomic_dec(&b->parent->pincnt[ph]);
955 if (test_bit(B_InoIdx, &b->flags))
956 nb = &LAFSI(b->inode)->dblock->b;
959 if (next_parent != nb) {
960 LAFS_BUG(next_parent, b);
962 atomic_inc(&nb->refcnt);
968 /* check the ->parent link */
977 /* Don't need ->parent any more */
978 if (next_parent == NULL)
979 next_parent = &b->parent->b;
980 else if (next_parent == &b->parent->b ||
981 next_parent->parent == b->parent) {
982 if (atomic_dec_and_test(&b->parent->b.refcnt))
987 del_ref(&b->parent->b, MKREF(child),
991 if (test_bit(B_InoIdx, &b->flags)) {
992 /* While an InoIdx has a parent we hold a count on
993 * the dblock. Now we have dropped one, we must drop the
996 struct datablock *db = LAFSI(b->inode)->dblock;
997 if (&db->b.parent->b != next_parent &&
998 &db->b != next_parent) {
999 printk("db = %s\n", strblk(&db->b));
1000 printk("dp->p = %s\n", strblk(&db->b.parent->b));
1001 printk("np = %s\n", strblk(next_parent));
1002 printk("b = %s\n", strblk(b));
1004 BUG_ON(&db->b.parent->b != next_parent && &db->b != next_parent);
1005 if (atomic_dec_and_test(&next_parent->refcnt))
1007 next_parent = &db->b;
1008 del_ref(&db->b, MKREF(pdblock), __FILE__, __LINE__);
1010 if (test_and_clear_bit(B_SegRef, &b->flags))
1011 physref = b->physaddr;
1013 LAFS_BUG(test_bit(B_PrimaryRef, &b->flags), b);
1014 list_del_init(&b->siblings);
1016 if (test_bit(B_Index, &b->flags))
1017 list_add(&b->siblings,
1018 &LAFSI(b->inode)->free_index);
1020 if (test_and_clear_bit(B_Prealloc, &b->flags) &&
1022 lafs_summary_allocate(fs, b->inode, -1);
1024 if (test_and_clear_bit(B_Credit, &b->flags))
1026 if (test_and_clear_bit(B_ICredit, &b->flags))
1028 if (test_and_clear_bit(B_NCredit, &b->flags))
1030 if (test_and_clear_bit(B_NICredit, &b->flags))
1032 if (test_and_clear_bit(B_UnincCredit, &b->flags))
1034 lafs_space_return(fs, credits);
1036 if (test_bit(B_Index, &b->flags) &&
1044 if (b->parent != NULL)
1045 /* Could this be uninc - where
1047 printk("Problem: %s\n", strblk(b));
1048 LAFS_BUG(b->parent != NULL, b);
1049 /* put it on the lru */
1050 spin_lock(&lafs_hash_lock);
1051 if (!test_and_set_bit(B_OnFree, &b->flags)) {
1052 LAFS_BUG(!list_empty(&b->lru), b);
1053 list_move(&b->lru, &freelist.lru);
1056 spin_unlock(&lafs_hash_lock);
1058 /* last reference to a dblock with no page
1059 * requires special handling
1060 * The first block on a page must be freed,
1061 * the other blocks hold a reference on that
1062 * first block which must be dropped.
1063 * However it is possible that a new reference was taken
1064 * via _leafs. If so we have now cleared Pinned so
1065 * get_flushable will immediately put this again.
1066 * so we should leave this cleanup to later.
1067 * Unlike other tests, this one isn't idempotent, so
1068 * need the check on refcnt
1071 if (!test_bit(B_Index, &b->flags) &&
1072 !PagePrivate(db->page) &&
1073 atomic_read(&b->refcnt) == 0) {
1074 int bits = (PAGE_SHIFT -
1075 b->inode->i_blkbits);
1076 int mask = (1<<bits) - 1;
1077 int bnum = b->fileaddr & mask;
1079 LAFS_BUG(next, next);
1081 next = &db[-bnum].b;
1082 del_ref(next, "lafs_release_0",
1083 __FILE__, __LINE__);
1089 /* last reference to an iblock requires that we
1090 * deref the dblock. We don't need to re-check
1091 * refcnt here as a racing getiref_locked will take an
1092 * extra dblock reference it.
1094 if (test_bit(B_InoIdx, &b->flags)) {
1095 LAFS_BUG(LAFSI(b->inode)->iblock
1097 LAFS_BUG(next, next);
1098 next = &LAFSI(b->inode)->dblock->b;
1099 del_ref(next, MKREF(iblock),
1100 __FILE__, __LINE__);
1103 /* Free a delayed-release inode */
1104 if (!test_bit(B_Index, &b->flags) &&
1105 (myi = rcu_my_inode(dblk(b))) != NULL &&
1106 (!PagePrivate(dblk(b)->page) ||
1107 test_bit(I_Destroyed, &LAFSI(myi)->iflags))) {
1108 dblk(b)->my_inode = NULL;
1109 LAFSI(myi)->dblock = NULL;
1110 /* Don't need lafs_hash_lock to clear iblock as
1111 * new refs on iblock are only taken while holding
1112 * dblock, and we know dblock has no references.
1113 * You would need an iget to get a ref on dblock now,
1114 * and because I_Destroyed, we know that isn't possible.
1116 LAFSI(myi)->iblock = NULL;
1117 spin_unlock(&b->inode->i_data.private_lock);
1118 if (test_bit(I_Destroyed, &LAFSI(myi)->iflags))
1119 lafs_destroy_inode(myi);
1121 spin_unlock(&b->inode->i_data.private_lock);
1126 lafs_seg_deref(fs, physref, 0);
1131 lafs_inode_checkpin(checkpin);
1139 } else if (next_parent) {
1148 * create (if it doesn't already exist) the 'iblock' for an inode.
1149 * This is a shadow of the dblock but comes into it's own if/when
1150 * the inode's indexing tree needs to grow.
1151 * Return a counted reference to the index block.
1152 * NOTE: we must hold a reference to ->dblock when we call
1155 struct indexblock * __must_check
1156 lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG)
1158 struct lafs_inode *lai = LAFSI(ino);
1159 struct indexblock *ib, *new = NULL;
1160 struct fs *fs = fs_from_inode(ino);
1163 dprintk("MAKE_IBLOCK %d\n", (int)ino->i_ino);
1165 BUG_ON(lai->dblock == NULL);
1166 BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1168 spin_lock(&lafs_hash_lock);
1170 ib = getiref_locked_needsync(lai->iblock, REF);
1175 (void)getdref(lai->dblock, MKREF(iblock));
1177 spin_unlock(&lafs_hash_lock);
1179 lafs_iblock_free(new);
1183 err = setparent(lai->dblock, async);
1185 block_adopt(&ib->b, lai->dblock->b.parent);
1190 return ERR_PTR(err);
1193 new = lafs_iblock_alloc(fs, GFP_KERNEL, 0, REF);
1195 return ERR_PTR(-ENOMEM);
1197 new->b.fileaddr = 0;
1198 new->b.physaddr = lai->dblock->b.physaddr;
1199 if (test_bit(B_PhysValid, &lai->dblock->b.flags))
1200 set_bit(B_PhysValid, &new->b.flags);
1201 LAFS_BUG(!test_bit(B_Valid, &lai->dblock->b.flags), &lai->dblock->b);
1202 set_bit(B_Valid, &new->b.flags);
1204 new->depth = lai->depth;
1205 /* Note: this doesn't get hashed until the index
1206 * tree grows and this block is disconnected from
1209 set_bit(B_InoIdx, &new->b.flags);
1210 if (test_bit(B_Root, &lai->dblock->b.flags))
1211 set_bit(B_Root, &new->b.flags);
1212 new->b.parent = NULL;
1218 leaf_lookup(void *bf, int len, u32 startaddr, u32 target, u32 *nextp)
1220 /* buf starts with a 2byte header
1221 * if 1, then 6byte littleending indirect entries.
1222 * if 2, then 12byte extent entries
1224 unsigned char *buf = bf;
1232 *nextp = 0xffffffff;
1241 case IBLK_INDIRECT: /* indirect */
1242 dprintk("indirect lookup for %lu from %lu, len %d\n",
1243 (unsigned long)target, (unsigned long)startaddr, len);
1247 if (target < startaddr)
1251 target -= startaddr;
1253 /* Need both tests as target could be v.large and
1254 * multiply by 6 could overflow
1262 /* find the next allocated block */
1268 *nextp = 0xffffffff;
1281 case IBLK_EXTENT: /* extent */
1282 /* 12 byte records: 6byte device, 2 byte length, 4
1290 int mid = (lo+hi)/2;
1294 elen = decode16(cp);
1295 addr = decode32(cp);
1297 if (elen && addr <= target)
1306 elen = decode16(cp);
1307 addr = decode32(cp);
1308 if (target < addr) {
1311 } else if (target >= addr + elen)
1318 *nextp = 0xffffffff; /* next extent */
1322 elen = decode16(cp);
1323 addr = decode32(cp);
1325 *nextp = 0xffffffff; /* no more meaningful
1335 u32 lafs_leaf_next(struct indexblock *ib, u32 start)
1337 /* In this leaf index block, find the first address
1338 * at-or-after 'start'. If none, return 0xFFFFFFFF
1340 char *buf = map_iblock(ib);
1341 int blocksize = ib->b.inode->i_sb->s_blocksize;
1342 struct lafs_inode *li = LAFSI(ib->b.inode);
1343 u32 next = 0xffffffff;
1346 if (test_bit(B_InoIdx, &ib->b.flags))
1347 phys = leaf_lookup(buf + li->metadata_size,
1348 blocksize - li->metadata_size,
1349 ib->b.fileaddr, start, &next);
1351 phys = leaf_lookup(buf, blocksize,
1352 ib->b.fileaddr, start, &next);
1353 unmap_iblock(ib, buf);
1361 index_lookup(void *bf, int len, u32 target, u32 *addrp, u32 *nextp)
1363 unsigned char *buf = bf;
1369 if (buf[1] || buf[0]) {
1370 dprintk("WARNING: not an index block %02x %02x\n",
1378 /* 10 byte records: 6 byte Device address, 4 byte file address */
1379 lo = 0; hi = (len/10);
1381 int mid = (lo+hi)/2;
1385 addr = decode32(cp);
1386 dprintk("...addr=%lu target=%lu lo=%d mid=%d hi=%d\n",
1387 (unsigned long)addr, (unsigned long)target,
1389 if (p && addr <= target)
1398 addr = decode32(cp);
1401 /* Nothing in this block */
1404 if (addr > target) {
1405 /* target is before first address */
1411 if (cp+10 <= (unsigned char *) buf + len && nextp) {
1412 u64 p2 = decode48(cp);
1413 u32 nxt = decode32(cp);
1415 *nextp = nxt; /* address of next index block */
1420 int lafs_index_empty(struct indexblock *ib)
1422 char *buf = map_iblock(ib);
1423 int blocksize = ib->b.inode->i_sb->s_blocksize;
1424 struct lafs_inode *li = LAFSI(ib->b.inode);
1428 if (test_bit(B_InoIdx, &ib->b.flags))
1429 phys = index_lookup(buf + li->metadata_size,
1430 blocksize - li->metadata_size,
1431 0xFFFFFFFF, &addr, NULL);
1433 phys = index_lookup(buf, blocksize,
1434 0xFFFFFFFF, &addr, NULL);
1435 unmap_iblock(ib, buf);
1440 static struct indexblock *find_better(struct inode *ino,
1441 struct indexblock *ib,
1442 u32 addr, u32 *next, REFARG)
1444 struct indexblock *rv = ib;
1445 spin_lock(&ino->i_data.private_lock);
1446 list_for_each_entry_continue(ib, &ib->b.parent->children, b.siblings) {
1447 if (!test_bit(B_PrimaryRef, &ib->b.flags))
1449 if (test_bit(B_EmptyIndex, &ib->b.flags))
1451 if (ib->b.fileaddr > addr) {
1452 if (next && *next > ib->b.fileaddr)
1453 *next = ib->b.fileaddr;
1456 /* This appears to be better. */
1459 getref_locked(&rv->b, REF);
1460 spin_unlock(&ino->i_data.private_lock);
1465 lafs_leaf_find(struct inode *inode, u32 addr, int adopt, u32 *next,
1468 /* Find the leaf index block which refers to this address (if any do)
1469 * Returns the leaf IOLocked.
1470 * *next will be set to an address that is higher than addr such
1471 * that there are no other leafs before *next. There might not
1472 * actually be a leaf at *next though.
1474 struct indexblock *ib, *ib2;
1475 struct indexblock *hold = NULL;
1479 struct lafs_inode *li = LAFSI(inode);
1480 int blocksize = inode->i_sb->s_blocksize;
1484 /* Must own a reference to ->dblock */
1485 BUG_ON(li->dblock == NULL);
1486 BUG_ON(atomic_read(&li->dblock->b.refcnt) == 0);
1488 /* We come back to restart if we needed to unlock to
1489 * perform IO and we cannot be sure the parent didn't change.
1490 * We hold the last seen index block in 'hold' so that if
1491 * no changes happened (the common case) we are certain
1492 * that no IO will be required to get back to where
1499 ib = lafs_make_iblock(inode, adopt, async, REF);
1503 offset = li->metadata_size;
1507 lafs_iolock_block(&ib->b);
1508 else if (!lafs_iolock_block_async(&ib->b))
1511 while (ib->depth > 1) {
1512 /* internal index block */
1513 /* NOTE: index block could be empty, or could have
1514 * nothing as early as 'addr'. In that case '0' is
1515 * returned implying that the block is either in
1516 * cache, or needs to be synthesised as an empty
1517 * index block (one level down).
1519 struct indexblock *better;
1523 buf = map_iblock(ib);
1524 iaddr = ib->b.fileaddr;
1525 iphys = index_lookup(buf + offset, blocksize - offset,
1527 target == addr ? next : NULL);
1528 unmap_iblock(ib, buf);
1530 ib2 = lafs_iblock_get(inode, iaddr, ib->depth-1, iphys, REF);
1534 lafs_iounlock_block(&ib->b);
1538 better = find_better(inode, ib2, addr, next, REF);
1542 if (!test_bit(B_Valid, &ib2->b.flags)) {
1543 lafs_iounlock_block(&ib->b);
1544 err = lafs_load_block(&ib2->b, NULL);
1548 err = lafs_wait_block_async(&ib2->b);
1550 err = lafs_wait_block(&ib2->b);
1556 lafs_iolock_block(&ib->b);
1557 else if (!lafs_iolock_block_async(&ib->b))
1560 /* While we did IO, the parent could have been
1561 * split, so it is now the wrong parent, or it
1562 * could have become EmptyIndex, though that is
1563 * much less likely. If it did, then it must
1564 * have had a parent, so the ref we hold means
1565 * it still has a parent.
1566 * So if ib->b.parent, then we might need to
1567 * retry from the top, and holding a ref on ib
1568 * means that we don't risk live-locking if
1569 * memory for index blocks is very tight.
1570 * If there is no parent, then there is
1571 * no risk that ib changed.
1575 putiref(hold, MKREF(hold));
1576 hold = getiref(ib, MKREF(hold));
1585 lafs_iolock_block(&ib2->b);
1586 else if (!lafs_iolock_block_async(&ib2->b))
1589 if (ib2->b.fileaddr != ib->b.fileaddr &&
1590 test_bit(B_EmptyIndex, &ib2->b.flags)) {
1591 /* need to look earlier in the parent */
1592 target = ib2->b.fileaddr - 1;
1593 lafs_iounlock_block(&ib2->b);
1599 if (!(ib2->b.flags & B_Linked)) {
1600 struct block *b2 = peer_find(fs,
1604 list_add(&ib2->peers, &b2->peers);
1605 set_bit(B_Linked, &ib2->b.flags);
1609 block_adopt(&ib2->b, ib);
1610 lafs_iounlock_block(&ib->b);
1624 putiref(hold, MKREF(hold));
1625 return ERR_PTR(err);
1628 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr);
1630 static int __lafs_find_next(struct inode *ino, loff_t *addrp)
1632 /* Find the next allocated block in inode at or after *addrp
1636 * 0 - there is no block at or after '*addrp'
1637 * 1 - It is worth checking the block at *addrp.
1638 * 2 - *addrp has been updated, check again.
1640 struct lafs_inode *lai = LAFSI(ino);
1641 struct indexblock *leaf;
1642 struct fs *fs = fs_from_inode(ino);
1644 u32 nexti, nextd, nextl;
1650 /* Must hold a reference to ->dblock */
1651 BUG_ON(lai->dblock == NULL);
1652 BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1654 if (lai->depth == 0) {
1657 leaf = lafs_make_iblock(ino, NOADOPT, SYNC, MKREF(find_next));
1659 return PTR_ERR(leaf);
1662 leaf = lafs_leaf_find(ino, *addrp, NOADOPT, &nexti, SYNC,
1664 /* FIXME what if 'leaf' is an error?? */
1666 if (lai->depth == 1)
1667 offset = lai->metadata_size;
1668 buf = map_iblock(leaf);
1669 p = leaf_lookup(buf+offset, fs->blocksize-offset,
1670 leaf->b.fileaddr, (u32)*addrp, &nextd);
1671 unmap_iblock(leaf, buf);
1673 lafs_iounlock_block(&leaf->b);
1674 putiref(leaf, MKREF(find_next));
1675 if (*addrp == 0xFFFFFFFF) {
1677 printk("at %s\n", strblk(&lai->dblock->b));
1678 printk("offset = %d depth=%d\n", offset,
1680 for (j = 0; j < 16; j++)
1681 printk("%04x ", buf[offset+j] & 0xffff);
1682 for (j = 0; j < 16; j++)
1683 printk("%04x ", buf[1024-16+j] & 0xffff);
1686 BUG_ON(*addrp == 0xFFFFFFFF);
1689 if (nextd != 0xFFFFFFFF)
1691 else if (nexti != 0xFFFFFFFF) {
1696 /* If rv==2, nextd may not be high enough as it might
1697 * be the address of an EmptyIndex block. So only
1698 * use it if nothing better is found.
1704 list_for_each_entry(b, &leaf->children, siblings)
1705 if (b->fileaddr >= *addrp &&
1706 b->fileaddr <= nextl &&
1707 test_bit(B_Valid, &b->flags)) {
1708 nextd = nextl = b->fileaddr;
1712 if (leaf->uninc_table.pending_cnt) {
1713 spin_lock(&leaf->b.inode->i_data.private_lock);
1714 if (table_find_first(&leaf->uninc_table, *addrp, &nextl)) {
1718 spin_unlock(&leaf->b.inode->i_data.private_lock);
1720 lafs_iounlock_block(&leaf->b);
1721 putiref(leaf, MKREF(find_next));
1727 * find the first data block at or after *bnump, and
1728 * store the address in *bnump.
1729 * Return 0 if nothing found, -1 on error, or
1730 * 1 if *bnump is a valid data block address
1732 * The block could be in the indexing tree, or in
1733 * an uninc_table, or totally unincorporated.
1734 * Blocks that are really part of the file, but are not in
1735 * the indexing tree will still be on a ->children list
1736 * from an index block, and the index block will be very close
1737 * to the indexing tree.
1738 * So once we have found a suitable index block, we need to check
1739 * all it's children. This could possibly be optimised using
1740 * find_get_pages if that was exported.
1742 int lafs_find_next(struct inode *ino, loff_t *bnump)
1745 /* Need to hold a reference to dblock while calling
1748 struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(find_nexti));
1753 struct datablock *b;
1756 rv = __lafs_find_next(ino, bnump);
1763 b = lafs_get_block(ino, *bnump, NULL, GFP_KERNEL,
1768 rv = lafs_find_block(b, NOADOPT);
1770 putdref(b, MKREF(find_next));
1773 hole = (b->b.physaddr == 0 ||
1774 !test_bit(B_PhysValid, &b->b.flags)) &&
1775 !test_bit(B_Valid, &b->b.flags);
1776 if (LAFSI(ino)->depth == 0 &&
1778 hole = 0; /* first block lives in inode */
1779 putdref(b, MKREF(find_next));
1784 *bnump = (*bnump)+1;
1786 putdref(db, MKREF(find_nexti));
1787 BUG_ON(rv > 0 && *bnump == 0xFFFFFFFF);
1791 static int table_lookup(struct uninc *tbl, u32 addr, u64 *phys)
1794 for (i = tbl->pending_cnt; i; ) {
1795 struct addr *a = &tbl->pending_addr[--i];
1796 if (a->fileaddr <= addr &&
1797 a->fileaddr + a->cnt > addr) {
1798 *phys = a->physaddr + (addr - a->fileaddr);
1805 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr)
1807 /* Find the earliest allocated block in tbl which is
1808 * at-or-after 'min' and before '*addr', and store it in
1809 * *addr, returning 1 if something was found.
1814 for (i = tbl->pending_cnt ; i ; ) {
1815 struct addr *a = &tbl->pending_addr[--i];
1816 if (a->fileaddr >= *addr)
1818 if (a->fileaddr + a->cnt < min)
1820 if (a->physaddr == 0)
1822 if (a->fileaddr >= min)
1823 *addr = a->fileaddr;
1831 static void fill_block_zero(struct datablock *db, struct datablock *inob)
1833 struct lafs_inode *lai = LAFSI(db->b.inode);
1834 void *baddr = map_dblock(db);
1835 void *iaddr = map_dblock_2(inob);
1836 int offset = lai->metadata_size;
1837 int len = (1<<db->b.inode->i_blkbits) - offset;
1839 memcpy(baddr, iaddr+offset, len);
1840 unmap_dblock_2(inob, iaddr);
1841 memset(baddr+len, 0, (1<<db->b.inode->i_blkbits)-len);
1842 unmap_dblock(db, baddr);
1843 set_bit(B_Valid, &db->b.flags);
1846 static int __must_check
1847 find_block(struct datablock *b, int adopt, int async)
1849 /* Find where this block is in storage, and
1850 * set b->b.physaddr.
1851 * If the block lives in the inode, physaddr
1852 * gets set to zero, and lafs_load_block works with this.
1853 * Otherwise we walk down the index tree
1858 struct lafs_inode *lai;
1859 struct indexblock *ib;
1860 struct datablock *db;
1864 LAFS_BUG(test_bit(B_Pinned, &b->b.flags) &&
1865 b->b.parent == NULL,
1867 if (b->b.inode == NULL)
1869 lai = LAFSI(b->b.inode);
1871 if (lai->depth == 0) {
1872 /* Data is in the inode if anywhere */
1873 if (!test_bit(B_PhysValid, &b->b.flags)) {
1875 set_bit(B_PhysValid, &b->b.flags);
1877 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1880 if (!test_bit(B_Valid, &b->b.flags) && b->b.fileaddr == 0) {
1881 /* Now is the best time to copy data from inode into
1882 * datablock, as we know we have a valid inode block
1885 LAFS_BUG(b->b.physaddr != 0, &b->b);
1886 fill_block_zero(b, db);
1889 ib = lafs_make_iblock(b->b.inode, adopt, async,
1891 putdref(db, MKREF(findblock));
1894 block_adopt(&b->b, ib);
1895 putiref(ib, MKREF(findblock));
1897 putdref(db, MKREF(findblock));
1901 if (test_bit(B_PhysValid, &b->b.flags)
1902 && (!adopt || b->b.parent || test_bit(B_Root, &b->b.flags))) {
1906 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1910 /* Ok, there is some indexing information we need to
1911 * look through. Find the leaf first
1913 ib = lafs_leaf_find(b->b.inode, b->b.fileaddr, adopt, NULL, async,
1915 putdref(db, MKREF(findblock));
1918 dprintk("lafs_leaf_find returned error to lafs_find_block\n");
1922 if (!test_bit(B_PhysValid, &b->b.flags)
1923 && ib->uninc_table.pending_cnt) {
1924 spin_lock(&b->b.inode->i_data.private_lock);
1925 if (table_lookup(&ib->uninc_table, b->b.fileaddr,
1927 set_bit(B_PhysValid, &b->b.flags);
1928 spin_unlock(&b->b.inode->i_data.private_lock);
1930 if (!test_bit(B_PhysValid, &b->b.flags)) {
1931 buf = map_iblock(ib);
1932 offset = lai->depth > 1 ? 0 : lai->metadata_size;
1933 if (test_bit(B_InoIdx, &ib->b.flags))
1934 dprintk("InoIdx!!!\n");
1935 dprintk("offset %d: %02x %02x\n", offset,
1936 ((char *)buf)[offset], ((char *)buf)[offset+1]);
1937 b->b.physaddr = leaf_lookup(buf+offset,
1938 b->b.inode->i_sb->s_blocksize
1940 ib->b.fileaddr, b->b.fileaddr,
1942 set_bit(B_PhysValid, &b->b.flags);
1943 unmap_iblock(ib, buf);
1945 dprintk("lafs_find_block: lookup for %lu of %lu at %lu found %llu\n",
1946 (unsigned long)b->b.fileaddr,
1947 (unsigned long)b->b.inode->i_ino,
1948 (unsigned long)ib->b.fileaddr,
1950 /* FIXME should peer_find go here or in load_block? */
1953 block_adopt(&b->b, ib);
1954 lafs_iounlock_block(&ib->b);
1959 putiref(ib, MKREF(findblock));
1964 lafs_find_block(struct datablock *b, int adopt)
1966 return find_block(b, adopt, SYNC);
1970 lafs_find_block_async(struct datablock *b)
1972 return find_block(b, ADOPT, ASYNC);
1976 * lafs_allocated_block records that a block has been written at a new
1977 * address (or found at an address during roll-forward). This involves
1978 * updating some summary information (quota etc) and making sure
1979 * the change gets recorded in the parent block
1980 * This must be called before B_Dirty or B_Realloc are cleared,
1981 * to ensure that the index block is marked accordingly.
1982 * B_UnincCredit should still be set, and we clear it.
1985 int lafs_add_block_address(struct fs *fs, struct block *blk)
1987 struct indexblock *p = blk->parent;
1990 spin_lock(&blk->inode->i_data.private_lock);
1991 if (test_bit(B_Index, &blk->flags)) {
1992 if (!test_and_set_bit(B_Uninc, &blk->flags)) {
1993 blk->chain = p->uninc;
1994 LAFS_BUG(p->depth == 1, blk);
1996 getref(blk, MKREF(uninc));
1998 spin_unlock(&blk->inode->i_data.private_lock);
2002 /* same phase and leaf; add the address to uninc_table */
2004 /* the only place we try to merge is at the end
2005 * of the embedded table, and then only for data
2007 * It is essential that uninc_table is sorted and,
2008 * in particular, has no overlapping extents.
2009 * If this block threatens problems we incorporate first.
2012 LAFS_BUG(!test_bit(B_Dirty, &p->b.flags) &&
2013 !test_bit(B_Realloc, &p->b.flags), blk);
2015 a = &p->uninc_table.pending_addr
2016 [p->uninc_table.pending_cnt-1];
2017 if (p->uninc_table.pending_cnt >= 1 &&
2018 a->fileaddr+a->cnt == blk->fileaddr &&
2020 a->physaddr+a->cnt == blk->physaddr) {
2022 if (test_and_clear_bit(B_UnincCredit, &blk->flags))
2023 p->uninc_table.credits++;
2024 spin_unlock(&blk->inode->i_data.private_lock);
2026 } else if ((p->uninc_table.pending_cnt == 0 ||
2027 a->fileaddr + a->cnt <= blk->fileaddr) &&
2028 /* properly ordered, maybe add */
2029 (p->uninc_table.pending_cnt <
2030 ARRAY_SIZE(p->uninc_table.pending_addr))) {
2032 a->fileaddr = blk->fileaddr;
2033 a->physaddr = blk->physaddr;
2035 p->uninc_table.pending_cnt++;
2036 if (test_and_clear_bit(B_UnincCredit,
2038 p->uninc_table.credits++;
2039 spin_unlock(&blk->inode->i_data.private_lock);
2043 spin_unlock(&blk->inode->i_data.private_lock);
2044 /* We own a ref through blk->parent now, but
2045 * after lafs_incorporate, we might not
2047 getiref(p,MKREF(addblock));
2048 lafs_iolock_written(&p->b);
2049 lafs_incorporate(fs, p);
2050 lafs_iounlock_block(&p->b);
2051 putiref(p,MKREF(addblock));
2056 int lafs_allocated_block(struct fs *fs, struct block *blk, u64 phys)
2059 struct indexblock *p;
2061 dprintk("Allocated %s -> %llu\n", strblk(blk), phys);
2063 /* If phys is 0, I don't need uninc credit.
2064 * Also, Index block that are not near full do not need UnincCredits
2066 LAFS_BUG(test_bit(B_InoIdx, &blk->flags), blk);
2067 if (!test_bit(B_Dirty, &blk->flags) &&
2068 !test_bit(B_Realloc, &blk->flags) &&
2070 printk("something missing %s\n", strblk(blk));
2071 LAFS_BUG(!test_bit(B_UnincCredit, &blk->flags) && phys &&
2072 !test_bit(B_Index, &blk->flags), blk);
2073 LAFS_BUG(!test_bit(B_Dirty, &blk->flags) &&
2074 !test_bit(B_Realloc, &blk->flags) &&
2077 LAFS_BUG(!test_bit(B_Writeback, &blk->flags), blk);
2079 if (test_bit(B_Root, &blk->flags)) {
2081 struct lafs_inode *lai;
2083 LAFS_BUG(test_bit(B_Index, &blk->flags), blk);
2085 for (i = 0; i < fs->maxsnapshot; i++)
2086 if (fs->ss[i].root == blk->inode)
2088 LAFS_BUG(i == fs->maxsnapshot, blk);
2089 blk->physaddr = phys; /* superblock doesn't get
2090 counted in summaries */
2091 LAFS_BUG(test_bit(B_SegRef, &blk->flags), blk);
2092 set_bit(B_PhysValid, &blk->flags);
2093 fs->ss[i].root_addr = phys;
2094 lai = LAFSI(blk->inode);
2095 if (i == 0 && lai->md.fs.usagetable > 1)
2096 /* snapshot checkpoint is happening. Record root
2097 * address in main fs and in snapshot
2099 fs->ss[lai->md.fs.usagetable-1].root_addr = phys;
2101 /* Don't bother to clear the B_UnincCredit in the root.
2102 * It'll just get set again. This way there is less churn.
2106 if (test_bit(FinalCheckpoint, &fs->fsstate) &&
2107 !test_bit(B_Index, &blk->flags) &&
2108 (LAFSI(blk->inode)->type == TypeSegmentMap ||
2109 LAFSI(blk->inode)->type == TypeQuota)) {
2110 /* These blocks will be picked up by roll-forward,
2111 * so don't need to have their address recorded,
2112 * and doing so would leave the index blocks
2113 * dirty after the checkpoint.
2116 if (test_and_clear_bit(B_SegRef, &blk->flags))
2117 lafs_seg_deref(fs, blk->physaddr, 0);
2118 blk->physaddr = phys;
2119 set_bit(B_PhysValid, &blk->flags);
2126 /* We need to work out if this block is in the current phase or
2127 * the next one. This can only be an issue if a phase change
2128 * (aka checkpoint) is underway. But if it is, we need to
2129 * ensure the right phase is used
2131 lafs_checkpoint_lock(fs);
2132 if (test_bit(B_Pinned, &blk->flags))
2133 ph = !!test_bit(B_Phase1, &blk->flags);
2134 else if (test_bit(B_Index, &blk->flags))
2135 LAFS_BUG(1, blk); /* Index blocks must always be pinned when dirty (*/
2136 else if (p->uninc_table.pending_cnt) {
2137 /* attach data block into previous phase as incorporation of
2138 * this phase is still to happen.
2139 * This is needed for flush-if-cannot-preallocate to work.
2141 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), &p->b);
2142 ph = !!test_bit(B_Phase1, &p->b.flags);
2143 lafs_pin_block_ph(blk, ph);
2145 lafs_pin_block(blk);
2146 ph = !!test_bit(B_Phase1, &blk->flags);
2148 lafs_checkpoint_unlock(fs);
2149 if (!test_bit(B_PhysValid, &blk->flags))
2151 lafs_summary_update(fs, blk->inode, blk->physaddr, phys,
2152 test_bit(B_Index, &blk->flags), ph,
2153 test_bit(B_SegRef, &blk->flags));
2154 clear_bit(B_Prealloc, &blk->flags);
2156 blk->physaddr = phys;
2157 set_bit(B_PhysValid, &blk->flags);
2159 if (ph != !!test_bit(B_Phase1, &p->b.flags)) {
2160 /* in the next phase */
2161 dprintk("uninc next phase\n");
2162 if (test_and_set_bit(B_Uninc, &blk->flags))
2163 /* This block was already on the 'unincorporated'
2164 * list, (it must be changing quickly) so there
2165 * is nothing else to do.
2168 getref(blk, MKREF(uninc));
2170 spin_lock(&blk->inode->i_data.private_lock);
2171 blk->chain = p->uninc_next;
2172 p->uninc_next = blk;
2173 spin_unlock(&blk->inode->i_data.private_lock);
2174 /* We will try again after the phase change.
2175 * That will be when we clean B_UnincCredit
2181 if (test_bit(B_Dirty, &blk->flags) || phys == 0) {
2182 if (!test_bit(B_Dirty, &p->b.flags)
2183 && !test_bit(B_Credit, &p->b.flags)) {
2184 printk("Oh dear: %s\n", strblk(blk));
2185 printk(".......: %s\n", strblk(&p->b));
2187 lafs_dirty_iblock(p);
2189 if (!test_bit(B_Valid, &p->b.flags)) {
2190 printk("Not valid in lafs_allocated_block: %s\n",
2192 printk("blk is %s\n", strblk(blk));
2195 LAFS_BUG(!test_bit(B_Valid, &p->b.flags), &p->b);
2196 if (!test_bit(B_Realloc, &p->b.flags)) {
2197 /* I cannot use B_Credit to fill B_Realloc as that
2198 * might still be needed for B_Dirty.
2199 * So if we cannot allocated a new credit,
2200 * just set the block as 'dirty' now.
2202 if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
2203 if (test_and_set_bit(B_Realloc, &p->b.flags))
2204 lafs_space_return(fs, 1);
2206 lafs_dirty_iblock(p);
2208 if (!test_and_set_bit(B_UnincCredit, &p->b.flags))
2209 if (!test_and_clear_bit(B_ICredit, &p->b.flags))
2210 LAFS_BUG(1, &p->b); // ICredit should be set before
2215 dprintk("Uninc same phase\n");
2217 BUG_ON(!test_bit(B_Pinned, &blk->parent->b.flags));
2219 if (!lafs_add_block_address(fs, blk)) {
2223 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), blk);
2227 /* That credit has now been added to the uninc_table and
2228 * thus made available to the parent
2231 lafs_refile(&p->b, 0);