3 * Handle index block for LaFS.
5 * Copyright (C) 2005-2009
6 * Neil Brown <neilb@suse.de>
7 * Released under the GPL, version 2
11 #include <linux/types.h>
12 #include <linux/hash.h>
13 #include <linux/slab.h>
15 * For each index-block there is an 'indexblock' data structure,
16 * and a 'block' of memory, which is a page or part there-of.
18 * Index-blocks are stored in a hash table, indexed by
19 * (struct inode *, fileaddr, depth). There is one hash table
20 * for all filesystems.
22 * There is a single free-list for all index blocks from all
23 * filesystems, of all sizes. Index blocks only get on this list when
24 * they are unpinned leafs. i.e. their 'children' list is empty. As they
25 * are freed, their parents may get added to the list.
27 * It's not clear yet how memory pressure will flush out dirty index blocks.
29 * All indexblocks for an inode are either attached under the inode via
30 * the children/siblings tree, or on a per-inode 'free' list also using
31 * the 'siblings' linkage. Thus we are sure that they can be removed when
32 * the inode is removed. Note that data blocks do *not* necessarily live
33 * in this tree (though they will when they are dirty). They can always be
34 * found in the radix_tree for the inode's address-space.
37 * Consider how 'rcu' can be used for
40 * Consider gang-freeing for adding to head of lru lists.
43 /* FIXME these should be configured at runtime based on memory */
45 static struct hlist_head hash_table[1<<HASHBITS];
46 spinlock_t lafs_hash_lock;
48 /* static */ struct freelists freelist;
50 static int lafs_shrinker(int nr_to_scan, /*gfp_t*/unsigned int gfp_mask)
54 if (!(gfp_mask & __GFP_FS))
56 dprintk("scan %d\n", nr_to_scan);
57 spin_lock(&lafs_hash_lock);
58 while (nr_to_scan && !list_empty(&freelist.lru)) {
59 struct indexblock *ib = list_entry(freelist.lru.next,
62 /* Check if it is really free */
64 if (atomic_read(&ib->b.refcnt)) {
65 list_del_init(&ib->b.lru);
66 clear_bit(B_OnFree, &ib->b.flags);
70 if (test_bit(B_InoIdx, &ib->b.flags))
71 LAFSI(ib->b.inode)->iblock = NULL;
73 /* Any pinned iblock must have children, be
74 * refcounted, or be on a 'leaf' lru, which gives
75 * a refcount. So this unreffed block cannot be
76 * pinned. This BUG_ON is here because previously
77 * the above 'if' also tested for 'Pinned', and that
78 * seems unnecessary, but I want to be sure.
80 LAFS_BUG(test_bit(B_Pinned, &ib->b.flags), &ib->b);
81 if (!hlist_unhashed(&ib->hash))
82 hlist_del_init(&ib->hash);
83 /* delete from inode->free_index */
84 list_del_init(&ib->b.siblings);
85 list_move(&ib->b.lru, &togo);
89 spin_unlock(&lafs_hash_lock);
90 while (!list_empty(&togo)) {
91 struct indexblock *ib = list_entry(togo.next,
98 return freelist.freecnt;
101 void lafs_release_index(struct list_head *head)
103 /* This is the other place that index blocks get freed.
104 * They are freed either by memory pressure in lafs_shrinker
105 * above, or when an inode is destroyed by this code.
106 * They need to both obey the same locking rules, and ensure
107 * blocks are removed from each of siblings, lru, and hash.
108 * This list is an inode's free_index and are linked on
109 * b.siblings. They are all on the freelist.lru
112 spin_lock(&lafs_hash_lock);
113 while (!list_empty(head)) {
114 struct indexblock *ib = list_entry(head->next,
117 list_del_init(&ib->b.siblings);
118 LAFS_BUG(!test_bit(B_OnFree, &ib->b.flags), &ib->b);
119 if (!hlist_unhashed(&ib->hash))
120 hlist_del_init(&ib->hash);
121 list_move(&ib->b.lru, &togo);
124 spin_unlock(&lafs_hash_lock);
125 while (!list_empty(&togo)) {
126 struct indexblock *ib = list_entry(togo.next,
129 list_del(&ib->b.lru);
130 lafs_iblock_free(ib);
135 lafs_iblock_alloc(struct fs *fs, int gfp, int with_buffer, REFARG)
137 struct indexblock *ib;
139 ib = kzalloc(sizeof(*ib), gfp);
143 ib->data = kmalloc(fs->blocksize, gfp);
152 set_bit(B_Index, &ib->b.flags);
153 atomic_set(&ib->b.refcnt, 1);
154 add_ref(&ib->b, REF, __FILE__, __LINE__);
156 INIT_LIST_HEAD(&ib->children);
157 atomic_set(&ib->pincnt[0], 0);
158 atomic_set(&ib->pincnt[1], 0);
160 ib->uninc_next = NULL;
163 INIT_LIST_HEAD(&ib->b.siblings);
164 INIT_LIST_HEAD(&ib->b.lru);
165 INIT_LIST_HEAD(&ib->b.peers);
166 INIT_HLIST_NODE(&ib->hash);
168 ib->uninc_table.pending_cnt = 0;
169 ib->uninc_table.credits = 0;
174 void lafs_iblock_free(struct indexblock *ib)
180 static struct shrinker hash_shrink = {
181 .shrink = lafs_shrinker,
182 .seeks = DEFAULT_SEEKS,
185 int lafs_ihash_init(void)
188 for (i = 0 ; i < (1<<HASHBITS); i++)
189 INIT_HLIST_HEAD(hash_table+i);
190 spin_lock_init(&lafs_hash_lock);
191 INIT_LIST_HEAD(&freelist.lru);
192 register_shrinker(&hash_shrink);
197 void lafs_ihash_free(void)
199 unregister_shrinker(&hash_shrink);
203 ihash(struct inode *inode, faddr_t addr, int depth)
205 unsigned long h = hash_ptr(inode, BITS_PER_LONG);
206 h = hash_long(h ^ addr, BITS_PER_LONG);
207 return hash_long(h ^ depth, HASHBITS);
210 static struct indexblock *
211 ihash_lookup(struct inode *inode, faddr_t addr, int depth,
212 struct indexblock *new, REFARG)
214 struct indexblock *ib = NULL;
215 struct hlist_node *tmp;
216 struct hlist_head *head = &hash_table[ihash(inode, addr, depth)];
218 spin_lock(&lafs_hash_lock);
219 hlist_for_each_entry(ib, tmp, head, hash)
220 if (ib->b.fileaddr == addr && ib->b.inode == inode &&
225 hlist_add_head(&new->hash, head);
231 getiref_locked_needsync(ib, REF);
232 spin_unlock(&lafs_hash_lock);
238 struct indexblock *lafs_getiref_locked(struct indexblock *ib)
243 LAFS_BUG(!test_bit(B_InoIdx, &ib->b.flags), &ib->b);
244 if (spin_is_locked(&ib->b.inode->i_data.private_lock))
246 if (spin_is_locked(&lafs_hash_lock))
248 if (spin_is_locked(&fs_from_inode(ib->b.inode)->lock))
250 LAFS_BUG(!ok, &ib->b);
252 if (atomic_inc_return(&ib->b.refcnt) == 1) {
253 /* First reference to an InoIdx block implies a reference
255 * If two different threads come here under different locks, one could
256 * exit before the ref on the dblock is taken. However as each
257 * thread must own an indirect reference to be here, the last indirect ref to
258 * the dblock cannot disappear before all threads have exited this function,
259 * by which time this direct ref will be active.
261 struct datablock *db;
262 db = getdref(LAFSI(ib->b.inode)->dblock, MKREF(iblock));
263 LAFS_BUG(db == NULL, &ib->b);
269 lafs_iblock_get(struct inode *ino, faddr_t addr, int depth, paddr_t phys, REFARG)
271 /* Find the index block with this inode/depth/address.
272 * If it isn't in the cache, create it with given
275 struct indexblock *ib = ihash_lookup(ino, addr, depth, NULL, REF);
276 struct indexblock *new;
283 new = lafs_iblock_alloc(fs_from_inode(ino), GFP_KERNEL, 1, REF);
286 new->b.fileaddr = addr;
289 new->b.physaddr = phys;
290 set_bit(B_PhysValid, &new->b.flags);
292 ib = ihash_lookup(ino, addr, depth, new, REF);
294 lafs_iblock_free(new);
298 void lafs_hash_iblock(struct indexblock *ib)
300 /* This block was an InoIdx block but has just become a real
301 * index block, so hash it. There cannot be a block with this
302 * address already as we haven't increased the inode depth yet.
303 * Alternately this block was recently split off another,
304 * and has now been incorporated, so a lookup might need
305 * to find it. So it better be in the hash table.
307 struct hlist_head *head =
308 &hash_table[ihash(ib->b.inode, ib->b.fileaddr, ib->depth)];
309 spin_lock(&lafs_hash_lock);
310 LAFS_BUG(!hlist_unhashed(&ib->hash), &ib->b);
311 hlist_add_head(&ib->hash, head);
312 spin_unlock(&lafs_hash_lock);
315 void lafs_unhash_iblock(struct indexblock *ib)
317 /* This block has been emptied and deleted and is no longer
318 * wanted in the hash tables
320 spin_lock(&lafs_hash_lock);
321 LAFS_BUG(hlist_unhashed(&ib->hash), &ib->b);
322 hlist_del_init(&ib->hash);
323 spin_unlock(&lafs_hash_lock);
326 /* adopt blk into parent if possible.
329 block_adopt(struct block *blk, struct indexblock *parent)
331 struct address_space *as;
333 if (parent == NULL) {
334 LAFS_BUG(!test_bit(B_Root, &blk->flags), blk);
339 LAFS_BUG(blk->parent != parent, blk);
343 LAFS_BUG(!(parent->b.flags & B_Valid), &parent->b);
345 as = &blk->inode->i_data;
347 spin_lock(&as->private_lock);
349 if (blk->parent != parent) {
350 printk("blk = %s\n", strblk(blk));
351 printk("blk->p = %s\n", strblk(&blk->parent->b));
352 printk("parent = %s\n", strblk(&parent->b));
355 LAFS_BUG(blk->parent != parent, blk);
357 if (test_bit(B_EmptyIndex, &parent->b.flags)) {
358 LAFS_BUG(!test_bit(B_Index, &parent->b.flags), &parent->b);
359 LAFS_BUG(test_bit(B_InoIdx, &parent->b.flags), &parent->b);
360 LAFS_BUG(parent->b.fileaddr != parent->b.parent->b.fileaddr, &parent->b);
362 clear_bit(B_EmptyIndex, &parent->b.flags);
364 LAFS_BUG(parent == iblk(blk), blk);
365 blk->parent = parent;
366 getiref(parent, MKREF(child));
367 /* Remove from the per-inode free list */
368 spin_lock(&lafs_hash_lock);
369 if (!test_bit(B_InoIdx, &blk->flags))
370 list_move(&blk->siblings, &parent->children);
372 /* Remove from ->free_index list */
373 list_del_init(&blk->siblings);
374 /* Having set ->parent, we take a ref on the dblock.
375 * The dblock must exist because we can only reach here from
376 * lafs_make_iblock which can only be called while holding a ref
379 (void)getdref(LAFSI(blk->inode)->dblock, MKREF(pdblock));
381 spin_unlock(&lafs_hash_lock);
383 spin_unlock(&as->private_lock);
386 static int __must_check
387 find_block(struct datablock *b, int adopt, int async);
389 static int __must_check
390 setparent(struct datablock *blk, int async)
393 if (blk->b.parent == NULL && !test_bit(B_Root, &blk->b.flags))
394 err = find_block(blk, 1, async);
399 lafs_setparent(struct datablock *blk)
401 return setparent(blk, 0);
404 void lafs_pin_block_ph(struct block *b, int ph)
406 struct fs *fs = fs_from_inode(b->inode);
410 LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags), b);
411 if (!test_bit(B_Realloc, &b->flags) &&
412 LAFSI(b->inode)->type != TypeSegmentMap)
413 LAFS_BUG(!test_phase_locked(fs), b);
416 spin_lock(&ino->i_data.private_lock);
418 p && !test_bit(B_Pinned, &p->flags) ;
419 p = &(p->parent)->b) {
421 if (test_bit(B_InoIdx, &p->flags)) {
422 struct datablock *db = LAFSI(p->inode)->dblock;
424 spin_unlock(&ino->i_data.private_lock);
425 lafs_inode_checkpin(p->inode);
427 spin_lock(&ino->i_data.private_lock);
430 spin_unlock(&ino->i_data.private_lock);
434 static void pin_all_children(struct indexblock *ib)
436 /* If we cannot get new credits for phase_flip, we need to
437 * pin all dirty data-block children so that they get
438 * written and don't remain to require a phase_flip
439 * on the next checkpoint (at which point there will be
441 * So if any dirty child is found, set phase. We don't
442 * need PinPending as it is already Dirty. All other
443 * requirement for pinning will already be met.
446 struct indexblock *start = NULL;
448 struct fs *fs = fs_from_inode(ib->b.inode);
452 * This is called with an IOLock on ib, So it is unlikely
453 * that children will be added or removed(?), but we really
454 * should hole the inode private_lock anyway.
456 getiref(ib, MKREF(pinall));
459 /* We hold a reference on ib, and on 'start' if non-NULL */
460 spin_lock(&ib->b.inode->i_data.private_lock);
461 b = list_prepare_entry(b, &ib->children, siblings);
462 list_for_each_entry_continue(b, &ib->children, siblings) {
463 /* FIXME I don't descend through inode blocks to the file below! */
464 if (test_bit(B_Index, &b->flags)) {
467 getref_locked(b, MKREF(pinall));
468 spin_unlock(&ib->b.inode->i_data.private_lock);
469 putiref(ib, MKREF(pinall));
473 } else if (test_bit(B_Dirty, &b->flags) &&
474 !test_bit(B_Pinned, &b->flags)) {
475 getref_locked(b, MKREF(pinall));
476 spin_unlock(&ib->b.inode->i_data.private_lock);
478 putref(b, MKREF(pinall));
479 /* Slightly safer to restart this level */
484 spin_unlock(&ib->b.inode->i_data.private_lock);
485 putiref(start, MKREF(pinall));
489 ib = getiref(ib->b.parent, MKREF(pinall));
493 putiref(ib, MKREF(pinall));
496 static void flip_phase(struct block *b)
498 struct indexblock *p;
499 int oldphase = !!test_bit(B_Phase1, &b->flags);
501 LAFS_BUG(!test_bit(B_Pinned, &b->flags), b);
503 clear_bit(B_Phase1, &b->flags);
505 set_bit(B_Phase1, &b->flags);
510 atomic_inc(&p->pincnt[1-oldphase]);
511 atomic_dec(&p->pincnt[oldphase]);
513 LAFS_BUG(!test_bit(B_Root, &b->flags), b);
515 /* move 'next' credits to 'here' */
516 if (!test_bit(B_Credit, &b->flags) &&
517 test_and_clear_bit(B_NCredit, &b->flags))
518 set_bit(B_Credit, &b->flags);
520 if (!test_bit(B_ICredit, &b->flags) &&
521 test_and_clear_bit(B_NICredit, &b->flags))
522 set_bit(B_ICredit, &b->flags);
525 /* When the pinning of a block needs to be carried across a
526 * checkpoint, we need to 'flip' the phase.
527 * This only applies to blocks that can be pinned by a block that
528 * may not be written until the next phase.
529 * This includes any index block (is it may have some children in one
530 * phase and some in the next) and any space-accounting block
531 * for the same reason.
532 * So indexblocks will need to flip, and use lafs_phase_flip.
533 * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock.
534 * TypeInodeFile don't need to be phase_flipped, though their InoIdx
535 * block might. InodeFile blocks are only pinned by metadata transactions
536 * which happen inside a checkpoint lock.
539 void lafs_flip_dblock(struct datablock *db)
541 /* This is an accounting block (SegmentMap or Quota)
542 * which we need to write out after ->phase has changed
543 * in the tail of the checkpoint.
544 * We always flip the phase and reallocate from AccountSpace.
545 * If all references get dropped, it will then get unpinned
546 * before the next phase finished - we don't unpin here
547 * (unlike lafs_phase_flip for index blocks).
550 lafs_prealloc(&db->b, AccountSpace);
551 /* Parent might need to be on a leaflist now */
552 lafs_refile(&db->b.parent->b, 0);
555 void lafs_phase_flip(struct fs *fs, struct indexblock *ib)
557 /* We are performing a checkpoint, this block has been written
558 * out and now needs to be flipped into the next phase.
561 * - Processing all uninc_next blocks into uninc_table.
562 * - adjusting counts on parent
563 * - moving credits from 'next' to 'this' phase.
564 * - update block counts to included phase-delayed updates.
566 int oldphase = !!test_bit(B_Phase1, &ib->b.flags);
569 dprintk("FLIP %s\n", strblk(&ib->b));
571 LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
572 LAFS_BUG(!test_bit(B_Index, &ib->b.flags), &ib->b);
573 LAFS_BUG(ib->b.parent == NULL && !test_bit(B_Root, &ib->b.flags), &ib->b);
574 LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
575 LAFS_BUG(atomic_read(&ib->pincnt[oldphase]), &ib->b);
576 LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
578 /* If this is the only reference, and pincnts are zero
579 * and all relevant flags are clear, then don't flip
582 if (test_bit(B_Pinned, &ib->b.flags) &&
583 !test_bit(B_Dirty, &ib->b.flags) &&
584 !test_bit(B_Realloc, &ib->b.flags) &&
585 atomic_read(&ib->b.refcnt) == 1 && /* This is us */
587 ib->uninc_table.pending_cnt == 0 &&
589 ib->uninc_next == NULL &&
590 atomic_read(&ib->pincnt[0]) == 0 &&
591 atomic_read(&ib->pincnt[1]) == 0 &&
593 (!test_bit(B_InoIdx, &ib->b.flags) ||
594 !(test_bit(B_PinPending, &LAFSI(ib->b.inode)->dblock->b.flags)
595 || test_bit(B_Dirty, &LAFSI(ib->b.inode)->dblock->b.flags)
596 || test_bit(B_Realloc, &LAFSI(ib->b.inode)->dblock->b.flags)))
598 if (test_and_clear_bit(B_Pinned, &ib->b.flags)) {
599 if (!test_bit(B_Root, &ib->b.flags)) {
600 atomic_dec(&ib->b.parent->pincnt[oldphase]);
601 lafs_refile(&ib->b.parent->b, 0);
603 lafs_inode_checkpin(ib->b.inode);
605 if (test_bit(B_InoIdx, &ib->b.flags))
606 /* maybe data block should go in leaf list now */
607 lafs_refile(&LAFSI(ib->b.inode)->dblock->b, 0);
608 lafs_iounlock_block(&ib->b);
614 if (test_bit(B_InoIdx, &ib->b.flags)) {
615 struct lafs_inode *lai = LAFSI(ib->b.inode);
616 struct datablock *db = lai->dblock;
617 struct inode *ino = db->b.inode;
619 spin_lock(&ino->i_data.private_lock);
620 lai->cblocks += lai->pblocks;
622 lai->ciblocks += lai->piblocks;
624 if (lai->type == TypeInodeFile) {
625 /* Root of filesystem */
626 lai->md.fs.cblocks_used +=
627 lai->md.fs.pblocks_used;
628 lai->md.fs.pblocks_used = 0;
630 spin_unlock(&ino->i_data.private_lock);
632 /* maybe data block needs to be on leaf list */
633 lafs_refile(&db->b, 0);
636 if (lafs_prealloc(&ib->b, ReleaseSpace) < 0) {
637 /* Couldn't get all N*Credit, so
638 * Pin all children to this phase, so they
641 pin_all_children(ib);
644 spin_lock(&ib->b.inode->i_data.private_lock);
645 ulist = ib->uninc_next;
646 ib->uninc_next = NULL;
647 spin_unlock(&ib->b.inode->i_data.private_lock);
649 lafs_iounlock_block(&ib->b);
652 lafs_dirty_iblock(ib);
654 struct block *b2 = ulist;
657 clear_bit(B_Uninc, &b2->flags);
658 while (lafs_add_block_address(fs, b2) == 0)
659 /* We had to incorporate b2->parent which might
660 * have split the parent but as that will have made
661 * it dirty, there is nothing extra that we need to
665 putref(b2, MKREF(uninc));
667 LAFS_BUG(ib->uninc_next, &ib->b);
669 lafs_refile(&ib->b, 0);
671 /* Parent may need to be attached to a phase_leafs now */
672 lafs_refile(&ib->b.parent->b, 0);
676 * lafs_refile is called whenever we finish doing stuff
677 * to a block. lafs_refile checks the state of the block and
678 * make sure various lists etc are correct.
679 * When finished with a block we often want to drop a reference,
680 * and this functionality is included in refile so that the
681 * decref and associated refiling can be done under a lock.
683 * Key issues that lafs_refile deals with are:
684 * Make sure ->lru is on correct list.
685 * Remove ->parent link if no longer needed.
686 * remove B_Lock if no longer needed
687 * trigger a phase change when appropriate
688 * Remove dir parent link if no longer needed.
690 * Fleshing these out a bit:
691 * possible lru lists are:
692 * phase_leafs[phase] - for dirty/pinned blocks with no pinned children
693 * clean_leafs - for blocks that are being relocated for cleaning
694 * and that have no pinned children
695 * The parent link is only needed if this block has a refcount or one
696 * of B_Pinned, B_Dirty, B_Uninc
697 * B_Lock is only needed if one of
700 * though a refcount on a pinned datablock is only significant
701 * while the current phase is locked.
703 * a phase change can (and should) happen for index blocks
704 * in the 'other' phase that are
705 * not Dirty, not Alloc, pincnt[oldphase]==0, uninc-table empty
708 * Refiling one block may cause a change in another block which requires
709 * a recursive refile. Naturally we unroll the tail-recursion, but we
710 * we need to be sure there are a limited number of tails.
711 * The causes of recursion are:
712 * A datablock which no-longer has a PagePrivate page holds a ref
713 * on block0 of that page, which must be dropped on last put.
714 * When any block becomes unpinned, we must refile the parent.
715 * When we clear ->parent we must deref the parent.
716 * When an InoIdx block drops ->parent it is dropped from inode->iblock
717 * so we need to drop the implied refcnt on ->dblock
718 * So there can only be 2 block to be recursed on to.
719 * A block0 or dblock, and a parent.
720 * A block pointing to a block0 never has a parent, and an iblock has the same
721 * parent as the dblock, so there can never be more than 2 outstanding blocks.
723 int lafs_is_leaf(struct block *b, int ph)
725 /* This is pinned and not iolocked. Should it be on a leaf
729 struct lafs_inode *lai;
732 if (!test_bit(B_Pinned, &b->flags) ||
733 test_bit(B_Writeback, &b->flags))
736 if (test_bit(B_Index, &b->flags)) {
738 return atomic_read(&iblk(b)->pincnt[ph]) == 0;
739 return (atomic_read(&iblk(b)->pincnt[0]) +
740 atomic_read(&iblk(b)->pincnt[1])) == 0;
743 /* This is a data block */
744 myi = rcu_my_inode(dblk(b));
746 /* Non-inode data blocks are always leafs */
750 spin_lock(&lai->vfs_inode.i_data.private_lock);
752 ph = !!test_bit(B_Phase1, &b->flags);
754 test_bit(B_Pinned, &lai->iblock->b.flags) &&
755 !!test_bit(B_Phase1, &lai->iblock->b.flags) == ph)
756 /* iblock still pinned in the same phase, so
757 * this block isn't a leaf
761 spin_unlock(&lai->vfs_inode.i_data.private_lock);
767 * This for debugging and is racy and is probably only safe on UP
769 static void check_consistency(struct block *b)
772 * 1/ make sure pincnt is right
774 if (test_bit(B_Index, &b->flags)) {
778 list_for_each_entry(cb, &iblk(b)->children, siblings) {
779 struct inode *myi = NULL;
780 if (test_bit(B_Pinned, &cb->flags)) {
781 int pp = !!test_bit(B_Phase1, &cb->flags);
784 /* InoIdx block might be pinned but is not a direct
786 if (!test_bit(B_Index, &cb->flags) &&
787 (myi = rcu_my_inode(dblk(cb))) != NULL &&
788 LAFSI(myi)->iblock &&
789 test_bit(B_Pinned, &LAFSI(myi)->iblock->b.flags)) {
790 int pp = !!test_bit(B_Phase1,
791 &LAFSI(myi)->iblock->b.flags);
796 if (c[0] != atomic_read(&iblk(b)->pincnt[0]) ||
797 c[1] != atomic_read(&iblk(b)->pincnt[1])) {
798 printk("%d %d\n", c[0], c[1]);
805 struct indexblock *ib = b->parent;
809 list_for_each_entry(cb, &ib->children, siblings) {
810 struct inode *myi = NULL;
811 if (test_bit(B_Pinned, &cb->flags)) {
812 int pp = !!test_bit(B_Phase1, &cb->flags);
815 /* InoIdx block might be pinned but is not a direct
817 if (!test_bit(B_Index, &cb->flags) &&
818 (myi = rcu_my_inode(dblk(cb))) != NULL &&
819 LAFSI(myi)->iblock &&
820 test_bit(B_Pinned, &(LAFSI(myi)
821 ->iblock->b.flags))) {
822 int pp = !!test_bit(B_Phase1,
829 if (c[0] != atomic_read(&ib->pincnt[0]) ||
830 c[1] != atomic_read(&ib->pincnt[1])) {
831 printk("badcnt %d %d\n", c[0], c[1]);
837 static void set_lru(struct block *b)
840 int ph = !!test_bit(B_Phase1, &b->flags);
842 if (!test_bit(B_OnFree, &b->flags) && !list_empty_careful(&b->lru))
844 if (test_bit(B_IOLock, &b->flags) || !lafs_is_leaf(b, ph))
847 /* This is close enough to a leaf that we should put it on a list.
848 * If we raced and it isn't then it will be found and removed
850 if (test_bit(B_OnFree, &b->flags)) {
851 spin_lock(&lafs_hash_lock);
852 if (test_and_clear_bit(B_OnFree, &b->flags)) {
853 list_del_init(&b->lru);
856 spin_unlock(&lafs_hash_lock);
858 fs = fs_from_inode(b->inode);
859 spin_lock(&fs->lock);
860 if (list_empty(&b->lru)) {
861 if (test_bit(B_Realloc, &b->flags))
862 list_add(&b->lru, &fs->clean_leafs);
864 ph = !!test_bit(B_Phase1, &b->flags);
865 list_add(&b->lru, &fs->phase_leafs[ph]);
869 spin_unlock(&fs->lock);
872 void lafs_refile(struct block *b, int dec)
874 struct block *next = NULL, *next_parent = NULL;
875 struct fs *fs = NULL;
881 fs = fs_from_inode(b->inode);
882 check_consistency(b);
884 /* To (mostly) avoid recursion, we have a loop which may
885 * walk up to the parent.
886 * The main reason for holding lafs_hash_lock is that it protects
887 * ->lru - i.e. all the lists. FIXME that should be fs->lock
889 LAFS_BUG(atomic_read(&b->refcnt) == 0, b);
893 struct inode *checkpin = NULL;
894 struct inode *myi = NULL;
895 struct datablock *db;
899 if (test_bit(B_Pinned, &b->flags) &&
900 !test_bit(B_Index, &b->flags) &&
901 !test_bit(B_PinPending, &b->flags) &&
902 !test_bit(B_Dirty, &b->flags) &&
903 !test_bit(B_Realloc, &b->flags)
905 /* Don't need to be Pinned any more */
906 /* FIXME do I need to lock access to ->parent */
907 lafs_checkpoint_lock(fs);
908 ph = !!test_bit(B_Phase1, &b->flags);
909 if (test_and_clear_bit(B_Pinned, &b->flags)) {
910 if (test_bit(B_Dirty, &b->flags) ||
911 test_bit(B_Realloc, &b->flags) ||
912 test_bit(B_PinPending, &b->flags))
915 else if (!test_bit(B_Root, &b->flags)) {
917 atomic_dec(&b->parent->pincnt[ph]);
919 if (next_parent != nb) {
920 LAFS_BUG(next_parent, b);
922 atomic_inc(&nb->refcnt);
926 lafs_checkpoint_unlock(fs);
929 if (dec && atomic_dec_and_lock(&b->refcnt, &b->inode->i_data.private_lock)) {
930 /* PinPending disappears with the last non-lru reference,
931 * (or possibly at other times).
933 clear_bit(B_PinPending, &b->flags);
935 ph = !!test_bit(B_Phase1, &b->flags);
936 /* See if we still need to be pinned */
937 if (test_bit(B_Pinned, &b->flags) &&
938 !test_bit(B_Dirty, &b->flags) &&
939 !test_bit(B_Realloc, &b->flags)) {
940 /* Don't need to be Pinned any more */
941 if (test_and_clear_bit(B_Pinned, &b->flags)) {
942 if (!list_empty_careful(&b->lru)) {
943 spin_lock(&fs->lock);
944 list_del_init(&b->lru);
945 spin_unlock(&fs->lock);
947 if (test_bit(B_InoIdx, &b->flags) &&
950 if (!test_bit(B_Root, &b->flags)) {
952 atomic_dec(&b->parent->pincnt[ph]);
953 if (test_bit(B_InoIdx, &b->flags))
954 nb = &LAFSI(b->inode)->dblock->b;
957 if (next_parent != nb) {
958 LAFS_BUG(next_parent, b);
960 atomic_inc(&nb->refcnt);
966 /* check the ->parent link */
975 /* Don't need ->parent any more */
976 if (next_parent == NULL)
977 next_parent = &b->parent->b;
978 else if (next_parent == &b->parent->b ||
979 next_parent->parent == b->parent) {
980 if (atomic_dec_and_test(&b->parent->b.refcnt))
985 del_ref(&b->parent->b, MKREF(child),
989 if (test_bit(B_InoIdx, &b->flags)) {
990 /* While an InoIdx has a parent we hold a count on
991 * the dblock. Now we have dropped one, we must drop the
994 struct datablock *db = LAFSI(b->inode)->dblock;
995 if (&db->b.parent->b != next_parent &&
996 &db->b != next_parent) {
997 printk("db = %s\n", strblk(&db->b));
998 printk("dp->p = %s\n", strblk(&db->b.parent->b));
999 printk("np = %s\n", strblk(next_parent));
1000 printk("b = %s\n", strblk(b));
1002 BUG_ON(&db->b.parent->b != next_parent && &db->b != next_parent);
1003 if (atomic_dec_and_test(&next_parent->refcnt))
1005 next_parent = &db->b;
1006 del_ref(&db->b, MKREF(pdblock), __FILE__, __LINE__);
1008 if (test_and_clear_bit(B_SegRef, &b->flags))
1009 physref = b->physaddr;
1011 LAFS_BUG(test_bit(B_PrimaryRef, &b->flags), b);
1012 list_del_init(&b->siblings);
1014 if (test_bit(B_Index, &b->flags))
1015 list_add(&b->siblings,
1016 &LAFSI(b->inode)->free_index);
1018 if (test_and_clear_bit(B_Prealloc, &b->flags) &&
1020 lafs_summary_allocate(fs, b->inode, -1);
1022 if (test_and_clear_bit(B_Credit, &b->flags))
1024 if (test_and_clear_bit(B_ICredit, &b->flags))
1026 if (test_and_clear_bit(B_NCredit, &b->flags))
1028 if (test_and_clear_bit(B_NICredit, &b->flags))
1030 if (test_and_clear_bit(B_UnincCredit, &b->flags))
1032 lafs_space_return(fs, credits);
1034 if (test_bit(B_Index, &b->flags) &&
1042 if (b->parent != NULL)
1043 /* Could this be uninc - where
1045 printk("Problem: %s\n", strblk(b));
1046 LAFS_BUG(b->parent != NULL, b);
1047 /* put it on the lru */
1048 spin_lock(&lafs_hash_lock);
1049 if (!test_and_set_bit(B_OnFree, &b->flags)) {
1050 LAFS_BUG(!list_empty(&b->lru), b);
1051 list_move(&b->lru, &freelist.lru);
1054 spin_unlock(&lafs_hash_lock);
1056 /* last reference to a dblock with no page
1057 * requires special handling
1058 * The first block on a page must be freed,
1059 * the other blocks hold a reference on that
1060 * first block which must be dropped.
1061 * However it is possible that a new reference was taken
1062 * via _leafs. If so we have now cleared Pinned so
1063 * get_flushable will immediately put this again.
1064 * so we should leave this cleanup to later.
1065 * Unlike other tests, this one isn't idempotent, so
1066 * need the check on refcnt
1069 if (!test_bit(B_Index, &b->flags) &&
1070 !PagePrivate(db->page) &&
1071 atomic_read(&b->refcnt) == 0) {
1072 int bits = (PAGE_SHIFT -
1073 b->inode->i_blkbits);
1074 int mask = (1<<bits) - 1;
1075 int bnum = b->fileaddr & mask;
1077 LAFS_BUG(next, next);
1079 next = &db[-bnum].b;
1080 del_ref(next, "lafs_release_0",
1081 __FILE__, __LINE__);
1087 /* last reference to an iblock requires that we
1088 * deref the dblock. We don't need to re-check
1089 * refcnt here as a racing getiref_locked will take an
1090 * extra dblock reference it.
1092 if (test_bit(B_InoIdx, &b->flags)) {
1093 LAFS_BUG(LAFSI(b->inode)->iblock
1095 LAFS_BUG(next, next);
1096 next = &LAFSI(b->inode)->dblock->b;
1097 del_ref(next, MKREF(iblock),
1098 __FILE__, __LINE__);
1101 /* Free a delayed-release inode */
1102 if (!test_bit(B_Index, &b->flags) &&
1103 (myi = rcu_my_inode(dblk(b))) != NULL &&
1104 (!PagePrivate(dblk(b)->page) ||
1105 test_bit(I_Destroyed, &LAFSI(myi)->iflags))) {
1106 dblk(b)->my_inode = NULL;
1107 LAFSI(myi)->dblock = NULL;
1108 /* Don't need lafs_hash_lock to clear iblock as
1109 * new refs on iblock are only taken while holding
1110 * dblock, and we know dblock has no references.
1111 * You would need an iget to get a ref on dblock now,
1112 * and because I_Destroyed, we know that isn't possible.
1114 LAFSI(myi)->iblock = NULL;
1115 spin_unlock(&b->inode->i_data.private_lock);
1116 if (test_bit(I_Destroyed, &LAFSI(myi)->iflags))
1117 lafs_destroy_inode(myi);
1119 spin_unlock(&b->inode->i_data.private_lock);
1124 lafs_seg_deref(fs, physref, 0);
1129 lafs_inode_checkpin(checkpin);
1137 } else if (next_parent) {
1146 * create (if it doesn't already exist) the 'iblock' for an inode.
1147 * This is a shadow of the dblock but comes into it's own if/when
1148 * the inode's indexing tree needs to grow.
1149 * Return a counted reference to the index block.
1150 * NOTE: we must hold a reference to ->dblock when we call
1153 struct indexblock * __must_check
1154 lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG)
1156 struct lafs_inode *lai = LAFSI(ino);
1157 struct indexblock *ib, *new = NULL;
1158 struct fs *fs = fs_from_inode(ino);
1161 dprintk("MAKE_IBLOCK %d\n", (int)ino->i_ino);
1163 BUG_ON(lai->dblock == NULL);
1164 BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1166 spin_lock(&lafs_hash_lock);
1168 ib = getiref_locked_needsync(lai->iblock, REF);
1173 (void)getdref(lai->dblock, MKREF(iblock));
1175 spin_unlock(&lafs_hash_lock);
1177 lafs_iblock_free(new);
1181 err = setparent(lai->dblock, async);
1183 block_adopt(&ib->b, lai->dblock->b.parent);
1188 return ERR_PTR(err);
1191 new = lafs_iblock_alloc(fs, GFP_KERNEL, 0, REF);
1193 return ERR_PTR(-ENOMEM);
1195 new->b.fileaddr = 0;
1196 new->b.physaddr = lai->dblock->b.physaddr;
1197 if (test_bit(B_PhysValid, &lai->dblock->b.flags))
1198 set_bit(B_PhysValid, &new->b.flags);
1199 LAFS_BUG(!test_bit(B_Valid, &lai->dblock->b.flags), &lai->dblock->b);
1200 set_bit(B_Valid, &new->b.flags);
1202 new->depth = lai->depth;
1203 /* Note: this doesn't get hashed until the index
1204 * tree grows and this block is disconnected from
1207 set_bit(B_InoIdx, &new->b.flags);
1208 if (test_bit(B_Root, &lai->dblock->b.flags))
1209 set_bit(B_Root, &new->b.flags);
1210 new->b.parent = NULL;
1216 leaf_lookup(void *bf, int len, u32 startaddr, u32 target, u32 *nextp)
1218 /* buf starts with a 2byte header
1219 * if 1, then 6byte littleending indirect entries.
1220 * if 2, then 12byte extent entries
1222 unsigned char *buf = bf;
1230 *nextp = 0xffffffff;
1239 case IBLK_INDIRECT: /* indirect */
1240 dprintk("indirect lookup for %lu from %lu, len %d\n",
1241 (unsigned long)target, (unsigned long)startaddr, len);
1245 if (target < startaddr)
1249 target -= startaddr;
1251 /* Need both tests as target could be v.large and
1252 * multiply by 6 could overflow
1260 /* find the next allocated block */
1266 *nextp = 0xffffffff;
1279 case IBLK_EXTENT: /* extent */
1280 /* 12 byte records: 6byte device, 2 byte length, 4
1288 int mid = (lo+hi)/2;
1292 elen = decode16(cp);
1293 addr = decode32(cp);
1295 if (elen && addr <= target)
1304 elen = decode16(cp);
1305 addr = decode32(cp);
1306 if (target < addr) {
1309 } else if (target >= addr + elen)
1316 *nextp = 0xffffffff; /* next extent */
1320 elen = decode16(cp);
1321 addr = decode32(cp);
1323 *nextp = 0xffffffff; /* no more meaningful
1333 u32 lafs_leaf_next(struct indexblock *ib, u32 start)
1335 /* In this leaf index block, find the first address
1336 * at-or-after 'start'. If none, return 0xFFFFFFFF
1338 char *buf = map_iblock(ib);
1339 int blocksize = ib->b.inode->i_sb->s_blocksize;
1340 struct lafs_inode *li = LAFSI(ib->b.inode);
1341 u32 next = 0xffffffff;
1344 if (test_bit(B_InoIdx, &ib->b.flags))
1345 phys = leaf_lookup(buf + li->metadata_size,
1346 blocksize - li->metadata_size,
1347 ib->b.fileaddr, start, &next);
1349 phys = leaf_lookup(buf, blocksize,
1350 ib->b.fileaddr, start, &next);
1351 unmap_iblock(ib, buf);
1359 index_lookup(void *bf, int len, u32 target, u32 *addrp, u32 *nextp)
1361 unsigned char *buf = bf;
1367 if (buf[1] || buf[0]) {
1368 dprintk("WARNING: not an index block %02x %02x\n",
1376 /* 10 byte records: 6 byte Device address, 4 byte file address */
1377 lo = 0; hi = (len/10);
1379 int mid = (lo+hi)/2;
1383 addr = decode32(cp);
1384 dprintk("...addr=%lu target=%lu lo=%d mid=%d hi=%d\n",
1385 (unsigned long)addr, (unsigned long)target,
1387 if (p && addr <= target)
1396 addr = decode32(cp);
1399 /* Nothing in this block */
1402 if (addr > target) {
1403 /* target is before first address */
1409 if (cp+10 <= (unsigned char *) buf + len && nextp) {
1410 u64 p2 = decode48(cp);
1411 u32 nxt = decode32(cp);
1413 *nextp = nxt; /* address of next index block */
1418 int lafs_index_empty(struct indexblock *ib)
1420 char *buf = map_iblock(ib);
1421 int blocksize = ib->b.inode->i_sb->s_blocksize;
1422 struct lafs_inode *li = LAFSI(ib->b.inode);
1426 if (test_bit(B_InoIdx, &ib->b.flags))
1427 phys = index_lookup(buf + li->metadata_size,
1428 blocksize - li->metadata_size,
1429 0xFFFFFFFF, &addr, NULL);
1431 phys = index_lookup(buf, blocksize,
1432 0xFFFFFFFF, &addr, NULL);
1433 unmap_iblock(ib, buf);
1438 static struct indexblock *find_better(struct inode *ino,
1439 struct indexblock *ib,
1440 u32 addr, u32 *next, REFARG)
1442 struct indexblock *rv = ib;
1443 spin_lock(&ino->i_data.private_lock);
1444 list_for_each_entry_continue(ib, &ib->b.parent->children, b.siblings) {
1445 if (!test_bit(B_PrimaryRef, &ib->b.flags))
1447 if (test_bit(B_EmptyIndex, &ib->b.flags))
1449 if (ib->b.fileaddr > addr) {
1450 if (next && *next > ib->b.fileaddr)
1451 *next = ib->b.fileaddr;
1454 /* This appears to be better. */
1457 getref_locked(&rv->b, REF);
1458 spin_unlock(&ino->i_data.private_lock);
1463 lafs_leaf_find(struct inode *inode, u32 addr, int adopt, u32 *next,
1466 /* Find the leaf index block which refers to this address (if any do)
1467 * Returns the leaf IOLocked.
1468 * *next will be set to an address that is higher than addr such
1469 * that there are no other leafs before *next. There might not
1470 * actually be a leaf at *next though.
1472 struct indexblock *ib, *ib2;
1473 struct indexblock *hold = NULL;
1477 struct lafs_inode *li = LAFSI(inode);
1478 int blocksize = inode->i_sb->s_blocksize;
1482 /* Must own a reference to ->dblock */
1483 BUG_ON(li->dblock == NULL);
1484 BUG_ON(atomic_read(&li->dblock->b.refcnt) == 0);
1486 /* We come back to restart if we needed to unlock to
1487 * perform IO and we cannot be sure the parent didn't change.
1488 * We hold the last seen index block in 'hold' so that if
1489 * no changes happened (the common case) we are certain
1490 * that no IO will be required to get back to where
1497 ib = lafs_make_iblock(inode, adopt, async, REF);
1501 offset = li->metadata_size;
1505 lafs_iolock_block(&ib->b);
1506 else if (!lafs_iolock_block_async(&ib->b))
1509 while (ib->depth > 1) {
1510 /* internal index block */
1511 /* NOTE: index block could be empty, or could have
1512 * nothing as early as 'addr'. In that case '0' is
1513 * returned implying that the block is either in
1514 * cache, or needs to be synthesised as an empty
1515 * index block (one level down).
1517 struct indexblock *better;
1521 buf = map_iblock(ib);
1522 iaddr = ib->b.fileaddr;
1523 iphys = index_lookup(buf + offset, blocksize - offset,
1525 target == addr ? next : NULL);
1526 unmap_iblock(ib, buf);
1528 ib2 = lafs_iblock_get(inode, iaddr, ib->depth-1, iphys, REF);
1532 lafs_iounlock_block(&ib->b);
1536 better = find_better(inode, ib2, addr, next, REF);
1540 if (!test_bit(B_Valid, &ib2->b.flags)) {
1541 lafs_iounlock_block(&ib->b);
1542 err = lafs_load_block(&ib2->b, NULL);
1546 err = lafs_wait_block_async(&ib2->b);
1548 err = lafs_wait_block(&ib2->b);
1554 lafs_iolock_block(&ib->b);
1555 else if (!lafs_iolock_block_async(&ib->b))
1558 /* While we did IO, the parent could have been
1559 * split, so it is now the wrong parent, or it
1560 * could have become EmptyIndex, though that is
1561 * much less likely. If it did, then it must
1562 * have had a parent, so the ref we hold means
1563 * it still has a parent.
1564 * So if ib->b.parent, then we might need to
1565 * retry from the top, and holding a ref on ib
1566 * means that we don't risk live-locking if
1567 * memory for index blocks is very tight.
1568 * If there is no parent, then there is
1569 * no risk that ib changed.
1573 putiref(hold, MKREF(hold));
1574 hold = getiref(ib, MKREF(hold));
1583 lafs_iolock_block(&ib2->b);
1584 else if (!lafs_iolock_block_async(&ib2->b))
1587 if (ib2->b.fileaddr != ib->b.fileaddr &&
1588 test_bit(B_EmptyIndex, &ib2->b.flags)) {
1589 /* need to look earlier in the parent */
1590 target = ib2->b.fileaddr - 1;
1591 lafs_iounlock_block(&ib2->b);
1597 if (!(ib2->b.flags & B_Linked)) {
1598 struct block *b2 = peer_find(fs,
1602 list_add(&ib2->peers, &b2->peers);
1603 set_bit(B_Linked, &ib2->b.flags);
1607 block_adopt(&ib2->b, ib);
1608 lafs_iounlock_block(&ib->b);
1622 putiref(hold, MKREF(hold));
1623 return ERR_PTR(err);
1626 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr);
1628 static int __lafs_find_next(struct inode *ino, loff_t *addrp)
1630 /* Find the next allocated block in inode at or after *addrp
1634 * 0 - there is no block at or after '*addrp'
1635 * 1 - It is worth checking the block at *addrp.
1636 * 2 - *addrp has been updated, check again.
1638 struct lafs_inode *lai = LAFSI(ino);
1639 struct indexblock *leaf;
1640 struct fs *fs = fs_from_inode(ino);
1642 u32 nexti, nextd, nextl;
1648 /* Must hold a reference to ->dblock */
1649 BUG_ON(lai->dblock == NULL);
1650 BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1652 if (lai->depth == 0) {
1655 leaf = lafs_make_iblock(ino, NOADOPT, SYNC, MKREF(find_next));
1657 return PTR_ERR(leaf);
1660 leaf = lafs_leaf_find(ino, *addrp, NOADOPT, &nexti, SYNC,
1662 /* FIXME what if 'leaf' is an error?? */
1664 if (lai->depth == 1)
1665 offset = lai->metadata_size;
1666 buf = map_iblock(leaf);
1667 p = leaf_lookup(buf+offset, fs->blocksize-offset,
1668 leaf->b.fileaddr, (u32)*addrp, &nextd);
1669 unmap_iblock(leaf, buf);
1671 lafs_iounlock_block(&leaf->b);
1672 putiref(leaf, MKREF(find_next));
1673 if (*addrp == 0xFFFFFFFF) {
1675 printk("at %s\n", strblk(&lai->dblock->b));
1676 printk("offset = %d depth=%d\n", offset,
1678 for (j = 0; j < 16; j++)
1679 printk("%04x ", buf[offset+j] & 0xffff);
1680 for (j = 0; j < 16; j++)
1681 printk("%04x ", buf[1024-16+j] & 0xffff);
1684 BUG_ON(*addrp == 0xFFFFFFFF);
1687 if (nextd != 0xFFFFFFFF)
1689 else if (nexti != 0xFFFFFFFF) {
1694 /* If rv==2, nextd may not be high enough as it might
1695 * be the address of an EmptyIndex block. So only
1696 * use it if nothing better is found.
1702 list_for_each_entry(b, &leaf->children, siblings)
1703 if (b->fileaddr >= *addrp &&
1704 b->fileaddr <= nextl &&
1705 test_bit(B_Valid, &b->flags)) {
1706 nextd = nextl = b->fileaddr;
1710 if (leaf->uninc_table.pending_cnt) {
1711 spin_lock(&leaf->b.inode->i_data.private_lock);
1712 if (table_find_first(&leaf->uninc_table, *addrp, &nextl)) {
1716 spin_unlock(&leaf->b.inode->i_data.private_lock);
1718 lafs_iounlock_block(&leaf->b);
1719 putiref(leaf, MKREF(find_next));
1725 * find the first data block at or after *bnump, and
1726 * store the address in *bnump.
1727 * Return 0 if nothing found, -1 on error, or
1728 * 1 if *bnump is a valid data block address
1730 * The block could be in the indexing tree, or in
1731 * an uninc_table, or totally unincorporated.
1732 * Blocks that are really part of the file, but are not in
1733 * the indexing tree will still be on a ->children list
1734 * from an index block, and the index block will be very close
1735 * to the indexing tree.
1736 * So once we have found a suitable index block, we need to check
1737 * all it's children. This could possibly be optimised using
1738 * find_get_pages if that was exported.
1740 int lafs_find_next(struct inode *ino, loff_t *bnump)
1743 /* Need to hold a reference to dblock while calling
1746 struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(find_nexti));
1751 struct datablock *b;
1754 rv = __lafs_find_next(ino, bnump);
1761 b = lafs_get_block(ino, *bnump, NULL, GFP_KERNEL,
1766 rv = lafs_find_block(b, NOADOPT);
1768 putdref(b, MKREF(find_next));
1771 hole = (b->b.physaddr == 0 ||
1772 !test_bit(B_PhysValid, &b->b.flags)) &&
1773 !test_bit(B_Valid, &b->b.flags);
1774 if (LAFSI(ino)->depth == 0 &&
1776 hole = 0; /* first block lives in inode */
1777 putdref(b, MKREF(find_next));
1782 *bnump = (*bnump)+1;
1784 putdref(db, MKREF(find_nexti));
1785 BUG_ON(rv > 0 && *bnump == 0xFFFFFFFF);
1789 static int table_lookup(struct uninc *tbl, u32 addr, u64 *phys)
1792 for (i = tbl->pending_cnt; i; ) {
1793 struct addr *a = &tbl->pending_addr[--i];
1794 if (a->fileaddr <= addr &&
1795 a->fileaddr + a->cnt > addr) {
1796 *phys = a->physaddr + (addr - a->fileaddr);
1803 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr)
1805 /* Find the earliest allocated block in tbl which is
1806 * at-or-after 'min' and before '*addr', and store it in
1807 * *addr, returning 1 if something was found.
1812 for (i = tbl->pending_cnt ; i ; ) {
1813 struct addr *a = &tbl->pending_addr[--i];
1814 if (a->fileaddr >= *addr)
1816 if (a->fileaddr + a->cnt < min)
1818 if (a->physaddr == 0)
1820 if (a->fileaddr >= min)
1821 *addr = a->fileaddr;
1829 static void fill_block_zero(struct datablock *db, struct datablock *inob)
1831 struct lafs_inode *lai = LAFSI(db->b.inode);
1832 void *baddr = map_dblock(db);
1833 void *iaddr = map_dblock_2(inob);
1834 int offset = lai->metadata_size;
1835 int len = (1<<db->b.inode->i_blkbits) - offset;
1837 memcpy(baddr, iaddr+offset, len);
1838 unmap_dblock_2(inob, iaddr);
1839 memset(baddr+len, 0, (1<<db->b.inode->i_blkbits)-len);
1840 unmap_dblock(db, baddr);
1841 set_bit(B_Valid, &db->b.flags);
1844 static int __must_check
1845 find_block(struct datablock *b, int adopt, int async)
1847 /* Find where this block is in storage, and
1848 * set b->b.physaddr.
1849 * If the block lives in the inode, physaddr
1850 * gets set to zero, and lafs_load_block works with this.
1851 * Otherwise we walk down the index tree
1856 struct lafs_inode *lai;
1857 struct indexblock *ib;
1858 struct datablock *db;
1862 LAFS_BUG(test_bit(B_Pinned, &b->b.flags) &&
1863 b->b.parent == NULL,
1865 if (b->b.inode == NULL)
1867 lai = LAFSI(b->b.inode);
1869 if (lai->depth == 0) {
1870 /* Data is in the inode if anywhere */
1871 if (!test_bit(B_PhysValid, &b->b.flags)) {
1873 set_bit(B_PhysValid, &b->b.flags);
1875 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1878 if (!test_bit(B_Valid, &b->b.flags) && b->b.fileaddr == 0) {
1879 /* Now is the best time to copy data from inode into
1880 * datablock, as we know we have a valid inode block
1883 LAFS_BUG(b->b.physaddr != 0, &b->b);
1884 fill_block_zero(b, db);
1887 ib = lafs_make_iblock(b->b.inode, adopt, async,
1889 putdref(db, MKREF(findblock));
1892 block_adopt(&b->b, ib);
1893 putiref(ib, MKREF(findblock));
1895 putdref(db, MKREF(findblock));
1899 if (test_bit(B_PhysValid, &b->b.flags)
1900 && (!adopt || b->b.parent || test_bit(B_Root, &b->b.flags))) {
1904 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1908 /* Ok, there is some indexing information we need to
1909 * look through. Find the leaf first
1911 ib = lafs_leaf_find(b->b.inode, b->b.fileaddr, adopt, NULL, async,
1913 putdref(db, MKREF(findblock));
1916 dprintk("lafs_leaf_find returned error to lafs_find_block\n");
1920 if (!test_bit(B_PhysValid, &b->b.flags)
1921 && ib->uninc_table.pending_cnt) {
1922 spin_lock(&b->b.inode->i_data.private_lock);
1923 if (table_lookup(&ib->uninc_table, b->b.fileaddr,
1925 set_bit(B_PhysValid, &b->b.flags);
1926 spin_unlock(&b->b.inode->i_data.private_lock);
1928 if (!test_bit(B_PhysValid, &b->b.flags)) {
1929 buf = map_iblock(ib);
1930 offset = lai->depth > 1 ? 0 : lai->metadata_size;
1931 if (test_bit(B_InoIdx, &ib->b.flags))
1932 dprintk("InoIdx!!!\n");
1933 dprintk("offset %d: %02x %02x\n", offset,
1934 ((char *)buf)[offset], ((char *)buf)[offset+1]);
1935 b->b.physaddr = leaf_lookup(buf+offset,
1936 b->b.inode->i_sb->s_blocksize
1938 ib->b.fileaddr, b->b.fileaddr,
1940 set_bit(B_PhysValid, &b->b.flags);
1941 unmap_iblock(ib, buf);
1943 dprintk("lafs_find_block: lookup for %lu of %lu at %lu found %llu\n",
1944 (unsigned long)b->b.fileaddr,
1945 (unsigned long)b->b.inode->i_ino,
1946 (unsigned long)ib->b.fileaddr,
1948 /* FIXME should peer_find go here or in load_block? */
1951 block_adopt(&b->b, ib);
1952 lafs_iounlock_block(&ib->b);
1957 putiref(ib, MKREF(findblock));
1962 lafs_find_block(struct datablock *b, int adopt)
1964 return find_block(b, adopt, SYNC);
1968 lafs_find_block_async(struct datablock *b)
1970 return find_block(b, ADOPT, ASYNC);
1974 * lafs_allocated_block records that a block has been written at a new
1975 * address (or found at an address during roll-forward). This involves
1976 * updating some summary information (quota etc) and making sure
1977 * the change gets recorded in the parent block
1978 * This must be called before B_Dirty or B_Realloc are cleared,
1979 * to ensure that the index block is marked accordingly.
1980 * B_UnincCredit should still be set, and we clear it.
1983 int lafs_add_block_address(struct fs *fs, struct block *blk)
1985 struct indexblock *p = blk->parent;
1988 spin_lock(&blk->inode->i_data.private_lock);
1989 if (test_bit(B_Index, &blk->flags)) {
1990 if (!test_and_set_bit(B_Uninc, &blk->flags)) {
1991 blk->chain = p->uninc;
1992 LAFS_BUG(p->depth == 1, blk);
1994 getref(blk, MKREF(uninc));
1996 spin_unlock(&blk->inode->i_data.private_lock);
2000 /* same phase and leaf; add the address to uninc_table */
2002 /* the only place we try to merge is at the end
2003 * of the embedded table, and then only for data
2005 * It is essential that uninc_table is sorted and,
2006 * in particular, has no overlapping extents.
2007 * If this block threatens problems we incorporate first.
2010 LAFS_BUG(!test_bit(B_Dirty, &p->b.flags) &&
2011 !test_bit(B_Realloc, &p->b.flags), blk);
2013 a = &p->uninc_table.pending_addr
2014 [p->uninc_table.pending_cnt-1];
2015 if (p->uninc_table.pending_cnt >= 1 &&
2016 a->fileaddr+a->cnt == blk->fileaddr &&
2018 a->physaddr+a->cnt == blk->physaddr) {
2020 if (test_and_clear_bit(B_UnincCredit, &blk->flags))
2021 p->uninc_table.credits++;
2022 spin_unlock(&blk->inode->i_data.private_lock);
2024 } else if ((p->uninc_table.pending_cnt == 0 ||
2025 a->fileaddr + a->cnt <= blk->fileaddr) &&
2026 /* properly ordered, maybe add */
2027 (p->uninc_table.pending_cnt <
2028 ARRAY_SIZE(p->uninc_table.pending_addr))) {
2030 a->fileaddr = blk->fileaddr;
2031 a->physaddr = blk->physaddr;
2033 p->uninc_table.pending_cnt++;
2034 if (test_and_clear_bit(B_UnincCredit,
2036 p->uninc_table.credits++;
2037 spin_unlock(&blk->inode->i_data.private_lock);
2041 spin_unlock(&blk->inode->i_data.private_lock);
2042 /* We own a ref through blk->parent now, but
2043 * after lafs_incorporate, we might not
2045 getiref(p,MKREF(addblock));
2046 lafs_iolock_written(&p->b);
2047 lafs_incorporate(fs, p);
2048 lafs_iounlock_block(&p->b);
2049 putiref(p,MKREF(addblock));
2054 int lafs_allocated_block(struct fs *fs, struct block *blk, u64 phys)
2057 struct indexblock *p;
2059 dprintk("Allocated %s -> %llu\n", strblk(blk), phys);
2061 /* If phys is 0, I don't need uninc credit.
2062 * Also, Index block that are not near full do not need UnincCredits
2064 LAFS_BUG(test_bit(B_InoIdx, &blk->flags), blk);
2065 if (!test_bit(B_Dirty, &blk->flags) &&
2066 !test_bit(B_Realloc, &blk->flags) &&
2068 printk("something missing %s\n", strblk(blk));
2069 LAFS_BUG(!test_bit(B_UnincCredit, &blk->flags) && phys &&
2070 !test_bit(B_Index, &blk->flags), blk);
2071 LAFS_BUG(!test_bit(B_Dirty, &blk->flags) &&
2072 !test_bit(B_Realloc, &blk->flags) &&
2075 LAFS_BUG(!test_bit(B_Writeback, &blk->flags), blk);
2077 if (test_bit(B_Root, &blk->flags)) {
2079 struct lafs_inode *lai;
2081 LAFS_BUG(test_bit(B_Index, &blk->flags), blk);
2083 for (i = 0; i < fs->maxsnapshot; i++)
2084 if (fs->ss[i].root == blk->inode)
2086 LAFS_BUG(i == fs->maxsnapshot, blk);
2087 blk->physaddr = phys; /* superblock doesn't get
2088 counted in summaries */
2089 LAFS_BUG(test_bit(B_SegRef, &blk->flags), blk);
2090 set_bit(B_PhysValid, &blk->flags);
2091 fs->ss[i].root_addr = phys;
2092 lai = LAFSI(blk->inode);
2093 if (i == 0 && lai->md.fs.usagetable > 1)
2094 /* snapshot checkpoint is happening. Record root
2095 * address in main fs and in snapshot
2097 fs->ss[lai->md.fs.usagetable-1].root_addr = phys;
2099 /* Don't bother to clear the B_UnincCredit in the root.
2100 * It'll just get set again. This way there is less churn.
2104 if (test_bit(FinalCheckpoint, &fs->fsstate) &&
2105 !test_bit(B_Index, &blk->flags) &&
2106 (LAFSI(blk->inode)->type == TypeSegmentMap ||
2107 LAFSI(blk->inode)->type == TypeQuota)) {
2108 /* These blocks will be picked up by roll-forward,
2109 * so don't need to have their address recorded,
2110 * and doing so would leave the index blocks
2111 * dirty after the checkpoint.
2114 if (test_and_clear_bit(B_SegRef, &blk->flags))
2115 lafs_seg_deref(fs, blk->physaddr, 0);
2116 blk->physaddr = phys;
2117 set_bit(B_PhysValid, &blk->flags);
2124 /* We need to work out if this block is in the current phase or
2125 * the next one. This can only be an issue if a phase change
2126 * (aka checkpoint) is underway. But if it is, we need to
2127 * ensure the right phase is used
2129 lafs_checkpoint_lock(fs);
2130 if (test_bit(B_Pinned, &blk->flags))
2131 ph = !!test_bit(B_Phase1, &blk->flags);
2132 else if (test_bit(B_Index, &blk->flags))
2133 LAFS_BUG(1, blk); /* Index blocks must always be pinned when dirty (*/
2134 else if (p->uninc_table.pending_cnt) {
2135 /* attach data block into previous phase as incorporation of
2136 * this phase is still to happen.
2137 * This is needed for flush-if-cannot-preallocate to work.
2139 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), &p->b);
2140 ph = !!test_bit(B_Phase1, &p->b.flags);
2141 lafs_pin_block_ph(blk, ph);
2143 lafs_pin_block(blk);
2144 ph = !!test_bit(B_Phase1, &blk->flags);
2146 lafs_checkpoint_unlock(fs);
2147 if (!test_bit(B_PhysValid, &blk->flags))
2149 lafs_summary_update(fs, blk->inode, blk->physaddr, phys,
2150 test_bit(B_Index, &blk->flags), ph,
2151 test_bit(B_SegRef, &blk->flags));
2152 clear_bit(B_Prealloc, &blk->flags);
2154 blk->physaddr = phys;
2155 set_bit(B_PhysValid, &blk->flags);
2157 if (ph != !!test_bit(B_Phase1, &p->b.flags)) {
2158 /* in the next phase */
2159 dprintk("uninc next phase\n");
2160 if (test_and_set_bit(B_Uninc, &blk->flags))
2161 /* This block was already on the 'unincorporated'
2162 * list, (it must be changing quickly) so there
2163 * is nothing else to do.
2166 getref(blk, MKREF(uninc));
2168 spin_lock(&blk->inode->i_data.private_lock);
2169 blk->chain = p->uninc_next;
2170 p->uninc_next = blk;
2171 spin_unlock(&blk->inode->i_data.private_lock);
2172 /* We will try again after the phase change.
2173 * That will be when we clean B_UnincCredit
2179 if (test_bit(B_Dirty, &blk->flags) || phys == 0) {
2180 if (!test_bit(B_Dirty, &p->b.flags)
2181 && !test_bit(B_Credit, &p->b.flags)) {
2182 printk("Oh dear: %s\n", strblk(blk));
2183 printk(".......: %s\n", strblk(&p->b));
2185 lafs_dirty_iblock(p);
2187 if (!test_bit(B_Valid, &p->b.flags)) {
2188 printk("Not valid in lafs_allocated_block: %s\n",
2190 printk("blk is %s\n", strblk(blk));
2193 LAFS_BUG(!test_bit(B_Valid, &p->b.flags), &p->b);
2194 if (!test_bit(B_Realloc, &p->b.flags)) {
2195 /* I cannot use B_Credit to fill B_Realloc as that
2196 * might still be needed for B_Dirty.
2197 * So if we cannot allocated a new credit,
2198 * just set the block as 'dirty' now.
2200 if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
2201 if (test_and_set_bit(B_Realloc, &p->b.flags))
2202 lafs_space_return(fs, 1);
2204 lafs_dirty_iblock(p);
2206 if (!test_and_set_bit(B_UnincCredit, &p->b.flags))
2207 if (!test_and_clear_bit(B_ICredit, &p->b.flags))
2208 LAFS_BUG(1, &p->b); // ICredit should be set before
2213 dprintk("Uninc same phase\n");
2215 BUG_ON(!test_bit(B_Pinned, &blk->parent->b.flags));
2217 if (!lafs_add_block_address(fs, blk)) {
2221 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), blk);
2225 /* That credit has now been added to the uninc_table and
2226 * thus made available to the parent
2229 lafs_refile(&p->b, 0);