]> git.neil.brown.name Git - LaFS.git/blobdiff - block.c
README update
[LaFS.git] / block.c
diff --git a/block.c b/block.c
index f74a0f43af9eadbf15b707b3916de7d8f7fc7b77..9dd562915442903e83cc6669305593cc10a1aae7 100644 (file)
--- a/block.c
+++ b/block.c
@@ -2,13 +2,14 @@
 /*
  * Handle data blocks for LaFS
  * fs/lafs/block.c
- * Copyright (C) 2004-2006
+ * Copyright (C) 2004-2009
  * NeilBrown <neilb@suse.de>
  * Released under the GPL, version 2
  */
 
 #include       "lafs.h"
 #include       <linux/buffer_head.h> /* for try_to_release_page */
+#include       <linux/slab.h>
 /*
  * Data blocks are stored in a regular address space for the
  * relevant file.  A page may have multiple datablocks, but
  *
  */
 
+#if DEBUG_REF
+static DEFINE_SPINLOCK(refl);
+void add_ref(struct block *b, char *ref, char *file, int line)
+{
+       int i;
+       int z = -1;
+       spin_lock(&refl);
+       for (i = 0; i < 16; i++) {
+               if (b->holders[i].cnt) {
+                       if (strcmp(b->holders[i].name, ref) == 0) {
+                               b->holders[i].cnt++;
+                               spin_unlock(&refl);
+                               return;
+                       }
+               } else
+                       z = i;
+       }
+       if (z < 0) {
+               spin_unlock(&refl);
+               printk(KERN_ERR "LaFS: add_ref all holders are in use"
+                      " at %s:%d\nblock: %s\n",
+                      file, line, strblk(b));
+               BUG();
+       }
+       b->holders[z].cnt = 1;
+       b->holders[z].name = ref;
+       spin_unlock(&refl);
+}
+
+void del_ref(struct block *b, char *ref, char *file, int line)
+{
+       int i;
+       spin_lock(&refl);
+       for (i = 0; i < 16; i++) {
+               if (b->holders[i].cnt &&
+                   strcmp(b->holders[i].name, ref) == 0) {
+                       b->holders[i].cnt--;
+                       spin_unlock(&refl);
+                       return;
+               }
+
+       }
+       spin_unlock(&refl);
+       printk(KERN_ERR "LaFS: holder %s not found at %s:%d\nblk: %s\n",
+              ref, file, line, strblk(b));
+       BUG();
+}
+
+int has_ref(struct block *b, char *ref)
+{
+       int i;
+       spin_lock(&refl);
+       for (i = 0; i < 16; i++)
+               if (b->holders[i].cnt &&
+                   strcmp(b->holders[i].name, ref) == 0)
+                       break;
+       spin_unlock(&refl);
+       return i < 16;
+}
+#endif
+
 /* Based on grow_dev_page */
 struct datablock *
-lafs_get_block(struct inode *ino, unsigned long index, struct page *p, int gfp)
+lafs_get_block(struct inode *ino, unsigned long index, struct page *p,
+              int gfp, REFARG)
 {
-       struct datablock *b;
+       struct datablock *b = NULL;
        int bits = PAGE_SHIFT - ino->i_blkbits;
        int unlock = !p;
+       if (!p) {
+               p = find_get_page(&ino->i_data, index >> bits);
+               if (p) {
+                       spin_lock(&ino->i_data.private_lock);
+                       if (PagePrivate(p)) {
+                               b = (struct datablock *)p->private;
+                               b += index & ((1<<bits)-1);
+                               getdref_locked(b, REF);
+                       }
+                       spin_unlock(&ino->i_data.private_lock);
+                       page_cache_release(p);
+                       p = NULL;
+               }
+               if (b)
+                       return b;
+       }
        if (!p)
-               p = find_or_create_page(ino->i_mapping, index>>bits, gfp);
+               p = find_or_create_page(&ino->i_data, index>>bits, gfp);
        if (!p) {
                dprintk("find or create returned NULL\n");
                return NULL;
        }
 
        if (!PagePrivate(p)) {
-
                unsigned long ind = p->index << bits;
                int i;
                /* New page, need to set up attribute blocks */
                /* FIXME use kmem_cache */
                dprintk("setting up %p for %lu\n", p, index);
-               BUG_ON(index == 0x5a5a5a5a);
-               b = kmalloc(sizeof(struct datablock)<<bits, gfp);
+               b = kzalloc(sizeof(struct datablock)<<bits, gfp);
                if (!b) {
-                       if (unlock)
+                       if (unlock) {
                                unlock_page(p);
-                       page_cache_release(p);
+                               page_cache_release(p);
+                       }
                        return NULL;
                }
 
-               for (i=0; i< (1<<bits); i++) {
+               for (i = 0; i < (1<<bits); i++) {
                        b[i].page = p;
                        atomic_set(&b[i].b.refcnt, 0);
                        b[i].b.flags = 0;
+                       if (PageUptodate(p))
+                               set_bit(B_Valid, &b[i].b.flags);
                        b[i].b.fileaddr = ind++;
                        b[i].b.inode = ino;
                        b[i].b.physaddr = 0;
@@ -64,11 +144,7 @@ lafs_get_block(struct inode *ino, unsigned long index, struct page *p, int gfp)
                        INIT_LIST_HEAD(&b[i].cleaning);
                        b[i].b.chain = NULL;
 
-                       b[i].my_inode = NULL;  /* FIXME does this belong here? */
-                       if (fs_from_inode(ino)->orphans == ino)
-                               atomic_set(&b[i].pincnt, 0);
-
-                       /* FIXME what else needs to be initialised? */
+                       b[i].my_inode = NULL;
                }
 
                spin_lock(&ino->i_data.private_lock);
@@ -81,12 +157,18 @@ lafs_get_block(struct inode *ino, unsigned long index, struct page *p, int gfp)
                kfree(b);
        }
 
-       b = (struct datablock*) p->private;
+       b = (struct datablock *)p->private;
        b += index & ((1<<bits)-1);
-       getdref(b);
-
-       if (unlock) unlock_page(p);
-       BUG_ON(b->b.inode != ino);
+       /* spinlock is just to sync with lafs_refile */
+       spin_lock(&ino->i_data.private_lock);
+       getdref_locked(b, REF);
+       spin_unlock(&ino->i_data.private_lock);
+
+       if (unlock) {
+               unlock_page(p);
+               page_cache_release(p);
+       }
+       LAFS_BUG(b->b.inode != ino, &b->b);
        return b;
 }
 
@@ -105,10 +187,11 @@ lafs_get_block(struct inode *ino, unsigned long index, struct page *p, int gfp)
 
 void lafs_invalidate_page(struct page *page, unsigned long offset)
 {
-       struct super_block *sb = page->mapping->host->i_sb;
+       struct inode *ino = page->mapping->host;
+       struct super_block *sb = ino->i_sb;
        int bits = PAGE_SHIFT - sb->s_blocksize_bits;
-       loff_t size = i_size_read(page->mapping->host);
-       loff_t start = page_index(page) << PAGE_SHIFT;
+       loff_t size = i_size_read(ino);
+       loff_t start = (loff_t)page_index(page) << PAGE_SHIFT;
 
        if (PagePrivate(page)) {
                int i;
@@ -119,20 +202,34 @@ void lafs_invalidate_page(struct page *page, unsigned long offset)
                 *   erase any blocks beyond end-of-file
                 *   wait for any pending IO to complete (so page can be freed)
                 */
-               for (i=0; i<(1<<bits); i++) {
-                       if (start >= size) {
-                               if (test_bit(B_Dirty, &b[i].b.flags) ||
-                                   test_bit(B_Uninc, &b[i].b.flags) ||
-                                   test_bit(B_PinPending, &b[i].b.flags))
-                                       lafs_erase_dblock(&b[i]);
-                       } else if (b_start >= offset) {
-                               lafs_iolock_block(&b[i].b);
+               for (i = 0; i < (1<<bits); i++) {
+                       spin_lock(&ino->i_data.private_lock);
+                       (void)getdref_locked(&b[i], MKREF(inval));
+                       spin_unlock(&ino->i_data.private_lock);
+
+                       if (b_start >= offset &&
+                           test_and_clear_bit(B_Async, &b[i].b.flags))
+                               putdref(&b[i], MKREF(Async));
+
+                       if (LAFSI(ino)->type >= TypeBase && start >= size)
+                               /* Remove block from mapping and file */
+                               lafs_erase_dblock(&b[i]);
+                       else if (b_start >= offset) {
+                               /* Just remove block from mapping */
+                               lafs_iolock_written(&b[i].b);
+                               LAFS_BUG(test_bit(B_Dirty, &b[i].b.flags),
+                                        &b[i].b);
+                               LAFS_BUG(test_bit(B_Realloc, &b[i].b.flags),
+                                        &b[i].b);
                                clear_bit(B_Valid, &b[i].b.flags);
-                               lafs_iounlock_block(&b[i].b, 0);
+                               lafs_iounlock_block(&b[i].b);
                        }
                        b_start += sb->s_blocksize;
                        start += sb->s_blocksize;
-                       BUG_ON(offset == 0 && test_bit(B_IOLock, &b[i].b.flags));
+                       LAFS_BUG(offset == 0 &&
+                                test_bit(B_IOLock, &b[i].b.flags),
+                                &b[i].b);
+                       putdref(&b[i], MKREF(inval));
                }
        }
        if (offset == 0) {
@@ -146,6 +243,8 @@ int lafs_release_page(struct page *page, gfp_t gfp_flags)
        struct address_space * const mapping = page->mapping;
        int bits = PAGE_SHIFT - mapping->host->i_blkbits;
        int i;
+       int credits = 0;
+       struct indexblock *parents[1<<bits];
        struct datablock *b = NULL;
 
        if (PageWriteback(page)) {
@@ -166,51 +265,56 @@ int lafs_release_page(struct page *page, gfp_t gfp_flags)
         * - free the data structures
         */
        b = (struct datablock *)page->private;
-       for (i=0; i<(1<<bits); i++) {
+       for (i = 0; i < (1<<bits); i++) {
                if (test_bit(B_WriteError, &b[i].b.flags))
                        set_bit(AS_EIO, &mapping->flags);
                if (test_bit(B_Dirty, &b[i].b.flags) ||
                    test_bit(B_Pinned, &b[i].b.flags) ||
-                   test_bit(B_Alloc, &b[i].b.flags)
+                   test_bit(B_IOLock, &b[i].b.flags) ||
+                   test_bit(B_Writeback, &b[i].b.flags)
                    /* NOTE: if we find an Uninc is set when we
-                    * need to invalidate the page, then we 
+                    * need to invalidate the page, then we
                     * should be waiting for all pages to be gone
                     * properly before allowing truncate to complete.
                     * The whole file doesn't need to be truncated yet,
                     * that can continue lazily. but all the pages must
                     * be incorporated.  Maybe we just need to
-                    * wait for a checkpoint here.?? 
+                    * wait for a checkpoint here.??
                     */
                    || test_bit(B_Uninc, &b[i].b.flags)
-                   /* || atomic_read(&b[i].b.refcnt) */
                        ) {
-                       printk("Cannot release %s\n", strblk(&b[i].b));
-                       if (!list_empty(&b[i].b.lru))
-                               printk("lru NOT empty\n");
                        spin_unlock(&mapping->private_lock);
-                       BUG();
-                       /* This not really a bug, but bugs can lead here, and this is
-                        * an unusual situation (currently) so we BUG here to be
-                        * safe.  When we find a situation that does fail
-                        * a release_page with good reason, we should remove
-                        * this BUG().
+                       /* This can happen in various valid situations
+                        * such as when cleaning a file that is only
+                        * read-only to the VM so the VM feels free
+                        * to try to release it.
                         */
                        return 0;
                }
        }
        /* OK, we are good to go. */
-       for (i=0; i<(1<<bits); i++) {
-               list_del_init(&b[i].b.siblings); // FIXME do I still want this here??
+       for (i = 0; i < (1<<bits); i++) {
+               parents[i] = b[i].b.parent;
+               b[i].b.parent = NULL;
+               list_del_init(&b[i].b.siblings);
                list_del_init(&b[i].b.lru);
                list_del_init(&b[i].b.peers);
-               getdref(&b[i]);
-               /* When !PagePrivate(page), && refcnt, we hold a ref on the 
-                * first block which hold a ref on the page.
+               (void)getdref_locked(&b[i], MKREF(lafs_release));
+               if (test_and_clear_bit(B_Credit, &b[i].b.flags))
+                       credits++;
+               if (test_and_clear_bit(B_ICredit, &b[i].b.flags))
+                       credits++;
+               if (test_and_clear_bit(B_NCredit, &b[i].b.flags))
+                       credits++;
+               if (test_and_clear_bit(B_NICredit, &b[i].b.flags))
+                       credits++;
+               /* When !PagePrivate(page), && refcnt, we hold a ref on the
+                * first block which holds a ref on the page.
                 * When ref on firstblock with !PagePrivate(page) becomes zero,
                 * we free
                 */
                if (i)
-                       getdref(&b[0]);
+                       getdref_locked(&b[0], MKREF(lafs_release_0));
                else
                        get_page(page);
        }
@@ -219,15 +323,16 @@ int lafs_release_page(struct page *page, gfp_t gfp_flags)
        ClearPagePrivate(page);
 
        spin_unlock(&mapping->private_lock);
-       
-       for (i=0; i<(1<<bits); i++)
-               if (b[i].b.parent)
-                       putref(&b[i].b.parent->b);
+       lafs_space_return(fs_from_inode(mapping->host), credits);
+
+       for (i = 0; i < (1<<bits); i++) {
+               putdref(&b[i], MKREF(lafs_release));
+               putiref(parents[i], MKREF(child));
+       }
 
        return 1;
 }
 
-
 /* Pinning and dirtying of datablocks.
  * Before a modification of a datablock can be allowed we must be sure there
  * will be room to write it out.  Thus suitable pre-allocations are required.
@@ -273,221 +378,330 @@ lafs_reserve_block(struct block *b, int alloc_type)
 {
        int err = 0;
        struct fs *fs = fs_from_inode(b->inode);
+       int in_emergency;
+
+       if (!test_bit(B_PhysValid, &b->flags))
+               b->physaddr = 0;
 
        if (test_bit(B_Index, &b->flags))
-               BUG_ON(b->parent == NULL && !test_bit(B_Root, &b->flags));
+               LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags),
+                        b);
        else
                err = lafs_setparent(dblk(b));
+       if (err)
+               return err;
 
-       if (err) printk("EA err=%d\n", err);
-       if (test_bit(B_Realloc, &b->flags))
-               alloc_type = CleanSpace;
-       if (alloc_type == NewSpace && b->physaddr)
+       /* If there is already a physaddr, or the data is
+        * stored in the inode, then we aren't really allocating
+        * new space.
+        * When unlinking from a small directory, this can
+        * be an issue.
+        */
+       if (alloc_type == NewSpace &&
+           (b->physaddr || (b->fileaddr == 0
+                            && LAFSI(b->inode)->depth == 0)))
                alloc_type = ReleaseSpace;
 
-       if (alloc_type == NewSpace && !b->physaddr &&
-           b->fileaddr == 0 &&
-           LAFSI(b->inode)->depth == 0) {
-               /* HACK. If the file isn't growing, we can use
-                * ReleaseSpace.  But we don't really know.
-                * This is needed for unlink
-                */
-               alloc_type = ReleaseSpace;
-       }
-       if (alloc_type == NewSpace && test_bit(B_InoIdx, &b->flags)) {
-               /* HACK, phys isn't necessarily set for the InoIdx block.
-                * Is this a good way to handle that?
+       if (alloc_type == NewSpace && test_bit(B_InoIdx, &b->flags))
+               /* physaddr isn't necessarily set for the InoIdx block.
                 */
                alloc_type = ReleaseSpace;
-       }
 
-       err = err ?: lafs_prealloc(b, alloc_type);
-       if (err) printk("EB err=%d %s\n", err, strblk(b));
+       /* Important to test EmergencyClean before we
+        * called in to lafs_space_alloc to avoid races:
+        * space becomes available and EmergencyClean are
+        * set at the same time (strange, but true).
+        */
+       in_emergency = test_bit(EmergencyClean, &fs->fsstate);
+       /* Allocate space in the filesystem */
+       err = lafs_prealloc(b, alloc_type);
+       if (err) {
+               if (alloc_type == NewSpace) {
+                       if (in_emergency)
+                               return -ENOSPC;
+                       return -EAGAIN;
+               }
+               if (alloc_type == ReleaseSpace)
+                       return -EAGAIN;
+               LAFS_BUG(1, b);
+       }
 
-       if (err == 0 && b->physaddr == 0 &&
+       /* Allocate space in the file (and quota set) */
+       if (b->physaddr == 0 &&
+           !test_bit(B_Index, &b->flags) &&
            !test_and_set_bit(B_Prealloc, &b->flags)) {
                err = lafs_summary_allocate(fs, b->inode, 1);
-       if (err) printk("EC err=%d\n", err);
                if (err)
                        clear_bit(B_Prealloc, &b->flags);
        }
-       if (err)
+       if (err) {
+               LAFS_BUG(alloc_type == AccountSpace, b);
                return err;
-
-       if ( ! test_and_set_bit(B_SegRef, &b->flags) && b->physaddr) {
-               lafs_seg_ref(fs, b->physaddr, 0);
-               /* FIXME can that fail?? */
-               /* It shouldn't fail, but might block */
-               /* setting SegRef before getting the ref is needed to
-                * avoid infinite recursion.  but it isn't really
-                * safe....
-                */
        }
 
-       return 0;
+       /* Having reserved the block, we need to get a segref,
+        * which will involve reserving those blocks too.
+        * However we never get a segref for Root or any
+        * InoIdx block.
+        */
+
+       if (test_bit(B_InoIdx, &b->flags))
+               b = &LAFSI(b->inode)->dblock->b;
+
+       while (err == 0
+              && !test_bit(B_Root, &b->flags)
+              && !test_bit(B_SegRef, &b->flags))
+               err = lafs_seg_ref_block(b, 0);
+
+       return err;
 }
 
 int
-lafs_pin_dblock(struct datablock *b)
+lafs_pin_dblock(struct datablock *b, int alloc_type)
 {
        /* We need to:
         * - pin parents and inode
         * - preallocate as needed
-        * - reference the old segment 
+        * - reference the old segment
         * - update flags and pointers.
         */
+       /* FIXME I probably need an iolock here to avoid racing with
+        * lafs_cluster_allocate which can clear dirty and so lose credits.
+        */
        int err;
        struct fs *fs = fs_from_inode(b->b.inode);
-       struct block *blk;
 
-       /* We don't pin a datablock of an inode if there is an
-        * InoIdx block. We pin the InoIdx block instead.
-        * They might both be pinned at the same time, but
-        * only when the index block has swapped phase and the 
-        * data block is waiting to be written.
-        */
-       if (LAFSI(b->b.inode)->type == TypeInodeFile &&
-           b->my_inode &&
-           LAFSI(b->my_inode)->iblock) {
-               struct indexblock *ib = lafs_make_iblock(b->my_inode, 1, 0);
-               if (IS_ERR(ib))
-                       blk = getref(&b->b);
-               else
-                       blk = &ib->b;
-       } else
-               blk = getref(&b->b);
-
-       BUG_ON(!test_phase_locked(fs));
-
-       lafs_phase_wait(blk);
+       LAFS_BUG(!test_bit(B_PinPending, &b->b.flags), &b->b);
+       if (LAFSI(b->b.inode)->type != TypeSegmentMap) {
+               LAFS_BUG(!test_phase_locked(fs), &b->b);
+               lafs_iolock_written(&b->b);
+               /* If this block is already pinned in the previous
+                * phase, now it a good time to flip it - we know it has
+                * been written and we want to flip it before it
+                * can be dirtied.
+                */
+               if (test_bit(B_Pinned, &b->b.flags) &&
+                   !!test_bit(B_Phase1, &b->b.flags) != fs->phase) {
+                       clear_bit(B_PinPending, &b->b.flags);
+                       lafs_refile(&b->b, 0);
+                       set_bit(B_PinPending, &b->b.flags);
+               }
+               lafs_iounlock_block(&b->b);
+       }
 
-       set_bit(B_PinPending, &b->b.flags);
-       err = lafs_reserve_block(blk, NewSpace);
+       err = lafs_reserve_block(&b->b, alloc_type);
 
-       if (err) {
-               clear_bit(B_PinPending, &b->b.flags);
-               putref(blk);
+       if (err)
                return err;
-       }
 
-       lafs_pin_block(blk);
-       putref(blk);
+       lafs_pin_block(&b->b);
        return 0;
 }
 
 /* lafs_dirty_dblock
  * This cannot fail.  The block is already 'pinned' for writing
  * so any preallocations and other checks have passed.
- * We mark the block as being dirty and possibly attach it to the
- * current phase.
  */
 void
 lafs_dirty_dblock(struct datablock *b)
 {
-       /* FIXME is this all I have to do here?
-        * Do I need to put it on a list, or lock or something?
-
-        * Note, only need to set that phase if locked.
-        * Then no-one may change it while in phase transition.
+       LAFS_BUG(!test_bit(B_Valid, &b->b.flags), &b->b);
+       /*
         * FIXME maybe check we aren't dirtying a dirty block
         * in the previous phase.
         */
-
-       set_bit(B_Valid, &b->b.flags);
-
-       if (!test_and_set_bit(B_Dirty, &b->b.flags))
-               if (!test_and_clear_bit(B_Realloc, &b->b.flags))
-                       if (!test_and_clear_bit(B_Credit, &b->b.flags))
-                               if (!test_and_clear_bit(B_NCredit, &b->b.flags))
-                               BUG(); // Credit should have been set.
-
+//     LAFS_BUG(b->b.inode->i_ino == 0 && !test_bit(B_Pinned, &b->b.flags), &b->b);
+       if (!test_and_set_bit(B_Dirty, &b->b.flags)) {
+               if (!test_and_clear_bit(B_Credit, &b->b.flags))
+                       if (!test_and_clear_bit(B_NCredit, &b->b.flags))
+                               LAFS_BUG(1, &b->b); // Credit should have been set.
+               __set_page_dirty_nobuffers(b->page);
+       }
        if (!test_and_set_bit(B_UnincCredit, &b->b.flags))
                if (!test_and_clear_bit(B_ICredit, &b->b.flags))
                        if (!test_and_clear_bit(B_NICredit, &b->b.flags))
-                               BUG(); // ICredit should be set before we dirty a block.
-
-       if (test_and_clear_bit(B_Realloc, &b->b.flags))
-               lafs_space_return(fs_from_inode(b->b.inode), 1);
-
-       // FIXME Do I need to do something with PinPending??
-
+                               LAFS_BUG(1, &b->b);     /* ICredit should be set before we dirty
+                                                        * a block. */
 }
 
-void
-lafs_erase_dblock(struct datablock *b)
+static void
+erase_dblock_locked(struct datablock *b)
 {
        struct fs *fs = fs_from_inode(b->b.inode);
-       if (!b->b.parent) {
-               printk("erase with no parent: %s\n", strblk(&b->b));
-       }
+
+       dprintk("Eraseblock for %s\n", strblk(&b->b));
        if (b->b.physaddr == 0 &&
            b->b.fileaddr == 0 &&
            LAFSI(b->b.inode)->depth == 0) {
-               /* This lives in the inode, so we cannot drop the
-                * block, we have to zero it.
+               /* We need to clear out the index block that this
+                * block lives in.
+                * Need private_lock to be allowed to dereference ->iblock
+                * though if b was dirty we shouldn't.... FIXME.
+                * We need to hold the ref to idb for the getiref_locked_needsync to
+                * be safe.
                 */
-               char *buf = map_dblock(b);
-               memset(buf, 0, (1<<b->b.inode->i_blkbits));
-               unmap_dblock(b, buf);
-               return;
+               struct indexblock *ib;
+               struct datablock *idb = lafs_inode_dblock(b->b.inode, SYNC, MKREF(erase));
+               if (IS_ERR(idb)) {
+                       /* not much we can do here */
+                       BUG();
+                       goto skip;
+               }
+               spin_lock(&lafs_hash_lock);
+               ib = LAFSI(b->b.inode)->iblock;
+               if (ib)
+                       getiref_locked_needsync(ib, MKREF(erasedblock));
+               spin_unlock(&lafs_hash_lock);
+               sync_ref(&ib->b);
+               putdref(idb, MKREF(erase));
+               if (ib) {
+                       lafs_iolock_written(&ib->b);
+                       if (ib->depth == 0) {
+                               LAFS_BUG(LAFSI(b->b.inode)->depth !=
+                                        ib->depth, &b->b);
+                               ib->depth = 1;
+                               LAFSI(b->b.inode)->depth = 1;
+                               lafs_clear_index(ib);
+                               clear_bit(B_PhysValid, &b->b.flags);
+                               clear_bit(B_SegRef, &b->b.flags); /* Just in case */
+                       }
+                       lafs_iounlock_block(&ib->b);
+                       putiref(ib, MKREF(erasedblock));
+               }
+       skip:;
        }
-       lafs_iolock_block(&b->b);
+
+       if (LAFSI(b->b.inode)->type == TypeInodeFile) {
+               struct inode *ino = rcu_my_inode(b);
+               if (ino && LAFSI(ino)->iblock)
+                       LAFS_BUG(LAFSI(ino)->iblock->depth > 1,
+                                &b->b);
+               rcu_iput(ino);
+       }
+
        clear_bit(B_Valid, &b->b.flags);
+       lafs_unclean(b);
        if (test_and_clear_bit(B_Dirty, &b->b.flags))
                lafs_space_return(fs, 1);
+       if (test_and_clear_bit(B_Realloc, &b->b.flags))
+               lafs_space_return(fs, 1);
        if (test_and_clear_bit(B_Prealloc, &b->b.flags))
                if (b->b.physaddr == 0)
                        lafs_summary_allocate(fs, b->b.inode, -1);
-       lafs_iounlock_block(&b->b, 0);
 
-       if (b->b.physaddr)
+       spin_lock(&fs->lock);
+       if (test_bit(B_Pinned, &b->b.flags)) {
+               /* When erasing a pinned dblock it will usually be on a
+                * leaf list, so we must remove it.
+                * However it is IOLocked so it might not be on the leaf list.
+                */
+               LAFS_BUG(test_bit(B_Writeback, &b->b.flags), &b->b);
+               if (!list_empty(&b->b.lru)) {
+                       list_del_init(&b->b.lru);
+               }
+               if (!test_bit(B_Root, &b->b.flags))
+                       atomic_dec(&b->b.parent->pincnt
+                                  [!!test_bit(B_Phase1, &b->b.flags)]);
+               clear_bit(B_Pinned, &b->b.flags);
+               spin_unlock(&fs->lock);
+               if (!test_bit(B_Root, &b->b.flags))
+                       lafs_refile(&b->b.parent->b, 0);
+       } else
+               spin_unlock(&fs->lock);
+
+       /* we set Writeback to validate the call to lafs_allocated block */
+       set_bit(B_Writeback, &b->b.flags);
+       lafs_iounlock_block(&b->b);
+       if (b->b.parent == NULL)
+               /* Erasing a block that isn't in the indexing tree only
+                * happens when truncating and lafs_invalidate_page is called
+                * on some clean page.
+                * So we don't clear out physaddr here, but instead leave that
+                * to the core truncate code.
+                * Just remove B_PhysValid to avoid confusion.
+                */
+               clear_bit(B_PhysValid, &b->b.flags);
+       else if (b->b.physaddr)
                lafs_allocated_block(fs, &b->b, 0);
-       else if (test_and_clear_bit(B_UnincCredit, &b->b.flags))
-               lafs_space_return(fs, 1);
+       else
+               if (test_and_clear_bit(B_UnincCredit, &b->b.flags))
+                       lafs_space_return(fs, 1);
+       lafs_writeback_done(&b->b);
 }
 
 void
-lafs_erase_iblock(struct indexblock *b)
+lafs_erase_dblock(struct datablock *b)
 {
-       struct fs *fs = fs_from_inode(b->b.inode);
-
-       clear_bit(B_Valid, &b->b.flags);
-       if (test_and_clear_bit(B_Dirty, &b->b.flags))
-               lafs_space_return(fs, 1);
+       lafs_iolock_written(&b->b);
+       erase_dblock_locked(b);
+}
 
-       if (b->b.physaddr)
-               lafs_allocated_block(fs, &b->b, 0);
+int
+lafs_erase_dblock_async(struct datablock *b)
+{
+       int rv;
+       rv = lafs_iolock_written_async(&b->b);
+       if (rv)
+               erase_dblock_locked(b);
+       return rv;
 }
 
 void
-lafs_dirty_iblock(struct indexblock *b)
+lafs_dirty_iblock(struct indexblock *b, int want_realloc)
 {
-
-       /* FIXME is this all I have to do here?
-        * Do I need to put it on a list, or lock or something?
-
-        * Note, only need to set that phase if locked.
+       /* Note, only need to set the phase if locked.
         * Then no-one may change it while in phase transition.
         * FIXME maybe check we aren't dirtying a dirty block
         * in the previous phase.
         */
 
-       set_bit(B_Valid, &b->b.flags);
+       LAFS_BUG(!test_bit(B_Pinned, &b->b.flags), &b->b);
+       LAFS_BUG(!test_bit(B_Valid, &b->b.flags) && b->depth > 0, &b->b);
+
+       if (want_realloc) {
+               /* Try to make for Realloc instead.  If we cannot get the
+                * credits, fall back on Dirty
+                */
+               struct fs *fs = fs_from_inode(b->b.inode);
+               if (!test_bit(B_Realloc, &b->b.flags)) {
+                       /* I cannot use B_Credit to fill B_Realloc as that
+                        * might still be needed for B_Dirty.
+                        * So if we cannot allocated a new credit,
+                        * just set the block as 'dirty' now.
+                        */
+                       if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
+                               if (test_and_set_bit(B_Realloc, &b->b.flags))
+                                       lafs_space_return(fs, 1);
+                       } else
+                               goto dirty;
+               }
+               if (!test_bit(B_UnincCredit, &b->b.flags)) {
+                       /* Ditto for UnincCredit */
+                       if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
+                               if (test_and_set_bit(B_UnincCredit, &b->b.flags))
+                                       lafs_space_return(fs, 1);
+                       } else
+                               goto dirty;
+               }
+               return;
+       }
+dirty:
        if (!test_and_set_bit(B_Dirty, &b->b.flags)) {
-               /* FIXME is it completely safe to just clear Realloc here??? */
-               if (!test_and_clear_bit(B_Realloc, &b->b.flags))
-                       if (!test_and_clear_bit(B_Credit, &b->b.flags)) {
-                               printk("Why have I no credits? %s\n", strblk(&b->b));
-                               BUG(); // Credit should have been set.
-                       }
+               if (!test_and_clear_bit(B_Credit, &b->b.flags)) {
+                       printk(KERN_ERR "Why have I no credits?\n");
+                       LAFS_BUG(1, &b->b); // Credit should have been set.
+               }
        }
 
-       if (!test_and_set_bit(B_UnincCredit, &b->b.flags))
-               if (!test_and_clear_bit(B_ICredit, &b->b.flags))
-                       BUG(); // ICredit should be set before we dirty a block.
-       if (test_and_clear_bit(B_Realloc, &b->b.flags))
-               lafs_space_return(fs_from_inode(b->b.inode), 1);
-
-       // FIXME Do I need to do something with PinPending??
-
+       /* Iblocks don't always have ICredits.  If they have room
+        * for 3 new addresses, the ICredit is not essential.  But
+        * it is preferred.
+        */
+       if (!test_bit(B_UnincCredit, &b->b.flags))
+               /* We would like a credit */
+               if (test_and_clear_bit(B_ICredit, &b->b.flags))
+                       /* We have a credit */
+                       if (test_and_set_bit(B_UnincCredit, &b->b.flags))
+                               /* race - we didn't need it after all */
+                               lafs_space_return(fs_from_inode(b->b.inode), 1);
 }