index.c

   1
   2 /*
   3  * Handle index block for LaFS.
   4  * fs/lafs/index.c
   5  * Copyright (C) 2005-2009
   6  * Neil Brown <neilb@suse.de>
   7  * Released under the GPL, version 2
   8  */
   9
  10 #include        "lafs.h"
  11 #include        <linux/types.h>
  12 #include        <linux/hash.h>
  13 #include        <linux/slab.h>
  14 /*
  15  * For each index-block there is an 'indexblock' data structure,
  16  * and a 'block' of memory, which is a page or part there-of.
  17  *
  18  * Index-blocks are stored in a hash table, indexed by
  19  * (struct inode *, fileaddr, depth).  There is one hash table
  20  * for all filesystems.
  21  *
  22  * There is a single free-list for all index blocks from all
  23  * filesystems, of all sizes.  Index blocks only get on this list when
  24  * they are unpinned leafs.  i.e. their 'children' list is empty.  As they
  25  * are freed, their parents may get added to the list.
  26  *
  27  * It's not clear yet how memory pressure will flush out dirty index blocks.
  28  *
  29  * All indexblocks for an inode are either attached under the inode via
  30  * the children/siblings tree, or on a per-inode 'free' list also using
  31  * the 'siblings' linkage. Thus we are sure that they can be removed when
  32  * the inode is removed.  Note that data blocks do *not* necessarily live
  33  * in this tree (though they will when they are dirty).  They can always be
  34  * found in the radix_tree for the inode's address-space.
  35  *
  36  * STILL TO DO:
  37  *   Consider how 'rcu' can be used for
  38  *     - hash table
  39  *     - freelist
  40  *   Consider gang-freeing for adding to head of lru lists.
  41  */
  42
  43 /* FIXME these should be configured at runtime based on memory */
  44 #define HASHBITS        10
  45 static struct hlist_head hash_table[1<<HASHBITS];
  46 spinlock_t lafs_hash_lock;
  47
  48 /* static */ struct freelists freelist;
  49
  50 static int lafs_shrinker(int nr_to_scan, /*gfp_t*/unsigned int gfp_mask)
  51 {
  52         if (nr_to_scan) {
  53                 LIST_HEAD(togo);
  54                 if (!(gfp_mask & __GFP_FS))
  55                         return -1;
  56                 dprintk("scan %d\n", nr_to_scan);
  57                 spin_lock(&lafs_hash_lock);
  58                 while (nr_to_scan && !list_empty(&freelist.lru)) {
  59                         struct indexblock *ib = list_entry(freelist.lru.next,
  60                                                            struct indexblock,
  61                                                            b.lru);
  62                         /* Check if it is really free */
  63
  64                         if (atomic_read(&ib->b.refcnt)) {
  65                                 list_del_init(&ib->b.lru);
  66                                 clear_bit(B_OnFree, &ib->b.flags);
  67                                 freelist.freecnt--;
  68                                 continue;
  69                         }
  70                         if (test_bit(B_InoIdx, &ib->b.flags))
  71                                 LAFSI(ib->b.inode)->iblock = NULL;
  72
  73                         /* Any pinned iblock must have children, be
  74                          * refcounted, or be on a 'leaf' lru, which gives
  75                          * a refcount.  So this unreffed block cannot be
  76                          * pinned.  This BUG_ON is here because previously
  77                          * the above 'if' also tested for 'Pinned', and that
  78                          * seems unnecessary, but I want to be sure.
  79                          */
  80                         LAFS_BUG(test_bit(B_Pinned, &ib->b.flags), &ib->b);
  81                         if (!hlist_unhashed(&ib->hash))
  82                                 hlist_del_init(&ib->hash);
  83                         /* delete from inode->free_index */
  84                         list_del_init(&ib->b.siblings);
  85                         list_move(&ib->b.lru, &togo);
  86                         nr_to_scan--;
  87                         freelist.freecnt--;
  88                 }
  89                 spin_unlock(&lafs_hash_lock);
  90                 while (!list_empty(&togo)) {
  91                         struct indexblock *ib = list_entry(togo.next,
  92                                                            struct indexblock,
  93                                                            b.lru);
  94                         list_del(&ib->b.lru);
  95                         lafs_iblock_free(ib);
  96                 }
  97         }
  98         return freelist.freecnt;
  99 }
 100
 101 void lafs_release_index(struct list_head *head)
 102 {
 103         /* This is the other place that index blocks get freed.
 104          * They are freed either by memory pressure in lafs_shrinker
 105          * above, or when an inode is destroyed by this code.
 106          * They need to both obey the same locking rules, and ensure
 107          * blocks are removed from each of siblings, lru, and hash.
 108          * This list is an inode's free_index and are linked on
 109          * b.siblings.  They are all on the freelist.lru
 110          */
 111         LIST_HEAD(togo);
 112         spin_lock(&lafs_hash_lock);
 113         while (!list_empty(head)) {
 114                 struct indexblock *ib = list_entry(head->next,
 115                                                    struct indexblock,
 116                                                    b.siblings);
 117                 list_del_init(&ib->b.siblings);
 118                 LAFS_BUG(!test_bit(B_OnFree, &ib->b.flags), &ib->b);
 119                 if (!hlist_unhashed(&ib->hash))
 120                         hlist_del_init(&ib->hash);
 121                 list_move(&ib->b.lru, &togo);
 122                 freelist.freecnt--;
 123         }
 124         spin_unlock(&lafs_hash_lock);
 125         while (!list_empty(&togo)) {
 126                 struct indexblock *ib = list_entry(togo.next,
 127                                                    struct indexblock,
 128                                                    b.lru);
 129                 list_del(&ib->b.lru);
 130                 lafs_iblock_free(ib);
 131         }
 132 }
 133
 134 struct indexblock *
 135 lafs_iblock_alloc(struct fs *fs, int gfp, int with_buffer, REFARG)
 136 {
 137         struct indexblock *ib;
 138
 139         ib = kzalloc(sizeof(*ib), gfp);
 140         if (!ib)
 141                 return NULL;
 142         if (with_buffer) {
 143                 ib->data = kmalloc(fs->blocksize, gfp);
 144                 if (!ib->data) {
 145                         kfree(ib);
 146                         return NULL;
 147                 }
 148         } else
 149                 ib->data = NULL;
 150
 151         ib->b.flags = 0;
 152         set_bit(B_Index, &ib->b.flags);
 153         atomic_set(&ib->b.refcnt, 1);
 154         add_ref(&ib->b, REF, __FILE__, __LINE__);
 155
 156         INIT_LIST_HEAD(&ib->children);
 157         atomic_set(&ib->pincnt[0], 0);
 158         atomic_set(&ib->pincnt[1], 0);
 159         ib->uninc = NULL;
 160         ib->uninc_next = NULL;
 161
 162         ib->b.parent = NULL;
 163         INIT_LIST_HEAD(&ib->b.siblings);
 164         INIT_LIST_HEAD(&ib->b.lru);
 165         INIT_LIST_HEAD(&ib->b.peers);
 166         INIT_HLIST_NODE(&ib->hash);
 167         ib->b.chain = NULL;
 168         ib->uninc_table.pending_cnt = 0;
 169         ib->uninc_table.credits = 0;
 170         ib->b.inode = NULL;
 171         return ib;
 172 }
 173
 174 void lafs_iblock_free(struct indexblock *ib)
 175 {
 176         kfree(ib->data);
 177         kfree(ib);
 178 }
 179
 180 static struct shrinker hash_shrink = {
 181         .shrink = lafs_shrinker,
 182         .seeks = DEFAULT_SEEKS,
 183 };
 184
 185 int lafs_ihash_init(void)
 186 {
 187         int i;
 188         for (i = 0 ; i < (1<<HASHBITS); i++)
 189                 INIT_HLIST_HEAD(hash_table+i);
 190         spin_lock_init(&lafs_hash_lock);
 191         INIT_LIST_HEAD(&freelist.lru);
 192         register_shrinker(&hash_shrink);
 193
 194         return 0;
 195 }
 196
 197 void lafs_ihash_free(void)
 198 {
 199         unregister_shrinker(&hash_shrink);
 200 }
 201
 202 static int
 203 ihash(struct inode *inode, faddr_t addr, int depth)
 204 {
 205         unsigned long h = hash_ptr(inode, BITS_PER_LONG);
 206         h = hash_long(h ^ addr, BITS_PER_LONG);
 207         return hash_long(h ^ depth, HASHBITS);
 208 }
 209
 210 static struct indexblock *
 211 ihash_lookup(struct inode *inode, faddr_t addr, int depth,
 212              struct indexblock *new, REFARG)
 213 {
 214         struct indexblock *ib = NULL;
 215         struct hlist_node *tmp;
 216         struct hlist_head *head =  &hash_table[ihash(inode, addr, depth)];
 217
 218         spin_lock(&lafs_hash_lock);
 219         hlist_for_each_entry(ib, tmp, head, hash)
 220                 if (ib->b.fileaddr == addr && ib->b.inode == inode &&
 221                     ib->depth == depth)
 222                         break;
 223         if (!tmp) {
 224                 if (new) {
 225                         hlist_add_head(&new->hash, head);
 226                         ib = new;
 227                 } else
 228                         ib = NULL;
 229         }
 230         if (ib)
 231                 getiref_locked_needsync(ib, REF);
 232         spin_unlock(&lafs_hash_lock);
 233         if (ib)
 234                 sync_ref(&ib->b);
 235         return ib;
 236 }
 237
 238 struct indexblock *lafs_getiref_locked(struct indexblock *ib)
 239 {
 240         int ok = 0;
 241         if (!ib)
 242                 return ib;
 243         LAFS_BUG(!test_bit(B_InoIdx, &ib->b.flags), &ib->b);
 244         if (spin_is_locked(&ib->b.inode->i_data.private_lock))
 245                 ok = 1;
 246         if (spin_is_locked(&lafs_hash_lock))
 247                 ok = 1;
 248         if (spin_is_locked(&fs_from_inode(ib->b.inode)->lock))
 249                 ok = 1;
 250         LAFS_BUG(!ok, &ib->b);
 251
 252         if (atomic_inc_return(&ib->b.refcnt) == 1) {
 253                 /* First reference to an InoIdx block implies a reference
 254                  * to the dblock
 255                  * If two different threads come here under different locks, one could
 256                  * exit before the ref on the dblock is taken.  However as each
 257                  * thread must own an indirect reference to be here, the last indirect ref to
 258                  * the dblock cannot disappear before all threads have exited this function,
 259                  * by which time this direct ref will be active.
 260                  */
 261                 struct datablock *db;
 262                 db = getdref(LAFSI(ib->b.inode)->dblock, MKREF(iblock));
 263                 LAFS_BUG(db == NULL, &ib->b);
 264         }
 265         return ib;
 266 }
 267
 268 struct indexblock *
 269 lafs_iblock_get(struct inode *ino, faddr_t addr, int depth, paddr_t phys, REFARG)
 270 {
 271         /* Find the index block with this inode/depth/address.
 272          * If it isn't in the cache, create it with given
 273          * phys address
 274          */
 275         struct indexblock *ib = ihash_lookup(ino, addr, depth, NULL, REF);
 276         struct indexblock *new;
 277         if (ib)
 278                 return ib;
 279
 280         if (!phys)
 281                 return NULL;
 282
 283         new = lafs_iblock_alloc(fs_from_inode(ino), GFP_KERNEL, 1, REF);
 284         if (!new)
 285                 return NULL;
 286         new->b.fileaddr = addr;
 287         new->b.inode = ino;
 288         new->depth = depth;
 289         new->b.physaddr = phys;
 290         set_bit(B_PhysValid, &new->b.flags);
 291
 292         ib = ihash_lookup(ino, addr, depth, new, REF);
 293         if (ib != new)
 294                 lafs_iblock_free(new);
 295         return ib;
 296 }
 297
 298 void lafs_hash_iblock(struct indexblock *ib)
 299 {
 300         /* This block was an InoIdx block but has just become a real
 301          * index block, so hash it.  There cannot be a block with this
 302          * address already as we haven't increased the inode depth yet.
 303          * Alternately this block was recently split off another,
 304          * and has now been incorporated, so a lookup might need
 305          * to find it.  So it better be in the hash table.
 306          */
 307         struct hlist_head *head =
 308                 &hash_table[ihash(ib->b.inode, ib->b.fileaddr, ib->depth)];
 309         spin_lock(&lafs_hash_lock);
 310         LAFS_BUG(!hlist_unhashed(&ib->hash), &ib->b);
 311         hlist_add_head(&ib->hash, head);
 312         spin_unlock(&lafs_hash_lock);
 313 }
 314
 315 void lafs_unhash_iblock(struct indexblock *ib)
 316 {
 317         /* This block has been emptied and deleted and is no longer
 318          * wanted in the hash tables
 319          */
 320         spin_lock(&lafs_hash_lock);
 321         LAFS_BUG(hlist_unhashed(&ib->hash), &ib->b);
 322         hlist_del_init(&ib->hash);
 323         spin_unlock(&lafs_hash_lock);
 324 }
 325
 326 /* adopt blk into parent if possible.
 327  */
 328 static void
 329 block_adopt(struct block *blk, struct indexblock *parent)
 330 {
 331         struct address_space *as;
 332
 333         if (parent == NULL) {
 334                 LAFS_BUG(!test_bit(B_Root, &blk->flags), blk);
 335                 return;
 336         }
 337
 338         if (blk->parent) {
 339                 LAFS_BUG(blk->parent != parent, blk);
 340                 return;
 341         }
 342
 343         LAFS_BUG(!(parent->b.flags & B_Valid), &parent->b);
 344
 345         as = &blk->inode->i_data;
 346
 347         spin_lock(&as->private_lock);
 348         if (blk->parent) {
 349                 if (blk->parent != parent) {
 350                         printk("blk = %s\n", strblk(blk));
 351                         printk("blk->p = %s\n", strblk(&blk->parent->b));
 352                         printk("parent = %s\n", strblk(&parent->b));
 353                         lafs_dump_tree();
 354                 }
 355                 LAFS_BUG(blk->parent != parent, blk);
 356         } else {
 357                 if (test_bit(B_EmptyIndex, &parent->b.flags)) {
 358                         LAFS_BUG(!test_bit(B_Index, &parent->b.flags), &parent->b);
 359                         LAFS_BUG(test_bit(B_InoIdx, &parent->b.flags), &parent->b);
 360                         LAFS_BUG(parent->b.fileaddr != parent->b.parent->b.fileaddr, &parent->b);
 361
 362                         clear_bit(B_EmptyIndex, &parent->b.flags);
 363                 }
 364                 LAFS_BUG(parent == iblk(blk), blk);
 365                 blk->parent = parent;
 366                 getiref(parent, MKREF(child));
 367                 /* Remove from the per-inode free list */
 368                 spin_lock(&lafs_hash_lock);
 369                 if (!test_bit(B_InoIdx, &blk->flags))
 370                         list_move(&blk->siblings, &parent->children);
 371                 else {
 372                         /* Remove from ->free_index list */
 373                         list_del_init(&blk->siblings);
 374                         /* Having set ->parent, we take a ref on the dblock.
 375                          * The dblock must exist because we can only reach here from
 376                          * lafs_make_iblock which can only be called while holding a ref
 377                          * on the dblock
 378                          */
 379                         (void)getdref(LAFSI(blk->inode)->dblock, MKREF(pdblock));
 380                 }
 381                 spin_unlock(&lafs_hash_lock);
 382         }
 383         spin_unlock(&as->private_lock);
 384 }
 385
 386 static int __must_check
 387 find_block(struct datablock *b, int adopt, int async);
 388
 389 static int __must_check
 390 setparent(struct datablock *blk, int async)
 391 {
 392         int err = 0;
 393         if (blk->b.parent == NULL && !test_bit(B_Root, &blk->b.flags))
 394                 err = find_block(blk, 1, async);
 395         return err;
 396 }
 397
 398 int __must_check
 399 lafs_setparent(struct datablock *blk)
 400 {
 401         return setparent(blk, 0);
 402 }
 403
 404 void lafs_pin_block_ph(struct block *b, int ph)
 405 {
 406         struct fs *fs = fs_from_inode(b->inode);
 407         struct block *p;
 408         struct inode *ino;
 409
 410         LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags), b);
 411         if (!test_bit(B_Realloc, &b->flags) &&
 412             LAFSI(b->inode)->type != TypeSegmentMap)
 413                 LAFS_BUG(!test_phase_locked(fs), b);
 414
 415         ino = b->inode;
 416         spin_lock(&ino->i_data.private_lock);
 417         for (p = b ;
 418              p && !test_bit(B_Pinned, &p->flags) ;
 419              p = &(p->parent)->b) {
 420                 set_phase(p, ph);
 421                 if (test_bit(B_InoIdx, &p->flags)) {
 422                         struct datablock *db = LAFSI(p->inode)->dblock;
 423
 424                         spin_unlock(&ino->i_data.private_lock);
 425                         lafs_inode_checkpin(p->inode);
 426                         ino = db->b.inode;
 427                         spin_lock(&ino->i_data.private_lock);
 428                 }
 429         }
 430         spin_unlock(&ino->i_data.private_lock);
 431         lafs_refile(b, 0);
 432 }
 433
 434 static void pin_all_children(struct indexblock *ib)
 435 {
 436         /* If we cannot get new credits for phase_flip, we need to
 437          * pin all dirty data-block children so that they get
 438          * written and don't remain to require a phase_flip
 439          * on the next checkpoint (at which point there will be
 440          * no credit left)
 441          * So if any dirty child is found, set phase.  We don't
 442          * need PinPending as it is already Dirty.  All other
 443          * requirement for pinning will already be met.
 444          */
 445         struct block *b;
 446         struct indexblock *start = NULL;
 447         int depth = 0;
 448         struct fs *fs = fs_from_inode(ib->b.inode);
 449         int ph = fs->phase;
 450
 451         /* Locking:
 452          * This is called with an IOLock on ib,  So it is unlikely
 453          * that children will be added or removed(?), but we really
 454          * should hole the inode private_lock anyway.
 455          */
 456         getiref(ib, MKREF(pinall));
 457         b = NULL;
 458 restart:
 459         /* We hold a reference on ib, and on 'start' if non-NULL */
 460         spin_lock(&ib->b.inode->i_data.private_lock);
 461         b = list_prepare_entry(b, &ib->children, siblings);
 462         list_for_each_entry_continue(b, &ib->children, siblings) {
 463                 /* FIXME I don't descend through inode blocks to the file below! */
 464                 if (test_bit(B_Index, &b->flags)) {
 465                         /* restart down */
 466                         depth++;
 467                         getref_locked(b, MKREF(pinall));
 468                         spin_unlock(&ib->b.inode->i_data.private_lock);
 469                         putiref(ib, MKREF(pinall));
 470                         ib = iblk(b);
 471                         b = NULL;
 472                         goto restart;
 473                 } else if (test_bit(B_Dirty, &b->flags) &&
 474                            !test_bit(B_Pinned, &b->flags)) {
 475                         getref_locked(b, MKREF(pinall));
 476                         spin_unlock(&ib->b.inode->i_data.private_lock);
 477                         set_phase(b, ph);
 478                         putref(b, MKREF(pinall));
 479                         /* Slightly safer to restart this level */
 480                         b = NULL;
 481                         goto restart;
 482                 }
 483         }
 484         spin_unlock(&ib->b.inode->i_data.private_lock);
 485         putiref(start, MKREF(pinall));
 486         if (depth) {
 487                 depth--;
 488                 start = ib;
 489                 ib = getiref(ib->b.parent, MKREF(pinall));
 490                 b = &start->b;
 491                 goto restart;
 492         }
 493         putiref(ib, MKREF(pinall));
 494 }
 495
 496 static void flip_phase(struct block *b)
 497 {
 498         struct indexblock *p;
 499         int oldphase = !!test_bit(B_Phase1, &b->flags);
 500
 501         LAFS_BUG(!test_bit(B_Pinned, &b->flags), b);
 502         if (oldphase)
 503                 clear_bit(B_Phase1, &b->flags);
 504         else
 505                 set_bit(B_Phase1, &b->flags);
 506
 507         p = b->parent;
 508
 509         if (p) {
 510                 atomic_inc(&p->pincnt[1-oldphase]);
 511                 atomic_dec(&p->pincnt[oldphase]);
 512         } else
 513                 LAFS_BUG(!test_bit(B_Root, &b->flags), b);
 514
 515         /* move 'next' credits to 'here' */
 516         if (!test_bit(B_Credit, &b->flags) &&
 517             test_and_clear_bit(B_NCredit, &b->flags))
 518                 set_bit(B_Credit, &b->flags);
 519
 520         if (!test_bit(B_ICredit, &b->flags) &&
 521             test_and_clear_bit(B_NICredit, &b->flags))
 522                 set_bit(B_ICredit, &b->flags);
 523 }
 524
 525 /* When the pinning of a block needs to be carried across a
 526  * checkpoint, we need to 'flip' the phase.
 527  * This only applies to blocks that can be pinned by a block that
 528  * may not be written until the next phase.
 529  * This includes any index block (is it may have some children in one
 530  * phase and some in the next) and any space-accounting block
 531  * for the same reason.
 532  * So indexblocks will need to flip, and use lafs_phase_flip.
 533  * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock.
 534  * TypeInodeFile don't need to be phase_flipped, though their InoIdx
 535  * block might.  InodeFile blocks are only pinned by metadata transactions
 536  * which happen inside a checkpoint lock.
 537  */
 538
 539 void lafs_flip_dblock(struct datablock *db)
 540 {
 541         /* This is an accounting block (SegmentMap or Quota)
 542          * which we need to write out after ->phase has changed
 543          * in the tail of the checkpoint.
 544          * We always flip the phase and reallocate from AccountSpace.
 545          * If all references get dropped, it will then get unpinned
 546          * before the next phase finished - we don't unpin here
 547          * (unlike lafs_phase_flip for index blocks).
 548          */
 549         flip_phase(&db->b);
 550         lafs_prealloc(&db->b, AccountSpace);
 551         /* Parent might need to be on a leaflist now */
 552         lafs_refile(&db->b.parent->b, 0);
 553 }
 554
 555 void lafs_phase_flip(struct fs *fs, struct indexblock *ib)
 556 {
 557         /* We are performing a checkpoint, this block has been written
 558          * out and now needs to be flipped into the next phase.
 559          *
 560          * It involves.
 561          *  - Processing all uninc_next blocks into uninc_table.
 562          *  - adjusting counts on parent
 563          *  - moving credits from 'next' to 'this' phase.
 564          *  - update block counts to included phase-delayed updates.
 565          */
 566         int oldphase = !!test_bit(B_Phase1, &ib->b.flags);
 567         struct block *ulist;
 568
 569         dprintk("FLIP %s\n", strblk(&ib->b));
 570
 571         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
 572         LAFS_BUG(!test_bit(B_Index, &ib->b.flags), &ib->b);
 573         LAFS_BUG(ib->b.parent == NULL && !test_bit(B_Root, &ib->b.flags), &ib->b);
 574         LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
 575         LAFS_BUG(atomic_read(&ib->pincnt[oldphase]), &ib->b);
 576         LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
 577
 578         /* If this is the only reference, and pincnts are zero
 579          * and all relevant flags are clear, then don't flip
 580          * phase but unpin.
 581          */
 582         if (test_bit(B_Pinned, &ib->b.flags) &&
 583             !test_bit(B_Dirty, &ib->b.flags) &&
 584             !test_bit(B_Realloc, &ib->b.flags) &&
 585             atomic_read(&ib->b.refcnt) == 1 && /* This is us */
 586
 587             ib->uninc_table.pending_cnt == 0 &&
 588             ib->uninc == NULL &&
 589             ib->uninc_next == NULL &&
 590             atomic_read(&ib->pincnt[0]) == 0 &&
 591             atomic_read(&ib->pincnt[1]) == 0 &&
 592
 593             (!test_bit(B_InoIdx, &ib->b.flags) ||
 594              !(test_bit(B_PinPending, &LAFSI(ib->b.inode)->dblock->b.flags)
 595                || test_bit(B_Dirty, &LAFSI(ib->b.inode)->dblock->b.flags)
 596                || test_bit(B_Realloc, &LAFSI(ib->b.inode)->dblock->b.flags)))
 597                 ) {
 598                 if (test_and_clear_bit(B_Pinned, &ib->b.flags)) {
 599                         if (!test_bit(B_Root, &ib->b.flags)) {
 600                                 atomic_dec(&ib->b.parent->pincnt[oldphase]);
 601                                 lafs_refile(&ib->b.parent->b, 0);
 602                         }
 603                         lafs_inode_checkpin(ib->b.inode);
 604                 }
 605                 if (test_bit(B_InoIdx, &ib->b.flags))
 606                         /* maybe data block should go in leaf list now */
 607                         lafs_refile(&LAFSI(ib->b.inode)->dblock->b, 0);
 608                 lafs_iounlock_block(&ib->b);
 609                 return;
 610         }
 611
 612         flip_phase(&ib->b);
 613
 614         if (test_bit(B_InoIdx, &ib->b.flags)) {
 615                 struct lafs_inode *lai = LAFSI(ib->b.inode);
 616                 struct datablock *db = lai->dblock;
 617                 struct inode *ino = db->b.inode;
 618
 619                 spin_lock(&ino->i_data.private_lock);
 620                 lai->cblocks += lai->pblocks;
 621                 lai->pblocks = 0;
 622                 lai->ciblocks += lai->piblocks;
 623                 lai->piblocks = 0;
 624                 if (lai->type == TypeInodeFile) {
 625                         /* Root of filesystem */
 626                         lai->md.fs.cblocks_used +=
 627                                 lai->md.fs.pblocks_used;
 628                         lai->md.fs.pblocks_used = 0;
 629                 }
 630                 spin_unlock(&ino->i_data.private_lock);
 631
 632                 /* maybe data block needs to be on leaf list */
 633                 lafs_refile(&db->b, 0);
 634         }
 635
 636         if (lafs_prealloc(&ib->b, ReleaseSpace) < 0) {
 637                 /* Couldn't get all N*Credit, so
 638                  * Pin all children to this phase, so they
 639                  * won't be needed.
 640                  */
 641                 pin_all_children(ib);
 642         }
 643
 644         spin_lock(&ib->b.inode->i_data.private_lock);
 645         ulist = ib->uninc_next;
 646         ib->uninc_next = NULL;
 647         spin_unlock(&ib->b.inode->i_data.private_lock);
 648
 649         lafs_iounlock_block(&ib->b);
 650
 651         if (ulist)
 652                 lafs_dirty_iblock(ib);
 653         while (ulist) {
 654                 struct block *b2 = ulist;
 655                 ulist = b2->chain;
 656                 b2->chain = NULL;
 657                 clear_bit(B_Uninc, &b2->flags);
 658                 while (lafs_add_block_address(fs, b2) == 0)
 659                         /* We had to incorporate b2->parent which might
 660                          * have split the parent but as that will have made
 661                          * it dirty, there is nothing extra that we need to
 662                          * do here
 663                          */
 664                         ;
 665                 putref(b2, MKREF(uninc));
 666         }
 667         LAFS_BUG(ib->uninc_next, &ib->b);
 668
 669         lafs_refile(&ib->b, 0);
 670         if (ib->b.parent)
 671                 /* Parent may need to be attached to a phase_leafs now */
 672                 lafs_refile(&ib->b.parent->b, 0);
 673 }
 674
 675 /*
 676  * lafs_refile is called whenever we finish doing stuff
 677  * to a block.  lafs_refile checks the state of the block and
 678  * make sure various lists etc are correct.
 679  * When finished with a block we often want to drop a reference,
 680  * and this functionality is included in refile so that the
 681  * decref and associated refiling can be done under a lock.
 682  *
 683  * Key issues that lafs_refile deals with are:
 684  *   Make sure ->lru is on correct list.
 685  *   Remove ->parent link if no longer needed.
 686  *   remove B_Lock if no longer needed
 687  *   trigger a phase change when appropriate
 688  *   Remove dir parent link if no longer needed.
 689  *
 690  * Fleshing these out a bit:
 691  *   possible lru lists are:
 692  *      phase_leafs[phase] - for dirty/pinned blocks with no pinned children
 693  *      clean_leafs - for blocks that are being relocated for cleaning
 694  *                       and that have no pinned children
 695  *   The parent link is only needed if this block has a refcount or one
 696  *     of B_Pinned, B_Dirty, B_Uninc
 697  *   B_Lock is only needed if one of
 698  *      B_Dirty
 699  *    or refcount
 700  *  though a refcount on a pinned datablock is only significant
 701  *         while the current phase is locked.
 702  *
 703  *  a phase change can (and should) happen for index blocks
 704  *  in the 'other' phase that are
 705  *    not Dirty, not Alloc, pincnt[oldphase]==0, uninc-table empty
 706  *
 707  *
 708  * Refiling one block may cause a change in another block which requires
 709  * a recursive refile.  Naturally we unroll the tail-recursion, but we
 710  * we need to be sure there are a limited number of tails.
 711  * The causes of recursion are:
 712  *   A datablock which no-longer has a PagePrivate page holds a ref
 713  *      on block0 of that page, which must be dropped on last put.
 714  *   When any block becomes unpinned, we must refile the parent.
 715  *   When we clear ->parent we must deref the parent.
 716  *   When an InoIdx block drops ->parent it is dropped from inode->iblock
 717  *      so we need to drop the implied refcnt on ->dblock
 718  * So there can only be 2 block to be recursed on to.
 719  *  A block0 or dblock, and a parent.
 720  * A block pointing to a block0 never has a parent, and an iblock has the same
 721  * parent as the dblock, so there can never be more than 2 outstanding blocks.
 722  */
 723 int lafs_is_leaf(struct block *b, int ph)
 724 {
 725         /* This is pinned and not iolocked.  Should it be on a leaf
 726          * list?
 727          */
 728         struct inode *myi;
 729         struct lafs_inode *lai;
 730         int rv = 1;
 731
 732         if (!test_bit(B_Pinned, &b->flags) ||
 733             test_bit(B_Writeback, &b->flags))
 734                 return 0;
 735
 736         if (test_bit(B_Index, &b->flags)) {
 737                 if (ph >= 0)
 738                         return atomic_read(&iblk(b)->pincnt[ph]) == 0;
 739                 return (atomic_read(&iblk(b)->pincnt[0]) +
 740                         atomic_read(&iblk(b)->pincnt[1])) == 0;
 741         }
 742
 743         /* This is a data block */
 744         myi = rcu_my_inode(dblk(b));
 745         if (!myi) {
 746                 /* Non-inode data blocks are always leafs */
 747                 return 1;
 748         }
 749         lai = LAFSI(myi);
 750         spin_lock(&lai->vfs_inode.i_data.private_lock);
 751         if (ph < 0)
 752                 ph = !!test_bit(B_Phase1, &b->flags);
 753         if (lai->iblock &&
 754             test_bit(B_Pinned, &lai->iblock->b.flags) &&
 755             !!test_bit(B_Phase1, &lai->iblock->b.flags) == ph)
 756                 /* iblock still pinned in the same phase, so
 757                  * this block isn't a leaf
 758                  */
 759                 rv = 0;
 760
 761         spin_unlock(&lai->vfs_inode.i_data.private_lock);
 762         rcu_iput(myi);
 763         return rv;
 764 }
 765
 766 /*
 767  * This for debugging and is racy and is probably only safe on UP
 768  */
 769 static void check_consistency(struct block *b)
 770 {
 771         /* sanity tests.
 772          * 1/ make sure pincnt is right
 773          */
 774         if (test_bit(B_Index, &b->flags)) {
 775                 int c[2];
 776                 struct block *cb;
 777                 c[0] = c[1] = 0;
 778                 list_for_each_entry(cb, &iblk(b)->children, siblings) {
 779                         struct inode *myi = NULL;
 780                         if (test_bit(B_Pinned, &cb->flags)) {
 781                                 int pp = !!test_bit(B_Phase1, &cb->flags);
 782                                 c[pp]++;
 783                         }
 784                         /* InoIdx block might be pinned but is not a direct
 785                          * child */
 786                         if (!test_bit(B_Index, &cb->flags) &&
 787                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
 788                             LAFSI(myi)->iblock &&
 789                             test_bit(B_Pinned, &LAFSI(myi)->iblock->b.flags)) {
 790                                 int pp = !!test_bit(B_Phase1,
 791                                                     &LAFSI(myi)->iblock->b.flags);
 792                                 c[pp]++;
 793                         }
 794                         rcu_iput(myi);
 795                 }
 796                 if (c[0] != atomic_read(&iblk(b)->pincnt[0]) ||
 797                     c[1] != atomic_read(&iblk(b)->pincnt[1])) {
 798                         printk("%d %d\n", c[0], c[1]);
 799                         lafs_dump_tree();
 800                         LAFS_BUG(1, b);
 801                 }
 802         }
 803
 804         if (b->parent) {
 805                 struct indexblock *ib = b->parent;
 806                 int c[2];
 807                 struct block *cb;
 808                 c[0] = c[1] = 0;
 809                 list_for_each_entry(cb, &ib->children, siblings) {
 810                         struct inode *myi = NULL;
 811                         if (test_bit(B_Pinned, &cb->flags)) {
 812                                 int pp = !!test_bit(B_Phase1, &cb->flags);
 813                                 c[pp]++;
 814                         }
 815                         /* InoIdx block might be pinned but is not a direct
 816                          * child */
 817                         if (!test_bit(B_Index, &cb->flags) &&
 818                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
 819                             LAFSI(myi)->iblock &&
 820                             test_bit(B_Pinned, &(LAFSI(myi)
 821                                                  ->iblock->b.flags))) {
 822                                 int pp = !!test_bit(B_Phase1,
 823                                                     &LAFSI(myi)
 824                                                     ->iblock->b.flags);
 825                                 c[pp]++;
 826                         }
 827                         rcu_iput(myi);
 828                 }
 829                 if (c[0] != atomic_read(&ib->pincnt[0]) ||
 830                     c[1] != atomic_read(&ib->pincnt[1])) {
 831                         printk("badcnt %d %d\n", c[0], c[1]);
 832                         LAFS_BUG(1, &ib->b);
 833                 }
 834         }
 835 }
 836
 837 static void set_lru(struct block *b)
 838 {
 839         struct fs *fs;
 840         int ph = !!test_bit(B_Phase1, &b->flags);
 841
 842         if (!test_bit(B_OnFree, &b->flags) && !list_empty_careful(&b->lru))
 843                 return;
 844         if (test_bit(B_IOLock, &b->flags) || !lafs_is_leaf(b, ph))
 845                 return;
 846
 847         /* This is close enough to a leaf that we should put it on a list.
 848          * If we raced and it isn't then it will be found and removed
 849          */
 850         if (test_bit(B_OnFree, &b->flags)) {
 851                 spin_lock(&lafs_hash_lock);
 852                 if (test_and_clear_bit(B_OnFree, &b->flags)) {
 853                         list_del_init(&b->lru);
 854                         freelist.freecnt--;
 855                 }
 856                 spin_unlock(&lafs_hash_lock);
 857         }
 858         fs = fs_from_inode(b->inode);
 859         spin_lock(&fs->lock);
 860         if (list_empty(&b->lru)) {
 861                 if (test_bit(B_Realloc, &b->flags))
 862                         list_add(&b->lru, &fs->clean_leafs);
 863                 else {
 864                         ph = !!test_bit(B_Phase1, &b->flags);
 865                         list_add(&b->lru, &fs->phase_leafs[ph]);
 866                 }
 867
 868         }
 869         spin_unlock(&fs->lock);
 870         /* allow lafs_iolock_block-empty to complete */
 871         lafs_io_wake(b);
 872 }
 873
 874 void lafs_refile(struct block *b, int dec)
 875 {
 876         struct block *next = NULL, *next_parent = NULL;
 877         struct fs *fs = NULL;
 878         u64 physref = 0;
 879
 880         if (!b)
 881                 return;
 882
 883         fs = fs_from_inode(b->inode);
 884         check_consistency(b);
 885
 886 /* To (mostly) avoid recursion, we have a loop which may
 887  * walk up to the parent.
 888  * The main reason for holding lafs_hash_lock is that it protects
 889  * ->lru - i.e. all the lists. FIXME that should be fs->lock
 890  */
 891         LAFS_BUG(atomic_read(&b->refcnt) == 0, b);
 892         while (b) {
 893                 int ph;
 894                 int free_me = 0;
 895                 struct inode *checkpin = NULL;
 896                 struct inode *myi = NULL;
 897                 struct datablock *db;
 898
 899                 set_lru(b);
 900
 901                 if (test_bit(B_Pinned, &b->flags) &&
 902                     !test_bit(B_Index, &b->flags) &&
 903                     !test_bit(B_PinPending, &b->flags) &&
 904                     !test_bit(B_Dirty, &b->flags) &&
 905                     !test_bit(B_Realloc, &b->flags)
 906                         ) {
 907                         /* Don't need to be Pinned any more */
 908                         /* FIXME do I need to lock access to ->parent */
 909                         lafs_checkpoint_lock(fs);
 910                         ph = !!test_bit(B_Phase1, &b->flags);
 911                         if (test_and_clear_bit(B_Pinned, &b->flags)) {
 912                                 if (test_bit(B_Dirty, &b->flags) ||
 913                                     test_bit(B_Realloc, &b->flags) ||
 914                                     test_bit(B_PinPending, &b->flags))
 915                                         /* lost a race */
 916                                         set_phase(b, ph);
 917                                 else if (!test_bit(B_Root, &b->flags)) {
 918                                         struct block *nb;
 919                                         atomic_dec(&b->parent->pincnt[ph]);
 920                                         nb = &b->parent->b;
 921                                         if (next_parent != nb) {
 922                                                 LAFS_BUG(next_parent, b);
 923                                                 next_parent = nb;
 924                                                 atomic_inc(&nb->refcnt);
 925                                         }
 926                                 }
 927                         }
 928                         lafs_checkpoint_unlock(fs);
 929                 }
 930
 931                 if (dec && atomic_dec_and_lock(&b->refcnt, &b->inode->i_data.private_lock)) {
 932                         /* PinPending disappears with the last non-lru reference,
 933                          * (or possibly at other times).
 934                          */
 935                         clear_bit(B_PinPending, &b->flags);
 936
 937                         ph = !!test_bit(B_Phase1, &b->flags);
 938                         /* See if we still need to be pinned */
 939                         if (test_bit(B_Pinned, &b->flags) &&
 940                             !test_bit(B_Dirty, &b->flags) &&
 941                             !test_bit(B_Realloc, &b->flags)) {
 942                                 /* Don't need to be Pinned any more */
 943                                 if (test_and_clear_bit(B_Pinned, &b->flags)) {
 944                                         if (!list_empty_careful(&b->lru)) {
 945                                                 spin_lock(&fs->lock);
 946                                                 list_del_init(&b->lru);
 947                                                 spin_unlock(&fs->lock);
 948                                         }
 949                                         if (test_bit(B_InoIdx, &b->flags) &&
 950                                             b->inode->i_nlink)
 951                                                 checkpin = b->inode;
 952                                         if (!test_bit(B_Root, &b->flags)) {
 953                                                 struct block *nb;
 954                                                 atomic_dec(&b->parent->pincnt[ph]);
 955                                                 if (test_bit(B_InoIdx, &b->flags))
 956                                                         nb = &LAFSI(b->inode)->dblock->b;
 957                                                 else
 958                                                         nb = &b->parent->b;
 959                                                 if (next_parent != nb) {
 960                                                         LAFS_BUG(next_parent, b);
 961                                                         next_parent = nb;
 962                                                         atomic_inc(&nb->refcnt);
 963                                                 }
 964                                         }
 965                                 }
 966                         }
 967
 968                         /* check the ->parent link */
 969                         if (b->parent &&
 970                             !(b->flags & (
 971                                       (1<<B_Pinned) |
 972                                       (1<<B_Uninc) |
 973                                       (1<<B_Realloc) |
 974                                       (1<<B_Dirty)))
 975                                 ) {
 976                                 int credits;
 977                                 /* Don't need ->parent any more */
 978                                 if (next_parent == NULL)
 979                                         next_parent = &b->parent->b;
 980                                 else if (next_parent == &b->parent->b ||
 981                                          next_parent->parent == b->parent) {
 982                                         if (atomic_dec_and_test(&b->parent->b.refcnt))
 983                                                 LAFS_BUG(1, b);
 984                                 } else
 985                                         LAFS_BUG(1, b);
 986
 987                                 del_ref(&b->parent->b, MKREF(child),
 988                                         __FILE__, __LINE__);
 989                                 b->parent = NULL;
 990
 991                                 if (test_bit(B_InoIdx, &b->flags)) {
 992                                         /* While an InoIdx has a parent we hold a count on
 993                                          * the dblock.  Now we have dropped one, we must drop the
 994                                          * other
 995                                          */
 996                                         struct datablock *db = LAFSI(b->inode)->dblock;
 997                                         if (&db->b.parent->b != next_parent &&
 998                                             &db->b != next_parent) {
 999                                                 printk("db    = %s\n", strblk(&db->b));
1000                                                 printk("dp->p = %s\n", strblk(&db->b.parent->b));
1001                                                 printk("np    = %s\n", strblk(next_parent));
1002                                                 printk("b     = %s\n", strblk(b));
1003                                         }
1004                                         BUG_ON(&db->b.parent->b != next_parent && &db->b != next_parent);
1005                                         if (atomic_dec_and_test(&next_parent->refcnt))
1006                                                 LAFS_BUG(1, b);
1007                                         next_parent = &db->b;
1008                                         del_ref(&db->b, MKREF(pdblock), __FILE__, __LINE__);
1009                                 }
1010                                 if (test_and_clear_bit(B_SegRef, &b->flags))
1011                                         physref = b->physaddr;
1012
1013                                 LAFS_BUG(test_bit(B_PrimaryRef, &b->flags), b);
1014                                 list_del_init(&b->siblings);
1015
1016                                 if (test_bit(B_Index, &b->flags))
1017                                         list_add(&b->siblings,
1018                                                  &LAFSI(b->inode)->free_index);
1019
1020                                 if (test_and_clear_bit(B_Prealloc, &b->flags) &&
1021                                     b->physaddr == 0)
1022                                         lafs_summary_allocate(fs, b->inode, -1);
1023                                 credits = 0;
1024                                 if (test_and_clear_bit(B_Credit, &b->flags))
1025                                         credits++;
1026                                 if (test_and_clear_bit(B_ICredit, &b->flags))
1027                                         credits++;
1028                                 if (test_and_clear_bit(B_NCredit, &b->flags))
1029                                         credits++;
1030                                 if (test_and_clear_bit(B_NICredit, &b->flags))
1031                                         credits++;
1032                                 if (test_and_clear_bit(B_UnincCredit, &b->flags))
1033                                         credits++;
1034                                 lafs_space_return(fs, credits);
1035                         }
1036                         if (test_bit(B_Index, &b->flags) &&
1037                             !(b->flags & (
1038                                       (1<<B_Pinned) |
1039                                       (1<<B_Uninc) |
1040                                       (1<<B_Root) |
1041                                       (1<<B_Realloc) |
1042                                       (1<<B_Dirty)))
1043                                 ) {
1044                                 if (b->parent != NULL)
1045                                         /* Could this be uninc - where
1046                                          * is refcnt */
1047                                         printk("Problem: %s\n", strblk(b));
1048                                 LAFS_BUG(b->parent != NULL, b);
1049                                 /* put it on the lru */
1050                                 spin_lock(&lafs_hash_lock);
1051                                 if (!test_and_set_bit(B_OnFree, &b->flags)) {
1052                                         LAFS_BUG(!list_empty(&b->lru), b);
1053                                         list_move(&b->lru, &freelist.lru);
1054                                         freelist.freecnt++;
1055                                 }
1056                                 spin_unlock(&lafs_hash_lock);
1057                         }
1058                         /* last reference to a dblock with no page
1059                          * requires special handling
1060                          * The first block on a page must be freed,
1061                          * the other blocks hold a reference on that
1062                          * first block which must be dropped.
1063                          * However it is possible that a new reference was taken
1064                          * via _leafs. If so we have now cleared Pinned so
1065                          * get_flushable will immediately put this again.
1066                          * so we should leave this cleanup to later.
1067                          * Unlike other tests, this one isn't idempotent, so
1068                          * need the check on refcnt
1069                          */
1070                         db = dblk(b);
1071                         if (!test_bit(B_Index, &b->flags) &&
1072                             !PagePrivate(db->page) &&
1073                             atomic_read(&b->refcnt) == 0) {
1074                                 int bits = (PAGE_SHIFT -
1075                                             b->inode->i_blkbits);
1076                                 int mask = (1<<bits) - 1;
1077                                 int bnum = b->fileaddr & mask;
1078
1079                                 LAFS_BUG(next, next);
1080                                 if (bnum) {
1081                                         next = &db[-bnum].b;
1082                                         del_ref(next, "lafs_release_0",
1083                                                 __FILE__, __LINE__);
1084                                 } else {
1085                                         free_me = 1;
1086                                         put_page(db->page);
1087                                 }
1088                         }
1089                         /* last reference to an iblock requires that we
1090                          * deref the dblock.  We don't need to re-check
1091                          * refcnt here as a racing getiref_locked will take an
1092                          * extra dblock reference it.
1093                          */
1094                         if (test_bit(B_InoIdx, &b->flags)) {
1095                                 LAFS_BUG(LAFSI(b->inode)->iblock
1096                                          != iblk(b), b);
1097                                 LAFS_BUG(next, next);
1098                                 next = &LAFSI(b->inode)->dblock->b;
1099                                 del_ref(next, MKREF(iblock),
1100                                         __FILE__, __LINE__);
1101                         }
1102
1103                         /* Free a delayed-release inode */
1104                         if (!test_bit(B_Index, &b->flags) &&
1105                             (myi = rcu_my_inode(dblk(b))) != NULL &&
1106                             (!PagePrivate(dblk(b)->page) ||
1107                              test_bit(I_Destroyed, &LAFSI(myi)->iflags))) {
1108                                 dblk(b)->my_inode = NULL;
1109                                 LAFSI(myi)->dblock = NULL;
1110                                 /* Don't need lafs_hash_lock to clear iblock as
1111                                  * new refs on iblock are only taken while holding
1112                                  * dblock, and we know dblock has no references.
1113                                  * You would need an iget to get a ref on dblock now,
1114                                  * and because I_Destroyed, we know that isn't possible.
1115                                  */
1116                                 LAFSI(myi)->iblock = NULL;
1117                                 spin_unlock(&b->inode->i_data.private_lock);
1118                                 if (test_bit(I_Destroyed, &LAFSI(myi)->iflags))
1119                                         lafs_destroy_inode(myi);
1120                         } else
1121                                 spin_unlock(&b->inode->i_data.private_lock);
1122                 }
1123                 rcu_iput(myi);
1124
1125                 if (physref) {
1126                         lafs_seg_deref(fs, physref, 0);
1127                         physref = 0;
1128                 }
1129
1130                 if (checkpin)
1131                         lafs_inode_checkpin(checkpin);
1132
1133                 if (free_me)
1134                         kfree(dblk(b));
1135                 b = NULL;
1136                 if (next) {
1137                         b = next;
1138                         next = NULL;
1139                 } else if (next_parent) {
1140                         b = next_parent;
1141                         next_parent = NULL;
1142                 }
1143                 dec = 1;
1144         }
1145 }
1146
1147 /*
1148  * create (if it doesn't already exist) the 'iblock' for an inode.
1149  * This is a shadow of the dblock but comes into it's own if/when
1150  * the inode's indexing tree needs to grow.
1151  * Return a counted reference to the index block.
1152  * NOTE: we must hold a reference to ->dblock when we call
1153  * lafs_make_iblock.
1154  */
1155 struct indexblock * __must_check
1156 lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG)
1157 {
1158         struct lafs_inode *lai = LAFSI(ino);
1159         struct indexblock *ib, *new = NULL;
1160         struct fs *fs = fs_from_inode(ino);
1161         int err = 0;
1162
1163         dprintk("MAKE_IBLOCK %d\n", (int)ino->i_ino);
1164
1165         BUG_ON(lai->dblock == NULL);
1166         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1167 retry:
1168         spin_lock(&lafs_hash_lock);
1169         if (lai->iblock)
1170                 ib = getiref_locked_needsync(lai->iblock, REF);
1171         else if (new) {
1172                 lai->iblock = new;
1173                 ib = new;
1174                 new = NULL;
1175                 (void)getdref(lai->dblock, MKREF(iblock));
1176         }
1177         spin_unlock(&lafs_hash_lock);
1178         if (new)
1179                 lafs_iblock_free(new);
1180         if (ib) {
1181                 sync_ref(&ib->b);
1182                 if (adopt) {
1183                         err = setparent(lai->dblock, async);
1184                         if (err == 0)
1185                                 block_adopt(&ib->b, lai->dblock->b.parent);
1186                 }
1187                 if (err == 0)
1188                         return ib;
1189                 putiref(ib, REF);
1190                 return ERR_PTR(err);
1191         }
1192
1193         new = lafs_iblock_alloc(fs, GFP_KERNEL, 0, REF);
1194         if (!new)
1195                 return ERR_PTR(-ENOMEM);
1196
1197         new->b.fileaddr = 0;
1198         new->b.physaddr = lai->dblock->b.physaddr;
1199         if (test_bit(B_PhysValid, &lai->dblock->b.flags))
1200                 set_bit(B_PhysValid, &new->b.flags);
1201         LAFS_BUG(!test_bit(B_Valid, &lai->dblock->b.flags), &lai->dblock->b);
1202         set_bit(B_Valid, &new->b.flags);
1203         new->b.inode = ino;
1204         new->depth = lai->depth;
1205         /* Note: this doesn't get hashed until the index
1206          * tree grows and this block is disconnected from
1207          * the inode.
1208          */
1209         set_bit(B_InoIdx, &new->b.flags);
1210         if (test_bit(B_Root, &lai->dblock->b.flags))
1211                 set_bit(B_Root, &new->b.flags);
1212         new->b.parent = NULL;
1213
1214         goto retry;
1215 }
1216
1217 static u64
1218 leaf_lookup(void *bf, int len, u32 startaddr, u32 target, u32 *nextp)
1219 {
1220         /* buf starts with a 2byte header
1221          * if 1, then 6byte littleending indirect entries.
1222          * if 2, then 12byte extent entries
1223          */
1224         unsigned char *buf = bf;
1225         u64 p;
1226         unsigned char *cp;
1227         int elen;
1228         int hi, lo;
1229         u32 addr, next;
1230
1231         if (nextp)
1232                 *nextp = 0xffffffff;
1233
1234         if (buf[1])
1235                 return 0;
1236         switch (buf[0]) {
1237         default:
1238                 p = 0;
1239                 break;
1240
1241         case IBLK_INDIRECT: /* indirect */
1242                 dprintk("indirect lookup for %lu from %lu, len %d\n",
1243                         (unsigned long)target, (unsigned long)startaddr, len);
1244                 len -= 2;
1245                 buf += 2;
1246
1247                 if (target < startaddr)
1248                         return 0;
1249
1250                 next = target;
1251                 target -= startaddr;
1252
1253                 /* Need both tests as target could be v.large and
1254                  * multiply by 6 could overflow
1255                  */
1256                 if (target > len ||
1257                     target*6 + 6 > len)
1258                         return 0;
1259                 buf += target*6;
1260                 p = decode48(buf);
1261                 if (nextp) {
1262                         /* find the next allocated block */
1263                         u64 p2;
1264
1265                         len -= target*6;
1266                         len -= 6;
1267                         next++;
1268                         *nextp = 0xffffffff;
1269                         while (len >= 6) {
1270                                 p2 = decode48(buf);
1271                                 if (p2) {
1272                                         *nextp = next;
1273                                         break;
1274                                 }
1275                                 len -= 6;
1276                                 next++;
1277                         }
1278                 }
1279                 break;
1280
1281         case IBLK_EXTENT: /* extent */
1282                 /* 12 byte records: 6byte device, 2 byte length, 4
1283                  * byte fileaddr */
1284
1285                 len -= 2;
1286                 buf += 2;
1287                 lo = 0;
1288                 hi = len/12;
1289                 while (hi > lo+1) {
1290                         int mid = (lo+hi)/2;
1291                         cp = buf;
1292                         cp += mid*12 + 6;
1293
1294                         elen = decode16(cp);
1295                         addr = decode32(cp);
1296
1297                         if (elen && addr <= target)
1298                                 lo = mid;
1299                         else
1300                                 hi = mid;
1301                 }
1302
1303                 cp = buf;
1304                 cp += lo*12;
1305                 p = decode48(cp);
1306                 elen = decode16(cp);
1307                 addr = decode32(cp);
1308                 if (target < addr) {
1309                         p = 0;
1310                         cp -= 12;
1311                 } else if (target >= addr + elen)
1312                         p = 0;
1313                 else
1314                         p += target - addr;
1315
1316                 if (nextp) {
1317                         if (lo == len/12)
1318                                 *nextp = 0xffffffff; /* next extent */
1319                         else {
1320                                 u64 p2;
1321                                 p2 = decode48(cp);
1322                                 elen = decode16(cp);
1323                                 addr = decode32(cp);
1324                                 if (elen == 0)
1325                                         *nextp = 0xffffffff; /* no more meaningful
1326                                                           extents*/
1327                                 else
1328                                         *nextp = addr;
1329                         }
1330                 }
1331         }
1332         return p;
1333 }
1334
1335 u32 lafs_leaf_next(struct indexblock *ib, u32 start)
1336 {
1337         /* In this leaf index block, find the first address
1338          * at-or-after 'start'.  If none, return 0xFFFFFFFF
1339          */
1340         char *buf = map_iblock(ib);
1341         int blocksize = ib->b.inode->i_sb->s_blocksize;
1342         struct lafs_inode *li = LAFSI(ib->b.inode);
1343         u32 next = 0xffffffff;
1344         u64 phys;
1345
1346         if (test_bit(B_InoIdx, &ib->b.flags))
1347                 phys = leaf_lookup(buf + li->metadata_size,
1348                                    blocksize - li->metadata_size,
1349                                    ib->b.fileaddr, start, &next);
1350         else
1351                 phys = leaf_lookup(buf, blocksize,
1352                                    ib->b.fileaddr, start, &next);
1353         unmap_iblock(ib, buf);
1354         if (phys)
1355                 return start;
1356         else
1357                 return next;
1358 }
1359
1360 static u64
1361 index_lookup(void *bf, int len, u32 target, u32 *addrp, u32 *nextp)
1362 {
1363         unsigned char *buf = bf;
1364         int lo, hi;
1365         u32 addr;
1366         u64 p;
1367         unsigned char *cp;
1368
1369         if (buf[1] || buf[0]) {
1370                 dprintk("WARNING: not an index block %02x %02x\n",
1371                         buf[0], buf[1]);
1372                 return 0;
1373         }
1374
1375         buf += 2;
1376         len -= 2;
1377
1378         /* 10 byte records: 6 byte Device address, 4 byte file address */
1379         lo = 0; hi = (len/10);
1380         while (hi > lo+1) {
1381                 int mid = (lo+hi)/2;
1382                 cp = buf;
1383                 cp += mid*10;
1384                 p = decode48(cp);
1385                 addr = decode32(cp);
1386                 dprintk("...addr=%lu target=%lu lo=%d mid=%d hi=%d\n",
1387                         (unsigned long)addr, (unsigned  long)target,
1388                         lo, mid, hi);
1389                 if (p && addr <= target)
1390                         lo = mid;
1391                 else
1392                         hi = mid;
1393         }
1394
1395         cp = buf;
1396         cp += lo*10;
1397         p = decode48(cp);
1398         addr = decode32(cp);
1399
1400         if (p == 0)
1401                 /* Nothing in this block */
1402                 return 0;
1403
1404         if (addr > target) {
1405                 /* target is before first address */
1406                 p = 0;
1407                 cp -= 10;
1408         } else
1409                 *addrp = addr;
1410
1411         if (cp+10 <= (unsigned char *) buf + len && nextp) {
1412                 u64 p2 = decode48(cp);
1413                 u32 nxt = decode32(cp);
1414                 if (p2)
1415                         *nextp = nxt; /* address of next index block */
1416         }
1417         return p;
1418 }
1419
1420 int lafs_index_empty(struct indexblock *ib)
1421 {
1422         char *buf = map_iblock(ib);
1423         int blocksize = ib->b.inode->i_sb->s_blocksize;
1424         struct lafs_inode *li = LAFSI(ib->b.inode);
1425         u32 addr;
1426         u64 phys;
1427
1428         if (test_bit(B_InoIdx, &ib->b.flags))
1429                 phys = index_lookup(buf + li->metadata_size,
1430                                     blocksize - li->metadata_size,
1431                                     0xFFFFFFFF, &addr, NULL);
1432         else
1433                 phys = index_lookup(buf, blocksize,
1434                                     0xFFFFFFFF, &addr, NULL);
1435         unmap_iblock(ib, buf);
1436
1437         return phys == 0;
1438 }
1439
1440 static struct indexblock *find_better(struct inode *ino,
1441                                       struct indexblock *ib,
1442                                       u32 addr, u32 *next, REFARG)
1443 {
1444         struct indexblock *rv = ib;
1445         spin_lock(&ino->i_data.private_lock);
1446         list_for_each_entry_continue(ib, &ib->b.parent->children, b.siblings) {
1447                 if (!test_bit(B_PrimaryRef, &ib->b.flags))
1448                         break;
1449                 if (test_bit(B_EmptyIndex, &ib->b.flags))
1450                         continue;
1451                 if (ib->b.fileaddr > addr) {
1452                         if (next && *next > ib->b.fileaddr)
1453                                 *next = ib->b.fileaddr;
1454                         break;
1455                 }
1456                 /* This appears to be better. */
1457                 rv = ib;
1458         }
1459         getref_locked(&rv->b, REF);
1460         spin_unlock(&ino->i_data.private_lock);
1461         return rv;
1462 }
1463
1464 struct indexblock *
1465 lafs_leaf_find(struct inode *inode, u32 addr, int adopt, u32 *next,
1466                int async, REFARG)
1467 {
1468         /* Find the leaf index block which refers to this address (if any do)
1469          * Returns the leaf IOLocked.
1470          * *next will be set to an address that is higher than addr such
1471          * that there are no other leafs before *next.  There might not
1472          * actually be a leaf at *next though.
1473          */
1474         struct indexblock *ib, *ib2;
1475         struct indexblock *hold = NULL;
1476         u64 iphys;
1477         u32 iaddr = 0;
1478         void *buf;
1479         struct lafs_inode *li = LAFSI(inode);
1480         int blocksize = inode->i_sb->s_blocksize;
1481         int err = 0;
1482         int offset;
1483
1484         /* Must own a reference to ->dblock */
1485         BUG_ON(li->dblock == NULL);
1486         BUG_ON(atomic_read(&li->dblock->b.refcnt) == 0);
1487
1488         /* We come back to restart if we needed to unlock to
1489          * perform IO and we cannot be sure the parent didn't change.
1490          * We hold the last seen index block in 'hold' so that if
1491          * no changes happened (the common case) we are certain
1492          * that no IO will be required to get back to where
1493          * we were.
1494          */
1495 restart:
1496         if (next)
1497                 *next = 0xFFFFFFFF;
1498
1499         ib = lafs_make_iblock(inode, adopt, async, REF);
1500         err = PTR_ERR(ib);
1501         if (IS_ERR(ib))
1502                 goto err;
1503         offset = li->metadata_size;
1504
1505         err = -EAGAIN;
1506         if (!async)
1507                 lafs_iolock_block(&ib->b);
1508         else if (!lafs_iolock_block_async(&ib->b))
1509                 goto err_ib;
1510
1511         while (ib->depth > 1) {
1512                 /* internal index block */
1513                 /* NOTE:  index block could be empty, or could have
1514                  * nothing as early as 'addr'.  In that case '0' is
1515                  * returned implying that the block is either in
1516                  * cache, or needs to be synthesised as an empty
1517                  * index block (one level down).
1518                  */
1519                 struct indexblock *better;
1520                 u32 target = addr;
1521
1522         retry:
1523                 buf = map_iblock(ib);
1524                 iaddr = ib->b.fileaddr;
1525                 iphys = index_lookup(buf + offset, blocksize - offset,
1526                                      target, &iaddr,
1527                                      target == addr ? next : NULL);
1528                 unmap_iblock(ib, buf);
1529
1530                 ib2 = lafs_iblock_get(inode, iaddr, ib->depth-1, iphys, REF);
1531
1532                 if (!ib2) {
1533                         err = -ENOMEM;
1534                         lafs_iounlock_block(&ib->b);
1535                         goto err_ib;
1536                 }
1537
1538                 better = find_better(inode, ib2, addr, next, REF);
1539                 putiref(ib2, REF);
1540                 ib2 = better;
1541
1542                 if (!test_bit(B_Valid, &ib2->b.flags)) {
1543                         lafs_iounlock_block(&ib->b);
1544                         err = lafs_load_block(&ib2->b, NULL);
1545                         if (err)
1546                                 goto err_ib2;
1547                         if (async)
1548                                 err = lafs_wait_block_async(&ib2->b);
1549                         else
1550                                 err = lafs_wait_block(&ib2->b);
1551                         if (err)
1552                                 goto err_ib2;
1553
1554                         err = -EAGAIN;
1555                         if (!async)
1556                                 lafs_iolock_block(&ib->b);
1557                         else if (!lafs_iolock_block_async(&ib->b))
1558                                 goto err_ib2;
1559
1560                         /* While we did IO, the parent could have been
1561                          * split, so it is now the wrong parent, or it
1562                          * could have become EmptyIndex, though that is
1563                          * much less likely.  If it did, then it must
1564                          * have had a parent, so the ref we hold means
1565                          * it still has a parent.
1566                          * So if ib->b.parent, then we might need to
1567                          * retry from the top, and holding a ref on ib
1568                          * means that we don't risk live-locking if
1569                          * memory for index blocks is very tight.
1570                          * If there is no parent, then there is
1571                          * no risk that ib changed.
1572                          */
1573                         if (ib->b.parent) {
1574                                 if (hold)
1575                                         putiref(hold, MKREF(hold));
1576                                 hold = getiref(ib, MKREF(hold));
1577                                 putiref(ib2, REF);
1578                                 putiref(ib, REF);
1579                                 goto restart;
1580                         }
1581                 }
1582
1583                 err = -EAGAIN;
1584                 if (!async)
1585                         lafs_iolock_block(&ib2->b);
1586                 else if (!lafs_iolock_block_async(&ib2->b))
1587                         goto err_ib2;
1588
1589                 if (ib2->b.fileaddr != ib->b.fileaddr &&
1590                     test_bit(B_EmptyIndex, &ib2->b.flags)) {
1591                         /* need to look earlier in the parent */
1592                         target = ib2->b.fileaddr - 1;
1593                         lafs_iounlock_block(&ib2->b);
1594                         putiref(ib2, REF);
1595                         goto retry;
1596                 }
1597
1598 #ifdef FIXME
1599                 if (!(ib2->b.flags & B_Linked)) {
1600                         struct block *b2 = peer_find(fs,
1601                                                      inode, iaddr,
1602                                                      depth, iphys);
1603                         if (b2)
1604                                 list_add(&ib2->peers, &b2->peers);
1605                         set_bit(B_Linked, &ib2->b.flags);
1606                 }
1607 #endif
1608                 if (adopt)
1609                         block_adopt(&ib2->b, ib);
1610                 lafs_iounlock_block(&ib->b);
1611                 putiref(ib, REF);
1612                 ib = ib2;
1613                 offset = 0;
1614         }
1615
1616         return ib;
1617
1618 err_ib2:
1619         putiref(ib2, REF);
1620 err_ib:
1621         putiref(ib, REF);
1622 err:
1623         if (hold)
1624                 putiref(hold, MKREF(hold));
1625         return ERR_PTR(err);
1626 }
1627
1628 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr);
1629
1630 static int __lafs_find_next(struct inode *ino, loff_t *addrp)
1631 {
1632         /* Find the next allocated block in inode at or after *addrp
1633          * and set *addrp.
1634          * Return:
1635          *  -1 -  error
1636          *   0 -  there is no block at or after '*addrp'
1637          *   1 -  It is worth checking the block at *addrp.
1638          *   2 -  *addrp has been updated, check again.
1639          */
1640         struct lafs_inode *lai = LAFSI(ino);
1641         struct indexblock *leaf;
1642         struct fs *fs = fs_from_inode(ino);
1643         struct block *b;
1644         u32 nexti, nextd, nextl;
1645         int rv = 0;
1646         int offset;
1647         char *buf;
1648         u64 p;
1649
1650         /* Must hold a reference to ->dblock */
1651         BUG_ON(lai->dblock == NULL);
1652         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1653
1654         if (lai->depth == 0) {
1655                 if (*addrp == 0)
1656                         return 1;
1657                 leaf = lafs_make_iblock(ino, NOADOPT, SYNC, MKREF(find_next));
1658                 if (IS_ERR(leaf))
1659                         return PTR_ERR(leaf);
1660                 nextd = 0xFFFFFFFF;
1661         } else {
1662                 leaf = lafs_leaf_find(ino, *addrp, NOADOPT, &nexti, SYNC,
1663                                       MKREF(find_next));
1664                 /* FIXME what if 'leaf' is an error?? */
1665                 offset = 0;
1666                 if (lai->depth == 1)
1667                         offset = lai->metadata_size;
1668                 buf = map_iblock(leaf);
1669                 p = leaf_lookup(buf+offset, fs->blocksize-offset,
1670                                 leaf->b.fileaddr, (u32)*addrp, &nextd);
1671                 unmap_iblock(leaf, buf);
1672                 if (p) {
1673                         lafs_iounlock_block(&leaf->b);
1674                         putiref(leaf, MKREF(find_next));
1675                         if (*addrp == 0xFFFFFFFF) {
1676                                 int j;
1677                                 printk("at %s\n", strblk(&lai->dblock->b));
1678                                 printk("offset = %d depth=%d\n", offset,
1679                                        lai->depth);
1680                                 for (j = 0; j < 16; j++)
1681                                         printk("%04x ", buf[offset+j] & 0xffff);
1682                                 for (j = 0; j < 16; j++)
1683                                         printk("%04x ", buf[1024-16+j] & 0xffff);
1684                                 printk("\n");
1685                         }
1686                         BUG_ON(*addrp == 0xFFFFFFFF);
1687                         return 1;
1688                 }
1689                 if (nextd != 0xFFFFFFFF)
1690                         rv = 1;
1691                 else if (nexti != 0xFFFFFFFF) {
1692                         nextd = nexti;
1693                         rv = 2;
1694                 }
1695         }
1696         /* If rv==2, nextd may not be high enough as it might
1697          * be the address of an EmptyIndex block.  So only
1698          * use it if nothing better is found.
1699          */
1700         if (rv == 2)
1701                 nextl = 0xFFFFFFFF;
1702         else
1703                 nextl = nextd;
1704         list_for_each_entry(b, &leaf->children, siblings)
1705                 if (b->fileaddr >= *addrp &&
1706                     b->fileaddr <= nextl &&
1707                     test_bit(B_Valid, &b->flags)) {
1708                         nextd = nextl = b->fileaddr;
1709                         rv = 1;
1710                 }
1711
1712         if (leaf->uninc_table.pending_cnt) {
1713                 spin_lock(&leaf->b.inode->i_data.private_lock);
1714                 if (table_find_first(&leaf->uninc_table, *addrp, &nextl)) {
1715                         nextd = nextl;
1716                         rv = 1;
1717                 }
1718                 spin_unlock(&leaf->b.inode->i_data.private_lock);
1719         }
1720         lafs_iounlock_block(&leaf->b);
1721         putiref(leaf, MKREF(find_next));
1722         *addrp = nextd;
1723         return rv;
1724 }
1725
1726 /*
1727  * find the first data block at or after *bnump, and
1728  * store the address in *bnump.
1729  * Return 0 if nothing found, -1 on error, or
1730  * 1 if *bnump is a valid data block address
1731  *
1732  * The block could be in the indexing tree, or in
1733  * an uninc_table, or totally unincorporated.
1734  * Blocks that are really part of the file, but are not in
1735  * the indexing tree will still be on a ->children list
1736  * from an index block, and the index block will be very close
1737  * to the indexing tree.
1738  * So once we have found a suitable index block, we need to check
1739  * all it's children.  This could possibly be optimised using
1740  * find_get_pages if that was exported.
1741  */
1742 int lafs_find_next(struct inode *ino, loff_t *bnump)
1743 {
1744         int rv;
1745         /* Need to hold a reference to dblock while calling
1746          * __lafs_find_next
1747          */
1748         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(find_nexti));
1749
1750         if (IS_ERR(db))
1751                 return PTR_ERR(db);
1752         while (1) {
1753                 struct datablock *b;
1754                 int hole;
1755
1756                 rv = __lafs_find_next(ino, bnump);
1757
1758                 if (rv <= 0)
1759                         break;
1760                 if (rv == 2)
1761                         continue;
1762
1763                 b = lafs_get_block(ino, *bnump, NULL, GFP_KERNEL,
1764                                    MKREF(find_next));
1765                 if (!b)
1766                         return -ENOMEM;
1767
1768                 rv = lafs_find_block(b, NOADOPT);
1769                 if (rv) {
1770                         putdref(b, MKREF(find_next));
1771                         return rv;
1772                 }
1773                 hole = (b->b.physaddr == 0 ||
1774                         !test_bit(B_PhysValid, &b->b.flags)) &&
1775                         !test_bit(B_Valid, &b->b.flags);
1776                 if (LAFSI(ino)->depth == 0 &&
1777                     b->b.fileaddr == 0)
1778                         hole = 0; /* first block lives in inode */
1779                 putdref(b, MKREF(find_next));
1780
1781                 rv = 1;
1782                 if (!hole)
1783                         break;
1784                 *bnump = (*bnump)+1;
1785         }
1786         putdref(db, MKREF(find_nexti));
1787         BUG_ON(rv > 0 && *bnump == 0xFFFFFFFF);
1788         return rv;
1789 }
1790
1791 static int table_lookup(struct uninc *tbl, u32 addr, u64 *phys)
1792 {
1793         int i;
1794         for (i = tbl->pending_cnt; i; ) {
1795                 struct addr *a = &tbl->pending_addr[--i];
1796                 if (a->fileaddr <= addr &&
1797                     a->fileaddr + a->cnt > addr) {
1798                         *phys =  a->physaddr + (addr - a->fileaddr);
1799                         return 1;
1800                 }
1801         }
1802         return 0;
1803 }
1804
1805 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr)
1806 {
1807         /* Find the earliest allocated block in tbl which is
1808          * at-or-after 'min' and before '*addr', and store it in
1809          * *addr, returning 1 if something was found.
1810          */
1811         int rv = 0;
1812         int i;
1813
1814         for (i = tbl->pending_cnt ; i ; ) {
1815                 struct addr *a = &tbl->pending_addr[--i];
1816                 if (a->fileaddr >= *addr)
1817                         continue;
1818                 if (a->fileaddr + a->cnt < min)
1819                         continue;
1820                 if (a->physaddr == 0)
1821                         continue;
1822                 if (a->fileaddr >= min)
1823                         *addr = a->fileaddr;
1824                 else
1825                         *addr = min;
1826                 rv = 1;
1827         }
1828         return rv;
1829 }
1830
1831 static void fill_block_zero(struct datablock *db, struct datablock *inob)
1832 {
1833         struct lafs_inode *lai = LAFSI(db->b.inode);
1834         void *baddr = map_dblock(db);
1835         void *iaddr = map_dblock_2(inob);
1836         int offset = lai->metadata_size;
1837         int len = (1<<db->b.inode->i_blkbits) - offset;
1838
1839         memcpy(baddr, iaddr+offset, len);
1840         unmap_dblock_2(inob, iaddr);
1841         memset(baddr+len, 0, (1<<db->b.inode->i_blkbits)-len);
1842         unmap_dblock(db, baddr);
1843         set_bit(B_Valid, &db->b.flags);
1844 }
1845
1846 static int __must_check
1847 find_block(struct datablock *b, int adopt, int async)
1848 {
1849         /* Find where this block is in storage, and
1850          * set b->b.physaddr.
1851          * If the block lives in the inode, physaddr
1852          * gets set to zero, and lafs_load_block works with this.
1853          * Otherwise we walk down the index tree
1854          * Possible errors:
1855          *  -EIO
1856          *  -ENOMEM
1857          */
1858         struct lafs_inode *lai;
1859         struct indexblock *ib;
1860         struct datablock *db;
1861         void *buf;
1862         int  offset;
1863
1864         LAFS_BUG(test_bit(B_Pinned, &b->b.flags) &&
1865                  b->b.parent == NULL,
1866                  &b->b);
1867         if (b->b.inode == NULL)
1868                 return 0;
1869         lai = LAFSI(b->b.inode);
1870
1871         if (lai->depth == 0) {
1872                 /* Data is in the inode if anywhere */
1873                 if (!test_bit(B_PhysValid, &b->b.flags)) {
1874                         b->b.physaddr = 0;
1875                         set_bit(B_PhysValid, &b->b.flags);
1876                 }
1877                 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1878                 if (IS_ERR(db))
1879                         return PTR_ERR(db);
1880                 if (!test_bit(B_Valid, &b->b.flags) && b->b.fileaddr == 0) {
1881                         /* Now is the best time to copy data from inode into
1882                          * datablock, as we know we have a valid inode block
1883                          * now.
1884                          */
1885                         LAFS_BUG(b->b.physaddr != 0, &b->b);
1886                         fill_block_zero(b, db);
1887                 }
1888                 if (adopt) {
1889                         ib = lafs_make_iblock(b->b.inode, adopt, async,
1890                                               MKREF(findblock));
1891                         putdref(db, MKREF(findblock));
1892                         if (IS_ERR(ib))
1893                                 return PTR_ERR(ib);
1894                         block_adopt(&b->b, ib);
1895                         putiref(ib, MKREF(findblock));
1896                 } else
1897                         putdref(db, MKREF(findblock));
1898                 return 0;
1899         }
1900
1901         if (test_bit(B_PhysValid, &b->b.flags)
1902             && (!adopt || b->b.parent || test_bit(B_Root, &b->b.flags))) {
1903                 return 0;
1904         }
1905
1906         db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1907         if (IS_ERR(db))
1908                 return PTR_ERR(db);
1909
1910         /* Ok, there is some indexing information we need to
1911          * look through.  Find the leaf first
1912          */
1913         ib = lafs_leaf_find(b->b.inode, b->b.fileaddr, adopt, NULL, async,
1914                             MKREF(findblock));
1915         putdref(db, MKREF(findblock));
1916
1917         if (IS_ERR(ib)) {
1918                 dprintk("lafs_leaf_find returned error to lafs_find_block\n");
1919                 goto out;
1920         }
1921
1922         if (!test_bit(B_PhysValid, &b->b.flags)
1923             && ib->uninc_table.pending_cnt) {
1924                 spin_lock(&b->b.inode->i_data.private_lock);
1925                 if (table_lookup(&ib->uninc_table, b->b.fileaddr,
1926                                  &b->b.physaddr))
1927                         set_bit(B_PhysValid, &b->b.flags);
1928                 spin_unlock(&b->b.inode->i_data.private_lock);
1929         }
1930         if (!test_bit(B_PhysValid, &b->b.flags)) {
1931                 buf = map_iblock(ib);
1932                 offset = lai->depth > 1 ? 0 : lai->metadata_size;
1933                 if (test_bit(B_InoIdx, &ib->b.flags))
1934                         dprintk("InoIdx!!!\n");
1935                 dprintk("offset %d: %02x %02x\n", offset,
1936                         ((char *)buf)[offset], ((char *)buf)[offset+1]);
1937                 b->b.physaddr = leaf_lookup(buf+offset,
1938                                             b->b.inode->i_sb->s_blocksize
1939                                             - offset,
1940                                             ib->b.fileaddr, b->b.fileaddr,
1941                                             NULL);
1942                 set_bit(B_PhysValid, &b->b.flags);
1943                 unmap_iblock(ib, buf);
1944         }
1945         dprintk("lafs_find_block: lookup for %lu of %lu at %lu found %llu\n",
1946                 (unsigned long)b->b.fileaddr,
1947                 (unsigned long)b->b.inode->i_ino,
1948                 (unsigned long)ib->b.fileaddr,
1949                 b->b.physaddr);
1950         /* FIXME should peer_find go here or in load_block? */
1951
1952         if (adopt)
1953                 block_adopt(&b->b, ib);
1954         lafs_iounlock_block(&ib->b);
1955 out:
1956         if (IS_ERR(ib))
1957                 return PTR_ERR(ib);
1958         if (ib)
1959                 putiref(ib, MKREF(findblock));
1960         return 0;
1961 }
1962
1963 int __must_check
1964 lafs_find_block(struct datablock *b, int adopt)
1965 {
1966         return find_block(b, adopt, SYNC);
1967 }
1968
1969 int __must_check
1970 lafs_find_block_async(struct datablock *b)
1971 {
1972         return find_block(b, ADOPT, ASYNC);
1973 }
1974
1975 /*
1976  * lafs_allocated_block records that a block has been written at a new
1977  * address (or found at an address during roll-forward). This involves
1978  * updating some summary information (quota etc) and making sure
1979  * the change gets recorded in the parent block
1980  * This must be called before B_Dirty or B_Realloc are cleared,
1981  * to ensure that the index block is marked accordingly.
1982  *  B_UnincCredit should still be set, and we clear it.
1983  */
1984
1985 int lafs_add_block_address(struct fs *fs, struct block *blk)
1986 {
1987         struct indexblock *p = blk->parent;
1988         struct addr *a;
1989
1990         spin_lock(&blk->inode->i_data.private_lock);
1991         if (test_bit(B_Index, &blk->flags)) {
1992                 if (!test_and_set_bit(B_Uninc, &blk->flags)) {
1993                         blk->chain = p->uninc;
1994                         LAFS_BUG(p->depth == 1, blk);
1995                         p->uninc = blk;
1996                         getref(blk, MKREF(uninc));
1997                 }
1998                 spin_unlock(&blk->inode->i_data.private_lock);
1999                 return 1;
2000         }
2001
2002         /* same phase and leaf; add the address to uninc_table */
2003
2004         /* the only place we try to merge is at the end
2005          * of the embedded table, and then only for data
2006          * blocks
2007          * It is essential that uninc_table is sorted and,
2008          * in particular, has no overlapping extents.
2009          * If this block threatens problems we incorporate first.
2010          */
2011
2012         LAFS_BUG(!test_bit(B_Dirty, &p->b.flags) &&
2013                  !test_bit(B_Realloc, &p->b.flags), blk);
2014
2015         a = &p->uninc_table.pending_addr
2016                 [p->uninc_table.pending_cnt-1];
2017         if (p->uninc_table.pending_cnt >= 1 &&
2018             a->fileaddr+a->cnt == blk->fileaddr &&
2019             a->physaddr != 0 &&
2020             a->physaddr+a->cnt == blk->physaddr) {
2021                 a->cnt++;
2022                 if (test_and_clear_bit(B_UnincCredit, &blk->flags))
2023                         p->uninc_table.credits++;
2024                 spin_unlock(&blk->inode->i_data.private_lock);
2025                 return 1;
2026         } else if ((p->uninc_table.pending_cnt == 0 ||
2027                     a->fileaddr + a->cnt <= blk->fileaddr) &&
2028                    /* properly ordered, maybe add */
2029                    (p->uninc_table.pending_cnt <
2030                     ARRAY_SIZE(p->uninc_table.pending_addr))) {
2031                 a++;
2032                 a->fileaddr = blk->fileaddr;
2033                 a->physaddr = blk->physaddr;
2034                 a->cnt = 1;
2035                 p->uninc_table.pending_cnt++;
2036                 if (test_and_clear_bit(B_UnincCredit,
2037                                        &blk->flags))
2038                         p->uninc_table.credits++;
2039                 spin_unlock(&blk->inode->i_data.private_lock);
2040
2041                 return 1;
2042         } else {
2043                 spin_unlock(&blk->inode->i_data.private_lock);
2044                 /* We own a ref through blk->parent now, but
2045                  * after lafs_incorporate, we might not
2046                  */
2047                 getiref(p,MKREF(addblock));
2048                 lafs_iolock_written(&p->b);
2049                 lafs_incorporate(fs, p);
2050                 lafs_iounlock_block(&p->b);
2051                 putiref(p,MKREF(addblock));
2052                 return 0;
2053         }
2054 }
2055
2056 int lafs_allocated_block(struct fs *fs, struct block *blk, u64 phys)
2057 {
2058         int ph;
2059         struct indexblock *p;
2060
2061         dprintk("Allocated %s -> %llu\n",  strblk(blk), phys);
2062
2063         /* If phys is 0, I don't need uninc credit.
2064          * Also, Index block that are not near full do not need UnincCredits
2065          */
2066         LAFS_BUG(test_bit(B_InoIdx, &blk->flags), blk);
2067         if (!test_bit(B_Dirty, &blk->flags) &&
2068             !test_bit(B_Realloc, &blk->flags) &&
2069             phys != 0)
2070                 printk("something missing %s\n", strblk(blk));
2071         LAFS_BUG(!test_bit(B_UnincCredit, &blk->flags) && phys &&
2072                  !test_bit(B_Index, &blk->flags), blk);
2073         LAFS_BUG(!test_bit(B_Dirty, &blk->flags) &&
2074                  !test_bit(B_Realloc, &blk->flags) &&
2075                  phys != 0, blk);
2076
2077         LAFS_BUG(!test_bit(B_Writeback, &blk->flags), blk);
2078
2079         if (test_bit(B_Root, &blk->flags)) {
2080                 int i;
2081                 struct lafs_inode *lai;
2082
2083                 LAFS_BUG(test_bit(B_Index, &blk->flags), blk);
2084
2085                 for (i = 0; i < fs->maxsnapshot; i++)
2086                         if (fs->ss[i].root == blk->inode)
2087                                 break;
2088                 LAFS_BUG(i == fs->maxsnapshot, blk);
2089                 blk->physaddr = phys; /* superblock doesn't get
2090                                          counted in summaries */
2091                 LAFS_BUG(test_bit(B_SegRef, &blk->flags), blk);
2092                 set_bit(B_PhysValid, &blk->flags);
2093                 fs->ss[i].root_addr = phys;
2094                 lai = LAFSI(blk->inode);
2095                 if (i == 0 && lai->md.fs.usagetable > 1)
2096                         /* snapshot checkpoint is happening.  Record root
2097                          * address in main fs and in snapshot
2098                          */
2099                         fs->ss[lai->md.fs.usagetable-1].root_addr = phys;
2100
2101                 /* Don't bother to clear the B_UnincCredit in the root.
2102                  * It'll just get set again.  This way there is less churn.
2103                  */
2104                 return 0;
2105         }
2106         if (test_bit(FinalCheckpoint, &fs->fsstate) &&
2107             !test_bit(B_Index, &blk->flags) &&
2108             (LAFSI(blk->inode)->type == TypeSegmentMap ||
2109              LAFSI(blk->inode)->type == TypeQuota)) {
2110                 /* These blocks will be picked up by roll-forward,
2111                  * so don't need to have their address recorded,
2112                  * and doing so would leave the index blocks
2113                  * dirty after the checkpoint.
2114                  * So just return
2115                  */
2116                 if (test_and_clear_bit(B_SegRef, &blk->flags))
2117                         lafs_seg_deref(fs, blk->physaddr, 0);
2118                 blk->physaddr = phys;
2119                 set_bit(B_PhysValid, &blk->flags);
2120                 return 0;
2121         }
2122         p = blk->parent;
2123
2124         LAFS_BUG(!p, blk);
2125
2126         /* We need to work out if this block is in the current phase or
2127          * the next one.  This can only be an issue if a phase change
2128          * (aka checkpoint) is underway. But if it is, we need to
2129          * ensure the right phase is used
2130          */
2131         lafs_checkpoint_lock(fs);
2132         if (test_bit(B_Pinned, &blk->flags))
2133                 ph = !!test_bit(B_Phase1, &blk->flags);
2134         else if (test_bit(B_Index, &blk->flags))
2135                 LAFS_BUG(1, blk); /* Index blocks must always be pinned when dirty (*/
2136         else if (p->uninc_table.pending_cnt) {
2137                 /* attach data block into previous phase as incorporation of
2138                  * this phase is still to happen.
2139                  * This is needed for flush-if-cannot-preallocate to work.
2140                  */
2141                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), &p->b);
2142                 ph = !!test_bit(B_Phase1, &p->b.flags);
2143                 lafs_pin_block_ph(blk, ph);
2144         } else {
2145                 lafs_pin_block(blk);
2146                 ph = !!test_bit(B_Phase1, &blk->flags);
2147         }
2148         lafs_checkpoint_unlock(fs);
2149         if (!test_bit(B_PhysValid, &blk->flags))
2150                 blk->physaddr = 0;
2151         lafs_summary_update(fs, blk->inode, blk->physaddr, phys,
2152                             test_bit(B_Index, &blk->flags), ph,
2153                             test_bit(B_SegRef, &blk->flags));
2154         clear_bit(B_Prealloc, &blk->flags);
2155
2156         blk->physaddr = phys;
2157         set_bit(B_PhysValid, &blk->flags);
2158
2159         if (ph != !!test_bit(B_Phase1, &p->b.flags)) {
2160                 /* in the next phase */
2161                 dprintk("uninc next phase\n");
2162                 if (test_and_set_bit(B_Uninc, &blk->flags))
2163                         /* This block was already on the 'unincorporated'
2164                          * list, (it must be changing quickly) so there
2165                          * is nothing else to do.
2166                          */
2167                         return 0;
2168                 getref(blk, MKREF(uninc));
2169
2170                 spin_lock(&blk->inode->i_data.private_lock);
2171                 blk->chain = p->uninc_next;
2172                 p->uninc_next = blk;
2173                 spin_unlock(&blk->inode->i_data.private_lock);
2174                 /* We will try again after the phase change.
2175                  * That will be when we clean B_UnincCredit
2176                  */
2177                 return 0;
2178         }
2179
2180 new_parent:
2181         if (test_bit(B_Dirty, &blk->flags) || phys == 0) {
2182                 if (!test_bit(B_Dirty, &p->b.flags)
2183                     && !test_bit(B_Credit, &p->b.flags)) {
2184                         printk("Oh dear: %s\n", strblk(blk));
2185                         printk(".......: %s\n", strblk(&p->b));
2186                 }
2187                 lafs_dirty_iblock(p);
2188         } else {
2189                 if (!test_bit(B_Valid, &p->b.flags)) {
2190                         printk("Not valid in lafs_allocated_block: %s\n",
2191                                strblk(&p->b));
2192                         printk("blk is %s\n", strblk(blk));
2193                         BUG();
2194                 }
2195                 LAFS_BUG(!test_bit(B_Valid, &p->b.flags), &p->b);
2196                 if (!test_bit(B_Realloc, &p->b.flags)) {
2197                         /* I cannot use B_Credit to fill B_Realloc as that
2198                          * might still be needed for B_Dirty.
2199                          * So if we cannot allocated a new credit,
2200                          * just set the block as 'dirty' now.
2201                          */
2202                         if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
2203                                 if (test_and_set_bit(B_Realloc, &p->b.flags))
2204                                         lafs_space_return(fs, 1);
2205                         } else
2206                                 lafs_dirty_iblock(p);
2207                 }
2208                 if (!test_and_set_bit(B_UnincCredit, &p->b.flags))
2209                         if (!test_and_clear_bit(B_ICredit, &p->b.flags))
2210                                 LAFS_BUG(1, &p->b); // ICredit should be set before
2211                 //we dirty a block.
2212
2213         }
2214
2215         dprintk("Uninc same phase\n");
2216
2217         BUG_ON(!test_bit(B_Pinned, &blk->parent->b.flags));
2218
2219         if (!lafs_add_block_address(fs, blk)) {
2220                 p = blk->parent;
2221
2222                 LAFS_BUG(!p, blk);
2223                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), blk);
2224                 goto new_parent;
2225         }
2226
2227         /* That credit has now been added to the uninc_table and
2228          * thus made available to the parent
2229          */
2230
2231         lafs_refile(&p->b, 0);
2232         return 0;
2233 }