index.c

   1
   2 /*
   3  * Handle index block for LaFS.
   4  * fs/lafs/index.c
   5  * Copyright (C) 2005-2009
   6  * Neil Brown <neilb@suse.de>
   7  * Released under the GPL, version 2
   8  */
   9
  10 #include        "lafs.h"
  11 #include        <linux/types.h>
  12 #include        <linux/hash.h>
  13 #include        <linux/slab.h>
  14 /*
  15  * For each index-block there is an 'indexblock' data structure,
  16  * and a 'block' of memory, which is a page or part there-of.
  17  *
  18  * Index-blocks are stored in a hash table, indexed by
  19  * (struct inode *, fileaddr, depth).  There is one hash table
  20  * for all filesystems.
  21  *
  22  * There is a single free-list for all index blocks from all
  23  * filesystems, of all sizes.  Index blocks only get on this list when
  24  * they are unpinned leafs.  i.e. their 'children' list is empty.  As they
  25  * are freed, their parents may get added to the list.
  26  *
  27  * It's not clear yet how memory pressure will flush out dirty index blocks.
  28  *
  29  * All indexblocks for an inode are either attached under the inode via
  30  * the children/siblings tree, or on a per-inode 'free' list also using
  31  * the 'siblings' linkage. Thus we are sure that they can be removed when
  32  * the inode is removed.  Note that data blocks do *not* necessarily live
  33  * in this tree (though they will when they are dirty).  They can always be
  34  * found in the radix_tree for the inode's address-space.
  35  *
  36  * STILL TO DO:
  37  *   Consider how 'rcu' can be used for
  38  *     - hash table
  39  *     - freelist
  40  *   Consider gang-freeing for adding to head of lru lists.
  41  */
  42
  43 /* FIXME these should be configured at runtime based on memory */
  44 #define HASHBITS        10
  45 static struct hlist_head hash_table[1<<HASHBITS];
  46 spinlock_t lafs_hash_lock;
  47
  48 /* static */ struct freelists freelist;
  49
  50 static int lafs_shrinker(int nr_to_scan, /*gfp_t*/unsigned int gfp_mask)
  51 {
  52         if (nr_to_scan) {
  53                 LIST_HEAD(togo);
  54                 if (!(gfp_mask & __GFP_FS))
  55                         return -1;
  56                 dprintk("scan %d\n", nr_to_scan);
  57                 spin_lock(&lafs_hash_lock);
  58                 while (nr_to_scan && !list_empty(&freelist.lru)) {
  59                         struct indexblock *ib = list_entry(freelist.lru.next,
  60                                                            struct indexblock,
  61                                                            b.lru);
  62                         /* Check if it is really free */
  63
  64                         if (atomic_read(&ib->b.refcnt)) {
  65                                 list_del_init(&ib->b.lru);
  66                                 clear_bit(B_OnFree, &ib->b.flags);
  67                                 freelist.freecnt--;
  68                                 continue;
  69                         }
  70                         if (test_bit(B_InoIdx, &ib->b.flags))
  71                                 LAFSI(ib->b.inode)->iblock = NULL;
  72
  73                         /* Any pinned iblock must have children, be
  74                          * refcounted, or be on a 'leaf' lru, which gives
  75                          * a refcount.  So this unreffed block cannot be
  76                          * pinned.  This BUG_ON is here because previously
  77                          * the above 'if' also tested for 'Pinned', and that
  78                          * seems unnecessary, but I want to be sure.
  79                          */
  80                         LAFS_BUG(test_bit(B_Pinned, &ib->b.flags), &ib->b);
  81                         if (!hlist_unhashed(&ib->hash))
  82                                 hlist_del_init(&ib->hash);
  83                         /* delete from inode->free_index */
  84                         list_del_init(&ib->b.siblings);
  85                         list_move(&ib->b.lru, &togo);
  86                         nr_to_scan--;
  87                         freelist.freecnt--;
  88                 }
  89                 spin_unlock(&lafs_hash_lock);
  90                 while (!list_empty(&togo)) {
  91                         struct indexblock *ib = list_entry(togo.next,
  92                                                            struct indexblock,
  93                                                            b.lru);
  94                         list_del(&ib->b.lru);
  95                         lafs_iblock_free(ib);
  96                 }
  97         }
  98         return freelist.freecnt;
  99 }
 100
 101 void lafs_release_index(struct list_head *head)
 102 {
 103         /* This is the other place that index blocks get freed.
 104          * They are freed either by memory pressure in lafs_shrinker
 105          * above, or when an inode is destroyed by this code.
 106          * They need to both obey the same locking rules, and ensure
 107          * blocks are removed from each of siblings, lru, and hash.
 108          * This list is an inode's free_index and are linked on
 109          * b.siblings.  They are all on the freelist.lru
 110          */
 111         LIST_HEAD(togo);
 112         spin_lock(&lafs_hash_lock);
 113         while (!list_empty(head)) {
 114                 struct indexblock *ib = list_entry(head->next,
 115                                                    struct indexblock,
 116                                                    b.siblings);
 117                 list_del_init(&ib->b.siblings);
 118                 LAFS_BUG(!test_bit(B_OnFree, &ib->b.flags), &ib->b);
 119                 if (!hlist_unhashed(&ib->hash))
 120                         hlist_del_init(&ib->hash);
 121                 list_move(&ib->b.lru, &togo);
 122                 freelist.freecnt--;
 123         }
 124         spin_unlock(&lafs_hash_lock);
 125         while (!list_empty(&togo)) {
 126                 struct indexblock *ib = list_entry(togo.next,
 127                                                    struct indexblock,
 128                                                    b.lru);
 129                 list_del(&ib->b.lru);
 130                 lafs_iblock_free(ib);
 131         }
 132 }
 133
 134 struct indexblock *
 135 lafs_iblock_alloc(struct fs *fs, int gfp, int with_buffer, REFARG)
 136 {
 137         struct indexblock *ib;
 138
 139         ib = kzalloc(sizeof(*ib), gfp);
 140         if (!ib)
 141                 return NULL;
 142         if (with_buffer) {
 143                 ib->data = kmalloc(fs->blocksize, gfp);
 144                 if (!ib->data) {
 145                         kfree(ib);
 146                         return NULL;
 147                 }
 148         } else
 149                 ib->data = NULL;
 150
 151         ib->b.flags = 0;
 152         set_bit(B_Index, &ib->b.flags);
 153         atomic_set(&ib->b.refcnt, 1);
 154         add_ref(&ib->b, REF, __FILE__, __LINE__);
 155
 156         INIT_LIST_HEAD(&ib->children);
 157         atomic_set(&ib->pincnt[0], 0);
 158         atomic_set(&ib->pincnt[1], 0);
 159         ib->uninc = NULL;
 160         ib->uninc_next = NULL;
 161
 162         ib->b.parent = NULL;
 163         INIT_LIST_HEAD(&ib->b.siblings);
 164         INIT_LIST_HEAD(&ib->b.lru);
 165         INIT_LIST_HEAD(&ib->b.peers);
 166         INIT_HLIST_NODE(&ib->hash);
 167         ib->b.chain = NULL;
 168         ib->uninc_table.pending_cnt = 0;
 169         ib->uninc_table.credits = 0;
 170         ib->b.inode = NULL;
 171         return ib;
 172 }
 173
 174 void lafs_iblock_free(struct indexblock *ib)
 175 {
 176         kfree(ib->data);
 177         kfree(ib);
 178 }
 179
 180 static struct shrinker hash_shrink = {
 181         .shrink = lafs_shrinker,
 182         .seeks = DEFAULT_SEEKS,
 183 };
 184
 185 int lafs_ihash_init(void)
 186 {
 187         int i;
 188         for (i = 0 ; i < (1<<HASHBITS); i++)
 189                 INIT_HLIST_HEAD(hash_table+i);
 190         spin_lock_init(&lafs_hash_lock);
 191         INIT_LIST_HEAD(&freelist.lru);
 192         register_shrinker(&hash_shrink);
 193
 194         return 0;
 195 }
 196
 197 void lafs_ihash_free(void)
 198 {
 199         unregister_shrinker(&hash_shrink);
 200 }
 201
 202 static int
 203 ihash(struct inode *inode, faddr_t addr, int depth)
 204 {
 205         unsigned long h = hash_ptr(inode, BITS_PER_LONG);
 206         h = hash_long(h ^ addr, BITS_PER_LONG);
 207         return hash_long(h ^ depth, HASHBITS);
 208 }
 209
 210 static struct indexblock *
 211 ihash_lookup(struct inode *inode, faddr_t addr, int depth,
 212              struct indexblock *new, REFARG)
 213 {
 214         struct indexblock *ib = NULL;
 215         struct hlist_node *tmp;
 216         struct hlist_head *head =  &hash_table[ihash(inode, addr, depth)];
 217
 218         spin_lock(&lafs_hash_lock);
 219         hlist_for_each_entry(ib, tmp, head, hash)
 220                 if (ib->b.fileaddr == addr && ib->b.inode == inode &&
 221                     ib->depth == depth)
 222                         break;
 223         if (!tmp) {
 224                 if (new) {
 225                         hlist_add_head(&new->hash, head);
 226                         ib = new;
 227                 } else
 228                         ib = NULL;
 229         }
 230         if (ib)
 231                 getiref_locked_needsync(ib, REF);
 232         spin_unlock(&lafs_hash_lock);
 233         if (ib)
 234                 sync_ref(&ib->b);
 235         return ib;
 236 }
 237
 238 struct indexblock *lafs_getiref_locked(struct indexblock *ib)
 239 {
 240         int ok = 0;
 241         if (!ib)
 242                 return ib;
 243         LAFS_BUG(!test_bit(B_InoIdx, &ib->b.flags), &ib->b);
 244         if (spin_is_locked(&ib->b.inode->i_data.private_lock))
 245                 ok = 1;
 246         if (spin_is_locked(&lafs_hash_lock))
 247                 ok = 1;
 248         if (spin_is_locked(&fs_from_inode(ib->b.inode)->lock))
 249                 ok = 1;
 250         LAFS_BUG(!ok, &ib->b);
 251
 252         if (atomic_inc_return(&ib->b.refcnt) == 1) {
 253                 /* First reference to an InoIdx block implies a reference
 254                  * to the dblock
 255                  * If two different threads come here under different locks, one could
 256                  * exit before the ref on the dblock is taken.  However as each
 257                  * thread must own an indirect reference to be here, the last indirect ref to
 258                  * the dblock cannot disappear before all threads have exited this function,
 259                  * by which time this direct ref will be active.
 260                  */
 261                 struct datablock *db;
 262                 db = getdref(LAFSI(ib->b.inode)->dblock, MKREF(iblock));
 263                 LAFS_BUG(db == NULL, &ib->b);
 264         }
 265         return ib;
 266 }
 267
 268 struct indexblock *
 269 lafs_iblock_get(struct inode *ino, faddr_t addr, int depth, paddr_t phys, REFARG)
 270 {
 271         /* Find the index block with this inode/depth/address.
 272          * If it isn't in the cache, create it with given
 273          * phys address
 274          */
 275         struct indexblock *ib = ihash_lookup(ino, addr, depth, NULL, REF);
 276         struct indexblock *new;
 277         if (ib)
 278                 return ib;
 279
 280         if (!phys)
 281                 return NULL;
 282
 283         new = lafs_iblock_alloc(fs_from_inode(ino), GFP_KERNEL, 1, REF);
 284         if (!new)
 285                 return NULL;
 286         new->b.fileaddr = addr;
 287         new->b.inode = ino;
 288         new->depth = depth;
 289         new->b.physaddr = phys;
 290         set_bit(B_PhysValid, &new->b.flags);
 291
 292         ib = ihash_lookup(ino, addr, depth, new, REF);
 293         if (ib != new)
 294                 lafs_iblock_free(new);
 295         return ib;
 296 }
 297
 298 void lafs_hash_iblock(struct indexblock *ib)
 299 {
 300         /* This block was an InoIdx block but has just become a real
 301          * index block, so hash it.  There cannot be a block with this
 302          * address already as we haven't increased the inode depth yet.
 303          * Alternately this block was recently split off another,
 304          * and has now been incorporated, so a lookup might need
 305          * to find it.  So it better be in the hash table.
 306          */
 307         struct hlist_head *head =
 308                 &hash_table[ihash(ib->b.inode, ib->b.fileaddr, ib->depth)];
 309         spin_lock(&lafs_hash_lock);
 310         LAFS_BUG(!hlist_unhashed(&ib->hash), &ib->b);
 311         hlist_add_head(&ib->hash, head);
 312         spin_unlock(&lafs_hash_lock);
 313 }
 314
 315 void lafs_unhash_iblock(struct indexblock *ib)
 316 {
 317         /* This block has been emptied and deleted and is no longer
 318          * wanted in the hash tables
 319          */
 320         spin_lock(&lafs_hash_lock);
 321         LAFS_BUG(hlist_unhashed(&ib->hash), &ib->b);
 322         hlist_del_init(&ib->hash);
 323         spin_unlock(&lafs_hash_lock);
 324 }
 325
 326 /* adopt blk into parent if possible.
 327  */
 328 static void
 329 block_adopt(struct block *blk, struct indexblock *parent)
 330 {
 331         struct address_space *as;
 332
 333         if (parent == NULL) {
 334                 LAFS_BUG(!test_bit(B_Root, &blk->flags), blk);
 335                 return;
 336         }
 337
 338         if (blk->parent) {
 339                 LAFS_BUG(blk->parent != parent, blk);
 340                 return;
 341         }
 342
 343         LAFS_BUG(!(parent->b.flags & B_Valid), &parent->b);
 344
 345         as = &blk->inode->i_data;
 346
 347         spin_lock(&as->private_lock);
 348         if (blk->parent) {
 349                 if (blk->parent != parent) {
 350                         printk("blk = %s\n", strblk(blk));
 351                         printk("blk->p = %s\n", strblk(&blk->parent->b));
 352                         printk("parent = %s\n", strblk(&parent->b));
 353                         lafs_dump_tree();
 354                 }
 355                 LAFS_BUG(blk->parent != parent, blk);
 356         } else {
 357                 if (test_bit(B_EmptyIndex, &parent->b.flags)) {
 358                         LAFS_BUG(!test_bit(B_Index, &parent->b.flags), &parent->b);
 359                         LAFS_BUG(test_bit(B_InoIdx, &parent->b.flags), &parent->b);
 360                         LAFS_BUG(parent->b.fileaddr != parent->b.parent->b.fileaddr, &parent->b);
 361
 362                         clear_bit(B_EmptyIndex, &parent->b.flags);
 363                 }
 364                 LAFS_BUG(parent == iblk(blk), blk);
 365                 blk->parent = parent;
 366                 getiref(parent, MKREF(child));
 367                 /* Remove from the per-inode free list */
 368                 spin_lock(&lafs_hash_lock);
 369                 if (!test_bit(B_InoIdx, &blk->flags))
 370                         list_move(&blk->siblings, &parent->children);
 371                 else {
 372                         /* Remove from ->free_index list */
 373                         list_del_init(&blk->siblings);
 374                         /* Having set ->parent, we take a ref on the dblock.
 375                          * The dblock must exist because we can only reach here from
 376                          * lafs_make_iblock which can only be called while holding a ref
 377                          * on the dblock
 378                          */
 379                         (void)getdref(LAFSI(blk->inode)->dblock, MKREF(pdblock));
 380                 }
 381                 spin_unlock(&lafs_hash_lock);
 382         }
 383         spin_unlock(&as->private_lock);
 384 }
 385
 386 static int __must_check
 387 find_block(struct datablock *b, int adopt, int async);
 388
 389 static int __must_check
 390 setparent(struct datablock *blk, int async)
 391 {
 392         int err = 0;
 393         if (blk->b.parent == NULL && !test_bit(B_Root, &blk->b.flags))
 394                 err = find_block(blk, 1, async);
 395         return err;
 396 }
 397
 398 int __must_check
 399 lafs_setparent(struct datablock *blk)
 400 {
 401         return setparent(blk, 0);
 402 }
 403
 404 void lafs_pin_block_ph(struct block *b, int ph)
 405 {
 406         struct fs *fs = fs_from_inode(b->inode);
 407         struct block *p;
 408         struct inode *ino;
 409
 410         LAFS_BUG(b->parent == NULL && !test_bit(B_Root, &b->flags), b);
 411         if (!test_bit(B_Realloc, &b->flags) &&
 412             LAFSI(b->inode)->type != TypeSegmentMap)
 413                 LAFS_BUG(!test_phase_locked(fs), b);
 414
 415         ino = b->inode;
 416         spin_lock(&ino->i_data.private_lock);
 417         for (p = b ;
 418              p && !test_bit(B_Pinned, &p->flags) ;
 419              p = &(p->parent)->b) {
 420                 set_phase(p, ph);
 421                 if (test_bit(B_InoIdx, &p->flags)) {
 422                         struct datablock *db = LAFSI(p->inode)->dblock;
 423
 424                         spin_unlock(&ino->i_data.private_lock);
 425                         lafs_inode_checkpin(p->inode);
 426                         ino = db->b.inode;
 427                         spin_lock(&ino->i_data.private_lock);
 428                 }
 429         }
 430         spin_unlock(&ino->i_data.private_lock);
 431         lafs_refile(b, 0);
 432 }
 433
 434 static void pin_all_children(struct indexblock *ib)
 435 {
 436         /* If we cannot get new credits for phase_flip, we need to
 437          * pin all dirty data-block children so that they get
 438          * written and don't remain to require a phase_flip
 439          * on the next checkpoint (at which point there will be
 440          * no credit left)
 441          * So if any dirty child is found, set phase.  We don't
 442          * need PinPending as it is already Dirty.  All other
 443          * requirement for pinning will already be met.
 444          */
 445         struct block *b;
 446         struct indexblock *start = NULL;
 447         int depth = 0;
 448         struct fs *fs = fs_from_inode(ib->b.inode);
 449         int ph = fs->phase;
 450
 451         /* Locking:
 452          * This is called with an IOLock on ib,  So it is unlikely
 453          * that children will be added or removed(?), but we really
 454          * should hole the inode private_lock anyway.
 455          */
 456         getiref(ib, MKREF(pinall));
 457         b = NULL;
 458 restart:
 459         /* We hold a reference on ib, and on 'start' if non-NULL */
 460         spin_lock(&ib->b.inode->i_data.private_lock);
 461         b = list_prepare_entry(b, &ib->children, siblings);
 462         list_for_each_entry_continue(b, &ib->children, siblings) {
 463                 /* FIXME I don't descend through inode blocks to the file below! */
 464                 if (test_bit(B_Index, &b->flags)) {
 465                         /* restart down */
 466                         depth++;
 467                         getref_locked(b, MKREF(pinall));
 468                         spin_unlock(&ib->b.inode->i_data.private_lock);
 469                         putiref(ib, MKREF(pinall));
 470                         ib = iblk(b);
 471                         b = NULL;
 472                         goto restart;
 473                 } else if (test_bit(B_Dirty, &b->flags) &&
 474                            !test_bit(B_Pinned, &b->flags)) {
 475                         getref_locked(b, MKREF(pinall));
 476                         spin_unlock(&ib->b.inode->i_data.private_lock);
 477                         set_phase(b, ph);
 478                         putref(b, MKREF(pinall));
 479                         /* Slightly safer to restart this level */
 480                         b = NULL;
 481                         goto restart;
 482                 }
 483         }
 484         spin_unlock(&ib->b.inode->i_data.private_lock);
 485         putiref(start, MKREF(pinall));
 486         if (depth) {
 487                 depth--;
 488                 start = ib;
 489                 ib = getiref(ib->b.parent, MKREF(pinall));
 490                 b = &start->b;
 491                 goto restart;
 492         }
 493         putiref(ib, MKREF(pinall));
 494 }
 495
 496 static void flip_phase(struct block *b)
 497 {
 498         struct indexblock *p;
 499         int oldphase = !!test_bit(B_Phase1, &b->flags);
 500
 501         LAFS_BUG(!test_bit(B_Pinned, &b->flags), b);
 502         if (oldphase)
 503                 clear_bit(B_Phase1, &b->flags);
 504         else
 505                 set_bit(B_Phase1, &b->flags);
 506
 507         p = b->parent;
 508
 509         if (p) {
 510                 atomic_inc(&p->pincnt[1-oldphase]);
 511                 atomic_dec(&p->pincnt[oldphase]);
 512         } else
 513                 LAFS_BUG(!test_bit(B_Root, &b->flags), b);
 514
 515         /* move 'next' credits to 'here' */
 516         if (!test_bit(B_Credit, &b->flags) &&
 517             test_and_clear_bit(B_NCredit, &b->flags))
 518                 set_bit(B_Credit, &b->flags);
 519
 520         if (!test_bit(B_ICredit, &b->flags) &&
 521             test_and_clear_bit(B_NICredit, &b->flags))
 522                 set_bit(B_ICredit, &b->flags);
 523 }
 524
 525 /* When the pinning of a block needs to be carried across a
 526  * checkpoint, we need to 'flip' the phase.
 527  * This only applies to blocks that can be pinned by a block that
 528  * may not be written until the next phase.
 529  * This includes any index block (is it may have some children in one
 530  * phase and some in the next) and any space-accounting block
 531  * for the same reason.
 532  * So indexblocks will need to flip, and use lafs_phase_flip.
 533  * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock.
 534  * TypeInodeFile don't need to be phase_flipped, though their InoIdx
 535  * block might.  InodeFile blocks are only pinned by metadata transactions
 536  * which happen inside a checkpoint lock.
 537  */
 538
 539 void lafs_flip_dblock(struct datablock *db)
 540 {
 541         /* This is an accounting block (SegmentMap or Quota)
 542          * which we need to write out after ->phase has changed
 543          * in the tail of the checkpoint.
 544          * We always flip the phase and reallocate from AccountSpace.
 545          * If all references get dropped, it will then get unpinned
 546          * before the next phase finished - we don't unpin here
 547          * (unlike lafs_phase_flip for index blocks).
 548          */
 549         flip_phase(&db->b);
 550         lafs_prealloc(&db->b, AccountSpace);
 551         /* Parent might need to be on a leaflist now */
 552         lafs_refile(&db->b.parent->b, 0);
 553 }
 554
 555 void lafs_phase_flip(struct fs *fs, struct indexblock *ib)
 556 {
 557         /* We are performing a checkpoint, this block has been written
 558          * out and now needs to be flipped into the next phase.
 559          *
 560          * It involves.
 561          *  - Processing all uninc_next blocks into uninc_table.
 562          *  - adjusting counts on parent
 563          *  - moving credits from 'next' to 'this' phase.
 564          *  - update block counts to included phase-delayed updates.
 565          */
 566         int oldphase = !!test_bit(B_Phase1, &ib->b.flags);
 567         struct block *ulist;
 568
 569         dprintk("FLIP %s\n", strblk(&ib->b));
 570
 571         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
 572         LAFS_BUG(!test_bit(B_Index, &ib->b.flags), &ib->b);
 573         LAFS_BUG(ib->b.parent == NULL && !test_bit(B_Root, &ib->b.flags), &ib->b);
 574         LAFS_BUG(ib->uninc_table.pending_cnt, &ib->b);
 575         LAFS_BUG(atomic_read(&ib->pincnt[oldphase]), &ib->b);
 576         LAFS_BUG(!test_bit(B_IOLock, &ib->b.flags), &ib->b);
 577
 578         /* If this is the only reference, and pincnts are zero
 579          * and all relevant flags are clear, then don't flip
 580          * phase but unpin.
 581          */
 582         if (test_bit(B_Pinned, &ib->b.flags) &&
 583             !test_bit(B_Dirty, &ib->b.flags) &&
 584             !test_bit(B_Realloc, &ib->b.flags) &&
 585             atomic_read(&ib->b.refcnt) == 1 && /* This is us */
 586
 587             ib->uninc_table.pending_cnt == 0 &&
 588             ib->uninc == NULL &&
 589             ib->uninc_next == NULL &&
 590             atomic_read(&ib->pincnt[0]) == 0 &&
 591             atomic_read(&ib->pincnt[1]) == 0 &&
 592
 593             (!test_bit(B_InoIdx, &ib->b.flags) ||
 594              !(test_bit(B_PinPending, &LAFSI(ib->b.inode)->dblock->b.flags)
 595                || test_bit(B_Dirty, &LAFSI(ib->b.inode)->dblock->b.flags)
 596                || test_bit(B_Realloc, &LAFSI(ib->b.inode)->dblock->b.flags)))
 597                 ) {
 598                 if (test_and_clear_bit(B_Pinned, &ib->b.flags)) {
 599                         if (!test_bit(B_Root, &ib->b.flags)) {
 600                                 atomic_dec(&ib->b.parent->pincnt[oldphase]);
 601                                 lafs_refile(&ib->b.parent->b, 0);
 602                         }
 603                         lafs_inode_checkpin(ib->b.inode);
 604                 }
 605                 if (test_bit(B_InoIdx, &ib->b.flags))
 606                         /* maybe data block should go in leaf list now */
 607                         lafs_refile(&LAFSI(ib->b.inode)->dblock->b, 0);
 608                 lafs_iounlock_block(&ib->b);
 609                 return;
 610         }
 611
 612         flip_phase(&ib->b);
 613
 614         if (test_bit(B_InoIdx, &ib->b.flags)) {
 615                 struct lafs_inode *lai = LAFSI(ib->b.inode);
 616                 struct datablock *db = lai->dblock;
 617                 struct inode *ino = db->b.inode;
 618
 619                 spin_lock(&ino->i_data.private_lock);
 620                 lai->cblocks += lai->pblocks;
 621                 lai->pblocks = 0;
 622                 lai->ciblocks += lai->piblocks;
 623                 lai->piblocks = 0;
 624                 if (lai->type == TypeInodeFile) {
 625                         /* Root of filesystem */
 626                         lai->md.fs.cblocks_used +=
 627                                 lai->md.fs.pblocks_used;
 628                         lai->md.fs.pblocks_used = 0;
 629                 }
 630                 spin_unlock(&ino->i_data.private_lock);
 631
 632                 /* maybe data block needs to be on leaf list */
 633                 lafs_refile(&db->b, 0);
 634         }
 635
 636         if (lafs_prealloc(&ib->b, ReleaseSpace) < 0) {
 637                 /* Couldn't get all N*Credit, so
 638                  * Pin all children to this phase, so they
 639                  * won't be needed.
 640                  */
 641                 pin_all_children(ib);
 642         }
 643
 644         spin_lock(&ib->b.inode->i_data.private_lock);
 645         ulist = ib->uninc_next;
 646         ib->uninc_next = NULL;
 647         spin_unlock(&ib->b.inode->i_data.private_lock);
 648
 649         lafs_iounlock_block(&ib->b);
 650
 651         if (ulist)
 652                 lafs_dirty_iblock(ib);
 653         while (ulist) {
 654                 struct block *b2 = ulist;
 655                 ulist = b2->chain;
 656                 b2->chain = NULL;
 657                 clear_bit(B_Uninc, &b2->flags);
 658                 while (lafs_add_block_address(fs, b2) == 0)
 659                         /* We had to incorporate b2->parent which might
 660                          * have split the parent but as that will have made
 661                          * it dirty, there is nothing extra that we need to
 662                          * do here
 663                          */
 664                         ;
 665                 putref(b2, MKREF(uninc));
 666         }
 667         LAFS_BUG(ib->uninc_next, &ib->b);
 668
 669         lafs_refile(&ib->b, 0);
 670         if (ib->b.parent)
 671                 /* Parent may need to be attached to a phase_leafs now */
 672                 lafs_refile(&ib->b.parent->b, 0);
 673 }
 674
 675 /*
 676  * lafs_refile is called whenever we finish doing stuff
 677  * to a block.  lafs_refile checks the state of the block and
 678  * make sure various lists etc are correct.
 679  * When finished with a block we often want to drop a reference,
 680  * and this functionality is included in refile so that the
 681  * decref and associated refiling can be done under a lock.
 682  *
 683  * Key issues that lafs_refile deals with are:
 684  *   Make sure ->lru is on correct list.
 685  *   Remove ->parent link if no longer needed.
 686  *   remove B_Lock if no longer needed
 687  *   trigger a phase change when appropriate
 688  *   Remove dir parent link if no longer needed.
 689  *
 690  * Fleshing these out a bit:
 691  *   possible lru lists are:
 692  *      phase_leafs[phase] - for dirty/pinned blocks with no pinned children
 693  *      clean_leafs - for blocks that are being relocated for cleaning
 694  *                       and that have no pinned children
 695  *   The parent link is only needed if this block has a refcount or one
 696  *     of B_Pinned, B_Dirty, B_Uninc
 697  *   B_Lock is only needed if one of
 698  *      B_Dirty
 699  *    or refcount
 700  *  though a refcount on a pinned datablock is only significant
 701  *         while the current phase is locked.
 702  *
 703  *  a phase change can (and should) happen for index blocks
 704  *  in the 'other' phase that are
 705  *    not Dirty, not Alloc, pincnt[oldphase]==0, uninc-table empty
 706  *
 707  *
 708  * Refiling one block may cause a change in another block which requires
 709  * a recursive refile.  Naturally we unroll the tail-recursion, but we
 710  * we need to be sure there are a limited number of tails.
 711  * The causes of recursion are:
 712  *   A datablock which no-longer has a PagePrivate page holds a ref
 713  *      on block0 of that page, which must be dropped on last put.
 714  *   When any block becomes unpinned, we must refile the parent.
 715  *   When we clear ->parent we must deref the parent.
 716  *   When an InoIdx block drops ->parent it is dropped from inode->iblock
 717  *      so we need to drop the implied refcnt on ->dblock
 718  * So there can only be 2 block to be recursed on to.
 719  *  A block0 or dblock, and a parent.
 720  * A block pointing to a block0 never has a parent, and an iblock has the same
 721  * parent as the dblock, so there can never be more than 2 outstanding blocks.
 722  */
 723 int lafs_is_leaf(struct block *b, int ph)
 724 {
 725         /* This is pinned and not iolocked.  Should it be on a leaf
 726          * list?
 727          */
 728         struct inode *myi;
 729         struct lafs_inode *lai;
 730         int rv = 1;
 731
 732         if (!test_bit(B_Pinned, &b->flags) ||
 733             test_bit(B_Writeback, &b->flags))
 734                 return 0;
 735
 736         if (test_bit(B_Index, &b->flags)) {
 737                 if (ph >= 0)
 738                         return atomic_read(&iblk(b)->pincnt[ph]) == 0;
 739                 return (atomic_read(&iblk(b)->pincnt[0]) +
 740                         atomic_read(&iblk(b)->pincnt[1])) == 0;
 741         }
 742
 743         /* This is a data block */
 744         myi = rcu_my_inode(dblk(b));
 745         if (!myi) {
 746                 /* Non-inode data blocks are always leafs */
 747                 return 1;
 748         }
 749         lai = LAFSI(myi);
 750         spin_lock(&lai->vfs_inode.i_data.private_lock);
 751         if (ph < 0)
 752                 ph = !!test_bit(B_Phase1, &b->flags);
 753         if (lai->iblock &&
 754             test_bit(B_Pinned, &lai->iblock->b.flags) &&
 755             !!test_bit(B_Phase1, &lai->iblock->b.flags) == ph)
 756                 /* iblock still pinned in the same phase, so
 757                  * this block isn't a leaf
 758                  */
 759                 rv = 0;
 760
 761         spin_unlock(&lai->vfs_inode.i_data.private_lock);
 762         rcu_iput(myi);
 763         return rv;
 764 }
 765
 766 /*
 767  * This for debugging and is racy and is probably only safe on UP
 768  */
 769 static void check_consistency(struct block *b)
 770 {
 771         /* sanity tests.
 772          * 1/ make sure pincnt is right
 773          */
 774         if (test_bit(B_Index, &b->flags)) {
 775                 int c[2];
 776                 struct block *cb;
 777                 c[0] = c[1] = 0;
 778                 list_for_each_entry(cb, &iblk(b)->children, siblings) {
 779                         struct inode *myi = NULL;
 780                         if (test_bit(B_Pinned, &cb->flags)) {
 781                                 int pp = !!test_bit(B_Phase1, &cb->flags);
 782                                 c[pp]++;
 783                         }
 784                         /* InoIdx block might be pinned but is not a direct
 785                          * child */
 786                         if (!test_bit(B_Index, &cb->flags) &&
 787                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
 788                             LAFSI(myi)->iblock &&
 789                             test_bit(B_Pinned, &LAFSI(myi)->iblock->b.flags)) {
 790                                 int pp = !!test_bit(B_Phase1,
 791                                                     &LAFSI(myi)->iblock->b.flags);
 792                                 c[pp]++;
 793                         }
 794                         rcu_iput(myi);
 795                 }
 796                 if (c[0] != atomic_read(&iblk(b)->pincnt[0]) ||
 797                     c[1] != atomic_read(&iblk(b)->pincnt[1])) {
 798                         printk("%d %d\n", c[0], c[1]);
 799                         lafs_dump_tree();
 800                         LAFS_BUG(1, b);
 801                 }
 802         }
 803
 804         if (b->parent) {
 805                 struct indexblock *ib = b->parent;
 806                 int c[2];
 807                 struct block *cb;
 808                 c[0] = c[1] = 0;
 809                 list_for_each_entry(cb, &ib->children, siblings) {
 810                         struct inode *myi = NULL;
 811                         if (test_bit(B_Pinned, &cb->flags)) {
 812                                 int pp = !!test_bit(B_Phase1, &cb->flags);
 813                                 c[pp]++;
 814                         }
 815                         /* InoIdx block might be pinned but is not a direct
 816                          * child */
 817                         if (!test_bit(B_Index, &cb->flags) &&
 818                             (myi = rcu_my_inode(dblk(cb))) != NULL &&
 819                             LAFSI(myi)->iblock &&
 820                             test_bit(B_Pinned, &(LAFSI(myi)
 821                                                  ->iblock->b.flags))) {
 822                                 int pp = !!test_bit(B_Phase1,
 823                                                     &LAFSI(myi)
 824                                                     ->iblock->b.flags);
 825                                 c[pp]++;
 826                         }
 827                         rcu_iput(myi);
 828                 }
 829                 if (c[0] != atomic_read(&ib->pincnt[0]) ||
 830                     c[1] != atomic_read(&ib->pincnt[1])) {
 831                         printk("badcnt %d %d\n", c[0], c[1]);
 832                         LAFS_BUG(1, &ib->b);
 833                 }
 834         }
 835 }
 836
 837 static void set_lru(struct block *b)
 838 {
 839         struct fs *fs;
 840         int ph = !!test_bit(B_Phase1, &b->flags);
 841
 842         if (!test_bit(B_OnFree, &b->flags) && !list_empty_careful(&b->lru))
 843                 return;
 844         if (test_bit(B_IOLock, &b->flags) || !lafs_is_leaf(b, ph))
 845                 return;
 846
 847         /* This is close enough to a leaf that we should put it on a list.
 848          * If we raced and it isn't then it will be found and removed
 849          */
 850         if (test_bit(B_OnFree, &b->flags)) {
 851                 spin_lock(&lafs_hash_lock);
 852                 if (test_and_clear_bit(B_OnFree, &b->flags)) {
 853                         list_del_init(&b->lru);
 854                         freelist.freecnt--;
 855                 }
 856                 spin_unlock(&lafs_hash_lock);
 857         }
 858         fs = fs_from_inode(b->inode);
 859         spin_lock(&fs->lock);
 860         if (list_empty(&b->lru)) {
 861                 if (test_bit(B_Realloc, &b->flags))
 862                         list_add(&b->lru, &fs->clean_leafs);
 863                 else {
 864                         ph = !!test_bit(B_Phase1, &b->flags);
 865                         list_add(&b->lru, &fs->phase_leafs[ph]);
 866                 }
 867
 868         }
 869         spin_unlock(&fs->lock);
 870 }
 871
 872 void lafs_refile(struct block *b, int dec)
 873 {
 874         struct block *next = NULL, *next_parent = NULL;
 875         struct fs *fs = NULL;
 876         u64 physref = 0;
 877
 878         if (!b)
 879                 return;
 880
 881         fs = fs_from_inode(b->inode);
 882         check_consistency(b);
 883
 884 /* To (mostly) avoid recursion, we have a loop which may
 885  * walk up to the parent.
 886  * The main reason for holding lafs_hash_lock is that it protects
 887  * ->lru - i.e. all the lists. FIXME that should be fs->lock
 888  */
 889         LAFS_BUG(atomic_read(&b->refcnt) == 0, b);
 890         while (b) {
 891                 int ph;
 892                 int free_me = 0;
 893                 struct inode *checkpin = NULL;
 894                 struct inode *myi = NULL;
 895                 struct datablock *db;
 896
 897                 set_lru(b);
 898
 899                 if (test_bit(B_Pinned, &b->flags) &&
 900                     !test_bit(B_Index, &b->flags) &&
 901                     !test_bit(B_PinPending, &b->flags) &&
 902                     !test_bit(B_Dirty, &b->flags) &&
 903                     !test_bit(B_Realloc, &b->flags)
 904                         ) {
 905                         /* Don't need to be Pinned any more */
 906                         /* FIXME do I need to lock access to ->parent */
 907                         lafs_checkpoint_lock(fs);
 908                         ph = !!test_bit(B_Phase1, &b->flags);
 909                         if (test_and_clear_bit(B_Pinned, &b->flags)) {
 910                                 if (test_bit(B_Dirty, &b->flags) ||
 911                                     test_bit(B_Realloc, &b->flags) ||
 912                                     test_bit(B_PinPending, &b->flags))
 913                                         /* lost a race */
 914                                         set_phase(b, ph);
 915                                 else if (!test_bit(B_Root, &b->flags)) {
 916                                         struct block *nb;
 917                                         atomic_dec(&b->parent->pincnt[ph]);
 918                                         nb = &b->parent->b;
 919                                         if (next_parent != nb) {
 920                                                 LAFS_BUG(next_parent, b);
 921                                                 next_parent = nb;
 922                                                 atomic_inc(&nb->refcnt);
 923                                         }
 924                                 }
 925                         }
 926                         lafs_checkpoint_unlock(fs);
 927                 }
 928
 929                 if (dec && atomic_dec_and_lock(&b->refcnt, &b->inode->i_data.private_lock)) {
 930                         /* PinPending disappears with the last non-lru reference,
 931                          * (or possibly at other times).
 932                          */
 933                         clear_bit(B_PinPending, &b->flags);
 934
 935                         ph = !!test_bit(B_Phase1, &b->flags);
 936                         /* See if we still need to be pinned */
 937                         if (test_bit(B_Pinned, &b->flags) &&
 938                             !test_bit(B_Dirty, &b->flags) &&
 939                             !test_bit(B_Realloc, &b->flags)) {
 940                                 /* Don't need to be Pinned any more */
 941                                 if (test_and_clear_bit(B_Pinned, &b->flags)) {
 942                                         if (!list_empty_careful(&b->lru)) {
 943                                                 spin_lock(&fs->lock);
 944                                                 list_del_init(&b->lru);
 945                                                 spin_unlock(&fs->lock);
 946                                         }
 947                                         if (test_bit(B_InoIdx, &b->flags) &&
 948                                             b->inode->i_nlink)
 949                                                 checkpin = b->inode;
 950                                         if (!test_bit(B_Root, &b->flags)) {
 951                                                 struct block *nb;
 952                                                 atomic_dec(&b->parent->pincnt[ph]);
 953                                                 if (test_bit(B_InoIdx, &b->flags))
 954                                                         nb = &LAFSI(b->inode)->dblock->b;
 955                                                 else
 956                                                         nb = &b->parent->b;
 957                                                 if (next_parent != nb) {
 958                                                         LAFS_BUG(next_parent, b);
 959                                                         next_parent = nb;
 960                                                         atomic_inc(&nb->refcnt);
 961                                                 }
 962                                         }
 963                                 }
 964                         }
 965
 966                         /* check the ->parent link */
 967                         if (b->parent &&
 968                             !(b->flags & (
 969                                       (1<<B_Pinned) |
 970                                       (1<<B_Uninc) |
 971                                       (1<<B_Realloc) |
 972                                       (1<<B_Dirty)))
 973                                 ) {
 974                                 int credits;
 975                                 /* Don't need ->parent any more */
 976                                 if (next_parent == NULL)
 977                                         next_parent = &b->parent->b;
 978                                 else if (next_parent == &b->parent->b ||
 979                                          next_parent->parent == b->parent) {
 980                                         if (atomic_dec_and_test(&b->parent->b.refcnt))
 981                                                 LAFS_BUG(1, b);
 982                                 } else
 983                                         LAFS_BUG(1, b);
 984
 985                                 del_ref(&b->parent->b, MKREF(child),
 986                                         __FILE__, __LINE__);
 987                                 b->parent = NULL;
 988
 989                                 if (test_bit(B_InoIdx, &b->flags)) {
 990                                         /* While an InoIdx has a parent we hold a count on
 991                                          * the dblock.  Now we have dropped one, we must drop the
 992                                          * other
 993                                          */
 994                                         struct datablock *db = LAFSI(b->inode)->dblock;
 995                                         if (&db->b.parent->b != next_parent &&
 996                                             &db->b != next_parent) {
 997                                                 printk("db    = %s\n", strblk(&db->b));
 998                                                 printk("dp->p = %s\n", strblk(&db->b.parent->b));
 999                                                 printk("np    = %s\n", strblk(next_parent));
1000                                                 printk("b     = %s\n", strblk(b));
1001                                         }
1002                                         BUG_ON(&db->b.parent->b != next_parent && &db->b != next_parent);
1003                                         if (atomic_dec_and_test(&next_parent->refcnt))
1004                                                 LAFS_BUG(1, b);
1005                                         next_parent = &db->b;
1006                                         del_ref(&db->b, MKREF(pdblock), __FILE__, __LINE__);
1007                                 }
1008                                 if (test_and_clear_bit(B_SegRef, &b->flags))
1009                                         physref = b->physaddr;
1010
1011                                 LAFS_BUG(test_bit(B_PrimaryRef, &b->flags), b);
1012                                 list_del_init(&b->siblings);
1013
1014                                 if (test_bit(B_Index, &b->flags))
1015                                         list_add(&b->siblings,
1016                                                  &LAFSI(b->inode)->free_index);
1017
1018                                 if (test_and_clear_bit(B_Prealloc, &b->flags) &&
1019                                     b->physaddr == 0)
1020                                         lafs_summary_allocate(fs, b->inode, -1);
1021                                 credits = 0;
1022                                 if (test_and_clear_bit(B_Credit, &b->flags))
1023                                         credits++;
1024                                 if (test_and_clear_bit(B_ICredit, &b->flags))
1025                                         credits++;
1026                                 if (test_and_clear_bit(B_NCredit, &b->flags))
1027                                         credits++;
1028                                 if (test_and_clear_bit(B_NICredit, &b->flags))
1029                                         credits++;
1030                                 if (test_and_clear_bit(B_UnincCredit, &b->flags))
1031                                         credits++;
1032                                 lafs_space_return(fs, credits);
1033                         }
1034                         if (test_bit(B_Index, &b->flags) &&
1035                             !(b->flags & (
1036                                       (1<<B_Pinned) |
1037                                       (1<<B_Uninc) |
1038                                       (1<<B_Root) |
1039                                       (1<<B_Realloc) |
1040                                       (1<<B_Dirty)))
1041                                 ) {
1042                                 if (b->parent != NULL)
1043                                         /* Could this be uninc - where
1044                                          * is refcnt */
1045                                         printk("Problem: %s\n", strblk(b));
1046                                 LAFS_BUG(b->parent != NULL, b);
1047                                 /* put it on the lru */
1048                                 spin_lock(&lafs_hash_lock);
1049                                 if (!test_and_set_bit(B_OnFree, &b->flags)) {
1050                                         LAFS_BUG(!list_empty(&b->lru), b);
1051                                         list_move(&b->lru, &freelist.lru);
1052                                         freelist.freecnt++;
1053                                 }
1054                                 spin_unlock(&lafs_hash_lock);
1055                         }
1056                         /* last reference to a dblock with no page
1057                          * requires special handling
1058                          * The first block on a page must be freed,
1059                          * the other blocks hold a reference on that
1060                          * first block which must be dropped.
1061                          * However it is possible that a new reference was taken
1062                          * via _leafs. If so we have now cleared Pinned so
1063                          * get_flushable will immediately put this again.
1064                          * so we should leave this cleanup to later.
1065                          * Unlike other tests, this one isn't idempotent, so
1066                          * need the check on refcnt
1067                          */
1068                         db = dblk(b);
1069                         if (!test_bit(B_Index, &b->flags) &&
1070                             !PagePrivate(db->page) &&
1071                             atomic_read(&b->refcnt) == 0) {
1072                                 int bits = (PAGE_SHIFT -
1073                                             b->inode->i_blkbits);
1074                                 int mask = (1<<bits) - 1;
1075                                 int bnum = b->fileaddr & mask;
1076
1077                                 LAFS_BUG(next, next);
1078                                 if (bnum) {
1079                                         next = &db[-bnum].b;
1080                                         del_ref(next, "lafs_release_0",
1081                                                 __FILE__, __LINE__);
1082                                 } else {
1083                                         free_me = 1;
1084                                         put_page(db->page);
1085                                 }
1086                         }
1087                         /* last reference to an iblock requires that we
1088                          * deref the dblock.  We don't need to re-check
1089                          * refcnt here as a racing getiref_locked will take an
1090                          * extra dblock reference it.
1091                          */
1092                         if (test_bit(B_InoIdx, &b->flags)) {
1093                                 LAFS_BUG(LAFSI(b->inode)->iblock
1094                                          != iblk(b), b);
1095                                 LAFS_BUG(next, next);
1096                                 next = &LAFSI(b->inode)->dblock->b;
1097                                 del_ref(next, MKREF(iblock),
1098                                         __FILE__, __LINE__);
1099                         }
1100
1101                         /* Free a delayed-release inode */
1102                         if (!test_bit(B_Index, &b->flags) &&
1103                             (myi = rcu_my_inode(dblk(b))) != NULL &&
1104                             (!PagePrivate(dblk(b)->page) ||
1105                              test_bit(I_Destroyed, &LAFSI(myi)->iflags))) {
1106                                 dblk(b)->my_inode = NULL;
1107                                 LAFSI(myi)->dblock = NULL;
1108                                 /* Don't need lafs_hash_lock to clear iblock as
1109                                  * new refs on iblock are only taken while holding
1110                                  * dblock, and we know dblock has no references.
1111                                  * You would need an iget to get a ref on dblock now,
1112                                  * and because I_Destroyed, we know that isn't possible.
1113                                  */
1114                                 LAFSI(myi)->iblock = NULL;
1115                                 spin_unlock(&b->inode->i_data.private_lock);
1116                                 if (test_bit(I_Destroyed, &LAFSI(myi)->iflags))
1117                                         lafs_destroy_inode(myi);
1118                         } else
1119                                 spin_unlock(&b->inode->i_data.private_lock);
1120                 }
1121                 rcu_iput(myi);
1122
1123                 if (physref) {
1124                         lafs_seg_deref(fs, physref, 0);
1125                         physref = 0;
1126                 }
1127
1128                 if (checkpin)
1129                         lafs_inode_checkpin(checkpin);
1130
1131                 if (free_me)
1132                         kfree(dblk(b));
1133                 b = NULL;
1134                 if (next) {
1135                         b = next;
1136                         next = NULL;
1137                 } else if (next_parent) {
1138                         b = next_parent;
1139                         next_parent = NULL;
1140                 }
1141                 dec = 1;
1142         }
1143 }
1144
1145 /*
1146  * create (if it doesn't already exist) the 'iblock' for an inode.
1147  * This is a shadow of the dblock but comes into it's own if/when
1148  * the inode's indexing tree needs to grow.
1149  * Return a counted reference to the index block.
1150  * NOTE: we must hold a reference to ->dblock when we call
1151  * lafs_make_iblock.
1152  */
1153 struct indexblock * __must_check
1154 lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG)
1155 {
1156         struct lafs_inode *lai = LAFSI(ino);
1157         struct indexblock *ib, *new = NULL;
1158         struct fs *fs = fs_from_inode(ino);
1159         int err = 0;
1160
1161         dprintk("MAKE_IBLOCK %d\n", (int)ino->i_ino);
1162
1163         BUG_ON(lai->dblock == NULL);
1164         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1165 retry:
1166         spin_lock(&lafs_hash_lock);
1167         if (lai->iblock)
1168                 ib = getiref_locked_needsync(lai->iblock, REF);
1169         else if (new) {
1170                 lai->iblock = new;
1171                 ib = new;
1172                 new = NULL;
1173                 (void)getdref(lai->dblock, MKREF(iblock));
1174         }
1175         spin_unlock(&lafs_hash_lock);
1176         if (new)
1177                 lafs_iblock_free(new);
1178         if (ib) {
1179                 sync_ref(&ib->b);
1180                 if (adopt) {
1181                         err = setparent(lai->dblock, async);
1182                         if (err == 0)
1183                                 block_adopt(&ib->b, lai->dblock->b.parent);
1184                 }
1185                 if (err == 0)
1186                         return ib;
1187                 putiref(ib, REF);
1188                 return ERR_PTR(err);
1189         }
1190
1191         new = lafs_iblock_alloc(fs, GFP_KERNEL, 0, REF);
1192         if (!new)
1193                 return ERR_PTR(-ENOMEM);
1194
1195         new->b.fileaddr = 0;
1196         new->b.physaddr = lai->dblock->b.physaddr;
1197         if (test_bit(B_PhysValid, &lai->dblock->b.flags))
1198                 set_bit(B_PhysValid, &new->b.flags);
1199         LAFS_BUG(!test_bit(B_Valid, &lai->dblock->b.flags), &lai->dblock->b);
1200         set_bit(B_Valid, &new->b.flags);
1201         new->b.inode = ino;
1202         new->depth = lai->depth;
1203         /* Note: this doesn't get hashed until the index
1204          * tree grows and this block is disconnected from
1205          * the inode.
1206          */
1207         set_bit(B_InoIdx, &new->b.flags);
1208         if (test_bit(B_Root, &lai->dblock->b.flags))
1209                 set_bit(B_Root, &new->b.flags);
1210         new->b.parent = NULL;
1211
1212         goto retry;
1213 }
1214
1215 static u64
1216 leaf_lookup(void *bf, int len, u32 startaddr, u32 target, u32 *nextp)
1217 {
1218         /* buf starts with a 2byte header
1219          * if 1, then 6byte littleending indirect entries.
1220          * if 2, then 12byte extent entries
1221          */
1222         unsigned char *buf = bf;
1223         u64 p;
1224         unsigned char *cp;
1225         int elen;
1226         int hi, lo;
1227         u32 addr, next;
1228
1229         if (nextp)
1230                 *nextp = 0xffffffff;
1231
1232         if (buf[1])
1233                 return 0;
1234         switch (buf[0]) {
1235         default:
1236                 p = 0;
1237                 break;
1238
1239         case IBLK_INDIRECT: /* indirect */
1240                 dprintk("indirect lookup for %lu from %lu, len %d\n",
1241                         (unsigned long)target, (unsigned long)startaddr, len);
1242                 len -= 2;
1243                 buf += 2;
1244
1245                 if (target < startaddr)
1246                         return 0;
1247
1248                 next = target;
1249                 target -= startaddr;
1250
1251                 /* Need both tests as target could be v.large and
1252                  * multiply by 6 could overflow
1253                  */
1254                 if (target > len ||
1255                     target*6 + 6 > len)
1256                         return 0;
1257                 buf += target*6;
1258                 p = decode48(buf);
1259                 if (nextp) {
1260                         /* find the next allocated block */
1261                         u64 p2;
1262
1263                         len -= target*6;
1264                         len -= 6;
1265                         next++;
1266                         *nextp = 0xffffffff;
1267                         while (len >= 6) {
1268                                 p2 = decode48(buf);
1269                                 if (p2) {
1270                                         *nextp = next;
1271                                         break;
1272                                 }
1273                                 len -= 6;
1274                                 next++;
1275                         }
1276                 }
1277                 break;
1278
1279         case IBLK_EXTENT: /* extent */
1280                 /* 12 byte records: 6byte device, 2 byte length, 4
1281                  * byte fileaddr */
1282
1283                 len -= 2;
1284                 buf += 2;
1285                 lo = 0;
1286                 hi = len/12;
1287                 while (hi > lo+1) {
1288                         int mid = (lo+hi)/2;
1289                         cp = buf;
1290                         cp += mid*12 + 6;
1291
1292                         elen = decode16(cp);
1293                         addr = decode32(cp);
1294
1295                         if (elen && addr <= target)
1296                                 lo = mid;
1297                         else
1298                                 hi = mid;
1299                 }
1300
1301                 cp = buf;
1302                 cp += lo*12;
1303                 p = decode48(cp);
1304                 elen = decode16(cp);
1305                 addr = decode32(cp);
1306                 if (target < addr) {
1307                         p = 0;
1308                         cp -= 12;
1309                 } else if (target >= addr + elen)
1310                         p = 0;
1311                 else
1312                         p += target - addr;
1313
1314                 if (nextp) {
1315                         if (lo == len/12)
1316                                 *nextp = 0xffffffff; /* next extent */
1317                         else {
1318                                 u64 p2;
1319                                 p2 = decode48(cp);
1320                                 elen = decode16(cp);
1321                                 addr = decode32(cp);
1322                                 if (elen == 0)
1323                                         *nextp = 0xffffffff; /* no more meaningful
1324                                                           extents*/
1325                                 else
1326                                         *nextp = addr;
1327                         }
1328                 }
1329         }
1330         return p;
1331 }
1332
1333 u32 lafs_leaf_next(struct indexblock *ib, u32 start)
1334 {
1335         /* In this leaf index block, find the first address
1336          * at-or-after 'start'.  If none, return 0xFFFFFFFF
1337          */
1338         char *buf = map_iblock(ib);
1339         int blocksize = ib->b.inode->i_sb->s_blocksize;
1340         struct lafs_inode *li = LAFSI(ib->b.inode);
1341         u32 next = 0xffffffff;
1342         u64 phys;
1343
1344         if (test_bit(B_InoIdx, &ib->b.flags))
1345                 phys = leaf_lookup(buf + li->metadata_size,
1346                                    blocksize - li->metadata_size,
1347                                    ib->b.fileaddr, start, &next);
1348         else
1349                 phys = leaf_lookup(buf, blocksize,
1350                                    ib->b.fileaddr, start, &next);
1351         unmap_iblock(ib, buf);
1352         if (phys)
1353                 return start;
1354         else
1355                 return next;
1356 }
1357
1358 static u64
1359 index_lookup(void *bf, int len, u32 target, u32 *addrp, u32 *nextp)
1360 {
1361         unsigned char *buf = bf;
1362         int lo, hi;
1363         u32 addr;
1364         u64 p;
1365         unsigned char *cp;
1366
1367         if (buf[1] || buf[0]) {
1368                 dprintk("WARNING: not an index block %02x %02x\n",
1369                         buf[0], buf[1]);
1370                 return 0;
1371         }
1372
1373         buf += 2;
1374         len -= 2;
1375
1376         /* 10 byte records: 6 byte Device address, 4 byte file address */
1377         lo = 0; hi = (len/10);
1378         while (hi > lo+1) {
1379                 int mid = (lo+hi)/2;
1380                 cp = buf;
1381                 cp += mid*10;
1382                 p = decode48(cp);
1383                 addr = decode32(cp);
1384                 dprintk("...addr=%lu target=%lu lo=%d mid=%d hi=%d\n",
1385                         (unsigned long)addr, (unsigned  long)target,
1386                         lo, mid, hi);
1387                 if (p && addr <= target)
1388                         lo = mid;
1389                 else
1390                         hi = mid;
1391         }
1392
1393         cp = buf;
1394         cp += lo*10;
1395         p = decode48(cp);
1396         addr = decode32(cp);
1397
1398         if (p == 0)
1399                 /* Nothing in this block */
1400                 return 0;
1401
1402         if (addr > target) {
1403                 /* target is before first address */
1404                 p = 0;
1405                 cp -= 10;
1406         } else
1407                 *addrp = addr;
1408
1409         if (cp+10 <= (unsigned char *) buf + len && nextp) {
1410                 u64 p2 = decode48(cp);
1411                 u32 nxt = decode32(cp);
1412                 if (p2)
1413                         *nextp = nxt; /* address of next index block */
1414         }
1415         return p;
1416 }
1417
1418 int lafs_index_empty(struct indexblock *ib)
1419 {
1420         char *buf = map_iblock(ib);
1421         int blocksize = ib->b.inode->i_sb->s_blocksize;
1422         struct lafs_inode *li = LAFSI(ib->b.inode);
1423         u32 addr;
1424         u64 phys;
1425
1426         if (test_bit(B_InoIdx, &ib->b.flags))
1427                 phys = index_lookup(buf + li->metadata_size,
1428                                     blocksize - li->metadata_size,
1429                                     0xFFFFFFFF, &addr, NULL);
1430         else
1431                 phys = index_lookup(buf, blocksize,
1432                                     0xFFFFFFFF, &addr, NULL);
1433         unmap_iblock(ib, buf);
1434
1435         return phys == 0;
1436 }
1437
1438 static struct indexblock *find_better(struct inode *ino,
1439                                       struct indexblock *ib,
1440                                       u32 addr, u32 *next, REFARG)
1441 {
1442         struct indexblock *rv = ib;
1443         spin_lock(&ino->i_data.private_lock);
1444         list_for_each_entry_continue(ib, &ib->b.parent->children, b.siblings) {
1445                 if (!test_bit(B_PrimaryRef, &ib->b.flags))
1446                         break;
1447                 if (test_bit(B_EmptyIndex, &ib->b.flags))
1448                         continue;
1449                 if (ib->b.fileaddr > addr) {
1450                         if (next && *next > ib->b.fileaddr)
1451                                 *next = ib->b.fileaddr;
1452                         break;
1453                 }
1454                 /* This appears to be better. */
1455                 rv = ib;
1456         }
1457         getref_locked(&rv->b, REF);
1458         spin_unlock(&ino->i_data.private_lock);
1459         return rv;
1460 }
1461
1462 struct indexblock *
1463 lafs_leaf_find(struct inode *inode, u32 addr, int adopt, u32 *next,
1464                int async, REFARG)
1465 {
1466         /* Find the leaf index block which refers to this address (if any do)
1467          * Returns the leaf IOLocked.
1468          * *next will be set to an address that is higher than addr such
1469          * that there are no other leafs before *next.  There might not
1470          * actually be a leaf at *next though.
1471          */
1472         struct indexblock *ib, *ib2;
1473         struct indexblock *hold = NULL;
1474         u64 iphys;
1475         u32 iaddr = 0;
1476         void *buf;
1477         struct lafs_inode *li = LAFSI(inode);
1478         int blocksize = inode->i_sb->s_blocksize;
1479         int err = 0;
1480         int offset;
1481
1482         /* Must own a reference to ->dblock */
1483         BUG_ON(li->dblock == NULL);
1484         BUG_ON(atomic_read(&li->dblock->b.refcnt) == 0);
1485
1486         /* We come back to restart if we needed to unlock to
1487          * perform IO and we cannot be sure the parent didn't change.
1488          * We hold the last seen index block in 'hold' so that if
1489          * no changes happened (the common case) we are certain
1490          * that no IO will be required to get back to where
1491          * we were.
1492          */
1493 restart:
1494         if (next)
1495                 *next = 0xFFFFFFFF;
1496
1497         ib = lafs_make_iblock(inode, adopt, async, REF);
1498         err = PTR_ERR(ib);
1499         if (IS_ERR(ib))
1500                 goto err;
1501         offset = li->metadata_size;
1502
1503         err = -EAGAIN;
1504         if (!async)
1505                 lafs_iolock_block(&ib->b);
1506         else if (!lafs_iolock_block_async(&ib->b))
1507                 goto err_ib;
1508
1509         while (ib->depth > 1) {
1510                 /* internal index block */
1511                 /* NOTE:  index block could be empty, or could have
1512                  * nothing as early as 'addr'.  In that case '0' is
1513                  * returned implying that the block is either in
1514                  * cache, or needs to be synthesised as an empty
1515                  * index block (one level down).
1516                  */
1517                 struct indexblock *better;
1518                 u32 target = addr;
1519
1520         retry:
1521                 buf = map_iblock(ib);
1522                 iaddr = ib->b.fileaddr;
1523                 iphys = index_lookup(buf + offset, blocksize - offset,
1524                                      target, &iaddr,
1525                                      target == addr ? next : NULL);
1526                 unmap_iblock(ib, buf);
1527
1528                 ib2 = lafs_iblock_get(inode, iaddr, ib->depth-1, iphys, REF);
1529
1530                 if (!ib2) {
1531                         err = -ENOMEM;
1532                         lafs_iounlock_block(&ib->b);
1533                         goto err_ib;
1534                 }
1535
1536                 better = find_better(inode, ib2, addr, next, REF);
1537                 putiref(ib2, REF);
1538                 ib2 = better;
1539
1540                 if (!test_bit(B_Valid, &ib2->b.flags)) {
1541                         lafs_iounlock_block(&ib->b);
1542                         err = lafs_load_block(&ib2->b, NULL);
1543                         if (err)
1544                                 goto err_ib2;
1545                         if (async)
1546                                 err = lafs_wait_block_async(&ib2->b);
1547                         else
1548                                 err = lafs_wait_block(&ib2->b);
1549                         if (err)
1550                                 goto err_ib2;
1551
1552                         err = -EAGAIN;
1553                         if (!async)
1554                                 lafs_iolock_block(&ib->b);
1555                         else if (!lafs_iolock_block_async(&ib->b))
1556                                 goto err_ib2;
1557
1558                         /* While we did IO, the parent could have been
1559                          * split, so it is now the wrong parent, or it
1560                          * could have become EmptyIndex, though that is
1561                          * much less likely.  If it did, then it must
1562                          * have had a parent, so the ref we hold means
1563                          * it still has a parent.
1564                          * So if ib->b.parent, then we might need to
1565                          * retry from the top, and holding a ref on ib
1566                          * means that we don't risk live-locking if
1567                          * memory for index blocks is very tight.
1568                          * If there is no parent, then there is
1569                          * no risk that ib changed.
1570                          */
1571                         if (ib->b.parent) {
1572                                 if (hold)
1573                                         putiref(hold, MKREF(hold));
1574                                 hold = getiref(ib, MKREF(hold));
1575                                 putiref(ib2, REF);
1576                                 putiref(ib, REF);
1577                                 goto restart;
1578                         }
1579                 }
1580
1581                 err = -EAGAIN;
1582                 if (!async)
1583                         lafs_iolock_block(&ib2->b);
1584                 else if (!lafs_iolock_block_async(&ib2->b))
1585                         goto err_ib2;
1586
1587                 if (ib2->b.fileaddr != ib->b.fileaddr &&
1588                     test_bit(B_EmptyIndex, &ib2->b.flags)) {
1589                         /* need to look earlier in the parent */
1590                         target = ib2->b.fileaddr - 1;
1591                         lafs_iounlock_block(&ib2->b);
1592                         putiref(ib2, REF);
1593                         goto retry;
1594                 }
1595
1596 #ifdef FIXME
1597                 if (!(ib2->b.flags & B_Linked)) {
1598                         struct block *b2 = peer_find(fs,
1599                                                      inode, iaddr,
1600                                                      depth, iphys);
1601                         if (b2)
1602                                 list_add(&ib2->peers, &b2->peers);
1603                         set_bit(B_Linked, &ib2->b.flags);
1604                 }
1605 #endif
1606                 if (adopt)
1607                         block_adopt(&ib2->b, ib);
1608                 lafs_iounlock_block(&ib->b);
1609                 putiref(ib, REF);
1610                 ib = ib2;
1611                 offset = 0;
1612         }
1613
1614         return ib;
1615
1616 err_ib2:
1617         putiref(ib2, REF);
1618 err_ib:
1619         putiref(ib, REF);
1620 err:
1621         if (hold)
1622                 putiref(hold, MKREF(hold));
1623         return ERR_PTR(err);
1624 }
1625
1626 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr);
1627
1628 static int __lafs_find_next(struct inode *ino, loff_t *addrp)
1629 {
1630         /* Find the next allocated block in inode at or after *addrp
1631          * and set *addrp.
1632          * Return:
1633          *  -1 -  error
1634          *   0 -  there is no block at or after '*addrp'
1635          *   1 -  It is worth checking the block at *addrp.
1636          *   2 -  *addrp has been updated, check again.
1637          */
1638         struct lafs_inode *lai = LAFSI(ino);
1639         struct indexblock *leaf;
1640         struct fs *fs = fs_from_inode(ino);
1641         struct block *b;
1642         u32 nexti, nextd, nextl;
1643         int rv = 0;
1644         int offset;
1645         char *buf;
1646         u64 p;
1647
1648         /* Must hold a reference to ->dblock */
1649         BUG_ON(lai->dblock == NULL);
1650         BUG_ON(atomic_read(&lai->dblock->b.refcnt) == 0);
1651
1652         if (lai->depth == 0) {
1653                 if (*addrp == 0)
1654                         return 1;
1655                 leaf = lafs_make_iblock(ino, NOADOPT, SYNC, MKREF(find_next));
1656                 if (IS_ERR(leaf))
1657                         return PTR_ERR(leaf);
1658                 nextd = 0xFFFFFFFF;
1659         } else {
1660                 leaf = lafs_leaf_find(ino, *addrp, NOADOPT, &nexti, SYNC,
1661                                       MKREF(find_next));
1662                 /* FIXME what if 'leaf' is an error?? */
1663                 offset = 0;
1664                 if (lai->depth == 1)
1665                         offset = lai->metadata_size;
1666                 buf = map_iblock(leaf);
1667                 p = leaf_lookup(buf+offset, fs->blocksize-offset,
1668                                 leaf->b.fileaddr, (u32)*addrp, &nextd);
1669                 unmap_iblock(leaf, buf);
1670                 if (p) {
1671                         lafs_iounlock_block(&leaf->b);
1672                         putiref(leaf, MKREF(find_next));
1673                         if (*addrp == 0xFFFFFFFF) {
1674                                 int j;
1675                                 printk("at %s\n", strblk(&lai->dblock->b));
1676                                 printk("offset = %d depth=%d\n", offset,
1677                                        lai->depth);
1678                                 for (j = 0; j < 16; j++)
1679                                         printk("%04x ", buf[offset+j] & 0xffff);
1680                                 for (j = 0; j < 16; j++)
1681                                         printk("%04x ", buf[1024-16+j] & 0xffff);
1682                                 printk("\n");
1683                         }
1684                         BUG_ON(*addrp == 0xFFFFFFFF);
1685                         return 1;
1686                 }
1687                 if (nextd != 0xFFFFFFFF)
1688                         rv = 1;
1689                 else if (nexti != 0xFFFFFFFF) {
1690                         nextd = nexti;
1691                         rv = 2;
1692                 }
1693         }
1694         /* If rv==2, nextd may not be high enough as it might
1695          * be the address of an EmptyIndex block.  So only
1696          * use it if nothing better is found.
1697          */
1698         if (rv == 2)
1699                 nextl = 0xFFFFFFFF;
1700         else
1701                 nextl = nextd;
1702         list_for_each_entry(b, &leaf->children, siblings)
1703                 if (b->fileaddr >= *addrp &&
1704                     b->fileaddr <= nextl &&
1705                     test_bit(B_Valid, &b->flags)) {
1706                         nextd = nextl = b->fileaddr;
1707                         rv = 1;
1708                 }
1709
1710         if (leaf->uninc_table.pending_cnt) {
1711                 spin_lock(&leaf->b.inode->i_data.private_lock);
1712                 if (table_find_first(&leaf->uninc_table, *addrp, &nextl)) {
1713                         nextd = nextl;
1714                         rv = 1;
1715                 }
1716                 spin_unlock(&leaf->b.inode->i_data.private_lock);
1717         }
1718         lafs_iounlock_block(&leaf->b);
1719         putiref(leaf, MKREF(find_next));
1720         *addrp = nextd;
1721         return rv;
1722 }
1723
1724 /*
1725  * find the first data block at or after *bnump, and
1726  * store the address in *bnump.
1727  * Return 0 if nothing found, -1 on error, or
1728  * 1 if *bnump is a valid data block address
1729  *
1730  * The block could be in the indexing tree, or in
1731  * an uninc_table, or totally unincorporated.
1732  * Blocks that are really part of the file, but are not in
1733  * the indexing tree will still be on a ->children list
1734  * from an index block, and the index block will be very close
1735  * to the indexing tree.
1736  * So once we have found a suitable index block, we need to check
1737  * all it's children.  This could possibly be optimised using
1738  * find_get_pages if that was exported.
1739  */
1740 int lafs_find_next(struct inode *ino, loff_t *bnump)
1741 {
1742         int rv;
1743         /* Need to hold a reference to dblock while calling
1744          * __lafs_find_next
1745          */
1746         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(find_nexti));
1747
1748         if (IS_ERR(db))
1749                 return PTR_ERR(db);
1750         while (1) {
1751                 struct datablock *b;
1752                 int hole;
1753
1754                 rv = __lafs_find_next(ino, bnump);
1755
1756                 if (rv <= 0)
1757                         break;
1758                 if (rv == 2)
1759                         continue;
1760
1761                 b = lafs_get_block(ino, *bnump, NULL, GFP_KERNEL,
1762                                    MKREF(find_next));
1763                 if (!b)
1764                         return -ENOMEM;
1765
1766                 rv = lafs_find_block(b, NOADOPT);
1767                 if (rv) {
1768                         putdref(b, MKREF(find_next));
1769                         return rv;
1770                 }
1771                 hole = (b->b.physaddr == 0 ||
1772                         !test_bit(B_PhysValid, &b->b.flags)) &&
1773                         !test_bit(B_Valid, &b->b.flags);
1774                 if (LAFSI(ino)->depth == 0 &&
1775                     b->b.fileaddr == 0)
1776                         hole = 0; /* first block lives in inode */
1777                 putdref(b, MKREF(find_next));
1778
1779                 rv = 1;
1780                 if (!hole)
1781                         break;
1782                 *bnump = (*bnump)+1;
1783         }
1784         putdref(db, MKREF(find_nexti));
1785         BUG_ON(rv > 0 && *bnump == 0xFFFFFFFF);
1786         return rv;
1787 }
1788
1789 static int table_lookup(struct uninc *tbl, u32 addr, u64 *phys)
1790 {
1791         int i;
1792         for (i = tbl->pending_cnt; i; ) {
1793                 struct addr *a = &tbl->pending_addr[--i];
1794                 if (a->fileaddr <= addr &&
1795                     a->fileaddr + a->cnt > addr) {
1796                         *phys =  a->physaddr + (addr - a->fileaddr);
1797                         return 1;
1798                 }
1799         }
1800         return 0;
1801 }
1802
1803 static int table_find_first(struct uninc *tbl, u32 min, u32 *addr)
1804 {
1805         /* Find the earliest allocated block in tbl which is
1806          * at-or-after 'min' and before '*addr', and store it in
1807          * *addr, returning 1 if something was found.
1808          */
1809         int rv = 0;
1810         int i;
1811
1812         for (i = tbl->pending_cnt ; i ; ) {
1813                 struct addr *a = &tbl->pending_addr[--i];
1814                 if (a->fileaddr >= *addr)
1815                         continue;
1816                 if (a->fileaddr + a->cnt < min)
1817                         continue;
1818                 if (a->physaddr == 0)
1819                         continue;
1820                 if (a->fileaddr >= min)
1821                         *addr = a->fileaddr;
1822                 else
1823                         *addr = min;
1824                 rv = 1;
1825         }
1826         return rv;
1827 }
1828
1829 static void fill_block_zero(struct datablock *db, struct datablock *inob)
1830 {
1831         struct lafs_inode *lai = LAFSI(db->b.inode);
1832         void *baddr = map_dblock(db);
1833         void *iaddr = map_dblock_2(inob);
1834         int offset = lai->metadata_size;
1835         int len = (1<<db->b.inode->i_blkbits) - offset;
1836
1837         memcpy(baddr, iaddr+offset, len);
1838         unmap_dblock_2(inob, iaddr);
1839         memset(baddr+len, 0, (1<<db->b.inode->i_blkbits)-len);
1840         unmap_dblock(db, baddr);
1841         set_bit(B_Valid, &db->b.flags);
1842 }
1843
1844 static int __must_check
1845 find_block(struct datablock *b, int adopt, int async)
1846 {
1847         /* Find where this block is in storage, and
1848          * set b->b.physaddr.
1849          * If the block lives in the inode, physaddr
1850          * gets set to zero, and lafs_load_block works with this.
1851          * Otherwise we walk down the index tree
1852          * Possible errors:
1853          *  -EIO
1854          *  -ENOMEM
1855          */
1856         struct lafs_inode *lai;
1857         struct indexblock *ib;
1858         struct datablock *db;
1859         void *buf;
1860         int  offset;
1861
1862         LAFS_BUG(test_bit(B_Pinned, &b->b.flags) &&
1863                  b->b.parent == NULL,
1864                  &b->b);
1865         if (b->b.inode == NULL)
1866                 return 0;
1867         lai = LAFSI(b->b.inode);
1868
1869         if (lai->depth == 0) {
1870                 /* Data is in the inode if anywhere */
1871                 if (!test_bit(B_PhysValid, &b->b.flags)) {
1872                         b->b.physaddr = 0;
1873                         set_bit(B_PhysValid, &b->b.flags);
1874                 }
1875                 db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1876                 if (IS_ERR(db))
1877                         return PTR_ERR(db);
1878                 if (!test_bit(B_Valid, &b->b.flags) && b->b.fileaddr == 0) {
1879                         /* Now is the best time to copy data from inode into
1880                          * datablock, as we know we have a valid inode block
1881                          * now.
1882                          */
1883                         LAFS_BUG(b->b.physaddr != 0, &b->b);
1884                         fill_block_zero(b, db);
1885                 }
1886                 if (adopt) {
1887                         ib = lafs_make_iblock(b->b.inode, adopt, async,
1888                                               MKREF(findblock));
1889                         putdref(db, MKREF(findblock));
1890                         if (IS_ERR(ib))
1891                                 return PTR_ERR(ib);
1892                         block_adopt(&b->b, ib);
1893                         putiref(ib, MKREF(findblock));
1894                 } else
1895                         putdref(db, MKREF(findblock));
1896                 return 0;
1897         }
1898
1899         if (test_bit(B_PhysValid, &b->b.flags)
1900             && (!adopt || b->b.parent || test_bit(B_Root, &b->b.flags))) {
1901                 return 0;
1902         }
1903
1904         db = lafs_inode_dblock(b->b.inode, async, MKREF(findblock));
1905         if (IS_ERR(db))
1906                 return PTR_ERR(db);
1907
1908         /* Ok, there is some indexing information we need to
1909          * look through.  Find the leaf first
1910          */
1911         ib = lafs_leaf_find(b->b.inode, b->b.fileaddr, adopt, NULL, async,
1912                             MKREF(findblock));
1913         putdref(db, MKREF(findblock));
1914
1915         if (IS_ERR(ib)) {
1916                 dprintk("lafs_leaf_find returned error to lafs_find_block\n");
1917                 goto out;
1918         }
1919
1920         if (!test_bit(B_PhysValid, &b->b.flags)
1921             && ib->uninc_table.pending_cnt) {
1922                 spin_lock(&b->b.inode->i_data.private_lock);
1923                 if (table_lookup(&ib->uninc_table, b->b.fileaddr,
1924                                  &b->b.physaddr))
1925                         set_bit(B_PhysValid, &b->b.flags);
1926                 spin_unlock(&b->b.inode->i_data.private_lock);
1927         }
1928         if (!test_bit(B_PhysValid, &b->b.flags)) {
1929                 buf = map_iblock(ib);
1930                 offset = lai->depth > 1 ? 0 : lai->metadata_size;
1931                 if (test_bit(B_InoIdx, &ib->b.flags))
1932                         dprintk("InoIdx!!!\n");
1933                 dprintk("offset %d: %02x %02x\n", offset,
1934                         ((char *)buf)[offset], ((char *)buf)[offset+1]);
1935                 b->b.physaddr = leaf_lookup(buf+offset,
1936                                             b->b.inode->i_sb->s_blocksize
1937                                             - offset,
1938                                             ib->b.fileaddr, b->b.fileaddr,
1939                                             NULL);
1940                 set_bit(B_PhysValid, &b->b.flags);
1941                 unmap_iblock(ib, buf);
1942         }
1943         dprintk("lafs_find_block: lookup for %lu of %lu at %lu found %llu\n",
1944                 (unsigned long)b->b.fileaddr,
1945                 (unsigned long)b->b.inode->i_ino,
1946                 (unsigned long)ib->b.fileaddr,
1947                 b->b.physaddr);
1948         /* FIXME should peer_find go here or in load_block? */
1949
1950         if (adopt)
1951                 block_adopt(&b->b, ib);
1952         lafs_iounlock_block(&ib->b);
1953 out:
1954         if (IS_ERR(ib))
1955                 return PTR_ERR(ib);
1956         if (ib)
1957                 putiref(ib, MKREF(findblock));
1958         return 0;
1959 }
1960
1961 int __must_check
1962 lafs_find_block(struct datablock *b, int adopt)
1963 {
1964         return find_block(b, adopt, SYNC);
1965 }
1966
1967 int __must_check
1968 lafs_find_block_async(struct datablock *b)
1969 {
1970         return find_block(b, ADOPT, ASYNC);
1971 }
1972
1973 /*
1974  * lafs_allocated_block records that a block has been written at a new
1975  * address (or found at an address during roll-forward). This involves
1976  * updating some summary information (quota etc) and making sure
1977  * the change gets recorded in the parent block
1978  * This must be called before B_Dirty or B_Realloc are cleared,
1979  * to ensure that the index block is marked accordingly.
1980  *  B_UnincCredit should still be set, and we clear it.
1981  */
1982
1983 int lafs_add_block_address(struct fs *fs, struct block *blk)
1984 {
1985         struct indexblock *p = blk->parent;
1986         struct addr *a;
1987
1988         spin_lock(&blk->inode->i_data.private_lock);
1989         if (test_bit(B_Index, &blk->flags)) {
1990                 if (!test_and_set_bit(B_Uninc, &blk->flags)) {
1991                         blk->chain = p->uninc;
1992                         LAFS_BUG(p->depth == 1, blk);
1993                         p->uninc = blk;
1994                         getref(blk, MKREF(uninc));
1995                 }
1996                 spin_unlock(&blk->inode->i_data.private_lock);
1997                 return 1;
1998         }
1999
2000         /* same phase and leaf; add the address to uninc_table */
2001
2002         /* the only place we try to merge is at the end
2003          * of the embedded table, and then only for data
2004          * blocks
2005          * It is essential that uninc_table is sorted and,
2006          * in particular, has no overlapping extents.
2007          * If this block threatens problems we incorporate first.
2008          */
2009
2010         LAFS_BUG(!test_bit(B_Dirty, &p->b.flags) &&
2011                  !test_bit(B_Realloc, &p->b.flags), blk);
2012
2013         a = &p->uninc_table.pending_addr
2014                 [p->uninc_table.pending_cnt-1];
2015         if (p->uninc_table.pending_cnt >= 1 &&
2016             a->fileaddr+a->cnt == blk->fileaddr &&
2017             a->physaddr != 0 &&
2018             a->physaddr+a->cnt == blk->physaddr) {
2019                 a->cnt++;
2020                 if (test_and_clear_bit(B_UnincCredit, &blk->flags))
2021                         p->uninc_table.credits++;
2022                 spin_unlock(&blk->inode->i_data.private_lock);
2023                 return 1;
2024         } else if ((p->uninc_table.pending_cnt == 0 ||
2025                     a->fileaddr + a->cnt <= blk->fileaddr) &&
2026                    /* properly ordered, maybe add */
2027                    (p->uninc_table.pending_cnt <
2028                     ARRAY_SIZE(p->uninc_table.pending_addr))) {
2029                 a++;
2030                 a->fileaddr = blk->fileaddr;
2031                 a->physaddr = blk->physaddr;
2032                 a->cnt = 1;
2033                 p->uninc_table.pending_cnt++;
2034                 if (test_and_clear_bit(B_UnincCredit,
2035                                        &blk->flags))
2036                         p->uninc_table.credits++;
2037                 spin_unlock(&blk->inode->i_data.private_lock);
2038
2039                 return 1;
2040         } else {
2041                 spin_unlock(&blk->inode->i_data.private_lock);
2042                 /* We own a ref through blk->parent now, but
2043                  * after lafs_incorporate, we might not
2044                  */
2045                 getiref(p,MKREF(addblock));
2046                 lafs_iolock_written(&p->b);
2047                 lafs_incorporate(fs, p);
2048                 lafs_iounlock_block(&p->b);
2049                 putiref(p,MKREF(addblock));
2050                 return 0;
2051         }
2052 }
2053
2054 int lafs_allocated_block(struct fs *fs, struct block *blk, u64 phys)
2055 {
2056         int ph;
2057         struct indexblock *p;
2058
2059         dprintk("Allocated %s -> %llu\n",  strblk(blk), phys);
2060
2061         /* If phys is 0, I don't need uninc credit.
2062          * Also, Index block that are not near full do not need UnincCredits
2063          */
2064         LAFS_BUG(test_bit(B_InoIdx, &blk->flags), blk);
2065         if (!test_bit(B_Dirty, &blk->flags) &&
2066             !test_bit(B_Realloc, &blk->flags) &&
2067             phys != 0)
2068                 printk("something missing %s\n", strblk(blk));
2069         LAFS_BUG(!test_bit(B_UnincCredit, &blk->flags) && phys &&
2070                  !test_bit(B_Index, &blk->flags), blk);
2071         LAFS_BUG(!test_bit(B_Dirty, &blk->flags) &&
2072                  !test_bit(B_Realloc, &blk->flags) &&
2073                  phys != 0, blk);
2074
2075         LAFS_BUG(!test_bit(B_Writeback, &blk->flags), blk);
2076
2077         if (test_bit(B_Root, &blk->flags)) {
2078                 int i;
2079                 struct lafs_inode *lai;
2080
2081                 LAFS_BUG(test_bit(B_Index, &blk->flags), blk);
2082
2083                 for (i = 0; i < fs->maxsnapshot; i++)
2084                         if (fs->ss[i].root == blk->inode)
2085                                 break;
2086                 LAFS_BUG(i == fs->maxsnapshot, blk);
2087                 blk->physaddr = phys; /* superblock doesn't get
2088                                          counted in summaries */
2089                 LAFS_BUG(test_bit(B_SegRef, &blk->flags), blk);
2090                 set_bit(B_PhysValid, &blk->flags);
2091                 fs->ss[i].root_addr = phys;
2092                 lai = LAFSI(blk->inode);
2093                 if (i == 0 && lai->md.fs.usagetable > 1)
2094                         /* snapshot checkpoint is happening.  Record root
2095                          * address in main fs and in snapshot
2096                          */
2097                         fs->ss[lai->md.fs.usagetable-1].root_addr = phys;
2098
2099                 /* Don't bother to clear the B_UnincCredit in the root.
2100                  * It'll just get set again.  This way there is less churn.
2101                  */
2102                 return 0;
2103         }
2104         if (test_bit(FinalCheckpoint, &fs->fsstate) &&
2105             !test_bit(B_Index, &blk->flags) &&
2106             (LAFSI(blk->inode)->type == TypeSegmentMap ||
2107              LAFSI(blk->inode)->type == TypeQuota)) {
2108                 /* These blocks will be picked up by roll-forward,
2109                  * so don't need to have their address recorded,
2110                  * and doing so would leave the index blocks
2111                  * dirty after the checkpoint.
2112                  * So just return
2113                  */
2114                 if (test_and_clear_bit(B_SegRef, &blk->flags))
2115                         lafs_seg_deref(fs, blk->physaddr, 0);
2116                 blk->physaddr = phys;
2117                 set_bit(B_PhysValid, &blk->flags);
2118                 return 0;
2119         }
2120         p = blk->parent;
2121
2122         LAFS_BUG(!p, blk);
2123
2124         /* We need to work out if this block is in the current phase or
2125          * the next one.  This can only be an issue if a phase change
2126          * (aka checkpoint) is underway. But if it is, we need to
2127          * ensure the right phase is used
2128          */
2129         lafs_checkpoint_lock(fs);
2130         if (test_bit(B_Pinned, &blk->flags))
2131                 ph = !!test_bit(B_Phase1, &blk->flags);
2132         else if (test_bit(B_Index, &blk->flags))
2133                 LAFS_BUG(1, blk); /* Index blocks must always be pinned when dirty (*/
2134         else if (p->uninc_table.pending_cnt) {
2135                 /* attach data block into previous phase as incorporation of
2136                  * this phase is still to happen.
2137                  * This is needed for flush-if-cannot-preallocate to work.
2138                  */
2139                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), &p->b);
2140                 ph = !!test_bit(B_Phase1, &p->b.flags);
2141                 lafs_pin_block_ph(blk, ph);
2142         } else {
2143                 lafs_pin_block(blk);
2144                 ph = !!test_bit(B_Phase1, &blk->flags);
2145         }
2146         lafs_checkpoint_unlock(fs);
2147         if (!test_bit(B_PhysValid, &blk->flags))
2148                 blk->physaddr = 0;
2149         lafs_summary_update(fs, blk->inode, blk->physaddr, phys,
2150                             test_bit(B_Index, &blk->flags), ph,
2151                             test_bit(B_SegRef, &blk->flags));
2152         clear_bit(B_Prealloc, &blk->flags);
2153
2154         blk->physaddr = phys;
2155         set_bit(B_PhysValid, &blk->flags);
2156
2157         if (ph != !!test_bit(B_Phase1, &p->b.flags)) {
2158                 /* in the next phase */
2159                 dprintk("uninc next phase\n");
2160                 if (test_and_set_bit(B_Uninc, &blk->flags))
2161                         /* This block was already on the 'unincorporated'
2162                          * list, (it must be changing quickly) so there
2163                          * is nothing else to do.
2164                          */
2165                         return 0;
2166                 getref(blk, MKREF(uninc));
2167
2168                 spin_lock(&blk->inode->i_data.private_lock);
2169                 blk->chain = p->uninc_next;
2170                 p->uninc_next = blk;
2171                 spin_unlock(&blk->inode->i_data.private_lock);
2172                 /* We will try again after the phase change.
2173                  * That will be when we clean B_UnincCredit
2174                  */
2175                 return 0;
2176         }
2177
2178 new_parent:
2179         if (test_bit(B_Dirty, &blk->flags) || phys == 0) {
2180                 if (!test_bit(B_Dirty, &p->b.flags)
2181                     && !test_bit(B_Credit, &p->b.flags)) {
2182                         printk("Oh dear: %s\n", strblk(blk));
2183                         printk(".......: %s\n", strblk(&p->b));
2184                 }
2185                 lafs_dirty_iblock(p);
2186         } else {
2187                 if (!test_bit(B_Valid, &p->b.flags)) {
2188                         printk("Not valid in lafs_allocated_block: %s\n",
2189                                strblk(&p->b));
2190                         printk("blk is %s\n", strblk(blk));
2191                         BUG();
2192                 }
2193                 LAFS_BUG(!test_bit(B_Valid, &p->b.flags), &p->b);
2194                 if (!test_bit(B_Realloc, &p->b.flags)) {
2195                         /* I cannot use B_Credit to fill B_Realloc as that
2196                          * might still be needed for B_Dirty.
2197                          * So if we cannot allocated a new credit,
2198                          * just set the block as 'dirty' now.
2199                          */
2200                         if (lafs_space_alloc(fs, 1, CleanSpace) == 1) {
2201                                 if (test_and_set_bit(B_Realloc, &p->b.flags))
2202                                         lafs_space_return(fs, 1);
2203                         } else
2204                                 lafs_dirty_iblock(p);
2205                 }
2206                 if (!test_and_set_bit(B_UnincCredit, &p->b.flags))
2207                         if (!test_and_clear_bit(B_ICredit, &p->b.flags))
2208                                 LAFS_BUG(1, &p->b); // ICredit should be set before
2209                 //we dirty a block.
2210
2211         }
2212
2213         dprintk("Uninc same phase\n");
2214
2215         BUG_ON(!test_bit(B_Pinned, &blk->parent->b.flags));
2216
2217         if (!lafs_add_block_address(fs, blk)) {
2218                 p = blk->parent;
2219
2220                 LAFS_BUG(!p, blk);
2221                 LAFS_BUG(!test_bit(B_Pinned, &p->b.flags), blk);
2222                 goto new_parent;
2223         }
2224
2225         /* That credit has now been added to the uninc_table and
2226          * thus made available to the parent
2227          */
2228
2229         lafs_refile(&p->b, 0);
2230         return 0;
2231 }