inode.c

   1
   2 /*
   3  * fs/lafs/inode.c
   4  * Copyright (C) 2005-2009
   5  * Neil Brown <neilb@suse.de>
   6  * Released under the GPL, version 2
   7  *
   8  * generic inode handling
   9  *
  10  */
  11
  12 #include        "lafs.h"
  13 #include <linux/random.h>
  14 #include <linux/delay.h>
  15 #include <linux/slab.h>
  16
  17 /* Supporting an async 'iget' - as required by the cleaner -
  18  * is slightly non-trivial.
  19  * iget*_locked will normally wait for any inode with one
  20  * of the flags I_FREEING I_CLEAR I_WILL_FREE I_NEW
  21  * to either be unhashed or has the flag cleared.
  22  * We cannot afford that wait in the cleaner as we could deadlock.
  23  * So we use iget5_locked and provide a test function that fails
  24  * if it finds the inode with any of those flags set.
  25  * If it does see the inode like that it clear the inum
  26  * that is passed in (by reference) so that it knows to continue
  27  * failing (for consistency) and so that the 'set' function
  28  * we provide can know to fail the 'set'.
  29  * The result of this is that if iget finds an inode it would
  30  * have to wait on, the inum is cleared and NULL is returned.
  31  * An unfortunate side effect is that an inode will be allocated
  32  * and then destroyed to no avail.
  33  * This is avoided by calling ilookup5 first.  This also allows
  34  * us to only allocate/load the data block if there really seems
  35  * to be a need.
  36  */
  37 #define NO_INO (~(ino_t)0)
  38 static int async_itest(struct inode *inode, void *data)
  39 {
  40         ino_t *inump = data;
  41         ino_t inum = *inump;
  42
  43         if (inum == NO_INO)
  44                 /* found and is freeing */
  45                 return 0;
  46         if (inode->i_ino != inum)
  47                 return 0;
  48         if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) {
  49                 *inump = NO_INO;
  50                 return 0;
  51         }
  52         return 1;
  53 }
  54
  55 static int async_iset(struct inode *inode, void *data)
  56 {
  57         ino_t *inump = data;
  58         if (!*inump)
  59                 return -EBUSY;
  60         inode->i_ino = *inump;
  61         return 0;
  62 }
  63
  64 struct inode *
  65 lafs_iget(struct super_block *sb, ino_t inum, int async)
  66 {
  67         /* find, and load if needed, this inum */
  68         struct inode *ino = NULL;
  69         struct inode *oldino;
  70         struct datablock *b = NULL;
  71         struct inode *inodefile;
  72         struct sb_key *k;
  73         int err = 0;
  74
  75         BUG_ON(inum == NO_INO);
  76
  77         k = sb->s_fs_info;
  78         inodefile = k->root;
  79
  80         if (async) {
  81                 /* We cannot afford to block on 'freeing_inode'
  82                  * So use iget5_locked and refuse to match such
  83                  * inodes.
  84                  * If the inode is 'freeing', inum gets set to NO_INO.
  85                  * ilookup5 is used first to avoid an unnecessary
  86                  * alloc/free if the inode is locked in some way.
  87                  */
  88                 while (!ino) {
  89                         ino_t inum2 = inum;
  90                         err = 0;
  91                         ino = ilookup5(sb, inum, async_itest, &inum2);
  92                         if (ino)
  93                                 break;
  94
  95                         if (inum2 == NO_INO)
  96                                 err = -EAGAIN;
  97
  98                         /* For async we will always want the dblock loaded,
  99                          * and we need to load it first as we cannot afford
 100                          * to fail -EAGAIN once we have an I_NEW inode.
 101                          */
 102                         if (!b)
 103                                 b = lafs_get_block(inodefile, inum, NULL,
 104                                                    GFP_NOFS, MKREF(iget));
 105                         if (!b)
 106                                 return ERR_PTR(-ENOMEM);
 107
 108                         if (!err)
 109                                 err = lafs_read_block_async(b);
 110
 111                         if (!err) {
 112                                 /* Have the block, so safe to iget */
 113                                 inum2 = inum;
 114                                 ino = iget5_locked(sb, inum,
 115                                                    async_itest, async_iset,
 116                                                    &inum2);
 117                                 if (!ino) {
 118                                         if (inum2 == NO_INO)
 119                                                 err = -EAGAIN;
 120                                         else
 121                                                 err = -ENOMEM;
 122                                 }
 123                         }
 124                         if (err) {
 125                                 if (test_and_set_bit(B_Async, &b->b.flags)) {
 126                                         putdref(b, MKREF(iget));
 127                                         return ERR_PTR(err);
 128                                 }
 129                                 getdref(b, MKREF(async));
 130                         }
 131                 }
 132         } else
 133                 ino = iget_locked(sb, inum);
 134
 135         if (!ino) {
 136                 putdref(b, MKREF(iget));
 137                 return ERR_PTR(-ENOMEM);
 138         }
 139
 140         if (!(ino->i_state & I_NEW)) {
 141                 putdref(b, MKREF(iget));
 142                 if (ino->i_mode)
 143                         return ino;
 144                 iput(ino);
 145                 return ERR_PTR(-ENOENT);
 146         }
 147
 148         /* Need to load block 'inum' from an inode file...
 149          */
 150         if (!b) {
 151                 b = lafs_get_block(inodefile, inum, NULL, GFP_KERNEL, MKREF(iget));
 152                 if (!b)
 153                         err = -ENOMEM;
 154                 else
 155                         err = lafs_read_block(b);
 156         }
 157         if (err)
 158                 goto err;
 159
 160         oldino = rcu_my_inode(b);
 161         if (oldino) {
 162                 /* The inode is new, but the block thinks it has an
 163                  * old inode, so we must be in the process of destroying
 164                  * the old one.
 165                  * So fail the lookup without even looking at the content
 166                  * of the block (Which might not be clear yet).
 167                  */
 168                 spin_lock(&oldino->i_data.private_lock);
 169                 if (!test_bit(I_Deleting, &LAFSI(oldino)->iflags)) {
 170                         b->my_inode = NULL;
 171                         LAFSI(oldino)->dblock = NULL;
 172                         LAFSI(oldino)->iblock = NULL;
 173                 }
 174                 spin_unlock(&oldino->i_data.private_lock);
 175         }
 176         rcu_iput(oldino);
 177         if (b->my_inode) {
 178                 err = -ENOENT;
 179                 goto err;
 180         }
 181
 182         err = lafs_import_inode(ino, b);
 183         if (err) {
 184                 if (err != -ENOENT)
 185                         printk("lafs_import_inode failed %d\n", err);
 186                 goto err;
 187         }
 188         unlock_new_inode(ino);
 189 out:
 190         if (b && test_and_clear_bit(B_Async, &b->b.flags)) {
 191                 putdref(b, MKREF(async));
 192                 lafs_wake_thread(fs_from_sb(sb));
 193         }
 194         putdref(b, MKREF(iget));
 195         return ino;
 196 err:
 197         ino->i_nlink = 0;
 198         unlock_new_inode(ino);
 199         iput(ino);
 200         ino = ERR_PTR(err);
 201         goto out;
 202 }
 203
 204 struct inode *
 205 lafs_iget_fs(struct fs *fs, int fsnum, int inum, int async)
 206 {
 207         struct super_block *sb;
 208         struct inode *rv;
 209
 210         sb = fs->prime_sb;
 211
 212         if (fsnum) {
 213                 /* Need to locate or load the superblock for this
 214                  * subordinate filesystem
 215                  */
 216                 struct inode *filesys;
 217                 struct super_block *sb2;
 218
 219                 filesys = lafs_iget(sb, fsnum, async);
 220                 if (IS_ERR(filesys))
 221                         return filesys;
 222                 if (LAFSI(filesys)->type != TypeInodeFile) {
 223                         iput(filesys);
 224                         return ERR_PTR(-ENOENT);
 225                 }
 226                 /* FIXME can get_subset_sb be async at all?? */
 227                 sb2 = lafs_get_subset_sb(filesys);
 228                 if (IS_ERR(sb2)) {
 229                         iput(filesys);
 230                         return ERR_PTR(PTR_ERR(sb2));
 231                 }
 232                 rv = lafs_iget(sb2, inum, async);
 233                 if (IS_ERR(rv))
 234                         deactivate_locked_super(sb2);
 235                 else
 236                         up_write(&sb2->s_umount);
 237         } else {
 238                 rv = lafs_iget(sb, inum, async);
 239                 atomic_inc(&sb->s_active);
 240         }
 241         return rv;
 242 }
 243
 244 int __must_check
 245 lafs_import_inode(struct inode *ino, struct datablock *b)
 246 {
 247         struct la_inode *lai = map_dblock(b);
 248         struct lafs_inode *li = LAFSI(ino);
 249         int err = -ENOENT;
 250
 251         if (lai->filetype == 0) {
 252                 li->type = 0;
 253                 ino->i_mode = 0;
 254                 ino->i_nlink = 0;
 255                 goto out;
 256         }
 257
 258         ino->i_mode = S_IFREG;
 259         ino->i_nlink = 1; /* For special file, set nlink so they
 260                            * never appear unlinked */
 261
 262         err = -EINVAL;
 263
 264         LAFS_BUG(ino->i_ino != b->b.fileaddr, &b->b);
 265         li->cblocks = le32_to_cpu(lai->data_blocks);
 266         li->pblocks = li->ablocks = 0;
 267         li->vfs_inode.i_blocks = ((blkcnt_t)li->cblocks
 268                                   << (ino->i_sb->s_blocksize_bits - 9));
 269         li->ciblocks = le32_to_cpu(lai->index_blocks);
 270         li->piblocks = 0;
 271         li->iflags = 0;
 272
 273         ino->i_generation = le16_to_cpu(lai->generation);
 274         li->trunc_gen = lai->trunc_gen;
 275         li->flags = lai->flags;
 276         li->type = lai->filetype;
 277         li->metadata_size = le16_to_cpu(lai->metadata_size);
 278         li->depth = lai->depth;
 279
 280         dprintk("inode %lu type is %d\n", (unsigned long)ino->i_ino, li->type);
 281
 282         ino->i_data.a_ops = &lafs_file_aops;
 283         li->trunc_next = 0;
 284
 285         switch (li->type) {
 286         case TypeInodeFile:
 287         {
 288                 struct fs_md *i = &li->md.fs;
 289                 struct fs_metadata *l = &lai->metadata[0].fs;
 290                 int nlen;
 291
 292                 i->usagetable = le16_to_cpu(l->snapshot_usage_table);
 293                 decode_time(&ino->i_mtime, le64_to_cpu(l->update_time));
 294                 i->cblocks_used = le64_to_cpu(l->blocks_used);
 295                 i->pblocks_used = i->ablocks_used = 0;
 296                 i->blocks_allowed = le64_to_cpu(l->blocks_allowed);
 297                 i->blocks_unalloc = 0;
 298                 i->creation_age = le64_to_cpu(l->creation_age);
 299                 i->inodes_used = le32_to_cpu(l->inodes_used);
 300                 i->quota_inums[0] = le32_to_cpu(l->quota_inodes[0]);
 301                 i->quota_inums[1] = le32_to_cpu(l->quota_inodes[1]);
 302                 i->quota_inums[2] = le32_to_cpu(l->quota_inodes[2]);
 303                 i->quota_inodes[0] = i->quota_inodes[1]
 304                         = i->quota_inodes[2] = NULL;
 305                 nlen = li->metadata_size - offsetof(struct la_inode,
 306                                                     metadata[0].fs.name);
 307                 if (i->name)
 308                         kfree(i->name);
 309                 if (nlen == 0)
 310                         i->name = NULL;
 311                 else {
 312                         /* Need to unmap the dblock to kmalloc because
 313                          * the mapping makes us 'atomic'
 314                          */
 315                         unmap_dblock(b, lai);
 316                         i->name = kmalloc(nlen+1, GFP_KERNEL);
 317                         lai = map_dblock(b);
 318                         l = &lai->metadata[0].fs;
 319
 320                         err = -ENOMEM;
 321                         if (!i->name)
 322                                 goto out;
 323                         memcpy(i->name, l->name, nlen);
 324                         i->name[nlen] = 0;
 325                 }
 326                 /* Make this look like a directory */
 327                 ino->i_mode = S_IFDIR;
 328                 ino->i_uid = 0;
 329                 ino->i_gid = 0;
 330                 ino->i_size = 0;
 331                 ino->i_op = &lafs_subset_ino_operations;
 332                 ino->i_fop = &lafs_subset_file_operations;
 333                 break;
 334         }
 335
 336         case TypeInodeMap:
 337         {
 338                 struct inodemap_md *m = &li->md.inodemap;
 339                 struct inodemap_metadata *s = &lai->metadata[0].inodemap;
 340                 m->size = le32_to_cpu(s->size);
 341                 m->thisblock = NoBlock;
 342                 m->nextbit = 0;
 343                 break;
 344         }
 345
 346         case TypeSegmentMap:
 347         {
 348                 struct su_md *m = &li->md.segmentusage;
 349                 struct su_metadata *s = &lai->metadata[0].segmentusage;
 350                 m->table_size = le32_to_cpu(s->table_size);
 351                 break;
 352         }
 353
 354         case TypeQuota:
 355         {
 356                 struct quota_md *m = &li->md.quota;
 357                 struct quota_metadata *s = &lai->metadata[0].quota;
 358                 m->gracetime = le32_to_cpu(s->gracetime);
 359                 m->graceunits = le32_to_cpu(s->graceunits);
 360                 break;
 361         }
 362         case TypeOrphanList:
 363         {
 364                 struct orphan_md *m = &li->md.orphan;
 365                 /* This will be set via lafs_count_orphans */
 366                 m->nextfree = 0;
 367                 m->reserved = 0;
 368                 break;
 369         }
 370         case TypeAccessTime:
 371                 break;
 372
 373         default: /* TypeBase or larger */
 374         {
 375                 struct file_md *i = &li->md.file;
 376                 struct file_metadata *l = &lai->metadata[0].file;
 377                 struct dir_metadata *d = &lai->metadata[0].dir;
 378                 struct special_metadata *s = &lai->metadata[0].special;
 379
 380                 if (li->type < TypeBase)
 381                         goto out;
 382                 i->flags = le16_to_cpu(l->flags);
 383                 ino->i_mode = le16_to_cpu(l->mode);
 384                 ino->i_uid = le32_to_cpu(l->userid);
 385                 ino->i_gid = le32_to_cpu(l->groupid);
 386                 i->treeid = le32_to_cpu(l->treeid);
 387                 i->creationtime = le64_to_cpu(l->creationtime);
 388                 decode_time(&ino->i_mtime, le64_to_cpu(l->modifytime));
 389                 decode_time(&ino->i_ctime, le64_to_cpu(l->ctime));
 390                 decode_time(&i->i_accesstime, le64_to_cpu(l->accesstime));
 391                 ino->i_atime = i->i_accesstime; /* FIXME load from
 392                                                  * accesstime file */
 393                 ino->i_size = le64_to_cpu(l->size);
 394                 i->parent = le32_to_cpu(l->parent);
 395                 ino->i_nlink = le32_to_cpu(l->linkcount);
 396                 if (ino->i_nlink == 0 && list_empty(&b->orphans) &&
 397                     fs_from_inode(ino)->rolled) {
 398                         /* This block should already be on the orphan
 399                          * list, otherwise there is a filesystem
 400                          * inconsistency.
 401                          * Either the orphan file is wrong, or the
 402                          * linkcount is wrong.
 403                          * It is safest to assume the later - either
 404                          * way an FS check would be needed to fix it.
 405                          * Note: while roll-forward is happening, this
 406                          * situation is perfectly possible and is handled
 407                          * correctly.
 408                          */
 409                         /* FIXME set a superblock flag requesting
 410                          * directory linkage checking
 411                          */
 412                         ino->i_nlink = 1;
 413                 }
 414
 415                 dprintk("  mode = 0%o uid %d size %lld\n",
 416                         ino->i_mode, ino->i_uid, ino->i_size);
 417                 switch (li->type) {
 418                 case TypeFile:
 419                         ino->i_op = &lafs_file_ino_operations;
 420                         ino->i_fop = &lafs_file_file_operations;
 421                         ino->i_mode = (ino->i_mode & 07777)  | S_IFREG;
 422                         break;
 423                 case TypeDir:
 424                         i->seed = le32_to_cpu(d->hash_seed);
 425                         ino->i_op = &lafs_dir_ino_operations;
 426                         ino->i_fop = &lafs_dir_file_operations;
 427                         ino->i_mode = (ino->i_mode & 07777)  | S_IFDIR;
 428                         {
 429                                 u32 *b = (u32 *)lai;
 430                                 dprintk("Hmm. %d %d %d\n",
 431                                         (int)b[24],
 432                                         (int)b[25],
 433                                         (int)b[26]);
 434                         }
 435                         break;
 436                 case TypeSymlink:
 437                         ino->i_op = &lafs_link_ino_operations;
 438                         ino->i_mode = (ino->i_mode & 07777)  | S_IFLNK;
 439                         break;
 440                 case TypeSpecial:
 441                         /* the data had better be in the inode ... */
 442                         ino->i_rdev = MKDEV(le32_to_cpu(s->major),
 443                                             le32_to_cpu(s->minor));
 444                         ino->i_op = &lafs_special_ino_operations;
 445                         init_special_inode(ino, ino->i_mode, ino->i_rdev);
 446                         break;
 447                 }
 448                 break;
 449         }
 450         }
 451
 452         ino->i_blkbits = ino->i_sb->s_blocksize_bits;
 453         /* FIXME i_blocks and i_byte - used for quota?? */
 454         err = 0;
 455
 456         /* Note: no refcount yet.  Either will remove the reference to the
 457          * other when freed
 458          */
 459         li->dblock = b;
 460         rcu_assign_pointer(b->my_inode, ino);
 461
 462 out:
 463         if (err && li->type)
 464                 printk("inode %lu type is %d\n",
 465                        (unsigned long)ino->i_ino, li->type);
 466         unmap_dblock(b, lai);
 467         return err;
 468 }
 469
 470 void lafs_inode_checkpin(struct inode *ino)
 471 {
 472         /* Make sure I_Pinned is set correctly.
 473          * It should be set precisely if i_nlink is non-zero,
 474          * and ->iblock is B_Pinned.
 475          * When it is set, we own a reference to the inode.
 476          *
 477          * This needs to be called whenever we change
 478          * i_nlink, and whenever we pin or unpin an InoIdx
 479          * block.
 480          */
 481         if (ino->i_nlink == 0) {
 482                 /* I_Pinned should not be set */
 483                 if (test_and_clear_bit(I_Pinned, &LAFSI(ino)->iflags))
 484                         lafs_iput_fs(ino);
 485         } else {
 486                 /* Need to check if iblock is Pinned. */
 487                 struct indexblock *ib = NULL;
 488                 if (LAFSI(ino)->iblock) {
 489                         spin_lock(&ino->i_data.private_lock);
 490                         ib = LAFSI(ino)->iblock;
 491                         if (ib && !test_bit(B_Pinned, &ib->b.flags))
 492                                 ib = NULL;
 493                         spin_unlock(&ino->i_data.private_lock);
 494                 }
 495                 if (ib) {
 496                         if (!test_and_set_bit(I_Pinned, &LAFSI(ino)->iflags))
 497                                 lafs_igrab_fs(ino);
 498                 } else {
 499                         if (test_and_clear_bit(I_Pinned, &LAFSI(ino)->iflags))
 500                                 lafs_iput_fs(ino);
 501                 }
 502         }
 503 }
 504
 505 struct datablock *lafs_inode_get_dblock(struct inode *ino, REFARG)
 506 {
 507         struct datablock *db;
 508
 509         spin_lock(&ino->i_data.private_lock);
 510         db = LAFSI(ino)->dblock;
 511         if (db) {
 512                 if (db->b.inode == ino)
 513                         getdref_locked(db, REF);
 514                 else {
 515                         spin_lock_nested(&db->b.inode->i_data.private_lock, 1);
 516                         getdref_locked(db, REF);
 517                         spin_unlock(&db->b.inode->i_data.private_lock);
 518                 }
 519         }
 520         spin_unlock(&ino->i_data.private_lock);
 521         return db;
 522 }
 523
 524 struct datablock *lafs_inode_dblock(struct inode *ino, int async, REFARG)
 525 {
 526         struct datablock *db;
 527         int err;
 528
 529         db = lafs_inode_get_dblock(ino, REF);
 530
 531         if (!db)
 532                 db = lafs_get_block(ino_from_sb(ino->i_sb), ino->i_ino, NULL,
 533                                     GFP_KERNEL, REF);
 534         if (!db)
 535                 return ERR_PTR(-ENOMEM);
 536
 537         LAFSI(ino)->dblock = db;
 538         rcu_assign_pointer(db->my_inode, ino);
 539         if (async)
 540                 err = lafs_read_block_async(db);
 541         else
 542                 err = lafs_read_block(db);
 543         if (err == 0)
 544                 return db;
 545
 546         putdref(db, REF);
 547         return ERR_PTR(err);
 548 }
 549
 550 void lafs_inode_init(struct datablock *b, int type, int mode, struct inode *dir)
 551 {
 552         /* A new block has been allocated in an inode file to hold an
 553          * inode.  We get to fill in initial values so that when
 554          * 'iget' calls lafs_import_inode, the correct inode is
 555          * loaded.
 556          */
 557         struct fs *fs = fs_from_inode(b->b.inode);
 558         struct la_inode *lai = map_dblock(b);
 559         int size;
 560
 561         lai->data_blocks = cpu_to_le32(0);
 562         lai->index_blocks = cpu_to_le32(0);
 563         get_random_bytes(&lai->generation, sizeof(lai->generation));
 564         lai->depth = 1;
 565         lai->trunc_gen = 0;
 566         lai->filetype = type;
 567         lai->flags = 0;
 568
 569         switch(type) {
 570         case TypeInodeFile:
 571         {
 572                 struct fs_metadata *l = &lai->metadata[0].fs;
 573                 size = sizeof(struct fs_metadata);
 574                 l->update_time = 0;
 575                 l->blocks_used = 0;
 576                 l->blocks_allowed = 0;
 577                 l->creation_age = fs->wc[0].cluster_seq;
 578                 l->inodes_used = 0;
 579                 l->quota_inodes[0] = 0;
 580                 l->quota_inodes[1] = 0;
 581                 l->quota_inodes[2] = 0;
 582                 l->snapshot_usage_table = 0;
 583                 l->pad = 0;
 584                 /* name will be zero length and not used */
 585                 break;
 586         }
 587         case TypeInodeMap:
 588         {
 589                 struct inodemap_metadata *l = &lai->metadata[0].inodemap;
 590                 l->size = 0;
 591                 size = sizeof(struct inodemap_metadata);
 592                 break;
 593         }
 594         case TypeSegmentMap:
 595                 size = sizeof(struct su_metadata);
 596                 break;
 597         case TypeQuota:
 598                 size = sizeof(struct quota_metadata);
 599                 break;
 600         case TypeOrphanList:
 601                 size = 0;
 602                 break;
 603         case TypeAccessTime:
 604                 size = 0;
 605                 break;
 606         default:
 607         {
 608                 struct file_metadata *l = &lai->metadata[0].file;
 609                 struct timespec now = CURRENT_TIME;
 610
 611                 l->flags = cpu_to_le16(0);
 612                 l->userid = cpu_to_le32(current->cred->fsuid);
 613                 if (dir && (dir->i_mode & S_ISGID)) {
 614                         l->groupid = cpu_to_le32(dir->i_gid);
 615                         if (type == TypeDir)
 616                                 mode |= S_ISGID;
 617                 } else
 618                         l->groupid = cpu_to_le32(current->cred->fsgid);
 619                 if (dir && LAFSI(dir)->md.file.treeid)
 620                         l->treeid = cpu_to_le32(LAFSI(dir)->md.file.treeid);
 621                 else
 622                         l->treeid = l->userid;
 623
 624                 l->mode = cpu_to_le16(mode);
 625                 l->creationtime = encode_time(&now);
 626                 l->modifytime = l->creationtime;
 627                 l->ctime = l->creationtime;
 628                 l->accesstime = l->creationtime;
 629                 l->size = 0;
 630                 l->parent = dir ? cpu_to_le32(dir->i_ino) : 0;
 631                 l->linkcount = 0;
 632                 l->attrinode = 0;
 633                 if (type == TypeDir) {
 634                         struct dir_metadata *l = &lai->metadata[0].dir;
 635                         u32 seed;
 636                         get_random_bytes(&seed,
 637                                          sizeof(seed));
 638                         seed = (seed & ~7) | 1;
 639                         l->hash_seed = cpu_to_le32(seed);
 640                         size = sizeof(struct dir_metadata);
 641                 } else if (type == TypeSpecial) {
 642                         struct special_metadata *s = &lai->metadata[0].special;
 643                         s->major = s->minor = 0;
 644                         size = sizeof(struct special_metadata);
 645                 } else
 646                         size = sizeof(struct file_metadata);
 647         }
 648         }
 649         size += sizeof(struct la_inode);
 650         lai->metadata_size = cpu_to_le32(size);
 651         memset(((char *)lai)+size, 0, fs->blocksize-size);
 652         *(u16 *)(((char *)lai)+size) = cpu_to_le16(IBLK_EXTENT);
 653
 654         unmap_dblock(b, lai);
 655         set_bit(B_Valid, &b->b.flags);
 656         LAFS_BUG(!test_bit(B_Pinned, &b->b.flags), &b->b);
 657         lafs_dirty_dblock(b);
 658 }
 659
 660 void lafs_clear_inode(struct inode *ino)
 661 {
 662         struct lafs_inode *li = LAFSI(ino);
 663         dprintk("CLEAR INODE %d\n", (int)ino->i_ino);
 664
 665         li->type = 0;
 666
 667         /* Now is a good time to break the linkage between
 668          * inode and dblock - but not if the file is
 669          * being deleted
 670          */
 671         if (!test_bit(I_Deleting, &LAFSI(ino)->iflags)) {
 672                 struct datablock *db;
 673                 spin_lock(&ino->i_data.private_lock);
 674                 db = LAFSI(ino)->dblock;
 675                 if (db) {
 676                         struct indexblock *ib = LAFSI(ino)->iblock;
 677                         LAFS_BUG(ib && atomic_read(&ib->b.refcnt), &db->b);
 678                         db->my_inode = NULL;
 679                         LAFSI(ino)->dblock = NULL;
 680                         LAFSI(ino)->iblock = NULL;
 681                 }
 682                 spin_unlock(&ino->i_data.private_lock);
 683         }
 684
 685         /* FIXME release quota inodes if filesystem */
 686 }
 687
 688 static int inode_map_free(struct fs *fs, struct super_block *sb, u32 inum);
 689
 690 void lafs_delete_inode(struct inode *ino)
 691 {
 692         struct fs *fs = fs_from_inode(ino);
 693         struct datablock *b;
 694
 695         if (ino->i_mode == 0) {
 696                 /* There never was an inode here,
 697                  * so nothing to do.
 698                  */
 699                 clear_inode(ino);
 700                 return;
 701         }
 702         dprintk("DELETE INODE %d\n", (int)ino->i_ino);
 703
 704         /* Normal truncation holds an igrab, so we cannot be
 705          * deleted until any truncation finishes
 706          */
 707         BUG_ON(test_bit(I_Trunc, &LAFSI(ino)->iflags));
 708
 709         b = lafs_inode_dblock(ino, SYNC, MKREF(delete_inode));
 710
 711         i_size_write(ino, 0);
 712         truncate_inode_pages(&ino->i_data, 0);
 713         LAFSI(ino)->trunc_next = 0;
 714         set_bit(I_Deleting, &LAFSI(ino)->iflags);
 715         set_bit(I_Trunc, &LAFSI(ino)->iflags);
 716         lafs_igrab_fs(ino);
 717
 718         if (!IS_ERR(b)) {
 719                 set_bit(B_Claimed, &b->b.flags);
 720                 lafs_add_orphan(fs, b);
 721                 dprintk("PUNCH hole for %d\n", (int)b->b.fileaddr);
 722                 putdref(b, MKREF(delete_inode));
 723         }
 724         inode_map_free(fs, ino->i_sb,  ino->i_ino);
 725
 726         clear_inode(ino);
 727 }
 728
 729 static int prune(void *data, u32 addr, u64 paddr, int len)
 730 {
 731         /* This whole index block is being pruned, just account
 732          * for everything and it will be cleared afterwards
 733          */
 734         struct indexblock *ib = data;
 735         struct inode *ino = ib->b.inode;
 736         struct fs *fs = fs_from_inode(ino);
 737         int ph = !!test_bit(B_Phase1, &ib->b.flags);
 738         int i;
 739         dprintk("PRUNE %d for %d at %lld\n", addr, len, (long long)paddr);
 740         if (paddr == 0 || len == 0)
 741                 return 0;
 742         for (i = 0 ; i < len ; i++)
 743                 lafs_summary_update(fs, ino, paddr+i, 0, 0, ph, 0);
 744         return len;
 745 }
 746
 747 static int prune_some(void *data, u32 addr, u64 paddr, int len)
 748 {
 749         /* Part of this index block is being pruned.  Copy
 750          * what addresses we can into uninc_table so that
 751          * it can be 'incorporated'
 752          * We should probably share some code with
 753          * lafs_allocated_block??
 754          */
 755         struct indexblock *ib = data;
 756         struct inode *ino = ib->b.inode;
 757         struct fs *fs = fs_from_inode(ino);
 758         int ph = !!test_bit(B_Phase1, &ib->b.flags);
 759         int i;
 760
 761         if (paddr == 0 || len == 0)
 762                 return 0;
 763         dprintk("PRUNE2 %d for %d at %lld\n", addr, len, (long long)paddr);
 764         for (i = 0 ; i < len ; i++) {
 765                 /* FIXME should allow longer truncation ranges in uninc_table
 766                  * as they are easy to handle.
 767                  */
 768                 struct addr *a;
 769                 if (addr + i < LAFSI(ino)->trunc_next)
 770                         continue;
 771                 spin_lock(&ino->i_data.private_lock);
 772                 a = &ib->uninc_table.pending_addr
 773                         [ib->uninc_table.pending_cnt - 1];
 774                 if (ib->uninc_table.pending_cnt <
 775                     ARRAY_SIZE(ib->uninc_table.pending_addr)) {
 776                         a++;
 777                         a->fileaddr = addr + i;
 778                         a->physaddr = 0;
 779                         a->cnt = 1;
 780                         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
 781                         ib->uninc_table.pending_cnt++;
 782                 } else {
 783                         spin_unlock(&ino->i_data.private_lock);
 784                         break;
 785                 }
 786                 spin_unlock(&ino->i_data.private_lock);
 787                 lafs_summary_update(fs, ino, paddr+i, 0, 0, ph, 0);
 788         }
 789         return i;
 790 }
 791
 792 int lafs_inode_handle_orphan(struct datablock *b)
 793 {
 794         /* Don't need rcu protection for my_inode run_orphan
 795          * holds a reference
 796          */
 797         struct indexblock *ib, *ib2;
 798         struct inode *ino = b->my_inode;
 799         struct fs *fs = fs_from_inode(ino);
 800         u32 trunc_next, next_trunc;
 801         int loop_cnt = 20;
 802         int err = -ENOMEM;
 803
 804         if (!test_bit(I_Trunc, &LAFSI(ino)->iflags)) {
 805                 if (test_bit(I_Deleting, &LAFSI(ino)->iflags)) {
 806                         LAFS_BUG(ino->i_nlink, &b->b);
 807                         if (LAFSI(ino)->cblocks +
 808                             LAFSI(ino)->pblocks +
 809                             LAFSI(ino)->ablocks +
 810                             LAFSI(ino)->ciblocks +
 811                             LAFSI(ino)->piblocks)
 812                         printk("Deleting inode %lu: %ld+%ld+%ld %ld+%ld\n",
 813                                ino->i_ino,
 814                                LAFSI(ino)->cblocks,
 815                                LAFSI(ino)->pblocks,
 816                                LAFSI(ino)->ablocks,
 817                                LAFSI(ino)->ciblocks,
 818                                LAFSI(ino)->piblocks);
 819                         BUG_ON(LAFSI(ino)->cblocks +
 820                                LAFSI(ino)->pblocks +
 821                                LAFSI(ino)->ablocks +
 822                                LAFSI(ino)->ciblocks +
 823                                LAFSI(ino)->piblocks);
 824                         if (lafs_erase_dblock_async(b))
 825                                 lafs_orphan_release(fs, b);
 826                 } else if (ino->i_nlink || LAFSI(ino)->type == 0)
 827                         lafs_orphan_release(fs, b);
 828                 else
 829                         lafs_orphan_forget(fs, b);
 830                 return 0;
 831         }
 832
 833         ib = lafs_make_iblock(ino, ADOPT, SYNC, MKREF(inode_handle_orphan));
 834         if (IS_ERR(ib))
 835                 return PTR_ERR(ib);
 836
 837         /* Here is the guts of 'truncate'.  We find the next leaf index
 838          * block and discard all the addresses there-in.
 839          */
 840         trunc_next = LAFSI(ino)->trunc_next;
 841
 842         if (trunc_next == 0xFFFFFFFF) {
 843                 /* truncate has finished in that all data blocks
 844                  * have been removed and all index block are either
 845                  * gone or pending incorporation at which point they will
 846                  * go.
 847                  * If we hit a phase change, we will need to postpone
 848                  * the rest of the cleaning until it completes.
 849                  * If there is a checkpoint happening, then all the work
 850                  * that we can do now, it will do for us.  So just
 851                  * let it.
 852                  */
 853                 struct indexblock *tmp;
 854                 struct indexblock *next;
 855                 u32 lastaddr;
 856
 857                 if (!test_bit(B_Pinned, &ib->b.flags)) {
 858                         /* must be finished */
 859                         LAFS_BUG(test_bit(B_Dirty, &ib->b.flags), &ib->b);
 860                         clear_bit(I_Trunc, &LAFSI(ino)->iflags);
 861                         lafs_iput_fs(ino);
 862                         wake_up(&fs->trunc_wait);
 863                         err = -ERESTARTSYS;
 864                         goto out2;
 865                 }
 866                 if (fs->checkpointing) {
 867                         /* This cannot happen with current code,
 868                          * but leave it in case we ever have
 869                          * orphan handling parallel with checkpoints
 870                          */
 871                         err = -EBUSY; /* Try again after the checkpoint */
 872                         goto out2;
 873                 }
 874
 875                 lastaddr = (i_size_read(ino) +
 876                             fs->blocksize - 1)
 877                         >> fs->blocksize_bits;
 878                 /* Find a Pinned descendent of ib which has no
 879                  * Pinned descendents and no PrimaryRef dependent
 880                  * (so take the last).
 881                  * Prefer blocks that are beyond EOF (again, take the last).
 882                  * If there are none, descend the last block that
 883                  * is not after EOF and look at its children.
 884                  */
 885                 ib2 = next = ib;
 886                 spin_lock(&ib->b.inode->i_data.private_lock);
 887                 while (next) {
 888                         ib2 = next;
 889                         next = NULL;
 890                         list_for_each_entry(tmp, &ib2->children, b.siblings) {
 891                                 if (!test_bit(B_Index, &tmp->b.flags) ||
 892                                     !test_bit(B_Pinned, &tmp->b.flags))
 893                                         continue;
 894                                 if (next == NULL ||
 895                                     tmp->b.fileaddr > next->b.fileaddr)
 896                                         next = tmp;
 897                         }
 898                 }
 899                 if (ib2->b.fileaddr < lastaddr) {
 900                         /* Must be all done */
 901                         spin_unlock(&ib->b.inode->i_data.private_lock);
 902                         clear_bit(I_Trunc, &LAFSI(ino)->iflags);
 903                         lafs_iput_fs(ino);
 904                         wake_up(&fs->trunc_wait);
 905                         err = -ERESTARTSYS;
 906                         goto out2;
 907                 }
 908                 getiref(ib2, MKREF(inode_handle_orphan2));
 909                 spin_unlock(&ib->b.inode->i_data.private_lock);
 910
 911                 /* ib2 is an index block beyond EOF with no
 912                  * Pinned children.
 913                  * Incorporating it should unpin it.
 914                  */
 915                 if (!list_empty(&ib2->children)) {
 916                         lafs_print_tree(&ib2->b, 3);
 917                         LAFS_BUG(1, &ib2->b);
 918                 }
 919
 920                 if (!lafs_iolock_written_async(&ib2->b)) {
 921                         putiref(ib2, MKREF(inode_handle_orphan2));
 922                         err = -EAGAIN;
 923                         goto out2;
 924                 }
 925                 while (ib2->uninc_table.pending_cnt || ib2->uninc)
 926                         lafs_incorporate(fs, ib2);
 927
 928                 if (test_bit(B_Dirty, &ib2->b.flags) ||
 929                     test_bit(B_Realloc, &ib2->b.flags))
 930                         lafs_cluster_allocate(&ib2->b, 0);
 931                 else
 932                         lafs_iounlock_block(&ib2->b);
 933
 934                 if (!list_empty(&ib2->b.siblings)) {
 935                         printk("looping on %s\n", strblk(&ib2->b));
 936                         loop_cnt--;
 937                         if (loop_cnt < 0)
 938                                 BUG();
 939                 }
 940                 putiref(ib2, MKREF(inode_handle_orphan2));
 941                 err = -ERESTARTSYS;
 942                 if (ib->uninc) {
 943                         if (lafs_iolock_written_async(&ib->b)) {
 944                                 while (ib->uninc)
 945                                         lafs_incorporate(fs, ib);
 946                                 lafs_iounlock_block(&ib->b);
 947                         } else
 948                                 err = -EAGAIN;
 949                 }
 950         out2:
 951                 putiref(ib, MKREF(inode_handle_orphan));
 952                 return err;
 953         }
 954
 955         putiref(ib, MKREF(inode_handle_orphan));
 956
 957         ib = lafs_leaf_find(ino, trunc_next, ADOPT, &next_trunc,
 958                             ASYNC, MKREF(inode_handle_orphan3));
 959         if (IS_ERR(ib))
 960                 return PTR_ERR(ib);
 961         /* now hold an iolock on ib */
 962
 963         /* Ok, trunc_next seems to refer to a block that exists.
 964          * We need to erase it..
 965          *
 966          * So we open up the index block ourselves, call
 967          * lafs_summary_update with each block address, and then
 968          * erase the block.
 969          */
 970
 971         if (LAFSI(ino)->depth == 0) {
 972                 /* Nothing to truncate */
 973                 clear_bit(I_Trunc, &LAFSI(ino)->iflags);
 974                 lafs_iput_fs(ino);
 975                 if (test_bit(B_Pinned, &ib->b.flags))
 976                         /* Need to move the dirtiness which keeps this
 977                          * pinned to the data block.
 978                          */
 979                         lafs_cluster_allocate(&ib->b, 0);
 980                 else
 981                         lafs_iounlock_block(&ib->b);
 982                 err = -ERESTARTSYS;
 983                 goto out_put;
 984         }
 985
 986         lafs_checkpoint_lock(fs);
 987         err = lafs_reserve_block(&ib->b, ReleaseSpace);
 988         if (err < 0)
 989                 goto out;
 990
 991         if (!test_bit(B_Valid, &ib->b.flags) &&
 992             test_bit(B_InoIdx, &ib->b.flags)) {
 993                 /* still invalid, just re-erase to remove
 994                  * pinning */
 995                 LAFSI(ino)->trunc_next = next_trunc;
 996                 lafs_cluster_allocate(&ib->b, 0);
 997                 err = -ERESTARTSYS;
 998                 goto out_unlocked;
 999         }
1000
1001         lafs_pin_block(&ib->b);
1002
1003         /* It might be that this can happen, in which case
1004          * we simply update trunc_next and loop.  But I'd like
1005          * to be sure before I implement that
1006          */
1007         if (!test_bit(B_Valid, &ib->b.flags)) {
1008                 printk("Not Valid: %s\n", strblk(&ib->b));
1009                 printk("depth = %d\n", LAFSI(ino)->depth);
1010                 if (test_bit(B_InoIdx, &ib->b.flags))
1011                         printk("DB: %s\n", strblk(&LAFSI(ib->b.inode)->dblock->b));
1012                 LAFSI(ino)->trunc_next = next_trunc;
1013                 //BUG_ON(!test_bit(B_Valid, &ib->b.flags));
1014                 err = -ERESTARTSYS;
1015                 goto out;
1016         }
1017
1018         if (ib->b.fileaddr < trunc_next &&
1019             lafs_leaf_next(ib, 0) < trunc_next) {
1020                 /* We only want to truncate part of this index block.
1021                  * So we copy addresses into uninc_table and then
1022                  * call lafs_incorporate.
1023                  * This might cause the index tree to grow, so we
1024                  * cannot trust next_trunc
1025                  */
1026                 if (ib->uninc_table.pending_cnt == 0 &&
1027                     ib->uninc == NULL) {
1028                         lafs_dirty_iblock(ib, 0);
1029                         /* FIXME this just removes 8 blocks at a time,
1030                          * which is not enough
1031                          */
1032                         lafs_walk_leaf_index(ib, prune_some, ib);
1033                 }
1034                 if (test_bit(B_Dirty, &ib->b.flags))
1035                         lafs_incorporate(fs, ib);
1036                 err = -ERESTARTSYS;
1037                 goto out;
1038         }
1039         LAFSI(ino)->trunc_next = next_trunc;
1040
1041         while (ib->uninc_table.pending_cnt || ib->uninc) {
1042                 /* There should be no Realloc data blocks here
1043                  * but index blocks might be realloc still.
1044                  */
1045                 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1046                          !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1047                 lafs_incorporate(fs, ib);
1048         }
1049         if (test_bit(B_InoIdx, &ib->b.flags) ||
1050             !test_bit(B_PhysValid, &ib->b.flags) ||
1051             ib->b.physaddr != 0) {
1052                 lafs_walk_leaf_index(ib, prune, ib);
1053                 lafs_clear_index(ib);
1054                 lafs_dirty_iblock(ib, 0);
1055         }
1056         if (test_bit(B_Dirty, &ib->b.flags))
1057                 lafs_incorporate(fs, ib);
1058         if (!list_empty(&ib->children))
1059                 lafs_print_tree(&ib->b, 2);
1060         LAFS_BUG(!list_empty(&ib->children), &ib->b);
1061         err = -ERESTARTSYS;
1062 out:
1063         lafs_iounlock_block(&ib->b);
1064 out_unlocked:
1065         lafs_checkpoint_unlock(fs);
1066 out_put:
1067         putiref(ib, MKREF(inode_handle_orphan3));
1068         return err;
1069 }
1070
1071 void lafs_dirty_inode(struct inode *ino)
1072 {
1073         /* this is called in one of three cases:
1074          * 1/ by lafs internally when dblock or iblock is pinned and
1075          *    ready to be dirtied
1076          * 2/ by writeout before requesting a write - to update mtime
1077          * 3/ by read to update atime
1078          *
1079          * As we don't know which, there is not much we can do.
1080          * We mustn't update the data block as it could be in
1081          * writeout and we cannot always wait safely.
1082          * So require that anyone who really cares, dirties the datablock
1083          * or a child themselves.
1084          * When cluster_allocate eventually gets called, it will update
1085          * the datablock from the inode.
1086          * If an update has to wait for the next phase, lock_dblock
1087          * (e.g. in setattr) will do that.
1088          *
1089          * We also use this opportunity to update the filesystem modify time.
1090          */
1091         struct timespec now;
1092         struct inode *filesys;
1093         set_bit(I_Dirty, &LAFSI(ino)->iflags);
1094         ino->i_sb->s_dirt = 1;
1095
1096         now = current_fs_time(ino->i_sb);
1097         filesys = ino_from_sb(ino->i_sb);
1098         if (!timespec_equal(&filesys->i_mtime, &now)) {
1099                 filesys->i_mtime = now;
1100                 set_bit(I_Dirty, &LAFSI(filesys)->iflags);
1101         }
1102 }
1103
1104 int lafs_sync_inode(struct inode *ino, int wait)
1105 {
1106         /* fsync has been called on this file so we need
1107          * to sync any inode updates to the next cluster.
1108          *
1109          * If we cannot create an update record,
1110          * we wait for a phase change, which writes everything
1111          * out.
1112          */
1113         struct datablock *b;
1114         struct fs *fs = fs_from_inode(ino);
1115         struct update_handle uh;
1116         int err;
1117
1118         if (wait) {
1119                 if (LAFSI(ino)->update_cluster > 1)
1120                         lafs_cluster_wait(fs, LAFSI(ino)->update_cluster);
1121                 if (LAFSI(ino)->update_cluster == 1) {
1122                         lafs_checkpoint_lock(fs);
1123                         lafs_checkpoint_unlock_wait(fs);
1124                 }
1125                 return 0;
1126         }
1127
1128         LAFSI(ino)->update_cluster = 0;
1129         if (!test_bit(I_Dirty, &LAFSI(ino)->iflags))
1130                 return 0;
1131         b = lafs_inode_dblock(ino, SYNC, MKREF(write_inode));
1132         if (IS_ERR(b))
1133                 return PTR_ERR(b);
1134
1135         lafs_iolock_written(&b->b);
1136         lafs_inode_fillblock(ino);
1137         lafs_iounlock_block(&b->b);
1138
1139         err = lafs_cluster_update_prepare(&uh, fs, LAFS_INODE_LOG_SIZE);
1140         if (err)
1141                 lafs_cluster_update_abort(&uh);
1142         else {
1143                 lafs_checkpoint_lock(fs);
1144                 if (lafs_cluster_update_pin(&uh) == 0) {
1145                         if (test_and_clear_bit(B_Dirty, &b->b.flags))
1146                                 lafs_space_return(fs, 1);
1147                         LAFSI(ino)->update_cluster =
1148                                 lafs_cluster_update_commit
1149                                 (&uh, b, LAFS_INODE_LOG_START,
1150                                  LAFS_INODE_LOG_SIZE);
1151                 } else
1152                         lafs_cluster_update_abort(&uh);
1153                 lafs_checkpoint_unlock(fs);
1154         }
1155         if (test_bit(B_Dirty, &b->b.flags)) {
1156                 /* FIXME need to write out the data block...
1157                  * Is that just lafs_cluster_allocate ?
1158                  */
1159         }
1160
1161         if (LAFSI(ino)->update_cluster == 0) {
1162                 lafs_checkpoint_lock(fs);
1163                 if (test_bit(B_Dirty, &b->b.flags))
1164                         LAFSI(ino)->update_cluster = 1;
1165                 lafs_checkpoint_start(fs);
1166                 lafs_checkpoint_unlock(fs);
1167         }
1168         putdref(b, MKREF(write_inode));
1169         return 0; /* FIXME should I return some error message??? */
1170 }
1171
1172 void lafs_inode_fillblock(struct inode *ino)
1173 {
1174         /* copy data from ino into the related data block */
1175
1176         struct lafs_inode *li = LAFSI(ino);
1177         struct datablock *db = li->dblock;
1178         struct la_inode *lai;
1179
1180         clear_bit(I_Dirty, &LAFSI(ino)->iflags);
1181
1182         lai = map_dblock(db);
1183         lai->data_blocks = cpu_to_le32(li->cblocks);
1184         lai->index_blocks = cpu_to_le32(li->ciblocks);
1185         lai->generation = cpu_to_le16(ino->i_generation);
1186         lai->trunc_gen = li->trunc_gen;
1187         lai->flags = li->flags;
1188         lai->filetype = li->type;
1189         if (lai->metadata_size != cpu_to_le16(li->metadata_size)) {
1190                 /* Changing metadata size is wierd.
1191                  * We will need to handle this somehow for xattrs
1192                  * For now we just want to cope with
1193                  * Dir -> InodeFile changes, and that guarantees us
1194                  * there is no index info - so just clear the index
1195                  * area.
1196                  */
1197                 u16 *s = (u16*)(((char*)lai) + li->metadata_size);
1198                 BUG_ON(li->type != TypeInodeFile);
1199                 lai->metadata_size = cpu_to_le16(li->metadata_size);
1200                 memset(s, 0, ino->i_sb->s_blocksize - li->metadata_size);
1201                 *s = cpu_to_le16(IBLK_INDIRECT);
1202         }
1203         lai->depth = li->depth;
1204
1205         switch (li->type) {
1206         case TypeInodeFile:
1207         {
1208                 struct fs_md *i = &li->md.fs;
1209                 struct fs_metadata *l = &lai->metadata[0].fs;
1210                 int nlen;
1211
1212                 l->snapshot_usage_table = cpu_to_le16(i->usagetable);
1213                 l->update_time = cpu_to_le64(encode_time(&ino->i_mtime));
1214                 l->blocks_used = cpu_to_le64(i->cblocks_used);
1215                 l->blocks_allowed = cpu_to_le64(i->blocks_allowed);
1216                 l->creation_age = cpu_to_le64(i->creation_age);
1217                 l->inodes_used = cpu_to_le32(i->inodes_used);
1218                 l->quota_inodes[0] = cpu_to_le32(i->quota_inums[0]);
1219                 l->quota_inodes[1] = cpu_to_le32(i->quota_inums[1]);
1220                 l->quota_inodes[2] = cpu_to_le32(i->quota_inums[2]);
1221                 nlen = lai->metadata_size - offsetof(struct la_inode,
1222                                                      metadata[0].fs.name);
1223                 memset(l->name, 0, nlen);
1224                 if (i->name == NULL)
1225                         nlen = 0;
1226                 else if (strlen(i->name) < nlen)
1227                         nlen = strlen(i->name);
1228                 memcpy(l->name, i->name, nlen);
1229                 break;
1230         }
1231
1232         case TypeInodeMap:
1233         {
1234                 struct inodemap_md *m = &li->md.inodemap;
1235                 struct inodemap_metadata *s = &lai->metadata[0].inodemap;
1236                 s->size = cpu_to_le32(m->size);
1237                 break;
1238         }
1239
1240         case TypeSegmentMap:
1241         {
1242                 struct su_md *m = &li->md.segmentusage;
1243                 struct su_metadata *s = &lai->metadata[0].segmentusage;
1244                 s->table_size = cpu_to_le32(m->table_size);
1245                 break;
1246         }
1247
1248         case TypeQuota:
1249         {
1250                 struct quota_md *m = &li->md.quota;
1251                 struct quota_metadata *s = &lai->metadata[0].quota;
1252                 s->gracetime = cpu_to_le32(m->gracetime);
1253                 s->graceunits = cpu_to_le32(m->graceunits);
1254                 break;
1255         }
1256         case TypeOrphanList:
1257         case TypeAccessTime:
1258                 break;
1259
1260         default: /* TypeBase or larger */
1261         {
1262                 struct file_md *i = &li->md.file;
1263                 struct file_metadata *l = &lai->metadata[0].file;
1264                 struct dir_metadata *d = &lai->metadata[0].dir;
1265                 struct special_metadata *s = &lai->metadata[0].special;
1266
1267                 if (li->type < TypeBase)
1268                         break;
1269                 l->flags = cpu_to_le16(i->flags);
1270                 l->mode = cpu_to_le16(ino->i_mode);
1271                 l->userid = cpu_to_le32(ino->i_uid);
1272                 l->groupid = cpu_to_le32(ino->i_gid);
1273                 l->treeid = cpu_to_le32(i->treeid);
1274                 l->creationtime = cpu_to_le64(i->creationtime);
1275                 l->modifytime = cpu_to_le64(encode_time(&ino->i_mtime));
1276                 l->ctime = cpu_to_le64(encode_time(&ino->i_ctime));
1277                 l->accesstime = cpu_to_le64(encode_time(&ino->i_atime));
1278                 /* FIXME write 0 to accesstime file */
1279                 l->size = cpu_to_le64(ino->i_size);
1280                 l->parent = cpu_to_le32(i->parent);
1281                 l->linkcount = cpu_to_le32(ino->i_nlink);
1282
1283                 switch (li->type) {
1284                 case TypeFile:
1285                         break;
1286                 case TypeDir:
1287                         d->hash_seed = cpu_to_le32(i->seed);
1288                         break;
1289                 case TypeSymlink:
1290                         break;
1291                 case TypeSpecial:
1292                         s->major = cpu_to_le32(MAJOR(ino->i_rdev));
1293                         s->minor = cpu_to_le32(MINOR(ino->i_rdev));
1294                         break;
1295                 }
1296         }
1297         }
1298         unmap_dblock(db, lai);
1299 }
1300
1301 /*-----------------------------------------------------------------------
1302  * Inode allocate map handling.
1303  * Inode 1 of each fileset is a bitmap of free inode numbers.
1304  * Whenever the file is extended in size, new bits are set to one.  They
1305  * are then cleared when the inode is allocated.  When a block becomes
1306  * full of zeros, we don't need to store it any more.
1307  *
1308  * We don't clear the bit until we are committed to creating an inode
1309  * This means we cannot clear it straight away, so two different threads
1310  * might see the same inode number as being available.  We have two
1311  * approaches to guard against this.
1312  * Firstly we have a 'current' pointer into the inodemap file and
1313  * increase that past the inode we return.  This discourages multiple
1314  * hits but as the pointer would need to be rewound occasionally it
1315  * isn't a guarantee.  The guarantee against multiple allocations is done
1316  * via a flag in the block representing an inode.  This is set
1317  * while an inode is being allocated.
1318  */
1319
1320 /* inode number allocation has the prealloc/pin/commit/abort structure
1321  * so it can be committed effectively
1322  */
1323
1324 static int
1325 choose_free_inum(struct fs *fs, struct super_block *sb, u32 *inump,
1326                  struct datablock **bp, int *restarted)
1327 {
1328         struct inode *im = lafs_iget(sb, 1, SYNC);
1329         loff_t bnum;
1330         struct datablock *b;
1331         char *buf;
1332         int err;
1333         int bit;
1334
1335         if (*bp) {
1336                 struct inode *i = (*bp)->b.inode;
1337                 putdref(*bp, MKREF(cfi_map));
1338                 iput(i);
1339                 *bp = NULL;
1340         }
1341
1342         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1343 retry:
1344         bnum = LAFSI(im)->md.inodemap.thisblock;
1345
1346         if (bnum == NoBlock ||
1347             LAFSI(im)->md.inodemap.nextbit >= (fs->blocksize<<3)) {
1348                 if (bnum == NoBlock)
1349                         bnum = LAFSI(im)->md.inodemap.size;
1350
1351                 if (bnum+1 < LAFSI(im)->md.inodemap.size)
1352                         bnum++;
1353                 else if (!*restarted) {
1354                         bnum = 0;
1355                         *restarted = 1;
1356                 } else {
1357                         /* Need to add a new block to the file */
1358                         bnum = LAFSI(im)->md.inodemap.size;
1359                         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL,
1360                                            MKREF(cfi_map));
1361                         err = -ENOMEM;
1362                         if (!b)
1363                                 goto abort;
1364                         lafs_iolock_written(&b->b);
1365                         set_bit(B_PinPending, &b->b.flags);
1366                         lafs_iounlock_block(&b->b);
1367                 retry2:
1368                         lafs_checkpoint_lock(fs);
1369                         err = lafs_pin_dblock(b, NewSpace);
1370                         if (err == -EAGAIN) {
1371                                 lafs_checkpoint_unlock_wait(fs);
1372                                 goto retry2;
1373                         }
1374                         if (err < 0)
1375                                 goto abort_unlock;
1376
1377                         buf = map_dblock(b);
1378                         /* Set block to "all are free" */
1379                         memset(buf, 0xff, fs->blocksize);
1380                         unmap_dblock(b, buf);
1381                         set_bit(B_Valid, &b->b.flags);
1382                         LAFSI(im)->md.inodemap.size = bnum+1;
1383                         lafs_dirty_inode(im);
1384                         lafs_dirty_dblock(b);
1385                         lafs_checkpoint_unlock(fs);
1386                         putdref(b, MKREF(cfi_map));
1387                 }
1388                 b = NULL;
1389                 err = lafs_find_next(im, &bnum);
1390                 if (err < 0)
1391                         goto abort;
1392                 if (err == 0)
1393                         bnum = 0;
1394
1395                 LAFSI(im)->md.inodemap.nextbit = 0;
1396                 LAFSI(im)->md.inodemap.thisblock = bnum;
1397                 goto retry;
1398         }
1399         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(cfi_map));
1400         err = -ENOSPC;
1401         if (!b)
1402                 goto abort;
1403         err = lafs_find_block(b, NOADOPT);
1404         if (err)
1405                 goto abort;
1406         if (b->b.physaddr == 0 && !test_bit(B_Valid, &b->b.flags)) {
1407                 LAFSI(im)->md.inodemap.nextbit =
1408                         (fs->blocksize<<3) + 1;
1409                 putdref(b,MKREF(cfi_map));
1410                 goto retry;
1411         }
1412         err = lafs_read_block(b);
1413         if (err)
1414                 goto abort;
1415
1416         bit = LAFSI(im)->md.inodemap.nextbit;
1417         LAFSI(im)->md.inodemap.thisblock = bnum;
1418         buf = map_dblock(b);
1419         while (bnum == 0 && bit < 16) {
1420                 /* Never return an inum below 16 - they are special */
1421                 if (!generic_test_le_bit(bit, (unsigned long *)buf))
1422                         generic___clear_le_bit(bit, (unsigned long *)buf);
1423                 bit++;
1424         }
1425
1426         bit = generic_find_next_le_bit((unsigned long *)buf,
1427                                        fs->blocksize<<3, bit);
1428         unmap_dblock(b, buf);
1429         LAFSI(im)->md.inodemap.nextbit = bit+1;
1430         if (bit >= fs->blocksize<<3) {
1431                 putdref(b,MKREF(cfi_map));
1432                 goto retry;
1433         }
1434         mutex_unlock(&im->i_mutex);
1435         *bp = b;
1436         *inump = bit + (bnum << (im->i_blkbits + 3));
1437         return 0;
1438
1439 abort_unlock:
1440         lafs_checkpoint_unlock(fs);
1441 abort:
1442         putdref(b, MKREF(cfi_map));
1443         *bp = NULL;
1444         mutex_unlock(&im->i_mutex);
1445         iput(im);
1446         return err;
1447 }
1448
1449 struct inode_map_new_info {
1450         struct datablock *ib, *mb;
1451 };
1452
1453 static int
1454 inode_map_new_prepare(struct fs *fs, int inum, struct super_block *sb,
1455                       struct inode_map_new_info *imni)
1456 {
1457         int choice = inum;
1458         int restarted = 0;
1459         int err = 0;
1460         struct datablock *b;
1461
1462         imni->ib = imni->mb = NULL;
1463 retry:
1464         if (inum == 0)
1465                 /* choose a possibly-free inode number */
1466                 err = choose_free_inum(fs, sb, &choice,
1467                                        &imni->mb, &restarted);
1468         if (err)
1469                 return err;
1470
1471         b = lafs_get_block(ino_from_sb(sb), choice, NULL, GFP_KERNEL,
1472                            MKREF(cfi_ino));
1473         if (!b)
1474                 return -ENOMEM;
1475
1476         if (test_and_set_bit(B_Claimed, &b->b.flags)) {
1477                 putdref(b, MKREF(cfi_ino));
1478                 if (inum)
1479                         return -EEXIST;
1480                 goto retry;
1481         }
1482         if (imni->mb) {
1483                 lafs_iolock_written(&imni->mb->b);
1484                 set_bit(B_PinPending, &imni->mb->b.flags);
1485                 lafs_iounlock_block(&imni->mb->b);
1486         }
1487         set_bit(B_PinPending, &b->b.flags);
1488         b->my_inode = NULL;
1489         imni->ib = b;
1490         return 0;
1491 }
1492
1493 static int
1494 inode_map_new_pin(struct inode_map_new_info *imni)
1495 {
1496         int err = 0;
1497         if (imni->mb)
1498                 err = lafs_pin_dblock(imni->mb, NewSpace);
1499         err = err ?: lafs_pin_dblock(imni->ib, NewSpace);
1500         return err;
1501 }
1502
1503 static void
1504 inode_map_new_commit(struct inode_map_new_info *imni)
1505 {
1506         unsigned long *buf;
1507
1508         if (imni->mb) {
1509                 int blksize = imni->ib->b.inode->i_sb->s_blocksize;
1510                 int bit = imni->ib->b.fileaddr & (blksize*8 - 1);
1511                 int hole = 0;
1512                 struct inode *ino = imni->mb->b.inode;
1513
1514                 mutex_lock_nested(&ino->i_mutex, I_MUTEX_QUOTA);
1515                 buf = map_dblock(imni->mb);
1516                 generic___clear_le_bit(bit, buf);
1517                 if (buf[blksize/sizeof(*buf)-1] == 0 &&
1518                     generic_find_next_le_bit(buf, blksize*8, 0) == blksize*8)
1519                         /* block is empty, punch a hole */
1520                         hole = 1;
1521
1522                 unmap_dblock(imni->mb, buf);
1523                 if (hole)
1524                         lafs_erase_dblock(imni->mb);
1525                 else
1526                         lafs_dirty_dblock(imni->mb);
1527
1528                 putdref(imni->mb, MKREF(cfi_map));
1529                 mutex_unlock(&ino->i_mutex);
1530                 iput(ino);
1531         }
1532         putdref(imni->ib, MKREF(cfi_ino));
1533 }
1534
1535 static void
1536 inode_map_new_abort(struct inode_map_new_info *imni)
1537 {
1538         if (imni->ib) {
1539                 clear_bit(B_Claimed, &imni->ib->b.flags);
1540                 clear_bit(B_PinPending, &imni->ib->b.flags);
1541                 lafs_orphan_release(fs_from_inode(imni->ib->b.inode),
1542                                     imni->ib);
1543         }
1544         putdref(imni->ib, MKREF(cfi_ino));
1545         if (imni->mb) {
1546                 struct inode *ino = imni->mb->b.inode;
1547                 putdref(imni->mb, MKREF(cfi_map));
1548                 iput(ino);
1549         }
1550 }
1551
1552 struct inode *
1553 lafs_new_inode(struct fs *fs, struct super_block *sb, struct inode *dir,
1554                int type, int inum, int mode, struct datablock **inodbp)
1555 {
1556         /* allocate and instantiate a new inode.  If inum is non-zero,
1557          * choose any number, otherwise we are creating a special inode
1558          * and have to use the given number.
1559          * This creation is committed independently of any name that might
1560          * subsequently be given to the inode.  So we register it as an
1561          * orphan so that it will be cleaned up if the name isn't
1562          * successfully created
1563          *
1564          */
1565         struct inode *ino;
1566         struct datablock *b;
1567         struct inode_map_new_info imni;
1568         struct update_handle ui;
1569         int err;
1570
1571         err = inode_map_new_prepare(fs, inum, sb, &imni);
1572         err = lafs_cluster_update_prepare(&ui, fs, sizeof(struct la_inode))
1573                 ?: err;
1574         if (err == 0)
1575                 err = lafs_make_orphan(fs, imni.ib);
1576         if (err)
1577                 goto abort;
1578 retry:
1579         lafs_checkpoint_lock(fs);
1580
1581         err = inode_map_new_pin(&imni);
1582
1583         if (err == -EAGAIN) {
1584                 lafs_checkpoint_unlock_wait(fs);
1585                 goto retry;
1586         }
1587         if (err < 0)
1588                 goto abort_unlock;
1589
1590         b = getdref(imni.ib, MKREF(inode_new));
1591
1592         lafs_iolock_block(&b->b); /* make sure we don't race with the cleaner
1593                                    * and zero this inode while trying to load it
1594                                    */
1595         lafs_inode_init(b, type, mode, dir);
1596         lafs_iounlock_block(&b->b);
1597
1598         inode_map_new_commit(&imni);
1599         ino = lafs_iget(sb, b->b.fileaddr, SYNC);
1600         if (IS_ERR(ino)) {
1601                 lafs_cluster_update_abort(&ui);
1602                 LAFS_BUG(1, &b->b);
1603         } else
1604                 lafs_cluster_update_commit(&ui, b, 0,
1605                                            LAFSI(ino)->metadata_size);
1606         LAFS_BUG(LAFSI(ino)->dblock != b, &b->b);
1607         LAFS_BUG(b->my_inode != ino, &b->b);
1608         lafs_checkpoint_unlock(fs);
1609
1610         if (inodbp)
1611                 *inodbp = b;
1612         else
1613                 putdref(b, MKREF(inode_new));
1614         return ino;
1615
1616 abort_unlock:
1617         lafs_checkpoint_unlock(fs);
1618         err = -ENOSPC;
1619 abort:
1620         inode_map_new_abort(&imni);
1621         lafs_cluster_update_abort(&ui);
1622         dprintk("After abort %d: %s\n", err, strblk(&imni.ib->b));
1623         return ERR_PTR(err);
1624 }
1625
1626 static int inode_map_free(struct fs *fs, struct super_block *sb, u32 inum)
1627 {
1628         struct inode *im = lafs_iget(sb, 1, SYNC);
1629         int bit;
1630         unsigned long *buf;
1631         struct datablock *b;
1632         u32 bnum;
1633         int err;
1634
1635         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1636
1637         bnum = inum >> (3 + sb->s_blocksize_bits);
1638         bit = inum - (bnum << (3 + sb->s_blocksize_bits));
1639         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(inode_map_free));
1640         if (!b) {
1641                 mutex_unlock(&im->i_mutex);
1642                 iput(im);
1643                 return -ENOMEM;
1644         }
1645         err = lafs_read_block(b);
1646         if (err) {
1647                 putdref(b, MKREF(inode_map_free));
1648                 mutex_unlock(&im->i_mutex);
1649                 iput(im);
1650                 return err;
1651         }
1652         lafs_iolock_written(&b->b);
1653         set_bit(B_PinPending, &b->b.flags);
1654         lafs_iounlock_block(&b->b);
1655 retry:
1656         lafs_checkpoint_lock(fs);
1657         err = lafs_pin_dblock(b, ReleaseSpace);
1658         if (err == -EAGAIN) {
1659                 lafs_checkpoint_unlock_wait(fs);
1660                 goto retry;
1661         }
1662         BUG_ON(err < 0);
1663         buf = map_dblock(b);
1664         generic___set_le_bit(bit, buf);
1665         unmap_dblock(b, buf);
1666         lafs_dirty_dblock(b);
1667         putdref(b, MKREF(inode_map_free));
1668         lafs_checkpoint_unlock(fs);
1669         mutex_unlock(&im->i_mutex);
1670         iput(im);
1671         return 0;
1672 }
1673
1674 int lafs_inode_inuse(struct fs *fs, struct super_block *sb, u32 inum)
1675 {
1676         /* This is used during roll-forward to register a newly created
1677          * inode in the inode map
1678          */
1679         struct inode *im = lafs_iget(sb, 1, SYNC);
1680         int bit;
1681         unsigned long *buf;
1682         struct datablock *b;
1683         u32 bnum;
1684         int err;
1685
1686         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1687
1688         bnum = inum >> (3 + sb->s_blocksize_bits);
1689         bit = inum - (bnum << (3 + sb->s_blocksize_bits));
1690         if (bnum > LAFSI(im)->md.inodemap.size) {
1691                 /* inum to unbelievably big */
1692                 mutex_unlock(&im->i_mutex);
1693                 iput(im);
1694                 return -EINVAL;
1695         }
1696         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(inode_map_free));
1697         if (!b) {
1698                 mutex_unlock(&im->i_mutex);
1699                 iput(im);
1700                 return -ENOMEM;
1701         }
1702
1703         err = lafs_read_block(b);
1704         if (err) {
1705                 putdref(b, MKREF(inode_map_free));
1706                 mutex_unlock(&im->i_mutex);
1707                 iput(im);
1708                 return err;
1709         }
1710
1711         lafs_iolock_written(&b->b);
1712         set_bit(B_PinPending, &b->b.flags);
1713         lafs_iounlock_block(&b->b);
1714 retry:
1715         lafs_checkpoint_lock(fs);
1716         err = lafs_pin_dblock(b, CleanSpace);
1717         if (err == -EAGAIN) {
1718                 lafs_checkpoint_unlock_wait(fs);
1719                 goto retry;
1720         }
1721         BUG_ON(err < 0);
1722         buf = map_dblock(b);
1723         if (bnum == LAFSI(im)->md.inodemap.size) {
1724                 /* need to add a new block to the file */
1725                 memset(buf, 0xff, fs->blocksize);
1726                 LAFSI(im)->md.inodemap.size = bnum + 1;
1727                 lafs_dirty_inode(im);
1728         }
1729         generic___clear_le_bit(bit, buf);
1730         unmap_dblock(b, buf);
1731         lafs_dirty_dblock(b);
1732         putdref(b, MKREF(inode_map_free));
1733         lafs_checkpoint_unlock(fs);
1734         mutex_unlock(&im->i_mutex);
1735         iput(im);
1736         return 0;
1737 }
1738
1739
1740
1741 int lafs_setattr(struct dentry *dentry, struct iattr *attr)
1742 {
1743         int err;
1744         struct inode *ino = dentry->d_inode;
1745         struct fs *fs = fs_from_inode(ino);
1746         struct datablock *db;
1747
1748         err = inode_change_ok(ino, attr);
1749         db = lafs_inode_dblock(ino, SYNC, MKREF(setattr));
1750         if (IS_ERR(db))
1751                 err = PTR_ERR(db);
1752         if (err)
1753                 return err;
1754
1755         /* We don't need iolock_written here as we don't
1756          * actually change the inode block yet
1757          */
1758         lafs_iolock_block(&db->b);
1759         set_bit(B_PinPending, &db->b.flags);
1760         lafs_iounlock_block(&db->b);
1761
1762         /* FIXME quota stuff */
1763
1764 again:
1765         lafs_checkpoint_lock(fs);
1766         err = lafs_pin_dblock(db, ReleaseSpace);
1767         if (err == -EAGAIN) {
1768                 lafs_checkpoint_unlock_wait(fs);
1769                 goto again;
1770         }
1771         /* inode_setattr calls lafs_dirty_inode, which sets
1772          * I_Dirty so the dblock will get updated.
1773          */
1774         err = err ?: inode_setattr(ino, attr);
1775         if (!err)
1776                 lafs_dirty_dblock(db);
1777         clear_bit(B_PinPending, &db->b.flags);
1778         putdref(db, MKREF(setattr));
1779         lafs_checkpoint_unlock(fs);
1780
1781         return err;
1782 }
1783
1784 void lafs_truncate(struct inode *ino)
1785 {
1786         /* Want to truncate this file.
1787          * i_size has already been changed, and the address space
1788          * has been cleaned up.
1789          * So just start the background truncate
1790          */
1791         struct fs *fs = fs_from_inode(ino);
1792         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(trunc));
1793         loff_t trunc_block;
1794         DEFINE_WAIT(wq);
1795
1796         if (IS_ERR(db))
1797                 return;
1798
1799         trunc_block = ((i_size_read(ino) + fs->blocksize - 1)
1800                        >> fs->blocksize_bits);
1801         /* We hold i_mutex, so regular orphan processing cannot
1802          * contine - we have to push it forward ourselves.
1803          */
1804         while (test_bit(I_Trunc, &LAFSI(ino)->iflags) &&
1805                LAFSI(ino)->trunc_next < trunc_block) {
1806                 prepare_to_wait(&fs->async_complete, &wq,
1807                                 TASK_UNINTERRUPTIBLE);
1808                 lafs_inode_handle_orphan(db);
1809                 if (test_bit(B_Orphan, &db->b.flags))
1810                         schedule();
1811         }
1812         finish_wait(&fs->async_complete, &wq);
1813
1814         /* There is nothing we can do about errors here.  The
1815          * most likely are ENOMEM which itself is very unlikely.
1816          * If this doesn't get registered as an orphan .... maybe
1817          * it will have to wait until something else truncates it.
1818          */
1819         lafs_make_orphan(fs, db);
1820
1821         if (!test_and_set_bit(I_Trunc, &LAFSI(ino)->iflags))
1822                 lafs_igrab_fs(ino);
1823         if (trunc_block == 0)
1824                 LAFSI(ino)->trunc_gen++;
1825         LAFSI(ino)->trunc_next = trunc_block;
1826         putdref(db, MKREF(trunc));
1827 }
1828
1829 const struct inode_operations lafs_special_ino_operations = {
1830         .setattr        = lafs_setattr,
1831         .truncate       = lafs_truncate,
1832 };