roll.c

   1
   2 /*
   3  * fs/lafs/roll.c
   4  * Copyright (C) 2005-2010
   5  * Neil Brown <neilb@suse.de>
   6  * Released under the GPL, version 2
   7  *
   8  * 'rollforward'
   9  *
  10  * This file handles mounting of a filesystem once the superblocks
  11  * have been loaded.
  12  * It loads the root inode (the root of the filesystem, not of the
  13  * directory tree) and then handles roll-forward to pick up and changes
  14  * there are not in the filesystem yet, either due to a crash, or because
  15  * they cannot be consistently stored easily (final segusage/quota info).
  16  *
  17  * Roll-forward reads write-cluster header and handle things as appropriate.
  18  * Data blocks are only processed if they belong to:
  19  *   - regular files
  20  *   - segusage files
  21  *   - quota files.
  22  * A data block in a regular file implies an extension of the file size
  23  * to the end of the block, if it was previously at or before the start
  24  * of the block.  Datablocks that were just moved for cleaning are
  25  * ignored.
  26  *
  27  * Index blocks are always ignored - they need to be recalculated.
  28  *
  29  * 'miniblocks' or 'updates' are always processed - they represent an
  30  * atomic update that might affect multiple files - those files for which
  31  * data blocks are ignored.
  32  * Updates are understood:
  33  * - for inodes.  The update simply over-writes part of the inode metadata,
  34  *       which could affect the link count or size.  Such inodes become
  35  *       orphans in case truncation or deletion is needed.  This can create
  36  *       an inode which might affect the inode usage map.
  37  * - for directories.  The update identifies a name and an inode number.
  38  *       This can imply a change to the inode's link count and again could
  39  *       make it an orphan.  In some cases updates are paired, possibly across
  40  *       different directories.  This is needed for 'rename'.
  41  *
  42  * Each write-cluster has three levels of validation.
  43  *  Firstly, if the header is internally consistent, with correct tag,
  44  *   uuid, and sequence, then we know a write was attempted, and anything that
  45  *   must be written before that was successfully written.
  46  *  Secondly, if the header has a correct checksum, then it is all correct,
  47  *   and the miniblocks are valid.
  48  *  Thirdly, if the next or next-but-one header (depending on verify_type) is
  49  *   internally consistent, than we know that the data blocks in this cluster
  50  *   were all written successfully.
  51  */
  52
  53 #include        "lafs.h"
  54 #include        <linux/slab.h>
  55
  56 static int
  57 roll_valid(struct fs *fs, struct cluster_head *ch, unsigned long long addr)
  58 {
  59         /* return 1 if the cluster_head looks locally valid.
  60          * Don't check checksum as we may not have the whole head
  61          */
  62         if (memcmp(ch->idtag, "LaFSHead", 8) != 0)
  63                 return 0;
  64         if (memcmp(fs->state->uuid, ch->uuid, 16) != 0)
  65                 return 0;
  66         if (le64_to_cpu(ch->this_addr) != addr)
  67                 return 0;
  68         switch (le16_to_cpu(ch->verify_type)) {
  69         case VerifyNull:
  70         case VerifyNext:
  71         case VerifyNext2:
  72                 break;
  73         default:
  74                 return 0;
  75         }
  76         if (ch->pad0 != 0)
  77                 return 0;
  78         if (le16_to_cpu(ch->Clength) > fs->max_segment)
  79                 return 0;
  80         return 1;
  81 }
  82
  83 /*
  84  * roll_locate scopes out the full extent of the required roll-forward.
  85  * It start at the start of the last checkpoint (recorded in the stateblock)
  86  * and checks that the end of the checkpoint exists, and continues following
  87  * the chain as far as valid cluster heads can be found.
  88  * roll_locate returns 0 if proper endpoints were found,
  89  * or -EIO if CheckpointStart and CheckpointEnd weren't found properly
  90  * "next" will contain the address of the next cluster to be written to,
  91  * "last" the cluster before that, and "seq" the seq number for next cluster
  92  * "maxp" will be used to report the maximum size of a cluster head.
  93  */
  94 static int
  95 roll_locate(struct fs *fs, u64 start,
  96             u64 *nextp, u64 *lastp, u64 *seqp,
  97             int *maxp, struct page *p)
  98 {
  99         struct cluster_head *ch;
 100         u64 this, prev, prev2, last, next;
 101         u64 seq = 0;
 102         int max = 0;
 103         int prevtype, prev2type;
 104
 105         ch = (struct cluster_head *)page_address(p);
 106
 107         this = start; prev = start;
 108
 109         /* First we walk through the checkpoint section, which should
 110          * all be valid.
 111          */
 112         do {
 113                 if (lafs_load_page(fs, p, this, 1) != 0) {
 114                         printk(KERN_ERR "LaFS: Could not read cluster %llu\n",
 115                                (unsigned long long) this);
 116                         return -EIO;
 117                 }
 118                 if (!roll_valid(fs, ch, this)) {
 119                         printk(KERN_ERR "LaFS: Bad cluster at %llu\n",
 120                                (unsigned long long) this);
 121                         return -EIO;
 122                 }
 123                 if (this == start) {
 124                         seq = le64_to_cpu(ch->seq);
 125                         if (!(ch->flags & CH_CheckpointStart)) {
 126                                 printk(KERN_ERR "LaFS: Cluster at %llu not CheckpointStart!!\n",
 127                                        (unsigned long long)this);
 128                                 return -EIO;
 129                         }
 130                 } else if (seq != le64_to_cpu(ch->seq)) {
 131                         printk(KERN_ERR "LaFS: Cluster sequence bad at %llu: %llu->%llu\n",
 132                                (unsigned long long)this,
 133                                (unsigned long long)seq,
 134                                (unsigned long long)le64_to_cpu(ch->seq));
 135                         return -EIO;
 136                 }
 137
 138                 if (this != start && le64_to_cpu(ch->prev_addr) != prev) {
 139                         printk(KERN_ERR "LaFS: Cluster Linkage error at %llu: %llu != %llu\n",
 140                                (unsigned long long)this,
 141                                (unsigned long long)le64_to_cpu(ch->prev_addr),
 142                                (unsigned long long)prev);
 143                         return -EIO;
 144                 }
 145                 if (!ch->flags & CH_Checkpoint) {
 146                         printk(KERN_ERR "LaFS: Cluster %llu not a Checkpoint cluster\n",
 147                                (unsigned long long)this);
 148                         return -EIO;
 149                 }
 150                 dprintk("Found seq %llu at %llu\n",
 151                         (unsigned long long)seq, (unsigned long long)this);
 152                 if (le16_to_cpu(ch->Hlength) > max)
 153                         max = le16_to_cpu(ch->Hlength);
 154                 prev = this;
 155                 this = le64_to_cpu(ch->next_addr);
 156                 seq++;
 157         } while (!(ch->flags & CH_CheckpointEnd));
 158
 159         /* 'seq' is sequence number of 'this' */
 160         dprintk("CheckpointEnd found at %llu, seq %llu\n", prev, seq-1);
 161
 162         /* now we need to step forward a bit more carefully, as any
 163          * cluster we find now could easily be bad.
 164          * We keep:
 165          *   this - address of cluster we are now considering
 166          *   prev - address of previous cluster
 167          *   prevtype - verify type of previous cluster
 168          *   prev2 - address of cluster before prev
 169          *   prev2type - verify type of that cluster.
 170          *   start - "next_addr" entry from last known-good cluster
 171          *
 172          *
 173          */
 174
 175         last = prev;
 176         next = this;
 177         prev2 = prev;
 178         prevtype = prev2type = VerifyNull;
 179
 180         while (1) {
 181                 if (lafs_load_page(fs, p, this, 1) != 0)
 182                         break;
 183                 if (!roll_valid(fs, ch, this))
 184                         break;
 185                 if (le64_to_cpu(ch->prev_addr) != prev)
 186                         break;
 187                 if (le64_to_cpu(ch->seq) != seq)
 188                         break;
 189
 190                 /* this head looks valid, so we can possibly verify previous
 191                  * clusters
 192                  */
 193                 if (le16_to_cpu(ch->Hlength) > max)
 194                         max = le16_to_cpu(ch->Hlength);
 195
 196                 if (prev2type == VerifyNext2) {
 197                         next = prev;
 198                         last = prev2;
 199                 }
 200                 if (prevtype == VerifyNext) {
 201                         next = this;
 202                         last = prev;
 203                 }
 204
 205                 /* shift prev info back */
 206                 prev2 = prev;
 207                 prev2type = prevtype;
 208                 prev = this;
 209                 prevtype = le16_to_cpu(ch->verify_type);
 210                 this = le64_to_cpu(ch->next_addr);
 211                 if (prevtype == VerifyNull) {
 212                         next = this;
 213                         last = prev;
 214                 }
 215                 seq++;
 216         }
 217
 218         dprintk("LaFS: Next address to write is %llu\n", next);
 219         *nextp = next;
 220         *lastp = last;
 221         if (next == this)
 222                 *seqp = seq;
 223         else if (next == prev)
 224                 *seqp = seq-1;
 225         else if (next == prev2)
 226                 *seqp = seq-2;
 227         else
 228                 BUG();
 229         *maxp = max;
 230         return 0;
 231 }
 232
 233 static int __must_check
 234 roll_mini(struct fs *fs, int fsnum, int inum, int trunc,
 235           u32 bnum, int offset, int len, char *data)
 236 {
 237         struct inode *inode;
 238         struct inode *fsinode;
 239         struct lafs_inode *li;
 240         struct datablock *db = NULL;
 241         int err = 0;
 242         void *buf;
 243
 244         dprintk("Roll Mini  %d/%d/%lu/%d,%d\n",
 245                 fsnum, inum, (unsigned long) bnum,
 246                 offset, len);
 247
 248         /* The handling of miniblock updates is quite different for
 249          * different objects.
 250          *
 251          * inode-files: meta-data updates, including size, are allowed.
 252          *     index update and data update are not (data update must
 253          *      go through the file).  Implied creation requires
 254          *      orphan handling
 255          * regular-files: just over-write data, possibly extending size
 256          * symlink,dev,pipe: as with reg-files
 257          * directory: add/remove entries.
 258          */
 259
 260         inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
 261         if (IS_ERR(inode))
 262                 return PTR_ERR(inode);
 263
 264         li = LAFSI(inode);
 265
 266         switch (li->type) {
 267         default: /* Any unknown type is an error */
 268                 printk(KERN_WARNING "LAFS impossibly file type for roll-forward: %d\n",
 269                        li->type);
 270                 err = -EIO;
 271                 break;
 272
 273         case TypeInodeFile:
 274
 275                 if (fsnum) {
 276                         printk(KERN_WARNING "LAFS: Ignoring impossible sub-subset\n");
 277                         break;
 278                 }
 279
 280                 fsinode = inode;
 281                 inode = lafs_iget_fs(fs, inum, bnum, SYNC);
 282                 if (IS_ERR(inode)) {
 283                         struct super_block *sb;
 284                         err = PTR_ERR(inode);
 285                         if (err != -ENOENT || offset != 0) {
 286                                 lafs_iput_fs(fsinode);
 287                                 return err;
 288                         }
 289
 290                         db = lafs_get_block(fsinode, bnum, NULL, GFP_KERNEL,
 291                                             MKREF(roll));
 292                         sb = lafs_get_subset_sb(fsinode);
 293                         lafs_inode_inuse(fs, sb, bnum);
 294                         deactivate_super(sb);
 295                         lafs_iput_fs(fsinode);
 296                         if (!db)
 297                                 db = ERR_PTR(-ENOMEM);
 298                 } else {
 299                         lafs_iput_fs(fsinode);
 300                         db = lafs_inode_dblock(inode, SYNC, MKREF(roll));
 301                         if (!IS_ERR(db))
 302                                 /* Make sure block is in-sync with inode */
 303                                 lafs_inode_fillblock(inode);
 304                 }
 305                 if (IS_ERR(db)) {
 306                         err = PTR_ERR(db);
 307                         break;
 308                 }
 309                 /* Should normally iolock the block, but we don't
 310                  * need that during roll-forward */
 311                 set_bit(B_PinPending, &db->b.flags);
 312                 lafs_pin_dblock(db, CleanSpace);
 313                 buf = map_dblock(db);
 314                 memcpy(buf+offset, data, len);
 315                 unmap_dblock(db, buf);
 316                 if (inode)
 317                         err = lafs_import_inode(inode, db);
 318                 else {
 319                         inode = lafs_iget_fs(fs, inum, bnum, SYNC);
 320
 321                 }
 322                 lafs_dirty_dblock(db);
 323                 break;
 324
 325         case TypeDir:
 326                 /* Haven't written this yet FIXME */
 327                 BUG();
 328                 break;
 329         }
 330         /* We borrow the orphan list to keep a reference on
 331          * this inode until all processing is finished
 332          * to make sure inodes that are about to get linked
 333          * don't get deleted early
 334          */
 335         if (inode->i_nlink == 0) {
 336                 if (!db)
 337                         db = lafs_inode_get_dblock(inode, MKREF(roll));
 338                 if (db &&
 339                     list_empty(&db->orphans)) {
 340                         list_add(&db->orphans, &fs->pending_orphans);
 341                         lafs_igrab_fs(inode);
 342                         getdref(db, MKREF(roll_orphan));
 343                 }
 344         }
 345         putdref(db, MKREF(roll));
 346         lafs_iput_fs(inode);
 347         return err;
 348 }
 349
 350 static int __must_check
 351 roll_block(struct fs *fs, int fsnum, int inum, int trunc,
 352            u32 bnum, u64 baddr, int bytes, struct page *p)
 353 {
 354         struct inode *inode;
 355         struct datablock *blk = NULL;
 356         struct lafs_inode *li;
 357         int err = 0;
 358
 359         /* We found this block during roll-forward and need to
 360          * include it in the filesystem.
 361          * If 'bytes' is 0, the this is a 'hole' and we should
 362          * ignore baddr
 363          */
 364         if (bytes == DescHole)
 365                 baddr = 0;
 366
 367         dprintk("Roll Block %d/%d/%lu/%llu\n",
 368                 fsnum, inum, (unsigned long) bnum,
 369                 (unsigned long long)baddr);
 370
 371         /* find/load the inode */
 372         inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
 373         if (IS_ERR(inode))
 374                 return PTR_ERR(inode);
 375
 376         /* check type */
 377         li = LAFSI(inode);
 378
 379         dprintk("Got the inode, type %d %p size %llu\n", li->type,
 380                 inode, inode->i_size);
 381
 382         switch (li->type) {
 383                 struct la_inode *lai;
 384                 int mdsize;
 385
 386         default: /* most filetypes are simply ignored */
 387                 break;
 388
 389         case TypeInodeFile:
 390                 /* The only part of an inode that might be interesting
 391                  * is embedded data: All metadata changes get logged
 392                  * as miniblocks.
 393                  * Further the data can only be interesting for non-directories,
 394                  * as directory updates are also logged as miniblocks.
 395                  * So if this is a depth==0 non-directory inode,
 396                  * treat the data as a miniblock update.
 397                  */
 398                 if (bytes != fs->blocksize)
 399                         break;
 400                 err = lafs_load_page(fs, p, baddr, 1);
 401                 dprintk("inode load page err %d\n", err);
 402                 if (err)
 403                         break;
 404                 lai = (struct la_inode *)page_address(p);
 405                 mdsize = le16_to_cpu(lai->metadata_size);
 406                 if (lai->filetype >= TypeBase &&
 407                     lai->filetype != TypeDir  &&
 408                     lai->depth == 0 &&
 409                     mdsize > 1 && mdsize < fs->blocksize) {
 410                         u64 sz = le64_to_cpu(lai->metadata[0].file.size);
 411                         if (sz <= fs->blocksize - mdsize)
 412                                 err = roll_mini(fs, inum, bnum, -1, 0, 0,
 413                                                 (int)sz,
 414                                                 page_address(p) + mdsize);
 415                 }
 416                 break;
 417
 418         case TypeSegmentMap:
 419         case TypeQuota:
 420                 /* These only get merged while in a checkpoint. */
 421                 if (fs->qphase == fs->phase)
 422                         break;
 423                 /* FALL THROUGH */
 424         case TypeFile:
 425         case TypeSymlink:
 426                 /* merge into the file and possibly extend inode.size
 427                  * Only extend the size if it was before this block.
 428                  * i.e. if size was to the middle of this block, we don't
 429                  * extend the size
 430                  */
 431                 dprintk("FILE type\n");
 432                 err = -ENOMEM;
 433                 blk = lafs_get_block(inode, bnum, NULL, GFP_KERNEL,
 434                                      MKREF(roll));
 435                 if (!blk)
 436                         break;
 437
 438                 err = lafs_find_block(blk, ADOPT);
 439                 if (err)
 440                         break;
 441                 if (blk->b.physaddr == baddr)
 442                         /* already correctly indexed */
 443                         break;
 444
 445                 if (li->type >= TypeBase && bytes != DescHole &&
 446                     inode->i_size <= ((loff_t)bnum << inode->i_blkbits)) {
 447                         inode->i_size = ((loff_t)bnum << inode->i_blkbits) + bytes;
 448                         set_bit(I_Dirty, &LAFSI(inode)->iflags);
 449                 }
 450
 451                 /* FIXME: we pretend this is a dirty, pinned block
 452                  * so the lower-level code doesn't get confused.
 453                  * Is this really the best approach?
 454                  * Do I need to release some space here?
 455                  */
 456                 set_bit(B_PinPending, &blk->b.flags); /* Don't need iolock as no io yet */
 457                 lafs_pin_dblock(blk, CleanSpace); /* cannot fail during ! ->rolled */
 458
 459                 lafs_iolock_block(&blk->b);
 460                 lafs_summary_update(fs, blk->b.inode, blk->b.physaddr, baddr,
 461                                     0, fs->phase, 1);
 462                 blk->b.physaddr = baddr;
 463                 lafs_dirty_iblock(blk->b.parent, 0);
 464                 /* FIXME maybe set Writeback and unlock */
 465                 if (lafs_add_block_address(fs, &blk->b) == 0)
 466                         /* FIXME if the table becomes full, we have a problem... */
 467                         LAFS_BUG(1, &blk->b);
 468                 dprintk("Allocated block %lu to %llu\n",
 469                         (unsigned long)bnum, baddr);
 470                 /* FIXME maybe clear Writeback instead */
 471                 lafs_iounlock_block(&blk->b);
 472
 473                 clear_bit(B_PinPending, &blk->b.flags);
 474                 /* If we had previously read this block for some reason,
 475                  * the contents are now invalid.  If they are dirty,
 476                  * we have a real problem as those changes cannot be saved.
 477                  */
 478                 LAFS_BUG(test_bit(B_Dirty, &blk->b.flags), &blk->b);
 479                 clear_bit(B_Valid, &blk->b.flags);
 480
 481                 break;
 482         }
 483         if (blk)
 484                 putdref(blk, MKREF(roll));
 485
 486         if (inode->i_nlink == 0) {
 487                 struct datablock *db = lafs_inode_get_dblock(inode, MKREF(roll));
 488                 if (db &&
 489                     list_empty(&db->orphans)) {
 490                         list_add(&db->orphans, &fs->pending_orphans);
 491                         lafs_igrab_fs(inode);
 492                         getdref(db, MKREF(roll_orphan));
 493                 }
 494                 putdref(db, MKREF(roll));
 495         }
 496         lafs_iput_fs(inode);
 497         dprintk("leaving with error %d\n", err);
 498         return err;
 499 }
 500
 501 static int __must_check
 502 roll_one(struct fs *fs, u64 *addrp, struct page *p, struct page *pg,
 503          int max)
 504 {
 505         u64 addr = *addrp;
 506         struct cluster_head *ch = (struct cluster_head *)page_address(p);
 507         struct group_head *gh;
 508         struct descriptor *desc;
 509         int i;
 510         u64 baddr = addr;
 511         int err;
 512         int blocksize = fs->blocksize;
 513
 514         /* we "know" buf is big enough */
 515         err = lafs_load_page(fs, p, addr, max/blocksize);
 516         if (err)
 517                 return err;
 518
 519         /* just minimal checks, as we have looked at this already */
 520         if (!roll_valid(fs, ch, addr))
 521                 return -EIO;
 522         if (lafs_calc_cluster_csum(ch) != ch->checksum)
 523                 return -EIO;
 524         *addrp = le64_to_cpu(ch->next_addr);
 525
 526         if (le16_to_cpu(ch->Hlength) > max)
 527                 return -EIO;
 528
 529         baddr += (le16_to_cpu(ch->Hlength) + blocksize - 1) / blocksize;
 530
 531         if (!(ch->flags & CH_Checkpoint))
 532                 fs->qphase = fs->phase;
 533
 534         gh = ch->groups;
 535         i = 0;
 536         while (((char *)gh - (char *)ch) < le16_to_cpu(ch->Hlength)) {
 537                 int j = 0;
 538                 int inum = le32_to_cpu(gh->inum);
 539                 int fsnum = le32_to_cpu(gh->fsnum);
 540                 int trunc = le16_to_cpu(gh->truncatenum_and_flag) & 0x7fff;
 541                 int flg   = le16_to_cpu(gh->truncatenum_and_flag) & 0x8000;
 542
 543                 desc = gh->u.desc;
 544                 while (((char *)desc - (char *)gh) <
 545                        le16_to_cpu(gh->group_size_words)*4) {
 546                         if (le16_to_cpu(desc->block_bytes) <= DescMiniOffset ||
 547                             le16_to_cpu(desc->block_bytes) == DescIndex) {
 548                                 u32 bnum = le32_to_cpu(desc->block_num);
 549                                 int cnt = le16_to_cpu(desc->block_cnt);
 550                                 int bytes = le16_to_cpu(desc->block_bytes);
 551
 552                                 if (le16_to_cpu(desc->block_bytes) == DescIndex
 553                                     && cnt != 1)
 554                                         return -EIO; /* FIXME is this
 555                                                       * the best
 556                                                       * response */
 557                                 /* FIXME range check count */
 558                                 while (!err && cnt--) {
 559                                         if (!flg && bytes != DescIndex)
 560                                                 err = roll_block(fs, fsnum, inum, trunc,
 561                                                                  bnum, baddr,
 562                                                                  cnt == 0 || bytes == DescHole
 563                                                                  ? bytes
 564                                                                  : blocksize,
 565                                                                  pg);
 566                                         bnum++;
 567                                         if (bytes != DescHole)
 568                                                 baddr++;
 569                                 }
 570                                 /* FIXME allow for striping */
 571                                 desc++;
 572                         } else {
 573                                 struct miniblock *mb = (struct miniblock *)desc;
 574                                 u32 bnum = le32_to_cpu(mb->block_num);
 575                                 int offset = le16_to_cpu(mb->block_offset);
 576                                 int len = le16_to_cpu(mb->length)
 577                                         - DescMiniOffset;
 578                                 if (!flg)
 579                                         err = roll_mini(fs, fsnum, inum, trunc,
 580                                                         bnum, offset, len, (char *)(mb+1));
 581
 582                                 mb++;
 583                                 mb = (struct miniblock *)(((char*)mb)
 584                                                           + ROUND_UP(len));
 585                                 desc = (struct descriptor *)mb;
 586                         }
 587                         j++;
 588                         if (err)
 589                                 break;
 590                 }
 591                 gh = (struct group_head *)desc;
 592                 i++;
 593                 if (err)
 594                         break;
 595         }
 596         if (ch->flags & CH_CheckpointEnd)
 597                 fs->qphase = fs->phase;
 598         return err;
 599 }
 600
 601 static int roll_forward(struct fs *fs)
 602 {
 603         u64 first, next = 0, last = 0, seq = 0;
 604         int max = 0;
 605         struct page *p, *pg;
 606         int err;
 607         int blocksize = fs->blocksize;
 608         int dev;
 609         u32 seg;
 610         u32 offset;
 611
 612         fs->phase = 1;
 613         fs->qphase = 0;
 614         fs->checkpointing = CH_Checkpoint;
 615         clear_bit(DelayYouth, &fs->fsstate);
 616
 617         first = fs->checkpointcluster;
 618         p = alloc_page(GFP_KERNEL);
 619         if (!p)
 620                 return -ENOMEM;
 621
 622         err = roll_locate(fs, first, &next, &last, &seq, &max, p);
 623
 624         max = ((max + blocksize - 1) / blocksize) * blocksize;
 625
 626         if (!err && max > PAGE_SIZE)
 627                 err = -EFBIG;
 628         if (err) {
 629                 put_page(p);
 630                 return err;
 631         }
 632
 633         pg = alloc_page(GFP_KERNEL);
 634         if (!pg) {
 635                 put_page(p);
 636                 return -ENOMEM;
 637         }
 638
 639         err = lafs_cluster_init(fs, 0, next, last, seq);
 640         if (err) {
 641                 put_page(p); put_page(pg);
 642                 return err;
 643         }
 644         lafs_cluster_init(fs, 1, 0, 0, 0);
 645
 646         virttoseg(fs, first, &dev, &seg, &offset);
 647
 648         while (first != next) {
 649                 int dev2;
 650                 u32 seg2;
 651
 652                 virttoseg(fs, first, &dev2, &seg2, &offset);
 653                 err = roll_one(fs, &first, p, pg, max);
 654                 if (err)
 655                         break;
 656
 657                 if (fs->qphase == fs->phase &&
 658                     fs->checkpointing) {
 659                         fs->checkpointing = 0;
 660                         clear_bit(DelayYouth, &fs->fsstate);
 661                         lafs_seg_apply_all(fs);
 662                 }
 663
 664                 if (dev2 != dev || seg2 != seg) {
 665                         /* New segment - need to make sure youth is correct */
 666                         dev = dev2;
 667                         seg = seg2;
 668                         /* if fs->checkpointing, seg_apply_all will do the youth
 669                          * update
 670                          */
 671                         if (fs->checkpointing == 0)
 672                                 lafs_update_youth(fs, dev, seg);
 673                 }
 674         }
 675         put_page(p);
 676         put_page(pg);
 677
 678         lafs_add_active(fs, next);
 679
 680         /* Now we release all the nlink==0 inodes that we found */
 681         while (!list_empty(&fs->pending_orphans)) {
 682                 struct datablock *db = list_entry(fs->pending_orphans.next,
 683                                                   struct datablock,
 684                                                   orphans);
 685                 list_del_init(&db->orphans);
 686                 lafs_iput_fs(db->my_inode);
 687                 putdref(db, MKREF(roll_orphan));
 688         }
 689         fs->rolled = 1;
 690         return err;
 691 }
 692
 693 int
 694 lafs_mount(struct fs *fs)
 695 {
 696         struct datablock *b;
 697         struct inode *root;
 698         struct inode *rootdir;
 699         struct dentry *de;
 700         int err;
 701         int d;
 702         struct sb_key *k = fs->prime_sb->s_fs_info;
 703         int orphan_count;
 704
 705         fs->rolled = 0;
 706         fs->ss[0].root = root = iget_locked(fs->prime_sb, 0);
 707         k->root = root;
 708
 709         err = -ENOMEM;
 710         b = lafs_get_block(root, 0, NULL, GFP_KERNEL, MKREF(mount));
 711         if (!b)
 712                 goto err;
 713         set_bit(B_Root, &b->b.flags);
 714         b->b.physaddr = fs->ss[0].root_addr;
 715         set_bit(B_PhysValid, &b->b.flags);
 716         err = lafs_load_block(&b->b, NULL);
 717         if (err)
 718                 goto err;
 719         err = lafs_wait_block(&b->b);
 720         if (err)
 721                 goto err;
 722
 723         err = lafs_import_inode(root, b);
 724         if (err)
 725                 goto err;
 726         putdref(b, MKREF(mount));
 727         b = NULL;
 728
 729         unlock_new_inode(root);
 730         /* FIXME lots of error checking */
 731
 732         rootdir = lafs_iget(fs->prime_sb, 2, SYNC);
 733         err = PTR_ERR(rootdir);
 734         if (IS_ERR(rootdir))
 735                 goto err;
 736         de = d_alloc_root(rootdir);
 737         err = PTR_ERR(de);
 738         if (IS_ERR(de))
 739                 goto err;
 740         fs->prime_sb->s_root = de;
 741
 742         fs->orphans = lafs_iget(fs->prime_sb, 8, SYNC);
 743         for (d = 0; d < fs->devices ; d++) {
 744                 fs->devs[d].segsum = lafs_iget(fs->prime_sb,
 745                                                fs->devs[d].usage_inum,
 746                                                SYNC);
 747                 /* FIXME check this is a segusage file */
 748         }
 749         orphan_count = lafs_count_orphans(fs->orphans);
 750         LAFSI(fs->orphans)->md.orphan.nextfree = orphan_count;
 751
 752         lafs_checkpoint_lock(fs);
 753         err = roll_forward(fs);
 754         lafs_checkpoint_unlock(fs);
 755
 756         lafs_add_orphans(fs, fs->orphans, orphan_count);
 757
 758         for (d = 0; d < 4; d++) {
 759                 fs->cleaner.seg[d].chead = alloc_page(GFP_KERNEL);
 760                 INIT_LIST_HEAD(&fs->cleaner.seg[d].cleaning);
 761         }
 762         return err;
 763
 764 err:
 765         putdref(b, MKREF(mount));
 766         return err;
 767 }