super.c

   1
   2 /*
   3  * fs/lafs/super.c
   4  * Copyright (C) 2005-2009
   5  * Neil Brown <neilb@suse.de>
   6  * Released under the GPL, version 2
   7  */
   8
   9 #include        "lafs.h"
  10 #include        <linux/namei.h>
  11 #include        <linux/crc32.h>
  12 #include        <linux/statfs.h>
  13 #include        <linux/mount.h>
  14 #include        <linux/exportfs.h>
  15 #include        <linux/slab.h>
  16
  17 static struct super_operations lafs_sops;
  18 static const struct export_operations lafs_export_ops;
  19
  20 /*---------------------------------------------------------------------
  21  * Write out state and super blocks
  22  *  The super blocks only need to be written when the geometry of the
  23  *  array changes such as when a device is added, removed, or resized.
  24  *  So we don't bother with that just yet.
  25  *  The state block needs to be written - twice on each device - whenever
  26  *  a checkpoint is completed.  All copies are identical and the writes
  27  *  proceed in parallel.  There are 4 stateblock locations on each device.
  28  *  2 are typically less recent than the other two.  We over-write the
  29  *  less-recent copies.
  30  *  FIXME on a RAID4 we should pad the write to be a full stripe.
  31  *
  32  * Locking issues:  This is called from the checkpoint thread and so
  33  *  it does not race with anything else exclusive to that thread.
  34  *  The nonlog information needs to be reviewed once that functionality
  35  *  is implemented.
  36  */
  37
  38 int lafs_write_state(struct fs *fs)
  39 {
  40         struct lafs_state *st;
  41         int i, d;
  42
  43         fs->seq++;
  44         st = fs->state;
  45         st->seq = cpu_to_le32(fs->seq);
  46         st->nonlog_segment = cpu_to_le32(fs->nonlog_segment);
  47         st->nonlog_dev = cpu_to_le16(fs->nonlog_dev);
  48         st->nonlog_offset = cpu_to_le16(fs->nonlog_offset);
  49         st->nextyouth = cpu_to_le16(fs->youth_next);
  50         st->checkpointcluster = cpu_to_le64(fs->checkpointcluster);
  51         for (i = 0; i < fs->maxsnapshot; i++)
  52                 st->root_inodes[i] = cpu_to_le64(fs->ss[i].root_addr);
  53
  54         st->checksum = 0;
  55         st->checksum = crc32_le(0, (unsigned char *)st, fs->statesize);
  56
  57         for (d = 0; d < fs->devices ; d++)
  58                 for (i = (fs->seq & 1); i < 4 ; i += 2)
  59                         lafs_super_write(fs, d, fs->devs[d].stateaddr[i] >> 9,
  60                                          (char *)st, fs->statesize);
  61         lafs_super_wait(fs);
  62         /* FIXME what about a write error ??? */
  63         return 0;
  64 }
  65
  66 static int
  67 valid_devblock(struct lafs_dev *db, sector_t addr)
  68 {
  69         /* check that this devblock is valid, given that
  70          * it was found at sector 'addr'
  71          */
  72         u32 crc, crc2;
  73         if (strncmp(db->idtag, "LaFS-DeviceBlock", 16) != 0)
  74                 return 0;
  75         if (strncmp(db->version, "AlphaDevel      ", 16) != 0)
  76                 return 0;
  77         /* uuid can be anything */
  78         crc = db->checksum;
  79         db->checksum = 0;
  80         crc2 = crc32_le(0, (unsigned char *)db, LAFS_DEVBLK_SIZE);
  81         db->checksum = crc;
  82         if (crc2 != crc) {
  83                 dprintk("%lx != %lx\n", (unsigned long)crc,
  84                         (unsigned long)crc2);
  85                 return 0;
  86         }
  87
  88         addr = addr << 9; /* convert to byte */
  89         if (le64_to_cpu(db->devaddr[0]) != addr &&
  90             le64_to_cpu(db->devaddr[1]) != addr)
  91                 return 0;
  92
  93         if (db->statebits < 10 || db->statebits > 16)
  94                 return 0;
  95         if (db->blockbits < 9 || db->blockbits > 20)
  96                 return 0;
  97         if (db->width < 1 || db->width > 500)
  98                 return 0;
  99         if (db->stride < 1)
 100                 return 0;
 101         /* devaddr[0] must be early, [1] must be late */
 102         if (le64_to_cpu(db->devaddr[0]) >=
 103             le64_to_cpu(db->segment_offset))
 104                 return 0;
 105
 106         if (le64_to_cpu(db->devaddr[1]) <
 107             le64_to_cpu(db->segment_offset) +
 108             ((((sector_t)le32_to_cpu(db->segment_count)
 109                * le32_to_cpu(db->segment_size)))
 110              << le32_to_cpu(db->blockbits)))
 111                 return 0;
 112
 113         /* we should be fairly flexable about addresses of state blocks,
 114          * we should probably allow more, and we should just make sure
 115          * they do not overlap any true segments....
 116          * FIXME
 117          */
 118
 119         /* 2 is an absolute minimum segment size, a few hundred is more
 120          * likely. We'll put a lower limit of 8, and an upper of 800000
 121          */
 122         if (le32_to_cpu(db->segment_size) < 8 ||
 123             le32_to_cpu(db->segment_size) > 800000)
 124                 return 0;
 125
 126         if (le32_to_cpu(db->segment_offset) >
 127             (le32_to_cpu(db->segment_size)<<db->blockbits) * 10)
 128                 return 0;
 129
 130         /* FIXME should range check segment_count, but need to know
 131          * size for that */
 132         if (le32_to_cpu(db->level) > 10)
 133                 return 0;
 134
 135         /* I guess it look sane enough... */
 136         return 1;
 137 }
 138
 139 static int
 140 compare_dev(struct lafs_dev *orig, struct lafs_dev *new)
 141 {
 142         /* Both these are known to be valid.
 143          * Return:
 144          *   0 if they are for same filesystem, but 'new' is older
 145          *   1 if they are for same filesystem, and 'new' is newer
 146          *  -1 if they are for different filesystems
 147          */
 148         if (memcmp(orig->uuid, new->uuid, 16))
 149                 return -1;
 150         if (u32_after(le32_to_cpu(new->seq),
 151                       le32_to_cpu(orig->seq)))
 152                 return 1;
 153         return 0;
 154 }
 155
 156 static int
 157 valid_stateblock(struct lafs_state *st, struct lafs_dev *dv)
 158 {
 159         /* Given the 'dv' devblock, make sure 'st' is a valid
 160          * and consistent stateblock
 161          */
 162         u32 crc;
 163         if (strncmp(st->idtag, "LaFS-State-Block", 16) != 0)
 164                 return 0;
 165         if (strncmp(st->version, "AlphaDevel      ", 16) != 0)
 166                 return 0;
 167         crc = st->checksum;
 168         st->checksum = 0;
 169         if (crc32_le(0, (unsigned char *)st, 1<<dv->statebits) != crc)
 170                 return 0;
 171         st->checksum = crc;
 172
 173         if (memcmp(st->uuid, dv->uuid, 16))
 174                 return 0;
 175         /* FIXME cannot quite be that big! */
 176         if (le32_to_cpu(st->maxsnapshot) > (1<<(dv->statebits-3)))
 177                 return 0;
 178
 179         return 1;
 180 }
 181
 182 static int
 183 compare_state(struct lafs_state *orig, struct lafs_state *new)
 184 {
 185         /* return 1 if 'new' is actually newer than 'orig'.
 186          * We already know they are both valid and have the same
 187          * uuid... I don't think there is anything else to be checked
 188          */
 189         return u32_after(le32_to_cpu(new->seq), le32_to_cpu(orig->seq));
 190 }
 191
 192 /*
 193  * Mount options.
 194  * As we can have multiple devices, things are slightly non-obvious.
 195  * The 'devname' can be either a device name, starting '/', or
 196  * a filesytem name (not starting '/').
 197  * The 'data' is a standard comma-separated list of options.
 198  * For 'mount' these are:
 199  *    dev=/dev/X
 200  *              - devices in addition to 'dev_name'
 201  *    new=/dev/X
 202  *              - A new device, with a superblock already present, to be added.
 203  *    incomplete
 204  *              - don't complain if not all devices are given
 205  *    ?? quota stuff, cleaning parameters,
 206  *
 207  * For 'remount', options are
 208  *    dev=  - add another device
 209  *    new=  - the device is being added.
 210  *
 211  */
 212
 213 struct options {
 214         int devcnt;
 215         int curr_dev;
 216         int statebits, blockbits;
 217         struct devent {
 218                 const char *dev;
 219                 int is_new;
 220                 int is_name;
 221                 struct block_device *bdev;
 222                 struct lafs_dev *devblock;
 223                 struct lafs_state *stateblock;
 224                 int devchoice, statechoice;
 225         } *devlist;
 226         const char *name;
 227 };
 228 static int
 229 count_devs(const char *name, char *data)
 230 {
 231         int cnt = 0;
 232         if (*name == '/')
 233                 cnt = 1;
 234         while (data && *data) {
 235                 if (strncmp(data, "dev=", 4) == 0)
 236                         cnt++;
 237                 if (strncmp(data, "new=", 4) == 0)
 238                         cnt++;
 239                 data = strchr(data, ',');
 240                 if (data)
 241                         data++;
 242         }
 243         return cnt;
 244 }
 245
 246 static int
 247 parse_opts(struct options *op, const char *name, char *data)
 248 {
 249         int dv = 0;
 250         char *p;
 251
 252         memset(op, 0, sizeof(*op));
 253         op->devcnt = count_devs(name, data);
 254         op->devlist = kzalloc(op->devcnt*sizeof(op->devlist[0]), GFP_KERNEL);
 255
 256         if (!op->devlist)
 257                 return -ENOMEM;
 258
 259         op->name = NULL;
 260         if (*name == '/') {
 261                 op->devlist[dv].is_name = 1;
 262                 op->devlist[dv++].dev = name;
 263         } else
 264                 op->name = name;
 265         while ((p = strsep(&data, ",")) != NULL) {
 266                 if (!*p)
 267                         continue;
 268                 if (strncmp(p, "dev=", 4) == 0)
 269                         op->devlist[dv++].dev = p+4;
 270                 else if (strncmp(p, "new=", 4) == 0) {
 271                         op->devlist[dv].is_new = 1;
 272                         op->devlist[dv++].dev = p+4;
 273                 } else {
 274                         printk(KERN_ERR
 275                                "LaFS: Unrecognised mount option \"%s\"\n", p);
 276                         return -EINVAL;
 277
 278                 }
 279         }
 280         op->devcnt = dv;
 281
 282         return 0;
 283 }
 284
 285 static int
 286 lafs_load_super(struct block_device *bdev, void *opv, int silent)
 287 {
 288         /* Find the devblock and the stateblock for this device
 289
 290          * Only do basic internal consistancy checks.  Inter-device
 291          * checks happen later
 292          */
 293         struct options *op = opv;
 294         struct devent *dv;
 295         struct page *pg;
 296         sector_t sect, dev_addr = 0, state_addr = 0;
 297         int err = 0;
 298         unsigned int n;
 299         int i;
 300         int have_dev = 0, have_state = 0;
 301
 302         dv = &op->devlist[op->curr_dev];
 303         BUG_ON(dv->devblock);
 304         BUG_ON(dv->stateblock);
 305
 306         n = queue_logical_block_size(bdev->bd_disk->queue);
 307         if (n < LAFS_DEVBLK_SIZE)
 308                 n = LAFS_DEVBLK_SIZE;
 309         BUG_ON(n > PAGE_SIZE);
 310         dv->devblock = kmalloc(n, GFP_KERNEL);
 311         if (!dv->devblock)
 312                 return -ENOMEM;
 313         pg = alloc_page(GFP_KERNEL);
 314         if (!pg)
 315                 return -ENOMEM;
 316
 317         /* Now find a devblock, check the first two possible locations,
 318          * and the last two.  If two devblocks are found with different
 319          * uuids, we are confused!
 320          */
 321         sect = 0;
 322         for (i = 0; i < 4; i++) {
 323                 /* try to read block at 'sect' */
 324                 int ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
 325
 326                 if (ok && valid_devblock(page_address(pg), sect)) {
 327                         if (!have_dev) {
 328                                 have_dev = 1;
 329                                 memcpy(dv->devblock, page_address(pg), n);
 330                                 dev_addr = sect;
 331                         } else switch (compare_dev(dv->devblock,
 332                                                    page_address(pg))) {
 333                                 case 0: /* older, do nothing */
 334                                         break;
 335                                 case 1: /* newer, overwrite */
 336                                         memcpy(dv->devblock, page_address(pg), n);
 337                                         dev_addr = sect;
 338                                         break;
 339                                 default: /* inconsistent --- HELP */
 340                                         printk(KERN_ERR "LaFS: inconsistent device-blocks found.\n");
 341                                         err = -EINVAL;
 342                                         goto out;
 343                                 }
 344                 }
 345
 346                 if (i != 1)
 347                         sect += (n>>9);
 348                 else {
 349                         sect = bdev->bd_inode->i_size & ~(sector_t)(n-1);
 350                         sect >>= 9;
 351                         sect -= (n>>9)*2;
 352                 }
 353         }
 354         /* FIXME - we've lost the read error, if it was significant */
 355         err = -EINVAL;
 356         if (!have_dev) {
 357                 if (!silent)
 358                         printk(KERN_ERR "LaFS - no valid devblock found.\n");
 359                 goto out;
 360         }
 361
 362         /* OK, we have a valid devblock, that's nice.
 363          * Now we should be able to find some stateblocks.
 364          * The locations are in the devblock
 365          */
 366         n = le32_to_cpu(1<<dv->devblock->statebits);
 367         if ((n & (n-1)) ||
 368             n < queue_logical_block_size(bdev->bd_disk->queue) ||
 369             n > 128*1024) {
 370                 printk(KERN_ERR "LaFS: statesize of %u not acceptable.\n", n);
 371                 err = -EINVAL;
 372                 goto out;
 373         }
 374         dv->stateblock = kmalloc(n, GFP_KERNEL);
 375         err = -ENOMEM;
 376         if (!dv->stateblock)
 377                 goto out;
 378         for (i = 0; i < 4; i++) {
 379                 int ok;
 380                 sect = le64_to_cpu(dv->devblock->stateaddr[i])>>9;
 381                 ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
 382                 if (ok && valid_stateblock(page_address(pg), dv->devblock)) {
 383                         if (!have_state) {
 384                                 have_state = 1;
 385                                 memcpy(dv->stateblock, page_address(pg), n);
 386                                 state_addr = i;
 387                         } else if (compare_state(dv->stateblock,
 388                                                  page_address(pg))) {
 389                                 memcpy(dv->stateblock, page_address(pg), n);
 390                                 state_addr = i;
 391                         }
 392                 }
 393         }
 394
 395         if (have_state) {
 396                 err = 0;
 397                 dv->devchoice = dev_addr;
 398                 dv->statechoice = state_addr;
 399         } else {
 400                 err = -EINVAL;
 401                 if (!silent)
 402                         printk(KERN_ERR "LaFS: no valid stateblock found.\n");
 403         }
 404 out:
 405         page_cache_release(pg);
 406         return err;
 407 }
 408
 409 static int
 410 check_devs(struct options *op)
 411 {
 412         /* Check we have enough, that they are for the same
 413          * uuid, and they they don't overlap
 414          * Also check that 'seq' number of devblocks
 415          * are within '1'
 416          */
 417         int seqlo = le32_to_cpu(op->devlist[0].devblock->seq);
 418         int seqhi = le32_to_cpu(op->devlist[0].devblock->seq);
 419         int newdev = 0;
 420         int newstate = 0;
 421         int i, j;
 422
 423         for (i = 1; i < op->devcnt; i++) {
 424                 if (memcmp(op->devlist[0].stateblock->uuid,
 425                            op->devlist[i].stateblock->uuid,
 426                            16) != 0)
 427                         return -EINVAL;
 428
 429                 if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo)
 430                         ;
 431                 else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo+1) {
 432                         newdev = i;
 433                         seqhi = seqlo+1;
 434                 } else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqhi-1)
 435                         seqlo = seqhi-1;
 436                 else
 437                         return -EINVAL;
 438
 439                 if (u32_after(le32_to_cpu(op->devlist[i].stateblock->seq),
 440                               le32_to_cpu(op->devlist[newstate].
 441                                           stateblock->seq)))
 442                         newstate = i;
 443         }
 444         if (le32_to_cpu(op->devlist[newstate].stateblock->devices)
 445             != op->devcnt)
 446                 return -EINVAL;
 447
 448         op->statebits = op->devlist[0].devblock->statebits;
 449         op->blockbits = op->devlist[0].devblock->blockbits;
 450
 451         /* Now check devices don't overlap in start/size.
 452          * We do a simple quadratic search
 453          */
 454         for (i = 0; i < op->devcnt; i++)
 455                 for (j = 0; j < op->devcnt; j++)
 456                         if (i != j)
 457                                 if (le64_to_cpu(op->devlist[i].devblock->start) <
 458                                     le64_to_cpu(op->devlist[j].devblock->start) &&
 459
 460                                     le64_to_cpu(op->devlist[i].devblock->start)+
 461                                     le64_to_cpu(op->devlist[i].devblock->size) >
 462                                     le64_to_cpu(op->devlist[j].devblock->start))
 463                                         return -EINVAL;
 464         return newstate;
 465 }
 466
 467 /* we identify lafs superblocks by the filesystem uuid.  This means
 468  * that block-level snapshots cannot be mounted.  You should use
 469  * fs-level snapshots instead.
 470  */
 471 static int sb_test(struct super_block *sb, void *data)
 472 {
 473         struct sb_key *ptn = data;
 474         struct sb_key *sk = sb->s_fs_info;
 475         return memcmp(ptn->fs->state->uuid,
 476                       sk->fs->state->uuid, 16) == 0;
 477 }
 478
 479 static int sb_set(struct super_block *sb, void *data)
 480 {
 481         struct sb_key *ptn = data;
 482         sb->s_fs_info = ptn;
 483         return set_anon_super(sb, NULL);
 484 }
 485
 486
 487 static int
 488 lafs_load(struct fs *fs, struct options *op, int newest)
 489 {
 490         /* We seem to have a full set of devices for the filesystem.
 491          * Time to create our fs_info structure and fill it out.
 492          * This only includes information from the dev and state blocks.
 493          * Finding the root-inode comes a bit later.
 494          */
 495         struct lafs_state *st;
 496         int i;
 497         int err;
 498         struct sb_key *k;
 499
 500         st = fs->state = op->devlist[newest].stateblock;
 501         op->devlist[newest].stateblock = NULL;
 502 #ifdef DUMP
 503         dfs = fs;
 504 #endif
 505
 506         fs->seq = le32_to_cpu(st->seq);
 507         fs->levels = le32_to_cpu(st->levels);
 508         fs->devices = op->devcnt;
 509         fs->devs_loaded = fs->devices; /* FIXME use this or lose this */
 510         fs->statesize = 1 << op->statebits;
 511         fs->blocksize = 1 << op->blockbits;
 512         fs->blocksize_bits = op->blockbits;
 513
 514         fs->nonlog_segment = le32_to_cpu(st->nonlog_segment);
 515         fs->nonlog_dev = le16_to_cpu(st->nonlog_dev);
 516         fs->nonlog_offset = le16_to_cpu(st->nonlog_offset);
 517         fs->youth_next = le16_to_cpu(st->nextyouth);
 518         fs->checkpoint_youth = fs->youth_next;
 519         if (fs->youth_next < 8)
 520                 fs->youth_next = 8;
 521         fs->scan.first_free_pass = 1;
 522         fs->scan.free_dev = -1;
 523
 524         fs->maxsnapshot = le32_to_cpu(st->maxsnapshot);
 525
 526         fs->scan.free_usages = kmalloc(PAGE_SIZE, GFP_KERNEL);
 527         err = lafs_segtrack_init(fs->segtrack);
 528
 529         fs->ss = kzalloc(sizeof(struct snapshot)*fs->maxsnapshot, GFP_KERNEL);
 530         if (!fs->ss || !fs->scan.free_usages || err) {
 531                 if (!err)
 532                         err = -ENOMEM;
 533                 goto abort;
 534         }
 535
 536         fs->checkpointcluster = le64_to_cpu(st->checkpointcluster);
 537         for (i = 0; i < fs->maxsnapshot; i++) {
 538                 fs->ss[i].root_addr =
 539                         le64_to_cpu(st->root_inodes[i]);
 540                 dprintk("root inode %d are %llu\n",
 541                         i, fs->ss[i].root_addr);
 542         }
 543         INIT_LIST_HEAD(&fs->pending_orphans);
 544         INIT_LIST_HEAD(&fs->inode_index);
 545         INIT_LIST_HEAD(&fs->phase_leafs[0]);
 546         INIT_LIST_HEAD(&fs->phase_leafs[1]);
 547         INIT_LIST_HEAD(&fs->clean_leafs);
 548         INIT_LIST_HEAD(&fs->account_leafs);
 549         atomic_set(&fs->sb_writes_pending, 0);
 550         init_waitqueue_head(&fs->sb_writes_wait);
 551         init_waitqueue_head(&fs->async_complete);
 552         init_waitqueue_head(&fs->trunc_wait);
 553         mutex_init(&fs->cleaner.lock);
 554         spin_lock_init(&fs->stable_lock);
 555         spin_lock_init(&fs->alloc_lock);
 556         spin_lock_init(&fs->lock);
 557         init_waitqueue_head(&fs->phase_wait);
 558
 559         INIT_WORK(&fs->done_work, lafs_done_work);
 560
 561         /* FIXME add congention and unplug functions to this bdi */
 562         err = bdi_init(&fs->bdi);
 563         if (err)
 564                 goto abort;
 565
 566
 567         fs->phase_locked = 0;
 568         for (i = 0; i < WC_NUM; i++) {
 569                 int j;
 570                 mutex_init(&fs->wc[i].lock);
 571                 for (j = 0; j < 4 ; j++) {
 572                         atomic_set(&fs->wc[i].pending_cnt[j], 0);
 573                         INIT_LIST_HEAD(&fs->wc[i].pending_blocks[j]);
 574                 }
 575                 init_waitqueue_head(&fs->wc[i].pending_wait);
 576                 fs->wc[i].seg.dev = -1;
 577         }
 578
 579         fs->max_newsegs = 32; /* FIXME this should be configurable */
 580
 581         err = -ENOMEM;
 582         fs->devs = kzalloc(sizeof(struct fs_dev)*fs->devices, GFP_KERNEL);
 583         if (!fs->devs)
 584                 goto abort;
 585
 586         k = kzalloc(sizeof(*k), GFP_KERNEL);
 587         k->fs = fs;
 588         fs->prime_sb = sget(&lafs_fs_type, sb_test, sb_set, k);
 589         if (IS_ERR(fs->prime_sb)) {
 590                 kfree(k);
 591                 err = PTR_ERR(fs->prime_sb);
 592                 goto abort;
 593         }
 594         if (fs->prime_sb->s_root) {
 595                 /* filesystem with this uuid already exists */
 596                 deactivate_locked_super(fs->prime_sb);
 597                 kfree(k);
 598                 fs->prime_sb = NULL;
 599                 err = -EBUSY;
 600                 goto abort;
 601         }
 602         err = bdi_register_dev(&fs->bdi, fs->prime_sb->s_dev);
 603         if (err) {
 604                 deactivate_locked_super(fs->prime_sb);
 605                 kfree(k);
 606                 fs->prime_sb = NULL;
 607                 goto abort;
 608         }
 609         fs->prime_sb->s_bdi = &fs->bdi;
 610
 611         fs->prime_sb->s_blocksize = 1 << op->blockbits;
 612         fs->prime_sb->s_blocksize_bits = op->blockbits;
 613         fs->prime_sb->s_op = &lafs_sops;
 614         fs->prime_sb->s_export_op = &lafs_export_ops;
 615         fs->prime_sb->s_root = NULL;
 616
 617         /* We allow 29 bits for nanosecs, so they must be even. */
 618         fs->prime_sb->s_time_gran = 2;
 619
 620         for (i = 0; i < fs->devices; i++) {
 621                 struct fs_dev *dv = &fs->devs[i];
 622                 struct devent *de = &op->devlist[i];
 623                 int j;
 624                 dv->bdev = de->bdev;
 625                 de->bdev = NULL;
 626
 627                 dv->devblk = de->devblock;
 628                 de->devblock = NULL;
 629
 630                 dv->recent_dev = de->devchoice;
 631                 dv->recent_state = de->statechoice;
 632
 633                 dv->start = le64_to_cpu(dv->devblk->start);
 634                 dv->size = le64_to_cpu(dv->devblk->size);
 635                 dprintk("Dev %d seems to range %llu + %llu\n",
 636                         i, (unsigned long long)dv->start,
 637                         (unsigned long long)dv->size);
 638
 639                 dv->width = le16_to_cpu(dv->devblk->width);
 640                 dv->stride = le32_to_cpu(dv->devblk->stride);
 641                 dv->segment_size = le32_to_cpu(dv->devblk->segment_size);
 642                 dv->segment_offset = le32_to_cpu(dv->devblk->segment_offset);
 643                 dv->segment_count = le32_to_cpu(dv->devblk->segment_count);
 644                 dv->usage_inum = le32_to_cpu(dv->devblk->usage_inum);
 645                 dv->level = le16_to_cpu(dv->devblk->level);
 646
 647                 if (dv->segment_size > fs->max_segment)
 648                         fs->max_segment = dv->segment_size;
 649
 650                 if (dv->width * dv->stride <= dv->segment_size) {
 651                         dv->tables_per_seg = dv->segment_size /
 652                                 dv->width / dv->stride;
 653                         dv->rows_per_table = dv->stride;
 654                         dv->segment_stride = dv->segment_size;
 655                 } else {
 656                         dv->tables_per_seg = 1;
 657                         dv->rows_per_table = dv->segment_size / dv->width;
 658                         dv->segment_stride = dv->rows_per_table;
 659                 }
 660                 /* table size is the number of blocks in the segment usage
 661                  * file per snapshot
 662                  */
 663                 dv->tablesize = (dv->segment_count + (1<<(fs->blocksize_bits-1)) + 1)
 664                         >> (fs->blocksize_bits-1);
 665
 666                 for (j = 0; j < 2; j++)
 667                         dv->devaddr[j] = le64_to_cpu(dv->devblk->devaddr[j]);
 668                 for (j = 0; j < 4; j++)
 669                         dv->stateaddr[j] = le64_to_cpu(dv->devblk->stateaddr[j]);
 670         }
 671         return 0;
 672
 673 abort:
 674         bdi_destroy(&fs->bdi);
 675         kfree(fs->scan.free_usages);
 676         lafs_segtrack_free(fs->segtrack);
 677         kfree(fs->devs);
 678         kfree(fs->ss);
 679         kfree(fs);
 680         return -ENOMEM;
 681 }
 682
 683 static int show_orphans(struct fs *fs)
 684 {
 685         struct datablock *db;
 686         printk("Orphans:\n");
 687         list_for_each_entry(db, &fs->pending_orphans,
 688                             orphans) {
 689                 struct inode *ino = iget_my_inode(db);
 690                 printk("orphan=%s\n", strblk(&db->b));
 691                 if (ino)
 692                         lafs_print_tree(&LAFSI(ino)->iblock->b, 0);
 693                 iput(ino);
 694         }
 695         printk("cleaner active: %d %d\n", fs->cleaner.active,
 696                fs->scan.done);
 697         return 1; /* meaningless, but makes it easy to add to wait_event below */
 698 }
 699
 700 static void lafs_kill_sb(struct super_block *sb)
 701 {
 702         struct fs *fs = fs_from_sb(sb);
 703         /* Release the 'struct fs' */
 704         int i;
 705
 706         /* FIXME should I refcount this when there are multiple
 707          * filesets? How does that work?
 708          */
 709
 710         /* Delay final destruction of the root inode */
 711         /* FIXME all the sbs... */
 712         set_bit(I_Deleting, &LAFSI(fs->ss[0].root)->iflags);
 713
 714         /* FIXME I'm not sure we should be waiting for the
 715          * cleaner.  Maybe we should just release all tc->cleaning
 716          * blocks instead.
 717          */
 718         set_bit(CleanerDisabled, &fs->fsstate);
 719
 720         wait_event(fs->async_complete,
 721                    show_orphans(fs) &&
 722                    !test_bit(OrphansRunning, &fs->fsstate) &&
 723                    list_empty(&fs->pending_orphans) &&
 724                    fs->scan.done == 1 &&
 725                    fs->cleaner.active == 0);
 726
 727         kill_anon_super(fs->prime_sb);
 728
 729         bdi_destroy(&fs->bdi);
 730
 731         for (i = 0; i < fs->devices; i++) {
 732                 struct fs_dev *dv = &fs->devs[i];
 733                 kfree(dv->devblk);
 734                 close_bdev_exclusive(dv->bdev, FMODE_READ|FMODE_WRITE);
 735         }
 736
 737         /* Final checkpoint will have cleared out the leafs lists,
 738          * so they should all be empty.
 739          */
 740         /* Lets see what is on the 'leaf' list? */
 741         for (i = 0; i < 2; i++) {
 742                 struct block *b;
 743                 dprintk("For phase %d\n", i);
 744         retry:
 745                 list_for_each_entry(b, &fs->phase_leafs[i], lru) {
 746                         /* FIXME this only OK for readonly mounts.
 747                          */
 748                         getref(b, MKREF(release));
 749                         lafs_refile(b, 0);
 750                         if (test_bit(B_Pinned, &b->flags)) {
 751                                 /* didn't fix the pincnt !! */
 752                                 printk("This was pinned: %s\n", strblk(b));
 753                                 lafs_print_tree(b, 1);
 754                                 BUG();
 755                         }
 756                         putref(b, MKREF(release));
 757                         goto retry;
 758                 }
 759         }
 760         BUG_ON(!list_empty(&fs->clean_leafs));
 761
 762         flush_scheduled_work();
 763         lafs_stop_thread(fs);
 764
 765         for (i = 0; i < 4; i++)
 766                 if (fs->cleaner.seg[i].chead)
 767                         put_page(fs->cleaner.seg[i].chead);
 768
 769         kfree(fs->state);
 770         kfree(fs->ss);
 771         kfree(fs->devs);
 772         lafs_segtrack_free(fs->segtrack);
 773         kfree(fs->scan.free_usages);
 774         kfree(fs->prime_sb->s_fs_info);
 775         kfree(fs);
 776 }
 777
 778 static void
 779 lafs_put_super(struct super_block *sb)
 780 {
 781         struct fs *fs = fs_from_sb(sb);
 782         int ss;
 783         struct lafs_inode *li;
 784
 785         lafs_checkpoint_lock(fs);
 786         lafs_checkpoint_start(fs);
 787         if (sb == fs->prime_sb)
 788                 /* Don't incorporate any more segusage/quota updates. */
 789                 set_bit(FinalCheckpoint, &fs->fsstate);
 790         lafs_checkpoint_unlock_wait(fs);
 791         lafs_cluster_wait_all(fs);
 792
 793         if (sb == fs->prime_sb) {
 794                 int d;
 795                 /* This is the main sb, not a snapshot or
 796                  * subordinate fs.
 797                  * Now that all inodes have been invalidated we can do
 798                  * the final checkpoint.
 799                  */
 800                 lafs_close_all_segments(fs);
 801                 lafs_empty_segment_table(fs);
 802                 lafs_seg_put_all(fs);
 803
 804                 iput(fs->orphans);
 805                 fs->orphans = NULL;
 806                 for (d=0; d < fs->devices; d++)
 807                         if (fs->devs[d].segsum) {
 808                                 iput(fs->devs[d].segsum);
 809                                 fs->devs[d].segsum = NULL;
 810                         }
 811         }
 812
 813         /* need to break a circular reference... */
 814         for (ss = 0; ss < fs->maxsnapshot; ss++)
 815                 if (fs->ss[ss].root &&
 816                     fs->ss[ss].root->i_sb == sb) {
 817                         dprintk("Putting ss %d\n", ss);
 818                         li = LAFSI(fs->ss[ss].root);
 819                         if (test_bit(B_Realloc, &li->dblock->b.flags))
 820                                 lafs_dump_tree();
 821                         iput(fs->ss[ss].root);
 822                         fs->ss[ss].root = NULL;
 823                         break;
 824                 }
 825 }
 826
 827 static int
 828 lafs_get_devs(struct fs *fs, struct options *op, int flags)
 829 {
 830         int err;
 831         int i;
 832
 833         for (i = 0; i < op->devcnt; i++) {
 834                 struct block_device *bdev;
 835                 op->curr_dev = i;
 836
 837                 bdev = open_bdev_exclusive(op->devlist[i].dev,
 838                                            FMODE_READ|FMODE_WRITE, fs);
 839                 err = PTR_ERR(bdev);
 840                 if (IS_ERR(bdev))
 841                         goto out;
 842                 err = lafs_load_super(bdev, op, flags & MS_SILENT ? 1 : 0);
 843                 if (err < 0)
 844                         goto out;
 845                 op->devlist[i].bdev = bdev;
 846         }
 847         return 0;
 848
 849 out:
 850         return err;
 851 }
 852
 853 static int
 854 lafs_get_sb(struct file_system_type *fs_type,
 855             int flags, const char *dev_name, void *data,
 856             struct vfsmount *mnt)
 857 {
 858         /* as we may have multiple devices, some in 'data', we cannot just
 859          * use get_sb_bdev, we need to roll-our-own.
 860          * We call get_sb_bdev on *each* bdev, and make sure the returned
 861          * superblocks are either all new, or all for the same filesystem.
 862          * If the later, we return the primary.
 863          * If the former, we init the filesystem copying static data
 864          * to all supers.
 865          * First we 'open_bdev_exclusive' each device, exclusive to lafs
 866          * Then we 'sget' a superblock that knows any/all the devices.
 867          * This may be pre-existing, or may be new
 868          * If new, it will be created knowing all devices.
 869          * If pre-existing, and don't have correct device list, error
 870          */
 871         struct options op;
 872         int err;
 873         int newest;
 874         struct fs *fs = kzalloc(sizeof(*fs), GFP_KERNEL);
 875         char *cdata = data;
 876         if (cdata == NULL)
 877                 cdata = "";
 878
 879         err = -ENOMEM;
 880         if (!fs)
 881                 goto out;
 882         err = parse_opts(&op, dev_name, cdata);
 883         if (err)
 884                 goto out;
 885
 886         /* We now have as list of device names.  We call open_bdev_exclusive
 887          * on each to collect some superblocks.
 888          */
 889         err = lafs_get_devs(fs, &op, flags);
 890         if (err)
 891                 goto out;
 892
 893         /* Each device has a valid dev and state block.  Hopefully they
 894          * are all for the same filesystem.  If they don't have the
 895          * same uuid, we will bale-out here.  We also check that we have
 896          * enough, and that they don't overlap.
 897          * While we are looking at state blocks, pick the newest.
 898          */
 899         newest = check_devs(&op);
 900         if (newest < 0) {
 901                 err = newest;
 902                 goto out;
 903         }
 904
 905         /* So they seem to be the same - better create our
 906          * 'fs' structure and fill it in
 907          */
 908         err = lafs_load(fs, &op, newest);
 909         if (err)
 910                 goto out;
 911
 912         /* Well, all the devices check out.  Now we need to find the
 913          * filesystem */
 914         err = lafs_mount(fs);
 915         if (err == 0)
 916                 err = lafs_start_thread(fs);
 917         if (err)
 918                 deactivate_locked_super(fs->prime_sb);
 919         else {
 920                 fs->prime_sb->s_flags |= MS_ACTIVE;
 921                 simple_set_mnt(mnt, fs->prime_sb);
 922         }
 923         /* And there you have it.  Filesystem all mounted, root dir found,
 924          * metadata files initialised, all pigs fed, and ready to fly!!!
 925          */
 926
 927 out:
 928         /* Now we clean up 'options'.  Anything that is wanted has
 929          * been moved into 'fs', so we just discard anything we find
 930          */
 931         if (op.devlist) {
 932                 int i;
 933                 for (i = 0; i < op.devcnt; i++) {
 934                         kfree(op.devlist[i].devblock);
 935                         kfree(op.devlist[i].stateblock);
 936                         if (op.devlist[i].bdev)
 937                                 close_bdev_exclusive(op.devlist[i].bdev,
 938                                                      FMODE_READ|FMODE_WRITE);
 939                 }
 940                 kfree(op.devlist);
 941         }
 942         return err;
 943 }
 944
 945 static int test_subset(struct super_block *sb, void *data)
 946 {
 947         struct sb_key *ptn = data;
 948         struct sb_key *k = sb->s_fs_info;
 949
 950         return ptn->fs == k->fs && ptn->root == k->root;
 951 }
 952
 953 static int set_subset(struct super_block *sb, void *data)
 954 {
 955         sb->s_fs_info = data;
 956         set_anon_super(sb, NULL);
 957         return 0;
 958 }
 959
 960 static struct file_system_type lafs_subset_fs_type;
 961 struct super_block *lafs_get_subset_sb(struct inode *ino)
 962 {
 963         /* ino must be a TypeInodeFile inode in the prime filesystem. */
 964         struct fs *fs = fs_from_inode(ino);
 965         struct super_block *sb;
 966         struct sb_key *k = kmalloc(sizeof(*k), GFP_KERNEL);
 967
 968         if (!k)
 969                 return ERR_PTR(-ENOMEM);
 970
 971         k->fs = fs;
 972         k->root = ino;
 973         sb = sget(&lafs_subset_fs_type, test_subset, set_subset, k);
 974         if (IS_ERR(sb)) {
 975                 kfree(k);
 976         } else if (sb->s_root) {
 977                 /* already allocated */
 978                 kfree(k);
 979         } else {
 980                 struct inode *rootdir, *imapfile;
 981                 int err = 0;
 982
 983                 igrab(ino);
 984                 sb->s_blocksize = fs->blocksize;
 985                 sb->s_blocksize_bits = fs->blocksize_bits;
 986                 sb->s_bdi = fs->prime_sb->s_bdi;
 987                 sb->s_op = &lafs_sops;
 988                 sb->s_export_op = &lafs_export_ops;
 989                 sb->s_time_gran = 2;
 990                 rootdir = lafs_iget(sb, 2, SYNC);
 991                 if (IS_ERR(rootdir) && PTR_ERR(rootdir) == -ENOENT) {
 992                         rootdir = lafs_new_inode(fs, sb, NULL,
 993                                                  TypeDir, 2, 0755, NULL);
 994                         /* FIXME could the inode get written before we set
 995                          * the link count ??*/
 996                         rootdir->i_nlink = 2;
 997                 }
 998                 if (IS_ERR(rootdir))
 999                         err = PTR_ERR(rootdir);
1000                 else {
1001                         sb->s_root = d_alloc_root(rootdir);
1002                         imapfile = lafs_iget(sb, 1, SYNC);
1003                         if (IS_ERR(imapfile) && PTR_ERR(imapfile) == -ENOENT)
1004                                 imapfile = lafs_new_inode(fs, sb, NULL,
1005                                                           TypeInodeMap, 1, 0, NULL);
1006
1007                         if (IS_ERR(imapfile))
1008                                 err = PTR_ERR(imapfile);
1009                         else
1010                                 iput(imapfile);
1011                 }
1012
1013                 if (!err) {
1014                         sb->s_op = fs->prime_sb->s_op;
1015                         sb->s_flags |= MS_ACTIVE;
1016                         atomic_inc(&fs->prime_sb->s_active);
1017                         igrab(ino);
1018                 } else {
1019                         deactivate_locked_super(sb);
1020                         sb = ERR_PTR(err);
1021                 }
1022         }
1023         return sb;
1024 }
1025
1026 static int
1027 lafs_get_subset(struct file_system_type *fs_type,
1028                 int flags, const char *dev_name, void *data,
1029                 struct vfsmount *mnt)
1030 {
1031         /* mount, possibly creating, a sub-fileset.
1032          * dev_name must be an absolute path that leads
1033          * to an object in a lafs file-system (or snapshot).
1034          * The object must be either an InodeFile or
1035          * an empty directory in the main file-system
1036          * with mode 0 (though that rule might change).
1037          * In the latter case we change the object to an
1038          * InodeFile
1039          * FIXME must require readonly for snapshots, and readwrite
1040          * to create.
1041          */
1042
1043         struct nameidata nd;
1044         int err;
1045         struct super_block *sb;
1046         struct inode *ino;
1047         struct fs *fs;
1048
1049         err = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
1050         if (err)
1051                 goto out_noput;
1052         sb = nd.path.dentry->d_sb;
1053         err = -EINVAL;
1054         if (sb->s_type != &lafs_fs_type &&
1055             sb->s_type != &lafs_snap_fs_type)
1056                 goto out;
1057         ino = nd.path.dentry->d_inode;
1058         if (LAFSI(ino)->type != TypeInodeFile &&
1059             LAFSI(ino)->type != TypeDir)
1060                 goto out;
1061         fs = fs_from_sb(sb);
1062         mutex_lock(&ino->i_mutex);
1063         if (LAFSI(ino)->type == TypeDir) {
1064                 struct datablock *inodb;
1065                 /* maybe convert this to TypeInodeFile */
1066                 if (sb->s_type != &lafs_fs_type)
1067                         goto out_unlock;
1068                 if (ino->i_size)
1069                         /* FIXME maybe I should run orphans */
1070                         goto out_unlock;
1071                 if ((ino->i_mode & 07777) != 0)
1072                         goto out_unlock;
1073                 inodb = lafs_inode_dblock(ino, SYNC, MKREF(make_subset));
1074                 err = PTR_ERR(inodb);
1075                 if (IS_ERR(inodb))
1076                         goto out_unlock;
1077                 lafs_iolock_block(&inodb->b);
1078                 set_bit(B_PinPending, &inodb->b.flags);
1079                 lafs_iounlock_block(&inodb->b);
1080                 lafs_checkpoint_lock(fs);
1081                 err = lafs_pin_dblock(inodb, ReleaseSpace);
1082                 if (!err) {
1083                         struct fs_md *md;
1084                         /* OK, we are good to go making this filesystem */
1085                         LAFSI(ino)->type = TypeInodeFile;
1086                         LAFSI(ino)->metadata_size = (sizeof(struct la_inode) +
1087                                                      sizeof(struct fs_metadata));
1088                         ino->i_op = &lafs_subset_ino_operations;
1089                         ino->i_fop = &lafs_subset_file_operations;
1090                         /* FIXME we lose md->parent here - what to do?? */
1091                         md = &LAFSI(ino)->md.fs;
1092                         md->usagetable = 0;
1093                         ino->i_mtime = current_fs_time(sb);
1094                         md->cblocks_used = 0;
1095                         md->pblocks_used = 0;
1096                         md->ablocks_used = 0;
1097                         md->blocks_allowed = 10000; /* FIXME */
1098                         md->blocks_unalloc = 0;
1099                         /* FIXME should I be using inode_init here */
1100                         md->creation_age = fs->wc[0].cluster_seq;
1101                         md->inodes_used = 0;
1102                         md->quota_inums[0] = 0;
1103                         md->quota_inums[1] = 0;
1104                         md->quota_inums[2] = 0;
1105                         md->quota_inodes[0] = NULL;
1106                         md->quota_inodes[1] = NULL;
1107                         md->quota_inodes[2] = NULL;
1108                         md->name = NULL;
1109                         lafs_dirty_dblock(inodb);
1110                         lafs_dirty_inode(ino);
1111                         /* We use a checkpoint to commit this change,
1112                          * it is too unusual to bother logging
1113                          */
1114                         lafs_checkpoint_start(fs);
1115                         lafs_checkpoint_unlock_wait(fs);
1116                 } else {
1117                         lafs_checkpoint_unlock(fs);
1118                 }
1119                 putdref(inodb, MKREF(make_subset));
1120                 if (err)
1121                         goto out_unlock;
1122         }
1123         err = 0;
1124         /* We have a TypeInodeFile so we can make a superblock */
1125         sb = lafs_get_subset_sb(ino);
1126         iput(ino);
1127
1128         if (IS_ERR(sb))
1129                 err = PTR_ERR(sb);
1130         else
1131                 simple_set_mnt(mnt, sb);
1132 out_unlock:
1133         mutex_unlock(&ino->i_mutex);
1134 out:
1135         path_put(&nd.path);
1136 out_noput:
1137         return err;
1138 }
1139
1140 static void lafs_kill_subset(struct super_block *sb)
1141 {
1142         struct sb_key *k = sb->s_fs_info;
1143         kill_anon_super(sb);
1144         iput(k->root);
1145         deactivate_super(k->fs->prime_sb);
1146         kfree(k);
1147 }
1148
1149 const struct file_operations lafs_subset_file_operations = {
1150 };
1151
1152 const struct inode_operations lafs_subset_ino_operations = {
1153 };
1154
1155
1156 struct file_system_type lafs_fs_type = {
1157         .owner          = THIS_MODULE,
1158         .name           = "lafs",
1159         .get_sb         = lafs_get_sb,
1160         .kill_sb        = lafs_kill_sb,
1161         .fs_flags       = FS_REQUIRES_DEV,
1162 };
1163
1164 static struct file_system_type lafs_subset_fs_type = {
1165         .owner          = THIS_MODULE,
1166         .name           = "lafs_subset",
1167         .get_sb         = lafs_get_subset,
1168         .kill_sb        = lafs_kill_subset,
1169 };
1170
1171 static int __init lafs_init(void)
1172 {
1173         int err;
1174
1175         BUILD_BUG_ON(B_NUM_FLAGS > 32);
1176
1177         err = lafs_ihash_init();
1178         err = err ?: register_filesystem(&lafs_fs_type);
1179         err = err ?: register_filesystem(&lafs_snap_fs_type);
1180         err = err ?: register_filesystem(&lafs_subset_fs_type);
1181         if (err)
1182                 goto out;
1183         return 0;
1184
1185 out:
1186         unregister_filesystem(&lafs_fs_type);
1187         unregister_filesystem(&lafs_snap_fs_type);
1188         unregister_filesystem(&lafs_subset_fs_type);
1189         lafs_ihash_free();
1190         return err;
1191 }
1192
1193 static void __exit lafs_exit(void)
1194 {
1195         unregister_filesystem(&lafs_fs_type);
1196         unregister_filesystem(&lafs_snap_fs_type);
1197         unregister_filesystem(&lafs_subset_fs_type);
1198         lafs_ihash_free();
1199 }
1200
1201 static struct inode *lafs_nfs_get_inode(struct super_block *sb,
1202                                         u64 ino, u32 generation)
1203 {
1204         struct inode *inode;
1205
1206         inode = lafs_iget(sb, ino, SYNC);
1207         if (IS_ERR(inode))
1208                 return ERR_CAST(inode);
1209         if (generation && inode->i_generation != generation) {
1210                 iput(inode);
1211                 return ERR_PTR(-ESTALE);
1212         }
1213
1214         return inode;
1215 }
1216
1217 static struct dentry *lafs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1218                                         int fh_len, int fh_type)
1219 {
1220         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1221                                     lafs_nfs_get_inode);
1222 }
1223
1224 static struct dentry *lafs_fh_to_parent(struct super_block *sb, struct fid *fid,
1225                                         int fh_len, int fh_type)
1226 {
1227         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1228                                     lafs_nfs_get_inode);
1229 }
1230
1231 static struct dentry *lafs_get_parent(struct dentry *child)
1232 {
1233         ino_t inum = LAFSI(child->d_inode)->md.file.parent;
1234         struct inode *inode = lafs_iget(child->d_inode->i_sb, inum, SYNC);
1235         if (IS_ERR(inode))
1236                 return ERR_CAST(inode);
1237         return d_obtain_alias(inode);
1238 }
1239
1240 static const struct export_operations lafs_export_ops = {
1241         .fh_to_dentry = lafs_fh_to_dentry,
1242         .fh_to_parent = lafs_fh_to_parent,
1243         .get_parent = lafs_get_parent,
1244 };
1245
1246 static struct inode *lafs_alloc_inode(struct super_block *sb)
1247 {
1248         struct lafs_inode *li;
1249         li = kmalloc(sizeof(*li), GFP_NOFS);
1250         if (!li)
1251                 return NULL;
1252         inode_init_once(&li->vfs_inode);
1253         li->vfs_inode.i_data.backing_dev_info = sb->s_bdi;
1254         li->iblock = NULL;
1255         li->dblock = NULL;
1256         li->update_cluster = 0;
1257         li->md.fs.name = NULL;
1258
1259         init_rwsem(&li->ind_sem);
1260         INIT_LIST_HEAD(&li->free_index);
1261
1262         return &li->vfs_inode;
1263 }
1264
1265 static void kfree_inode(struct rcu_head *head)
1266 {
1267         struct lafs_inode *lai = container_of(head, struct lafs_inode,
1268                                               md.rcu);
1269         if (lai->type == TypeInodeFile)
1270                 kfree(lai->md.fs.name);
1271         kfree(lai);
1272 }
1273
1274 void lafs_destroy_inode(struct inode *inode)
1275 {
1276         struct datablock *db;
1277
1278         BUG_ON(!list_empty(&inode->i_sb_list));
1279         // Cannot test i_list as dispose_list just does list_del
1280         db = lafs_inode_get_dblock(inode, MKREF(destroy));
1281
1282         if (db) {
1283                 set_bit(I_Destroyed, &LAFSI(inode)->iflags);
1284                 putdref(db, MKREF(destroy));
1285         } else {
1286                 spin_lock(&inode->i_data.private_lock);
1287                 if (LAFSI(inode)->iblock)
1288                         LAFS_BUG(atomic_read(&LAFSI(inode)->iblock->b.refcnt),
1289                                  &LAFSI(inode)->iblock->b);
1290                 /* FIXME could there be Async blocks keeps a refcount?
1291                  * we should free them
1292                  */
1293                 spin_unlock(&inode->i_data.private_lock);
1294                 lafs_release_index(&LAFSI(inode)->free_index);
1295                 call_rcu(&LAFSI(inode)->md.rcu,
1296                          kfree_inode);
1297         }
1298 }
1299
1300 static int lafs_sync_fs(struct super_block *sb, int wait)
1301 {
1302         if (!wait)
1303                 /* We only reach here if s_dirt was set, so it
1304                  * is reasonable to force a checkpoint.
1305                  */
1306                 lafs_checkpoint_start(fs_from_sb(sb));
1307         else
1308                 printk("FIXME I should wait for the checkpoint to finish\n");
1309         return 0;
1310 }
1311
1312 static int lafs_statfs(struct dentry *de, struct kstatfs *buf)
1313 {
1314         int i;
1315         u32 fsid;
1316         u32 *fsuuid;
1317         struct fs *fs = fs_from_inode(de->d_inode);
1318         struct lafs_inode *root = LAFSI(fs->ss[0].root);
1319
1320         fsid = 0;
1321         fsuuid = (u32 *)fs->state->uuid;
1322         for (i = 0; i < 16 / 4 ; i++)
1323                 fsid ^= le32_to_cpu(fsuuid[i]);
1324
1325         spin_lock(&root->vfs_inode.i_lock);
1326         buf->f_type = 0x4C614654; /* "LaFS" */
1327         buf->f_bsize = fs->blocksize;
1328         buf->f_blocks = root->md.fs.blocks_allowed;
1329         if (buf->f_blocks == 0) {
1330                 /* should subtract usage of all other filesystems...*/
1331                 for (i = 0; i < fs->devs_loaded; i++)
1332                         buf->f_blocks += fs->devs[i].size;
1333         }
1334         /* "bavail" is "blocks we could succeed in adding to the filesystem".
1335          * "bfree" is effectively total blocks - used blocks
1336          */
1337         buf->f_bavail = fs->free_blocks + fs->clean_reserved - fs->allocated_blocks;
1338         buf->f_bfree = buf->f_blocks - (root->md.fs.cblocks_used +
1339                                         root->md.fs.pblocks_used +
1340                                         root->md.fs.ablocks_used);
1341         dprintk("df: tot=%ld free=%ld avail=%ld(%ld-%ld-%ld) cb=%ld pb=%ld ab=%ld\n",
1342                 (long)buf->f_blocks, (long)buf->f_bfree, (long)buf->f_bavail,
1343                 (long)fs->free_blocks, (long)fs->clean_reserved, (long)fs->allocated_blocks,
1344                 (long)root->md.fs.cblocks_used, (long)root->md.fs.pblocks_used,
1345                 (long)root->md.fs.ablocks_used);
1346
1347         buf->f_files = 0;
1348         buf->f_ffree = 0;
1349         buf->f_fsid.val[0] = fsid; /* FIXME */
1350         buf->f_namelen = 255;
1351         buf->f_frsize = 0;
1352         spin_unlock(&root->vfs_inode.i_lock);
1353         return 0;
1354 }
1355
1356 /* FIXME we hold inode_lock while calling drop_inode, so
1357  * extra locking isn't really welcome....???
1358  */
1359 static void lafs_drop_inode(struct inode *inode)
1360 {
1361         struct fs *fs = fs_from_inode(inode);
1362         struct datablock *db;
1363
1364         /* This lock that we now hold on the inode could prevent
1365          * the cleaner from getting the inode.  So after
1366          * the complete the drop we might need to wake the cleaner.
1367          */
1368
1369         db = lafs_inode_get_dblock(inode, MKREF(drop));
1370
1371         generic_drop_inode(inode);
1372         if (db && test_bit(B_Async, &db->b.flags))
1373                 lafs_wake_thread(fs);
1374         if (db)
1375                 putdref(db, MKREF(drop));
1376 }
1377
1378 static struct super_operations lafs_sops = {
1379         .alloc_inode    = lafs_alloc_inode,
1380         .destroy_inode  = lafs_destroy_inode,  /* Inverse of 'alloc_inode' */
1381         /* Don't use read_inode */
1382         .dirty_inode    = lafs_dirty_inode,
1383         /* .write_inode not needed */
1384         /* put_inode ?? */
1385         .drop_inode     = lafs_drop_inode,
1386         /* drop_inode ?? */                     /* default will call delete or forget
1387                                                  * where 'forget' flushes and clears
1388                                                  */
1389
1390         .clear_inode    = lafs_clear_inode,    /* forget internal state of this inode */
1391         .delete_inode   = lafs_delete_inode,   /* remove this inode from filesystem */
1392         .put_super      = lafs_put_super,
1393         .sync_fs        = lafs_sync_fs,
1394         /* write_super_lockfs ?? */
1395         /* unlockfs ?? */
1396         .statfs         = lafs_statfs,
1397         /* remount_fs ?? */
1398 };
1399
1400 MODULE_AUTHOR("Neil Brown");
1401 MODULE_DESCRIPTION("LaFS - Log Structured File System");
1402 MODULE_LICENSE("GPL");
1403 module_init(lafs_init);
1404 module_exit(lafs_exit);
1405 int lafs_trace = 1;
1406 module_param(lafs_trace, int, 0644);
1407
1408 #ifdef DUMP
1409 struct fs *dfs;
1410 static int do_dump(const char *val, struct kernel_param *kp)
1411 {
1412         extern void lafs_dump_orphans(void);
1413         extern void lafs_dump_tree(void);
1414         extern void lafs_dump_cleanable(void);
1415         extern void lafs_dump_usage(void);
1416
1417         printk("Want dump of %s\n", val);
1418         if (strncmp(val, "orphan", 6) == 0)
1419                 lafs_dump_orphans();
1420         if (strncmp(val, "tree", 4) == 0)
1421                 lafs_dump_tree();
1422         if (strncmp(val, "cleanable", 9) == 0)
1423                 lafs_dump_cleanable();
1424         if (strncmp(val, "usage", 5) == 0)
1425                 lafs_dump_usage();
1426         return 0;
1427 }
1428
1429 static int get_dump(char *buffer, struct kernel_param *kp)
1430 {
1431         strcpy(buffer, "orphans,tree,cleanable,usage");
1432         return strlen(buffer);
1433 }
1434
1435 int arg;
1436 module_param_call(dump, do_dump, get_dump, &arg, 0775);
1437 #endif