super.c

   1
   2 /*
   3  * fs/lafs/super.c
   4  * Copyright (C) 2005-2009
   5  * Neil Brown <neilb@suse.de>
   6  * Released under the GPL, version 2
   7  */
   8
   9 #include        "lafs.h"
  10 #include        <linux/namei.h>
  11 #include        <linux/crc32.h>
  12 #include        <linux/statfs.h>
  13 #include        <linux/mount.h>
  14 #include        <linux/exportfs.h>
  15 #include        <linux/slab.h>
  16
  17 static struct super_operations lafs_sops;
  18 static const struct export_operations lafs_export_ops;
  19
  20 /*---------------------------------------------------------------------
  21  * Write out state and super blocks
  22  *  The super blocks only need to be written when the geometry of the
  23  *  array changes such as when a device is added, removed, or resized.
  24  *  So we don't bother with that just yet.
  25  *  The state block needs to be written - twice on each device - whenever
  26  *  a checkpoint is completed.  All copies are identical and the writes
  27  *  proceed in parallel.  There are 4 stateblock locations on each device.
  28  *  2 are typically less recent than the other two.  We over-write the
  29  *  less-recent copies.
  30  *  FIXME on a RAID4 we should pad the write to be a full stripe.
  31  *
  32  * Locking issues:  This is called from the checkpoint thread and so
  33  *  it does not race with anything else exclusive to that thread.
  34  *  The nonlog information needs to be reviewed once that functionality
  35  *  is implemented.
  36  */
  37
  38 int lafs_write_state(struct fs *fs)
  39 {
  40         struct lafs_state *st;
  41         int i, d;
  42
  43         fs->seq++;
  44         st = fs->state;
  45         st->seq = cpu_to_le32(fs->seq);
  46         st->nonlog_segment = cpu_to_le32(fs->nonlog_segment);
  47         st->nonlog_dev = cpu_to_le16(fs->nonlog_dev);
  48         st->nonlog_offset = cpu_to_le16(fs->nonlog_offset);
  49         st->nextyouth = cpu_to_le16(fs->youth_next);
  50         st->checkpointcluster = cpu_to_le64(fs->checkpointcluster);
  51         for (i = 0; i < fs->maxsnapshot; i++)
  52                 st->root_inodes[i] = cpu_to_le64(fs->ss[i].root_addr);
  53
  54         st->checksum = 0;
  55         st->checksum = crc32_le(0, (unsigned char *)st, fs->statesize);
  56
  57         for (d = 0; d < fs->devices ; d++)
  58                 for (i = (fs->seq & 1); i < 4 ; i += 2)
  59                         lafs_super_write(fs, d, fs->devs[d].stateaddr[i] >> 9,
  60                                          (char *)st, fs->statesize);
  61         lafs_super_wait(fs);
  62         /* FIXME what about a write error ??? */
  63         return 0;
  64 }
  65
  66 static int
  67 valid_devblock(struct lafs_dev *db, sector_t addr, sector_t size)
  68 {
  69         /* check that this devblock is valid, given that
  70          * it was found at sector 'addr'
  71          */
  72         u32 crc, crc2;
  73         u64 byteaddr;
  74         sector_t segsize;
  75         int i;
  76
  77         if (strncmp(db->idtag, "LaFS-DeviceBlock", 16) != 0)
  78                 return 0;
  79         if (strncmp(db->version, "AlphaDevel      ", 16) != 0)
  80                 return 0;
  81         /* uuid can be anything */
  82         crc = db->checksum;
  83         db->checksum = 0;
  84         crc2 = crc32_le(0, (unsigned char *)db, LAFS_DEVBLK_SIZE);
  85         db->checksum = crc;
  86         if (crc2 != crc) {
  87                 dprintk("%lx != %lx\n", (unsigned long)crc,
  88                         (unsigned long)crc2);
  89                 return 0;
  90         }
  91
  92         byteaddr = (u64)addr << 9; /* convert to byte */
  93         if (le64_to_cpu(db->devaddr[0]) != byteaddr &&
  94             le64_to_cpu(db->devaddr[1]) != byteaddr)
  95                 return 0;
  96
  97         if (db->statebits < 10 || db->statebits > 16)
  98                 return 0;
  99         if (db->blockbits < 9 || db->blockbits > 20)
 100                 return 0;
 101         if (le16_to_cpu(db->width) < 1 || le16_to_cpu(db->width) >= 512)
 102                 return 0;
 103         if (le32_to_cpu(db->stride) < 1)
 104                 return 0;
 105         /* devaddr[0] must be early, [1] must be late */
 106         if (le64_to_cpu(db->devaddr[0]) >=
 107             le64_to_cpu(db->segment_offset))
 108                 return 0;
 109
 110         if (le64_to_cpu(db->devaddr[1]) <
 111             le64_to_cpu(db->segment_offset) +
 112             ((((sector_t)le32_to_cpu(db->segment_count)
 113                * le32_to_cpu(db->segment_size)))
 114              << db->blockbits))
 115                 return 0;
 116
 117         /* 2 is an absolute minimum segment size, a few hundred is more
 118          * likely. We'll put a lower limit of 8, and an upper of 800000
 119          */
 120         if (le32_to_cpu(db->segment_size) < 8 ||
 121             le32_to_cpu(db->segment_size) > 800000)
 122                 return 0;
 123
 124         if (le32_to_cpu(db->segment_offset) >
 125             (le32_to_cpu(db->segment_size)<<db->blockbits) * 10)
 126                 return 0;
 127
 128         /* The 4 state blocks live before the first or after the last segment.
 129          * The distance from start of first to end of last is either:
 130          * - segment_count * segment_size  if width*stride <= segment_size
 131          * - (width-1) * stride + segment_size / width * segment_count
 132          *                if width * stride > segment_size
 133          */
 134         segsize = le32_to_cpu(db->segment_size);
 135         segsize *= le32_to_cpu(db->segment_count);
 136         if (le16_to_cpu(db->width) *  le32_to_cpu(db->stride)
 137             > le32_to_cpu(db->segment_size)) {
 138                 int stride = le32_to_cpu(db->stride);
 139                 int width = le16_to_cpu(db->width);
 140
 141                 sector_div(segsize, width);
 142                 segsize += (width - 1) * stride;
 143         }
 144         segsize <<= db->blockbits;
 145         for (i = 0; i < 4; i++) {
 146                 sector_t addr = le64_to_cpu(db->stateaddr[i]);
 147                 int offset = le32_to_cpu(db->segment_offset);
 148                 if (addr + (1<<db->statebits) > offset &&
 149                     addr < offset + segsize)
 150                         return 0;
 151                 if (addr + (1<<db->statebits) > (size << db->blockbits))
 152                         return 0;
 153         }
 154
 155         /* Check all segments fit within device */
 156         if (le32_to_cpu(db->segment_offset) + segsize > (size << db->blockbits))
 157                 return 0;
 158
 159         if (le32_to_cpu(db->level) > 10)
 160                 return 0;
 161
 162         /* I guess it look sane enough... */
 163         return 1;
 164 }
 165
 166 static int
 167 compare_dev(struct lafs_dev *orig, struct lafs_dev *new)
 168 {
 169         /* Both these are known to be valid.
 170          * Return:
 171          *   0 if they are for same filesystem, but 'new' is older
 172          *   1 if they are for same filesystem, and 'new' is newer
 173          *  -1 if they are for different filesystems
 174          */
 175         if (memcmp(orig->uuid, new->uuid, 16))
 176                 return -1;
 177         if (u32_after(le32_to_cpu(new->seq),
 178                       le32_to_cpu(orig->seq)))
 179                 return 1;
 180         return 0;
 181 }
 182
 183 static int
 184 valid_stateblock(struct lafs_state *st, struct lafs_dev *dv)
 185 {
 186         /* Given the 'dv' devblock, make sure 'st' is a valid
 187          * and consistent stateblock
 188          */
 189         u32 crc;
 190         if (strncmp(st->idtag, "LaFS-State-Block", 16) != 0)
 191                 return 0;
 192         if (strncmp(st->version, "AlphaDevel      ", 16) != 0)
 193                 return 0;
 194         crc = st->checksum;
 195         st->checksum = 0;
 196         if (crc32_le(0, (unsigned char *)st, 1<<dv->statebits) != crc)
 197                 return 0;
 198         st->checksum = crc;
 199
 200         if (memcmp(st->uuid, dv->uuid, 16))
 201                 return 0;
 202         /* FIXME cannot quite be that big! */
 203         if (le32_to_cpu(st->maxsnapshot) > (1<<(dv->statebits-3)))
 204                 return 0;
 205
 206         return 1;
 207 }
 208
 209 static int
 210 compare_state(struct lafs_state *orig, struct lafs_state *new)
 211 {
 212         /* return 1 if 'new' is actually newer than 'orig'.
 213          * We already know they are both valid and have the same
 214          * uuid... I don't think there is anything else to be checked
 215          */
 216         return u32_after(le32_to_cpu(new->seq), le32_to_cpu(orig->seq));
 217 }
 218
 219 /*
 220  * Mount options.
 221  * As we can have multiple devices, things are slightly non-obvious.
 222  * The 'devname' can be either a device name, starting '/', or
 223  * a filesytem name (not starting '/').
 224  * The 'data' is a standard comma-separated list of options.
 225  * For 'mount' these are:
 226  *    dev=/dev/X
 227  *              - devices in addition to 'dev_name'
 228  *    new=/dev/X
 229  *              - A new device, with a superblock already present, to be added.
 230  *    incomplete
 231  *              - don't complain if not all devices are given
 232  *    ?? quota stuff, cleaning parameters,
 233  *
 234  * For 'remount', options are
 235  *    dev=  - add another device
 236  *    new=  - the device is being added.
 237  *
 238  */
 239
 240 struct options {
 241         int devcnt;
 242         int curr_dev;
 243         int statebits, blockbits;
 244         struct devent {
 245                 const char *dev;
 246                 int is_new;
 247                 int is_name;
 248                 struct block_device *bdev;
 249                 struct lafs_dev *devblock;
 250                 struct lafs_state *stateblock;
 251                 int devchoice, statechoice;
 252         } *devlist;
 253         const char *name;
 254 };
 255 static int
 256 count_devs(const char *name, char *data)
 257 {
 258         int cnt = 0;
 259         if (*name == '/')
 260                 cnt = 1;
 261         while (data && *data) {
 262                 if (strncmp(data, "dev=", 4) == 0)
 263                         cnt++;
 264                 if (strncmp(data, "new=", 4) == 0)
 265                         cnt++;
 266                 data = strchr(data, ',');
 267                 if (data)
 268                         data++;
 269         }
 270         return cnt;
 271 }
 272
 273 static int
 274 parse_opts(struct options *op, const char *name, char *data)
 275 {
 276         int dv = 0;
 277         char *p;
 278
 279         memset(op, 0, sizeof(*op));
 280         op->devcnt = count_devs(name, data);
 281         op->devlist = kzalloc(op->devcnt*sizeof(op->devlist[0]), GFP_KERNEL);
 282
 283         if (!op->devlist)
 284                 return -ENOMEM;
 285
 286         op->name = NULL;
 287         if (*name == '/') {
 288                 op->devlist[dv].is_name = 1;
 289                 op->devlist[dv++].dev = name;
 290         } else
 291                 op->name = name;
 292         while ((p = strsep(&data, ",")) != NULL) {
 293                 if (!*p)
 294                         continue;
 295                 if (strncmp(p, "dev=", 4) == 0)
 296                         op->devlist[dv++].dev = p+4;
 297                 else if (strncmp(p, "new=", 4) == 0) {
 298                         op->devlist[dv].is_new = 1;
 299                         op->devlist[dv++].dev = p+4;
 300                 } else {
 301                         printk(KERN_ERR
 302                                "LaFS: Unrecognised mount option \"%s\"\n", p);
 303                         return -EINVAL;
 304
 305                 }
 306         }
 307         op->devcnt = dv;
 308
 309         return 0;
 310 }
 311
 312 static int
 313 lafs_load_super(struct block_device *bdev, void *opv, int silent)
 314 {
 315         /* Find the devblock and the stateblock for this device
 316
 317          * Only do basic internal consistancy checks.  Inter-device
 318          * checks happen later
 319          */
 320         struct options *op = opv;
 321         struct devent *dv;
 322         struct page *pg;
 323         sector_t sect, dev_addr = 0, state_addr = 0;
 324         int err = 0;
 325         unsigned int n;
 326         int i;
 327         int have_dev = 0, have_state = 0;
 328         sector_t devsize;
 329
 330         dv = &op->devlist[op->curr_dev];
 331         BUG_ON(dv->devblock);
 332         BUG_ON(dv->stateblock);
 333
 334         n = queue_logical_block_size(bdev->bd_disk->queue);
 335         if (n < LAFS_DEVBLK_SIZE)
 336                 n = LAFS_DEVBLK_SIZE;
 337         BUG_ON(n > PAGE_SIZE);
 338         dv->devblock = kmalloc(n, GFP_KERNEL);
 339         if (!dv->devblock)
 340                 return -ENOMEM;
 341         pg = alloc_page(GFP_KERNEL);
 342         if (!pg)
 343                 return -ENOMEM;
 344
 345         devsize = i_size_read(bdev->bd_inode);
 346
 347         /* Now find a devblock, check the first two possible locations,
 348          * and the last two.  If two devblocks are found with different
 349          * uuids, we are confused!
 350          */
 351         sect = 0;
 352         for (i = 0; i < 4; i++) {
 353                 /* try to read block at 'sect' */
 354                 int ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
 355
 356                 if (ok && valid_devblock(page_address(pg), sect, devsize)) {
 357                         if (!have_dev) {
 358                                 have_dev = 1;
 359                                 memcpy(dv->devblock, page_address(pg), n);
 360                                 dev_addr = sect;
 361                         } else switch (compare_dev(dv->devblock,
 362                                                    page_address(pg))) {
 363                                 case 0: /* older, do nothing */
 364                                         break;
 365                                 case 1: /* newer, overwrite */
 366                                         memcpy(dv->devblock, page_address(pg), n);
 367                                         dev_addr = sect;
 368                                         break;
 369                                 default: /* inconsistent --- HELP */
 370                                         printk(KERN_ERR "LaFS: inconsistent device-blocks found.\n");
 371                                         err = -EINVAL;
 372                                         goto out;
 373                                 }
 374                 }
 375
 376                 if (i != 1)
 377                         sect += (n>>9);
 378                 else {
 379                         sect = devsize & ~(sector_t)(n-1);
 380                         sect >>= 9;
 381                         sect -= (n>>9)*2;
 382                 }
 383         }
 384         /* FIXME - we've lost the read error, if it was significant */
 385         err = -EINVAL;
 386         if (!have_dev) {
 387                 if (!silent)
 388                         printk(KERN_ERR "LaFS - no valid devblock found.\n");
 389                 goto out;
 390         }
 391
 392         /* OK, we have a valid devblock, that's nice.
 393          * Now we should be able to find some stateblocks.
 394          * The locations are in the devblock
 395          */
 396         n = le32_to_cpu(1<<dv->devblock->statebits);
 397         if ((n & (n-1)) ||
 398             n < queue_logical_block_size(bdev->bd_disk->queue) ||
 399             n > 128*1024) {
 400                 printk(KERN_ERR "LaFS: statesize of %u not acceptable.\n", n);
 401                 err = -EINVAL;
 402                 goto out;
 403         }
 404         dv->stateblock = kmalloc(n, GFP_KERNEL);
 405         err = -ENOMEM;
 406         if (!dv->stateblock)
 407                 goto out;
 408         for (i = 0; i < 4; i++) {
 409                 int ok;
 410                 sect = le64_to_cpu(dv->devblock->stateaddr[i])>>9;
 411                 ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
 412                 if (ok && valid_stateblock(page_address(pg), dv->devblock)) {
 413                         if (!have_state) {
 414                                 have_state = 1;
 415                                 memcpy(dv->stateblock, page_address(pg), n);
 416                                 state_addr = i;
 417                         } else if (compare_state(dv->stateblock,
 418                                                  page_address(pg))) {
 419                                 memcpy(dv->stateblock, page_address(pg), n);
 420                                 state_addr = i;
 421                         }
 422                 }
 423         }
 424
 425         if (have_state) {
 426                 err = 0;
 427                 dv->devchoice = dev_addr;
 428                 dv->statechoice = state_addr;
 429         } else {
 430                 err = -EINVAL;
 431                 if (!silent)
 432                         printk(KERN_ERR "LaFS: no valid stateblock found.\n");
 433         }
 434 out:
 435         page_cache_release(pg);
 436         return err;
 437 }
 438
 439 static int
 440 check_devs(struct options *op)
 441 {
 442         /* Check we have enough, that they are for the same
 443          * uuid, and they they don't overlap
 444          * Also check that 'seq' number of devblocks
 445          * are within '1'
 446          */
 447         int seqlo = le32_to_cpu(op->devlist[0].devblock->seq);
 448         int seqhi = le32_to_cpu(op->devlist[0].devblock->seq);
 449         int newdev = 0;
 450         int newstate = 0;
 451         int i, j;
 452
 453         for (i = 1; i < op->devcnt; i++) {
 454                 if (memcmp(op->devlist[0].stateblock->uuid,
 455                            op->devlist[i].stateblock->uuid,
 456                            16) != 0)
 457                         return -EINVAL;
 458
 459                 if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo)
 460                         ;
 461                 else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo+1) {
 462                         newdev = i;
 463                         seqhi = seqlo+1;
 464                 } else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqhi-1)
 465                         seqlo = seqhi-1;
 466                 else
 467                         return -EINVAL;
 468
 469                 if (u32_after(le32_to_cpu(op->devlist[i].stateblock->seq),
 470                               le32_to_cpu(op->devlist[newstate].
 471                                           stateblock->seq)))
 472                         newstate = i;
 473         }
 474         if (le32_to_cpu(op->devlist[newstate].stateblock->devices)
 475             != op->devcnt)
 476                 return -EINVAL;
 477
 478         op->statebits = op->devlist[0].devblock->statebits;
 479         op->blockbits = op->devlist[0].devblock->blockbits;
 480
 481         /* Now check devices don't overlap in start/size.
 482          * We do a simple quadratic search
 483          */
 484         for (i = 0; i < op->devcnt; i++)
 485                 for (j = 0; j < op->devcnt; j++)
 486                         if (i != j)
 487                                 if (le64_to_cpu(op->devlist[i].devblock->start) <
 488                                     le64_to_cpu(op->devlist[j].devblock->start) &&
 489
 490                                     le64_to_cpu(op->devlist[i].devblock->start)+
 491                                     le64_to_cpu(op->devlist[i].devblock->size) >
 492                                     le64_to_cpu(op->devlist[j].devblock->start))
 493                                         return -EINVAL;
 494         return newstate;
 495 }
 496
 497 /* we identify lafs superblocks by the filesystem uuid.  This means
 498  * that block-level snapshots cannot be mounted.  You should use
 499  * fs-level snapshots instead.
 500  */
 501 static int sb_test(struct super_block *sb, void *data)
 502 {
 503         struct sb_key *ptn = data;
 504         struct sb_key *sk = sb->s_fs_info;
 505         return memcmp(ptn->fs->state->uuid,
 506                       sk->fs->state->uuid, 16) == 0;
 507 }
 508
 509 static int sb_set(struct super_block *sb, void *data)
 510 {
 511         struct sb_key *ptn = data;
 512         sb->s_fs_info = ptn;
 513         return set_anon_super(sb, NULL);
 514 }
 515
 516
 517 static int
 518 lafs_load(struct fs *fs, struct options *op, int newest)
 519 {
 520         /* We seem to have a full set of devices for the filesystem.
 521          * Time to create our fs_info structure and fill it out.
 522          * This only includes information from the dev and state blocks.
 523          * Finding the root-inode comes a bit later.
 524          */
 525         struct lafs_state *st;
 526         int i;
 527         int err;
 528         struct sb_key *k;
 529
 530         st = fs->state = op->devlist[newest].stateblock;
 531         op->devlist[newest].stateblock = NULL;
 532 #ifdef DUMP
 533         dfs = fs;
 534 #endif
 535
 536         fs->seq = le32_to_cpu(st->seq);
 537         fs->levels = le32_to_cpu(st->levels);
 538         fs->devices = op->devcnt;
 539         fs->devs_loaded = fs->devices; /* FIXME use this or lose this */
 540         fs->statesize = 1 << op->statebits;
 541         fs->blocksize = 1 << op->blockbits;
 542         fs->blocksize_bits = op->blockbits;
 543
 544         fs->nonlog_segment = le32_to_cpu(st->nonlog_segment);
 545         fs->nonlog_dev = le16_to_cpu(st->nonlog_dev);
 546         fs->nonlog_offset = le16_to_cpu(st->nonlog_offset);
 547         fs->youth_next = le16_to_cpu(st->nextyouth);
 548         fs->checkpoint_youth = fs->youth_next;
 549         if (fs->youth_next < 8)
 550                 fs->youth_next = 8;
 551         fs->scan.first_free_pass = 1;
 552         fs->scan.free_dev = -1;
 553
 554         fs->maxsnapshot = le32_to_cpu(st->maxsnapshot);
 555
 556         fs->scan.free_usages = kmalloc(PAGE_SIZE, GFP_KERNEL);
 557         err = lafs_segtrack_init(fs->segtrack);
 558
 559         fs->ss = kzalloc(sizeof(struct snapshot)*fs->maxsnapshot, GFP_KERNEL);
 560         if (!fs->ss || !fs->scan.free_usages || err) {
 561                 if (!err)
 562                         err = -ENOMEM;
 563                 goto abort;
 564         }
 565
 566         fs->checkpointcluster = le64_to_cpu(st->checkpointcluster);
 567         for (i = 0; i < fs->maxsnapshot; i++) {
 568                 fs->ss[i].root_addr =
 569                         le64_to_cpu(st->root_inodes[i]);
 570                 dprintk("root inode %d are %llu\n",
 571                         i, fs->ss[i].root_addr);
 572         }
 573         INIT_LIST_HEAD(&fs->pending_orphans);
 574         INIT_LIST_HEAD(&fs->inode_index);
 575         INIT_LIST_HEAD(&fs->phase_leafs[0]);
 576         INIT_LIST_HEAD(&fs->phase_leafs[1]);
 577         INIT_LIST_HEAD(&fs->clean_leafs);
 578         INIT_LIST_HEAD(&fs->account_leafs);
 579         atomic_set(&fs->sb_writes_pending, 0);
 580         init_waitqueue_head(&fs->sb_writes_wait);
 581         init_waitqueue_head(&fs->async_complete);
 582         init_waitqueue_head(&fs->trunc_wait);
 583         mutex_init(&fs->cleaner.lock);
 584         spin_lock_init(&fs->stable_lock);
 585         spin_lock_init(&fs->alloc_lock);
 586         spin_lock_init(&fs->lock);
 587         init_waitqueue_head(&fs->phase_wait);
 588
 589         INIT_WORK(&fs->done_work, lafs_done_work);
 590
 591         /* FIXME add congention and unplug functions to this bdi */
 592         err = bdi_init(&fs->bdi);
 593         if (err)
 594                 goto abort;
 595
 596
 597         fs->phase_locked = 0;
 598         for (i = 0; i < WC_NUM; i++) {
 599                 int j;
 600                 mutex_init(&fs->wc[i].lock);
 601                 for (j = 0; j < 4 ; j++) {
 602                         atomic_set(&fs->wc[i].pending_cnt[j], 0);
 603                         INIT_LIST_HEAD(&fs->wc[i].pending_blocks[j]);
 604                 }
 605                 init_waitqueue_head(&fs->wc[i].pending_wait);
 606                 fs->wc[i].seg.dev = -1;
 607         }
 608
 609         fs->max_newsegs = 32; /* FIXME this should be configurable */
 610
 611         err = -ENOMEM;
 612         fs->devs = kzalloc(sizeof(struct fs_dev)*fs->devices, GFP_KERNEL);
 613         if (!fs->devs)
 614                 goto abort;
 615
 616         k = kzalloc(sizeof(*k), GFP_KERNEL);
 617         k->fs = fs;
 618         fs->prime_sb = sget(&lafs_fs_type, sb_test, sb_set, k);
 619         if (IS_ERR(fs->prime_sb)) {
 620                 kfree(k);
 621                 err = PTR_ERR(fs->prime_sb);
 622                 goto abort;
 623         }
 624         if (fs->prime_sb->s_root) {
 625                 /* filesystem with this uuid already exists */
 626                 deactivate_locked_super(fs->prime_sb);
 627                 kfree(k);
 628                 fs->prime_sb = NULL;
 629                 err = -EBUSY;
 630                 goto abort;
 631         }
 632         err = bdi_register_dev(&fs->bdi, fs->prime_sb->s_dev);
 633         if (err) {
 634                 deactivate_locked_super(fs->prime_sb);
 635                 kfree(k);
 636                 fs->prime_sb = NULL;
 637                 goto abort;
 638         }
 639         fs->prime_sb->s_bdi = &fs->bdi;
 640
 641         fs->prime_sb->s_blocksize = 1 << op->blockbits;
 642         fs->prime_sb->s_blocksize_bits = op->blockbits;
 643         fs->prime_sb->s_op = &lafs_sops;
 644         fs->prime_sb->s_export_op = &lafs_export_ops;
 645         fs->prime_sb->s_root = NULL;
 646
 647         /* We allow 29 bits for nanosecs, so they must be even. */
 648         fs->prime_sb->s_time_gran = 2;
 649
 650         for (i = 0; i < fs->devices; i++) {
 651                 struct fs_dev *dv = &fs->devs[i];
 652                 struct devent *de = &op->devlist[i];
 653                 int j;
 654                 dv->bdev = de->bdev;
 655                 de->bdev = NULL;
 656
 657                 dv->devblk = de->devblock;
 658                 de->devblock = NULL;
 659
 660                 dv->recent_dev = de->devchoice;
 661                 dv->recent_state = de->statechoice;
 662
 663                 dv->start = le64_to_cpu(dv->devblk->start);
 664                 dv->size = le64_to_cpu(dv->devblk->size);
 665                 dprintk("Dev %d seems to range %llu + %llu\n",
 666                         i, (unsigned long long)dv->start,
 667                         (unsigned long long)dv->size);
 668
 669                 dv->width = le16_to_cpu(dv->devblk->width);
 670                 dv->stride = le32_to_cpu(dv->devblk->stride);
 671                 dv->segment_size = le32_to_cpu(dv->devblk->segment_size);
 672                 dv->segment_offset = le32_to_cpu(dv->devblk->segment_offset);
 673                 dv->segment_count = le32_to_cpu(dv->devblk->segment_count);
 674                 dv->usage_inum = le32_to_cpu(dv->devblk->usage_inum);
 675                 dv->level = le16_to_cpu(dv->devblk->level);
 676
 677                 if (dv->segment_size > fs->max_segment)
 678                         fs->max_segment = dv->segment_size;
 679
 680                 if (dv->width * dv->stride <= dv->segment_size) {
 681                         dv->tables_per_seg = dv->segment_size /
 682                                 dv->width / dv->stride;
 683                         dv->rows_per_table = dv->stride;
 684                         dv->segment_stride = dv->segment_size;
 685                 } else {
 686                         dv->tables_per_seg = 1;
 687                         dv->rows_per_table = dv->segment_size / dv->width;
 688                         dv->segment_stride = dv->rows_per_table;
 689                 }
 690                 /* table size is the number of blocks in the segment usage
 691                  * file per snapshot
 692                  */
 693                 dv->tablesize = (dv->segment_count + (1<<(fs->blocksize_bits-1)) + 1)
 694                         >> (fs->blocksize_bits-1);
 695
 696                 for (j = 0; j < 2; j++)
 697                         dv->devaddr[j] = le64_to_cpu(dv->devblk->devaddr[j]);
 698                 for (j = 0; j < 4; j++)
 699                         dv->stateaddr[j] = le64_to_cpu(dv->devblk->stateaddr[j]);
 700         }
 701         return 0;
 702
 703 abort:
 704         bdi_destroy(&fs->bdi);
 705         kfree(fs->scan.free_usages);
 706         lafs_segtrack_free(fs->segtrack);
 707         kfree(fs->devs);
 708         kfree(fs->ss);
 709         kfree(fs);
 710         return -ENOMEM;
 711 }
 712
 713 static int show_orphans(struct fs *fs)
 714 {
 715         struct datablock *db;
 716         printk("Orphans:\n");
 717         list_for_each_entry(db, &fs->pending_orphans,
 718                             orphans) {
 719                 struct inode *ino = iget_my_inode(db);
 720                 printk("orphan=%s\n", strblk(&db->b));
 721                 if (ino)
 722                         lafs_print_tree(&LAFSI(ino)->iblock->b, 0);
 723                 iput(ino);
 724         }
 725         printk("cleaner active: %d %d\n", fs->cleaner.active,
 726                fs->scan.done);
 727         return 1; /* meaningless, but makes it easy to add to wait_event below */
 728 }
 729
 730 static void lafs_kill_sb(struct super_block *sb)
 731 {
 732         struct fs *fs = fs_from_sb(sb);
 733         /* Release the 'struct fs' */
 734         int i;
 735
 736         /* FIXME should I refcount this when there are multiple
 737          * filesets? How does that work?
 738          */
 739
 740         /* Delay final destruction of the root inode */
 741         /* FIXME all the sbs... */
 742         set_bit(I_Deleting, &LAFSI(fs->ss[0].root)->iflags);
 743
 744         /* FIXME I'm not sure we should be waiting for the
 745          * cleaner.  Maybe we should just release all tc->cleaning
 746          * blocks instead.
 747          */
 748         set_bit(CleanerDisabled, &fs->fsstate);
 749
 750         wait_event(fs->async_complete,
 751                    show_orphans(fs) &&
 752                    !test_bit(OrphansRunning, &fs->fsstate) &&
 753                    list_empty(&fs->pending_orphans) &&
 754                    fs->scan.done == 1 &&
 755                    fs->cleaner.active == 0);
 756
 757         kill_anon_super(fs->prime_sb);
 758
 759         bdi_destroy(&fs->bdi);
 760
 761         for (i = 0; i < fs->devices; i++) {
 762                 struct fs_dev *dv = &fs->devs[i];
 763                 kfree(dv->devblk);
 764                 close_bdev_exclusive(dv->bdev, FMODE_READ|FMODE_WRITE);
 765         }
 766
 767         /* Final checkpoint will have cleared out the leafs lists,
 768          * so they should all be empty.
 769          */
 770         /* Lets see what is on the 'leaf' list? */
 771         for (i = 0; i < 2; i++) {
 772                 struct block *b;
 773                 dprintk("For phase %d\n", i);
 774         retry:
 775                 list_for_each_entry(b, &fs->phase_leafs[i], lru) {
 776                         /* FIXME this only OK for readonly mounts.
 777                          */
 778                         getref(b, MKREF(release));
 779                         lafs_refile(b, 0);
 780                         if (test_bit(B_Pinned, &b->flags)) {
 781                                 /* didn't fix the pincnt !! */
 782                                 printk("This was pinned: %s\n", strblk(b));
 783                                 lafs_print_tree(b, 1);
 784                                 BUG();
 785                         }
 786                         putref(b, MKREF(release));
 787                         goto retry;
 788                 }
 789         }
 790         BUG_ON(!list_empty(&fs->clean_leafs));
 791
 792         flush_scheduled_work();
 793         lafs_stop_thread(fs);
 794
 795         for (i = 0; i < 4; i++)
 796                 if (fs->cleaner.seg[i].chead)
 797                         put_page(fs->cleaner.seg[i].chead);
 798
 799         kfree(fs->state);
 800         kfree(fs->ss);
 801         kfree(fs->devs);
 802         lafs_segtrack_free(fs->segtrack);
 803         kfree(fs->scan.free_usages);
 804         kfree(fs->prime_sb->s_fs_info);
 805         kfree(fs);
 806 }
 807
 808 static void
 809 lafs_put_super(struct super_block *sb)
 810 {
 811         struct fs *fs = fs_from_sb(sb);
 812         int ss;
 813         struct lafs_inode *li;
 814
 815         lafs_checkpoint_lock(fs);
 816         lafs_checkpoint_start(fs);
 817         if (sb == fs->prime_sb)
 818                 /* Don't incorporate any more segusage/quota updates. */
 819                 set_bit(FinalCheckpoint, &fs->fsstate);
 820         lafs_checkpoint_unlock_wait(fs);
 821         lafs_cluster_wait_all(fs);
 822
 823         if (sb == fs->prime_sb) {
 824                 int d;
 825                 /* This is the main sb, not a snapshot or
 826                  * subordinate fs.
 827                  * Now that all inodes have been invalidated we can do
 828                  * the final checkpoint.
 829                  */
 830                 lafs_close_all_segments(fs);
 831                 lafs_empty_segment_table(fs);
 832                 lafs_seg_put_all(fs);
 833
 834                 iput(fs->orphans);
 835                 fs->orphans = NULL;
 836                 for (d=0; d < fs->devices; d++)
 837                         if (fs->devs[d].segsum) {
 838                                 iput(fs->devs[d].segsum);
 839                                 fs->devs[d].segsum = NULL;
 840                         }
 841         }
 842
 843         /* need to break a circular reference... */
 844         for (ss = 0; ss < fs->maxsnapshot; ss++)
 845                 if (fs->ss[ss].root &&
 846                     fs->ss[ss].root->i_sb == sb) {
 847                         dprintk("Putting ss %d\n", ss);
 848                         li = LAFSI(fs->ss[ss].root);
 849                         if (test_bit(B_Realloc, &li->dblock->b.flags))
 850                                 lafs_dump_tree();
 851                         iput(fs->ss[ss].root);
 852                         fs->ss[ss].root = NULL;
 853                         break;
 854                 }
 855 }
 856
 857 static int
 858 lafs_get_devs(struct fs *fs, struct options *op, int flags)
 859 {
 860         int err;
 861         int i;
 862
 863         for (i = 0; i < op->devcnt; i++) {
 864                 struct block_device *bdev;
 865                 op->curr_dev = i;
 866
 867                 bdev = open_bdev_exclusive(op->devlist[i].dev,
 868                                            FMODE_READ|FMODE_WRITE, fs);
 869                 err = PTR_ERR(bdev);
 870                 if (IS_ERR(bdev))
 871                         goto out;
 872                 err = lafs_load_super(bdev, op, flags & MS_SILENT ? 1 : 0);
 873                 if (err < 0)
 874                         goto out;
 875                 op->devlist[i].bdev = bdev;
 876         }
 877         return 0;
 878
 879 out:
 880         return err;
 881 }
 882
 883 static int
 884 lafs_get_sb(struct file_system_type *fs_type,
 885             int flags, const char *dev_name, void *data,
 886             struct vfsmount *mnt)
 887 {
 888         /* as we may have multiple devices, some in 'data', we cannot just
 889          * use get_sb_bdev, we need to roll-our-own.
 890          * We call get_sb_bdev on *each* bdev, and make sure the returned
 891          * superblocks are either all new, or all for the same filesystem.
 892          * If the later, we return the primary.
 893          * If the former, we init the filesystem copying static data
 894          * to all supers.
 895          * First we 'open_bdev_exclusive' each device, exclusive to lafs
 896          * Then we 'sget' a superblock that knows any/all the devices.
 897          * This may be pre-existing, or may be new
 898          * If new, it will be created knowing all devices.
 899          * If pre-existing, and don't have correct device list, error
 900          */
 901         struct options op;
 902         int err;
 903         int newest;
 904         struct fs *fs = kzalloc(sizeof(*fs), GFP_KERNEL);
 905         char *cdata = data;
 906         if (cdata == NULL)
 907                 cdata = "";
 908
 909         err = -ENOMEM;
 910         if (!fs)
 911                 goto out;
 912         err = parse_opts(&op, dev_name, cdata);
 913         if (err)
 914                 goto out;
 915
 916         /* We now have as list of device names.  We call open_bdev_exclusive
 917          * on each to collect some superblocks.
 918          */
 919         err = lafs_get_devs(fs, &op, flags);
 920         if (err)
 921                 goto out;
 922
 923         /* Each device has a valid dev and state block.  Hopefully they
 924          * are all for the same filesystem.  If they don't have the
 925          * same uuid, we will bale-out here.  We also check that we have
 926          * enough, and that they don't overlap.
 927          * While we are looking at state blocks, pick the newest.
 928          */
 929         newest = check_devs(&op);
 930         if (newest < 0) {
 931                 err = newest;
 932                 goto out;
 933         }
 934
 935         /* So they seem to be the same - better create our
 936          * 'fs' structure and fill it in
 937          */
 938         err = lafs_load(fs, &op, newest);
 939         if (err)
 940                 goto out;
 941
 942         /* Well, all the devices check out.  Now we need to find the
 943          * filesystem */
 944         err = lafs_mount(fs);
 945         if (err == 0)
 946                 err = lafs_start_thread(fs);
 947         if (err)
 948                 deactivate_locked_super(fs->prime_sb);
 949         else {
 950                 fs->prime_sb->s_flags |= MS_ACTIVE;
 951                 simple_set_mnt(mnt, fs->prime_sb);
 952         }
 953         /* And there you have it.  Filesystem all mounted, root dir found,
 954          * metadata files initialised, all pigs fed, and ready to fly!!!
 955          */
 956
 957 out:
 958         /* Now we clean up 'options'.  Anything that is wanted has
 959          * been moved into 'fs', so we just discard anything we find
 960          */
 961         if (op.devlist) {
 962                 int i;
 963                 for (i = 0; i < op.devcnt; i++) {
 964                         kfree(op.devlist[i].devblock);
 965                         kfree(op.devlist[i].stateblock);
 966                         if (op.devlist[i].bdev)
 967                                 close_bdev_exclusive(op.devlist[i].bdev,
 968                                                      FMODE_READ|FMODE_WRITE);
 969                 }
 970                 kfree(op.devlist);
 971         }
 972         return err;
 973 }
 974
 975 static int test_subset(struct super_block *sb, void *data)
 976 {
 977         struct sb_key *ptn = data;
 978         struct sb_key *k = sb->s_fs_info;
 979
 980         return ptn->fs == k->fs && ptn->root == k->root;
 981 }
 982
 983 static int set_subset(struct super_block *sb, void *data)
 984 {
 985         sb->s_fs_info = data;
 986         set_anon_super(sb, NULL);
 987         return 0;
 988 }
 989
 990 static struct file_system_type lafs_subset_fs_type;
 991 struct super_block *lafs_get_subset_sb(struct inode *ino)
 992 {
 993         /* ino must be a TypeInodeFile inode in the prime filesystem. */
 994         struct fs *fs = fs_from_inode(ino);
 995         struct super_block *sb;
 996         struct sb_key *k = kmalloc(sizeof(*k), GFP_KERNEL);
 997
 998         if (!k)
 999                 return ERR_PTR(-ENOMEM);
1000
1001         k->fs = fs;
1002         k->root = ino;
1003         sb = sget(&lafs_subset_fs_type, test_subset, set_subset, k);
1004         if (IS_ERR(sb)) {
1005                 kfree(k);
1006         } else if (sb->s_root) {
1007                 /* already allocated */
1008                 kfree(k);
1009         } else {
1010                 struct inode *rootdir, *imapfile;
1011                 int err = 0;
1012
1013                 igrab(ino);
1014                 sb->s_blocksize = fs->blocksize;
1015                 sb->s_blocksize_bits = fs->blocksize_bits;
1016                 sb->s_bdi = fs->prime_sb->s_bdi;
1017                 sb->s_op = &lafs_sops;
1018                 sb->s_export_op = &lafs_export_ops;
1019                 sb->s_time_gran = 2;
1020                 rootdir = lafs_iget(sb, 2, SYNC);
1021                 if (IS_ERR(rootdir) && PTR_ERR(rootdir) == -ENOENT) {
1022                         rootdir = lafs_new_inode(fs, sb, NULL,
1023                                                  TypeDir, 2, 0755, NULL);
1024                         /* FIXME could the inode get written before we set
1025                          * the link count ??*/
1026                         rootdir->i_nlink = 2;
1027                 }
1028                 if (IS_ERR(rootdir))
1029                         err = PTR_ERR(rootdir);
1030                 else {
1031                         sb->s_root = d_alloc_root(rootdir);
1032                         imapfile = lafs_iget(sb, 1, SYNC);
1033                         if (IS_ERR(imapfile) && PTR_ERR(imapfile) == -ENOENT)
1034                                 imapfile = lafs_new_inode(fs, sb, NULL,
1035                                                           TypeInodeMap, 1, 0, NULL);
1036
1037                         if (IS_ERR(imapfile))
1038                                 err = PTR_ERR(imapfile);
1039                         else
1040                                 iput(imapfile);
1041                 }
1042
1043                 if (!err) {
1044                         sb->s_op = fs->prime_sb->s_op;
1045                         sb->s_flags |= MS_ACTIVE;
1046                         atomic_inc(&fs->prime_sb->s_active);
1047                         igrab(ino);
1048                 } else {
1049                         deactivate_locked_super(sb);
1050                         sb = ERR_PTR(err);
1051                 }
1052         }
1053         return sb;
1054 }
1055
1056 static int
1057 lafs_get_subset(struct file_system_type *fs_type,
1058                 int flags, const char *dev_name, void *data,
1059                 struct vfsmount *mnt)
1060 {
1061         /* mount, possibly creating, a sub-fileset.
1062          * dev_name must be an absolute path that leads
1063          * to an object in a lafs file-system (or snapshot).
1064          * The object must be either an InodeFile or
1065          * an empty directory in the main file-system
1066          * with mode 0 (though that rule might change).
1067          * In the latter case we change the object to an
1068          * InodeFile
1069          * FIXME must require readonly for snapshots, and readwrite
1070          * to create.
1071          */
1072
1073         struct nameidata nd;
1074         int err;
1075         struct super_block *sb;
1076         struct inode *ino;
1077         struct fs *fs;
1078
1079         err = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
1080         if (err)
1081                 goto out_noput;
1082         sb = nd.path.dentry->d_sb;
1083         err = -EINVAL;
1084         if (sb->s_type != &lafs_fs_type &&
1085             sb->s_type != &lafs_snap_fs_type)
1086                 goto out;
1087         ino = nd.path.dentry->d_inode;
1088         if (LAFSI(ino)->type != TypeInodeFile &&
1089             LAFSI(ino)->type != TypeDir)
1090                 goto out;
1091         fs = fs_from_sb(sb);
1092         mutex_lock(&ino->i_mutex);
1093         if (LAFSI(ino)->type == TypeDir) {
1094                 struct datablock *inodb;
1095                 /* maybe convert this to TypeInodeFile */
1096                 if (sb->s_type != &lafs_fs_type)
1097                         goto out_unlock;
1098                 if (ino->i_size)
1099                         /* FIXME maybe I should run orphans */
1100                         goto out_unlock;
1101                 if ((ino->i_mode & 07777) != 0)
1102                         goto out_unlock;
1103                 inodb = lafs_inode_dblock(ino, SYNC, MKREF(make_subset));
1104                 err = PTR_ERR(inodb);
1105                 if (IS_ERR(inodb))
1106                         goto out_unlock;
1107                 lafs_iolock_block(&inodb->b);
1108                 set_bit(B_PinPending, &inodb->b.flags);
1109                 lafs_iounlock_block(&inodb->b);
1110                 lafs_checkpoint_lock(fs);
1111                 err = lafs_pin_dblock(inodb, ReleaseSpace);
1112                 if (!err) {
1113                         struct fs_md *md;
1114                         /* OK, we are good to go making this filesystem */
1115                         LAFSI(ino)->type = TypeInodeFile;
1116                         LAFSI(ino)->metadata_size = (sizeof(struct la_inode) +
1117                                                      sizeof(struct fs_metadata));
1118                         ino->i_op = &lafs_subset_ino_operations;
1119                         ino->i_fop = &lafs_subset_file_operations;
1120                         /* FIXME we lose md->parent here - what to do?? */
1121                         md = &LAFSI(ino)->md.fs;
1122                         md->usagetable = 0;
1123                         ino->i_mtime = current_fs_time(sb);
1124                         md->cblocks_used = 0;
1125                         md->pblocks_used = 0;
1126                         md->ablocks_used = 0;
1127                         md->blocks_allowed = 10000; /* FIXME */
1128                         md->blocks_unalloc = 0;
1129                         /* FIXME should I be using inode_init here */
1130                         md->creation_age = fs->wc[0].cluster_seq;
1131                         md->inodes_used = 0;
1132                         md->quota_inums[0] = 0;
1133                         md->quota_inums[1] = 0;
1134                         md->quota_inums[2] = 0;
1135                         md->quota_inodes[0] = NULL;
1136                         md->quota_inodes[1] = NULL;
1137                         md->quota_inodes[2] = NULL;
1138                         md->name = NULL;
1139                         lafs_dirty_dblock(inodb);
1140                         lafs_dirty_inode(ino);
1141                         /* We use a checkpoint to commit this change,
1142                          * it is too unusual to bother logging
1143                          */
1144                         lafs_checkpoint_start(fs);
1145                         lafs_checkpoint_unlock_wait(fs);
1146                 } else {
1147                         lafs_checkpoint_unlock(fs);
1148                 }
1149                 putdref(inodb, MKREF(make_subset));
1150                 if (err)
1151                         goto out_unlock;
1152         }
1153         err = 0;
1154         /* We have a TypeInodeFile so we can make a superblock */
1155         sb = lafs_get_subset_sb(ino);
1156         iput(ino);
1157
1158         if (IS_ERR(sb))
1159                 err = PTR_ERR(sb);
1160         else
1161                 simple_set_mnt(mnt, sb);
1162 out_unlock:
1163         mutex_unlock(&ino->i_mutex);
1164 out:
1165         path_put(&nd.path);
1166 out_noput:
1167         return err;
1168 }
1169
1170 static void lafs_kill_subset(struct super_block *sb)
1171 {
1172         struct sb_key *k = sb->s_fs_info;
1173         kill_anon_super(sb);
1174         iput(k->root);
1175         deactivate_super(k->fs->prime_sb);
1176         kfree(k);
1177 }
1178
1179 const struct file_operations lafs_subset_file_operations = {
1180 };
1181
1182 const struct inode_operations lafs_subset_ino_operations = {
1183 };
1184
1185
1186 struct file_system_type lafs_fs_type = {
1187         .owner          = THIS_MODULE,
1188         .name           = "lafs",
1189         .get_sb         = lafs_get_sb,
1190         .kill_sb        = lafs_kill_sb,
1191         .fs_flags       = FS_REQUIRES_DEV,
1192 };
1193
1194 static struct file_system_type lafs_subset_fs_type = {
1195         .owner          = THIS_MODULE,
1196         .name           = "lafs_subset",
1197         .get_sb         = lafs_get_subset,
1198         .kill_sb        = lafs_kill_subset,
1199 };
1200
1201 static int __init lafs_init(void)
1202 {
1203         int err;
1204
1205         BUILD_BUG_ON(B_NUM_FLAGS > 32);
1206
1207         err = lafs_ihash_init();
1208         err = err ?: register_filesystem(&lafs_fs_type);
1209         err = err ?: register_filesystem(&lafs_snap_fs_type);
1210         err = err ?: register_filesystem(&lafs_subset_fs_type);
1211         if (err)
1212                 goto out;
1213         return 0;
1214
1215 out:
1216         unregister_filesystem(&lafs_fs_type);
1217         unregister_filesystem(&lafs_snap_fs_type);
1218         unregister_filesystem(&lafs_subset_fs_type);
1219         lafs_ihash_free();
1220         return err;
1221 }
1222
1223 static void __exit lafs_exit(void)
1224 {
1225         unregister_filesystem(&lafs_fs_type);
1226         unregister_filesystem(&lafs_snap_fs_type);
1227         unregister_filesystem(&lafs_subset_fs_type);
1228         lafs_ihash_free();
1229 }
1230
1231 static struct inode *lafs_nfs_get_inode(struct super_block *sb,
1232                                         u64 ino, u32 generation)
1233 {
1234         struct inode *inode;
1235
1236         inode = lafs_iget(sb, ino, SYNC);
1237         if (IS_ERR(inode))
1238                 return ERR_CAST(inode);
1239         if (generation && inode->i_generation != generation) {
1240                 iput(inode);
1241                 return ERR_PTR(-ESTALE);
1242         }
1243
1244         return inode;
1245 }
1246
1247 static struct dentry *lafs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1248                                         int fh_len, int fh_type)
1249 {
1250         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1251                                     lafs_nfs_get_inode);
1252 }
1253
1254 static struct dentry *lafs_fh_to_parent(struct super_block *sb, struct fid *fid,
1255                                         int fh_len, int fh_type)
1256 {
1257         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1258                                     lafs_nfs_get_inode);
1259 }
1260
1261 static struct dentry *lafs_get_parent(struct dentry *child)
1262 {
1263         ino_t inum = LAFSI(child->d_inode)->md.file.parent;
1264         struct inode *inode = lafs_iget(child->d_inode->i_sb, inum, SYNC);
1265         if (IS_ERR(inode))
1266                 return ERR_CAST(inode);
1267         return d_obtain_alias(inode);
1268 }
1269
1270 static const struct export_operations lafs_export_ops = {
1271         .fh_to_dentry = lafs_fh_to_dentry,
1272         .fh_to_parent = lafs_fh_to_parent,
1273         .get_parent = lafs_get_parent,
1274 };
1275
1276 static struct inode *lafs_alloc_inode(struct super_block *sb)
1277 {
1278         struct lafs_inode *li;
1279         li = kmalloc(sizeof(*li), GFP_NOFS);
1280         if (!li)
1281                 return NULL;
1282         inode_init_once(&li->vfs_inode);
1283         li->vfs_inode.i_data.backing_dev_info = sb->s_bdi;
1284         li->iblock = NULL;
1285         li->dblock = NULL;
1286         li->update_cluster = 0;
1287         li->md.fs.name = NULL;
1288
1289         init_rwsem(&li->ind_sem);
1290         INIT_LIST_HEAD(&li->free_index);
1291
1292         return &li->vfs_inode;
1293 }
1294
1295 static void kfree_inode(struct rcu_head *head)
1296 {
1297         struct lafs_inode *lai = container_of(head, struct lafs_inode,
1298                                               md.rcu);
1299         if (lai->type == TypeInodeFile)
1300                 kfree(lai->md.fs.name);
1301         kfree(lai);
1302 }
1303
1304 void lafs_destroy_inode(struct inode *inode)
1305 {
1306         struct datablock *db;
1307
1308         BUG_ON(!list_empty(&inode->i_sb_list));
1309         // Cannot test i_list as dispose_list just does list_del
1310         db = lafs_inode_get_dblock(inode, MKREF(destroy));
1311
1312         if (db) {
1313                 set_bit(I_Destroyed, &LAFSI(inode)->iflags);
1314                 putdref(db, MKREF(destroy));
1315         } else {
1316                 spin_lock(&inode->i_data.private_lock);
1317                 if (LAFSI(inode)->iblock)
1318                         LAFS_BUG(atomic_read(&LAFSI(inode)->iblock->b.refcnt),
1319                                  &LAFSI(inode)->iblock->b);
1320                 /* FIXME could there be Async blocks keeps a refcount?
1321                  * we should free them
1322                  */
1323                 spin_unlock(&inode->i_data.private_lock);
1324                 lafs_release_index(&LAFSI(inode)->free_index);
1325                 call_rcu(&LAFSI(inode)->md.rcu,
1326                          kfree_inode);
1327         }
1328 }
1329
1330 static int lafs_sync_fs(struct super_block *sb, int wait)
1331 {
1332         if (!wait)
1333                 /* We only reach here if s_dirt was set, so it
1334                  * is reasonable to force a checkpoint.
1335                  */
1336                 lafs_checkpoint_start(fs_from_sb(sb));
1337         else
1338                 printk("FIXME I should wait for the checkpoint to finish\n");
1339         return 0;
1340 }
1341
1342 static int lafs_statfs(struct dentry *de, struct kstatfs *buf)
1343 {
1344         int i;
1345         u32 fsid;
1346         u32 *fsuuid;
1347         struct fs *fs = fs_from_inode(de->d_inode);
1348         struct lafs_inode *root = LAFSI(fs->ss[0].root);
1349
1350         fsid = 0;
1351         fsuuid = (u32 *)fs->state->uuid;
1352         for (i = 0; i < 16 / 4 ; i++)
1353                 fsid ^= le32_to_cpu(fsuuid[i]);
1354
1355         spin_lock(&root->vfs_inode.i_lock);
1356         buf->f_type = 0x4C614654; /* "LaFS" */
1357         buf->f_bsize = fs->blocksize;
1358         buf->f_blocks = root->md.fs.blocks_allowed;
1359         if (buf->f_blocks == 0) {
1360                 /* should subtract usage of all other filesystems...*/
1361                 for (i = 0; i < fs->devs_loaded; i++)
1362                         buf->f_blocks += fs->devs[i].size;
1363         }
1364         /* "bavail" is "blocks we could succeed in adding to the filesystem".
1365          * "bfree" is effectively total blocks - used blocks
1366          */
1367         buf->f_bavail = fs->free_blocks + fs->clean_reserved - fs->allocated_blocks;
1368         buf->f_bfree = buf->f_blocks - (root->md.fs.cblocks_used +
1369                                         root->md.fs.pblocks_used +
1370                                         root->md.fs.ablocks_used);
1371         dprintk("df: tot=%ld free=%ld avail=%ld(%ld-%ld-%ld) cb=%ld pb=%ld ab=%ld\n",
1372                 (long)buf->f_blocks, (long)buf->f_bfree, (long)buf->f_bavail,
1373                 (long)fs->free_blocks, (long)fs->clean_reserved, (long)fs->allocated_blocks,
1374                 (long)root->md.fs.cblocks_used, (long)root->md.fs.pblocks_used,
1375                 (long)root->md.fs.ablocks_used);
1376
1377         buf->f_files = 0;
1378         buf->f_ffree = 0;
1379         buf->f_fsid.val[0] = fsid; /* FIXME */
1380         buf->f_namelen = 255;
1381         buf->f_frsize = 0;
1382         spin_unlock(&root->vfs_inode.i_lock);
1383         return 0;
1384 }
1385
1386 /* FIXME we hold inode_lock while calling drop_inode, so
1387  * extra locking isn't really welcome....???
1388  */
1389 static void lafs_drop_inode(struct inode *inode)
1390 {
1391         struct fs *fs = fs_from_inode(inode);
1392         struct datablock *db;
1393
1394         /* This lock that we now hold on the inode could prevent
1395          * the cleaner from getting the inode.  So after
1396          * the complete the drop we might need to wake the cleaner.
1397          */
1398
1399         db = lafs_inode_get_dblock(inode, MKREF(drop));
1400
1401         generic_drop_inode(inode);
1402         if (db && test_bit(B_Async, &db->b.flags))
1403                 lafs_wake_thread(fs);
1404         if (db)
1405                 putdref(db, MKREF(drop));
1406 }
1407
1408 static struct super_operations lafs_sops = {
1409         .alloc_inode    = lafs_alloc_inode,
1410         .destroy_inode  = lafs_destroy_inode,  /* Inverse of 'alloc_inode' */
1411         /* Don't use read_inode */
1412         .dirty_inode    = lafs_dirty_inode,
1413         /* .write_inode not needed */
1414         /* put_inode ?? */
1415         .drop_inode     = lafs_drop_inode,
1416         /* drop_inode ?? */                     /* default will call delete or forget
1417                                                  * where 'forget' flushes and clears
1418                                                  */
1419
1420         .clear_inode    = lafs_clear_inode,    /* forget internal state of this inode */
1421         .delete_inode   = lafs_delete_inode,   /* remove this inode from filesystem */
1422         .put_super      = lafs_put_super,
1423         .sync_fs        = lafs_sync_fs,
1424         /* write_super_lockfs ?? */
1425         /* unlockfs ?? */
1426         .statfs         = lafs_statfs,
1427         /* remount_fs ?? */
1428 };
1429
1430 MODULE_AUTHOR("Neil Brown");
1431 MODULE_DESCRIPTION("LaFS - Log Structured File System");
1432 MODULE_LICENSE("GPL");
1433 module_init(lafs_init);
1434 module_exit(lafs_exit);
1435 int lafs_trace = 1;
1436 module_param(lafs_trace, int, 0644);
1437
1438 #ifdef DUMP
1439 struct fs *dfs;
1440 static int do_dump(const char *val, struct kernel_param *kp)
1441 {
1442         extern void lafs_dump_orphans(void);
1443         extern void lafs_dump_tree(void);
1444         extern void lafs_dump_cleanable(void);
1445         extern void lafs_dump_usage(void);
1446
1447         printk("Want dump of %s\n", val);
1448         if (strncmp(val, "orphan", 6) == 0)
1449                 lafs_dump_orphans();
1450         if (strncmp(val, "tree", 4) == 0)
1451                 lafs_dump_tree();
1452         if (strncmp(val, "cleanable", 9) == 0)
1453                 lafs_dump_cleanable();
1454         if (strncmp(val, "usage", 5) == 0)
1455                 lafs_dump_usage();
1456         return 0;
1457 }
1458
1459 static int get_dump(char *buffer, struct kernel_param *kp)
1460 {
1461         strcpy(buffer, "orphans,tree,cleanable,usage");
1462         return strlen(buffer);
1463 }
1464
1465 int arg;
1466 module_param_call(dump, do_dump, get_dump, &arg, 0775);
1467 #endif