super.c

   1
   2 /*
   3  * fs/lafs/super.c
   4  * Copyright (C) 2005-2009
   5  * Neil Brown <neilb@suse.de>
   6  * Released under the GPL, version 2
   7  */
   8
   9 #include        "lafs.h"
  10 #include        <linux/namei.h>
  11 #include        <linux/crc32.h>
  12 #include        <linux/statfs.h>
  13 #include        <linux/mount.h>
  14 #include        <linux/exportfs.h>
  15 #include        <linux/slab.h>
  16
  17 static struct super_operations lafs_sops;
  18 static const struct export_operations lafs_export_ops;
  19
  20 /*---------------------------------------------------------------------
  21  * Write out state and super blocks
  22  *  The super blocks only need to be written when the geometry of the
  23  *  array changes such as when a device is added, removed, or resized.
  24  *  So we don't bother with that just yet.
  25  *  The state block needs to be written - twice on each device - whenever
  26  *  a checkpoint is completed.  All copies are identical and the writes
  27  *  proceed in parallel.  There are 4 stateblock locations on each device.
  28  *  2 are typically less recent than the other two.  We over-write the
  29  *  less-recent copies.
  30  *  FIXME on a RAID4 we should pad the write to be a full stripe.
  31  *
  32  * Locking issues:  This is called from the checkpoint thread and so
  33  *  it does not race with anything else exclusive to that thread.
  34  *  The nonlog information needs to be reviewed once that functionality
  35  *  is implemented.
  36  */
  37
  38 int lafs_write_state(struct fs *fs)
  39 {
  40         struct lafs_state *st;
  41         int i, d;
  42
  43         fs->seq++;
  44         st = fs->state;
  45         st->seq = cpu_to_le32(fs->seq);
  46         st->nonlog_segment = cpu_to_le32(fs->nonlog_segment);
  47         st->nonlog_dev = cpu_to_le16(fs->nonlog_dev);
  48         st->nonlog_offset = cpu_to_le16(fs->nonlog_offset);
  49         st->nextyouth = cpu_to_le16(fs->youth_next);
  50         st->checkpointcluster = cpu_to_le64(fs->checkpointcluster);
  51         for (i = 0; i < fs->maxsnapshot; i++)
  52                 st->root_inodes[i] = cpu_to_le64(fs->ss[i].root_addr);
  53
  54         st->checksum = 0;
  55         st->checksum = crc32_le(0, (unsigned char *)st, fs->statesize);
  56
  57         for (d = 0; d < fs->devices ; d++)
  58                 for (i = (fs->seq & 1); i < 4 ; i += 2)
  59                         lafs_super_write(fs, d, fs->devs[d].stateaddr[i] >> 9,
  60                                          (char *)st, fs->statesize);
  61         lafs_super_wait(fs);
  62         /* FIXME what about a write error ??? */
  63         return 0;
  64 }
  65
  66 static int
  67 valid_devblock(struct lafs_dev *db, sector_t addr, sector_t size)
  68 {
  69         /* check that this devblock is valid, given that
  70          * it was found at sector 'addr'
  71          */
  72         u32 crc, crc2;
  73         u64 byteaddr;
  74         sector_t segsize;
  75         int i;
  76
  77         if (strncmp(db->idtag, "LaFS-DeviceBlock", 16) != 0)
  78                 return 0;
  79         if (le32_to_cpu(db->version) != LAFS_DEV_VERS)
  80                 return 0;
  81         /* uuid can be anything */
  82         crc = db->checksum;
  83         db->checksum = 0;
  84         crc2 = crc32_le(0, (unsigned char *)db, LAFS_DEVBLK_SIZE);
  85         db->checksum = crc;
  86         if (crc2 != crc) {
  87                 dprintk("%lx != %lx\n", (unsigned long)crc,
  88                         (unsigned long)crc2);
  89                 return 0;
  90         }
  91
  92         byteaddr = (u64)addr << 9; /* convert to byte */
  93         if (le64_to_cpu(db->devaddr[0]) != byteaddr &&
  94             le64_to_cpu(db->devaddr[1]) != byteaddr)
  95                 return 0;
  96
  97         if (db->statebits < 10 || db->statebits > 16)
  98                 return 0;
  99         if (db->blockbits < 9 || db->blockbits > 20)
 100                 return 0;
 101         if (le16_to_cpu(db->width) < 1 || le16_to_cpu(db->width) >= 512)
 102                 return 0;
 103         if (le32_to_cpu(db->stride) < 1)
 104                 return 0;
 105         /* devaddr[0] must be early, [1] must be late */
 106         if (le64_to_cpu(db->devaddr[0]) >=
 107             le64_to_cpu(db->segment_offset))
 108                 return 0;
 109
 110         if (le64_to_cpu(db->devaddr[1]) <
 111             le64_to_cpu(db->segment_offset) +
 112             ((((sector_t)le32_to_cpu(db->segment_count)
 113                * le32_to_cpu(db->segment_size)))
 114              << db->blockbits))
 115                 return 0;
 116
 117         /* 2 is an absolute minimum segment size, a few hundred is more
 118          * likely. We'll put a lower limit of 8, and an upper of 800000
 119          */
 120         if (le32_to_cpu(db->segment_size) < 8 ||
 121             le32_to_cpu(db->segment_size) > 800000)
 122                 return 0;
 123
 124         if (le32_to_cpu(db->segment_offset) >
 125             (le32_to_cpu(db->segment_size)<<db->blockbits) * 10)
 126                 return 0;
 127
 128         /* The 4 state blocks live before the first or after the last segment.
 129          * The distance from start of first to end of last is either:
 130          * - segment_count * segment_size  if width*stride <= segment_size
 131          * - (width-1) * stride + segment_size / width * segment_count
 132          *                if width * stride > segment_size
 133          */
 134         segsize = le32_to_cpu(db->segment_size);
 135         segsize *= le32_to_cpu(db->segment_count);
 136         if (le16_to_cpu(db->width) *  le32_to_cpu(db->stride)
 137             > le32_to_cpu(db->segment_size)) {
 138                 int stride = le32_to_cpu(db->stride);
 139                 int width = le16_to_cpu(db->width);
 140
 141                 sector_div(segsize, width);
 142                 segsize += (width - 1) * stride;
 143         }
 144         segsize <<= db->blockbits;
 145         for (i = 0; i < 4; i++) {
 146                 sector_t addr = le64_to_cpu(db->stateaddr[i]);
 147                 int offset = le32_to_cpu(db->segment_offset);
 148                 if (addr + (1<<db->statebits) > offset &&
 149                     addr < offset + segsize)
 150                         return 0;
 151                 if (addr + (1<<db->statebits) > (size << db->blockbits))
 152                         return 0;
 153         }
 154
 155         /* Check all segments fit within device */
 156         if (le32_to_cpu(db->segment_offset) + segsize > (size << db->blockbits))
 157                 return 0;
 158
 159         /* I guess it look sane enough... */
 160         return 1;
 161 }
 162
 163 static int
 164 compare_dev(struct lafs_dev *orig, struct lafs_dev *new)
 165 {
 166         /* Both these are known to be valid.
 167          * Return:
 168          *   0 if they are for same filesystem, but 'new' is older
 169          *   1 if they are for same filesystem, and 'new' is newer
 170          *  -1 if they are for different filesystems
 171          */
 172         if (memcmp(orig->uuid, new->uuid, 16))
 173                 return -1;
 174         if (u32_after(le32_to_cpu(new->seq),
 175                       le32_to_cpu(orig->seq)))
 176                 return 1;
 177         return 0;
 178 }
 179
 180 static int
 181 valid_stateblock(struct lafs_state *st, struct lafs_dev *dv)
 182 {
 183         /* Given the 'dv' devblock, make sure 'st' is a valid
 184          * and consistent stateblock
 185          */
 186         u32 crc;
 187         if (strncmp(st->idtag, "LaFS-State-Block", 16) != 0)
 188                 return 0;
 189         if (le32_to_cpu(st->version) != LAFS_STATE_VERS)
 190                 return 0;
 191         crc = st->checksum;
 192         st->checksum = 0;
 193         if (crc32_le(0, (unsigned char *)st, 1<<dv->statebits) != crc)
 194                 return 0;
 195         st->checksum = crc;
 196
 197         if (memcmp(st->uuid, dv->uuid, 16))
 198                 return 0;
 199
 200         if (sizeof(*st) + le32_to_cpu(st->maxsnapshot) * 8
 201             > (1<<dv->statebits))
 202                 return 0;
 203
 204         /* Don't support RO sharing yet. */
 205         if (st->alt_seq)
 206                 return 0;
 207
 208         return 1;
 209 }
 210
 211 static int
 212 compare_state(struct lafs_state *orig, struct lafs_state *new)
 213 {
 214         /* return 1 if 'new' is actually newer than 'orig'.
 215          * We already know they are both valid and have the same
 216          * uuid... I don't think there is anything else to be checked
 217          */
 218         return u32_after(le32_to_cpu(new->seq), le32_to_cpu(orig->seq));
 219 }
 220
 221 /*
 222  * Mount options.
 223  * As we can have multiple devices, things are slightly non-obvious.
 224  * The 'devname' can be either a device name, starting '/', or
 225  * a filesytem name (not starting '/').
 226  * The 'data' is a standard comma-separated list of options.
 227  * For 'mount' these are:
 228  *    dev=/dev/X
 229  *              - devices in addition to 'dev_name'
 230  *    new=/dev/X
 231  *              - A new device, with a superblock already present, to be added.
 232  *    incomplete
 233  *              - don't complain if not all devices are given
 234  *    ?? quota stuff, cleaning parameters,
 235  *
 236  * For 'remount', options are
 237  *    dev=  - add another device
 238  *    new=  - the device is being added.
 239  *
 240  */
 241
 242 struct options {
 243         int devcnt;
 244         int curr_dev;
 245         int statebits, blockbits;
 246         struct devent {
 247                 const char *dev;
 248                 int is_new;
 249                 int is_name;
 250                 struct block_device *bdev;
 251                 struct lafs_dev *devblock;
 252                 struct lafs_state *stateblock;
 253                 int devchoice, statechoice;
 254         } *devlist;
 255         const char *name;
 256 };
 257 static int
 258 count_devs(const char *name, char *data)
 259 {
 260         int cnt = 0;
 261         if (*name == '/')
 262                 cnt = 1;
 263         while (data && *data) {
 264                 if (strncmp(data, "dev=", 4) == 0)
 265                         cnt++;
 266                 if (strncmp(data, "new=", 4) == 0)
 267                         cnt++;
 268                 data = strchr(data, ',');
 269                 if (data)
 270                         data++;
 271         }
 272         return cnt;
 273 }
 274
 275 static int
 276 parse_opts(struct options *op, const char *name, char *data)
 277 {
 278         int dv = 0;
 279         char *p;
 280
 281         memset(op, 0, sizeof(*op));
 282         op->devcnt = count_devs(name, data);
 283         op->devlist = kzalloc(op->devcnt*sizeof(op->devlist[0]), GFP_KERNEL);
 284
 285         if (!op->devlist)
 286                 return -ENOMEM;
 287
 288         op->name = NULL;
 289         if (*name == '/') {
 290                 op->devlist[dv].is_name = 1;
 291                 op->devlist[dv++].dev = name;
 292         } else
 293                 op->name = name;
 294         while ((p = strsep(&data, ",")) != NULL) {
 295                 if (!*p)
 296                         continue;
 297                 if (strncmp(p, "dev=", 4) == 0)
 298                         op->devlist[dv++].dev = p+4;
 299                 else if (strncmp(p, "new=", 4) == 0) {
 300                         op->devlist[dv].is_new = 1;
 301                         op->devlist[dv++].dev = p+4;
 302                 } else {
 303                         printk(KERN_ERR
 304                                "LaFS: Unrecognised mount option \"%s\"\n", p);
 305                         return -EINVAL;
 306
 307                 }
 308         }
 309         op->devcnt = dv;
 310
 311         return 0;
 312 }
 313
 314 static int
 315 lafs_load_super(struct block_device *bdev, void *opv, int silent)
 316 {
 317         /* Find the devblock and the stateblock for this device
 318
 319          * Only do basic internal consistancy checks.  Inter-device
 320          * checks happen later
 321          */
 322         struct options *op = opv;
 323         struct devent *dv;
 324         struct page *pg;
 325         sector_t sect, dev_addr = 0;
 326         int state_addr = 0;
 327         int err = 0;
 328         unsigned int n;
 329         int i;
 330         int have_dev = 0, have_state = 0;
 331         sector_t devsize;
 332
 333         dv = &op->devlist[op->curr_dev];
 334         BUG_ON(dv->devblock);
 335         BUG_ON(dv->stateblock);
 336
 337         n = queue_logical_block_size(bdev->bd_disk->queue);
 338         if (n < LAFS_DEVBLK_SIZE)
 339                 n = LAFS_DEVBLK_SIZE;
 340         BUG_ON(n > PAGE_SIZE);
 341         dv->devblock = kmalloc(n, GFP_KERNEL);
 342         if (!dv->devblock)
 343                 return -ENOMEM;
 344         pg = alloc_page(GFP_KERNEL);
 345         if (!pg)
 346                 return -ENOMEM;
 347
 348         devsize = i_size_read(bdev->bd_inode);
 349
 350         /* Now find a devblock, check the first two possible locations,
 351          * and the last two.  If two devblocks are found with different
 352          * uuids, we are confused!
 353          */
 354         sect = 0;
 355         for (i = 0; i < 4; i++) {
 356                 /* try to read block at 'sect' */
 357                 int ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
 358
 359                 if (ok && valid_devblock(page_address(pg), sect, devsize)) {
 360                         if (!have_dev) {
 361                                 have_dev = 1;
 362                                 memcpy(dv->devblock, page_address(pg), n);
 363                                 dev_addr = sect;
 364                         } else switch (compare_dev(dv->devblock,
 365                                                    page_address(pg))) {
 366                                 case 0: /* older, do nothing */
 367                                         break;
 368                                 case 1: /* newer, overwrite */
 369                                         memcpy(dv->devblock, page_address(pg), n);
 370                                         dev_addr = sect;
 371                                         break;
 372                                 default: /* inconsistent --- HELP */
 373                                         printk(KERN_ERR "LaFS: inconsistent device-blocks found.\n");
 374                                         err = -EINVAL;
 375                                         goto out;
 376                                 }
 377                 }
 378
 379                 if (i != 1)
 380                         sect += (n>>9);
 381                 else {
 382                         sect = devsize & ~(sector_t)(n-1);
 383                         sect >>= 9;
 384                         sect -= (n>>9)*2;
 385                 }
 386         }
 387         /* FIXME - we've lost the read error, if it was significant */
 388         err = -EINVAL;
 389         if (!have_dev) {
 390                 if (!silent)
 391                         printk(KERN_ERR "LaFS - no valid devblock found.\n");
 392                 goto out;
 393         }
 394
 395         /* OK, we have a valid devblock, that's nice.
 396          * Now we should be able to find some stateblocks.
 397          * The locations are in the devblock
 398          */
 399         n = le32_to_cpu(1<<dv->devblock->statebits);
 400         if ((n & (n-1)) ||
 401             n < queue_logical_block_size(bdev->bd_disk->queue) ||
 402             n > 128*1024) {
 403                 printk(KERN_ERR "LaFS: statesize of %u not acceptable.\n", n);
 404                 err = -EINVAL;
 405                 goto out;
 406         }
 407         dv->stateblock = kmalloc(n, GFP_KERNEL);
 408         err = -ENOMEM;
 409         if (!dv->stateblock)
 410                 goto out;
 411         for (i = 0; i < 4; i++) {
 412                 int ok;
 413                 sect = le64_to_cpu(dv->devblock->stateaddr[i])>>9;
 414                 ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
 415                 if (ok && valid_stateblock(page_address(pg), dv->devblock)) {
 416                         if (!have_state) {
 417                                 have_state = 1;
 418                                 memcpy(dv->stateblock, page_address(pg), n);
 419                                 state_addr = i;
 420                         } else if (compare_state(dv->stateblock,
 421                                                  page_address(pg))) {
 422                                 memcpy(dv->stateblock, page_address(pg), n);
 423                                 state_addr = i;
 424                         }
 425                 }
 426         }
 427
 428         if (have_state) {
 429                 err = 0;
 430                 dv->devchoice = dev_addr;
 431                 dv->statechoice = state_addr;
 432         } else {
 433                 err = -EINVAL;
 434                 if (!silent)
 435                         printk(KERN_ERR "LaFS: no valid stateblock found.\n");
 436         }
 437 out:
 438         page_cache_release(pg);
 439         return err;
 440 }
 441
 442 static int
 443 check_devs(struct options *op)
 444 {
 445         /* Check we have enough, that they are for the same
 446          * uuid, and they they don't overlap
 447          * Also check that 'seq' number of devblocks
 448          * are within '1'
 449          */
 450         int seqlo = le32_to_cpu(op->devlist[0].devblock->seq);
 451         int seqhi = le32_to_cpu(op->devlist[0].devblock->seq);
 452         int newdev = 0;
 453         int newstate = 0;
 454         int i, j;
 455
 456         for (i = 1; i < op->devcnt; i++) {
 457                 if (memcmp(op->devlist[0].stateblock->uuid,
 458                            op->devlist[i].stateblock->uuid,
 459                            16) != 0)
 460                         return -EINVAL;
 461
 462                 if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo)
 463                         ;
 464                 else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo+1) {
 465                         newdev = i;
 466                         seqhi = seqlo+1;
 467                 } else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqhi-1)
 468                         seqlo = seqhi-1;
 469                 else
 470                         return -EINVAL;
 471
 472                 if (u32_after(le32_to_cpu(op->devlist[i].stateblock->seq),
 473                               le32_to_cpu(op->devlist[newstate].
 474                                           stateblock->seq)))
 475                         newstate = i;
 476         }
 477         if (le32_to_cpu(op->devlist[newstate].stateblock->devices)
 478             != op->devcnt)
 479                 return -EINVAL;
 480
 481         op->statebits = op->devlist[0].devblock->statebits;
 482         op->blockbits = op->devlist[0].devblock->blockbits;
 483
 484         /* Now check devices don't overlap in start/size.
 485          * We do a simple quadratic search
 486          */
 487         for (i = 0; i < op->devcnt; i++)
 488                 for (j = 0; j < op->devcnt; j++)
 489                         if (i != j)
 490                                 if (le64_to_cpu(op->devlist[i].devblock->start) <
 491                                     le64_to_cpu(op->devlist[j].devblock->start) &&
 492
 493                                     le64_to_cpu(op->devlist[i].devblock->start)+
 494                                     le64_to_cpu(op->devlist[i].devblock->size) >
 495                                     le64_to_cpu(op->devlist[j].devblock->start))
 496                                         return -EINVAL;
 497         return newstate;
 498 }
 499
 500 /* we identify lafs superblocks by the filesystem uuid.  This means
 501  * that block-level snapshots cannot be mounted.  You should use
 502  * fs-level snapshots instead.
 503  */
 504 static int sb_test(struct super_block *sb, void *data)
 505 {
 506         struct sb_key *ptn = data;
 507         struct sb_key *sk = sb->s_fs_info;
 508         return memcmp(ptn->fs->state->uuid,
 509                       sk->fs->state->uuid, 16) == 0;
 510 }
 511
 512 static int sb_set(struct super_block *sb, void *data)
 513 {
 514         struct sb_key *ptn = data;
 515         sb->s_fs_info = ptn;
 516         return set_anon_super(sb, NULL);
 517 }
 518
 519
 520 static int
 521 lafs_load(struct fs *fs, struct options *op, int newest)
 522 {
 523         /* We seem to have a full set of devices for the filesystem.
 524          * Time to create our fs_info structure and fill it out.
 525          * This only includes information from the dev and state blocks.
 526          * Finding the root-inode comes a bit later.
 527          */
 528         struct lafs_state *st;
 529         int i;
 530         int err;
 531         struct sb_key *k;
 532
 533         st = fs->state = op->devlist[newest].stateblock;
 534         op->devlist[newest].stateblock = NULL;
 535 #ifdef DUMP
 536         dfs = fs;
 537 #endif
 538
 539         fs->seq = le32_to_cpu(st->seq);
 540         fs->devices = op->devcnt;
 541         fs->devs_loaded = fs->devices; /* FIXME use this or lose this */
 542         fs->statesize = 1 << op->statebits;
 543         fs->blocksize = 1 << op->blockbits;
 544         fs->blocksize_bits = op->blockbits;
 545
 546         fs->nonlog_segment = le32_to_cpu(st->nonlog_segment);
 547         fs->nonlog_dev = le16_to_cpu(st->nonlog_dev);
 548         fs->nonlog_offset = le16_to_cpu(st->nonlog_offset);
 549         fs->youth_next = le16_to_cpu(st->nextyouth);
 550         fs->checkpoint_youth = fs->youth_next;
 551         if (fs->youth_next < 8)
 552                 fs->youth_next = 8;
 553         fs->scan.first_free_pass = 1;
 554         fs->scan.free_dev = -1;
 555
 556         fs->maxsnapshot = le32_to_cpu(st->maxsnapshot);
 557
 558         fs->scan.free_usages = kmalloc(PAGE_SIZE, GFP_KERNEL);
 559         err = lafs_segtrack_init(fs->segtrack);
 560
 561         fs->ss = kzalloc(sizeof(struct snapshot)*fs->maxsnapshot, GFP_KERNEL);
 562         if (!fs->ss || !fs->scan.free_usages || err) {
 563                 if (!err)
 564                         err = -ENOMEM;
 565                 goto abort;
 566         }
 567
 568         fs->checkpointcluster = le64_to_cpu(st->checkpointcluster);
 569         for (i = 0; i < fs->maxsnapshot; i++) {
 570                 fs->ss[i].root_addr =
 571                         le64_to_cpu(st->root_inodes[i]);
 572                 dprintk("root inode %d are %llu\n",
 573                         i, fs->ss[i].root_addr);
 574         }
 575         INIT_LIST_HEAD(&fs->pending_orphans);
 576         INIT_LIST_HEAD(&fs->inode_index);
 577         INIT_LIST_HEAD(&fs->phase_leafs[0]);
 578         INIT_LIST_HEAD(&fs->phase_leafs[1]);
 579         INIT_LIST_HEAD(&fs->clean_leafs);
 580         INIT_LIST_HEAD(&fs->account_leafs);
 581         atomic_set(&fs->sb_writes_pending, 0);
 582         init_waitqueue_head(&fs->sb_writes_wait);
 583         init_waitqueue_head(&fs->async_complete);
 584         init_waitqueue_head(&fs->trunc_wait);
 585         mutex_init(&fs->cleaner.lock);
 586         spin_lock_init(&fs->stable_lock);
 587         spin_lock_init(&fs->alloc_lock);
 588         spin_lock_init(&fs->lock);
 589         init_waitqueue_head(&fs->phase_wait);
 590
 591         INIT_WORK(&fs->done_work, lafs_done_work);
 592
 593         /* FIXME add congention and unplug functions to this bdi */
 594         err = bdi_init(&fs->bdi);
 595         if (err)
 596                 goto abort;
 597
 598
 599         fs->phase_locked = 0;
 600         for (i = 0; i < WC_NUM; i++) {
 601                 int j;
 602                 mutex_init(&fs->wc[i].lock);
 603                 for (j = 0; j < 4 ; j++) {
 604                         atomic_set(&fs->wc[i].pending_cnt[j], 0);
 605                         INIT_LIST_HEAD(&fs->wc[i].pending_blocks[j]);
 606                 }
 607                 init_waitqueue_head(&fs->wc[i].pending_wait);
 608                 fs->wc[i].seg.dev = -1;
 609         }
 610
 611         fs->max_newsegs = 32; /* FIXME this should be configurable */
 612
 613         err = -ENOMEM;
 614         fs->devs = kzalloc(sizeof(struct fs_dev)*fs->devices, GFP_KERNEL);
 615         if (!fs->devs)
 616                 goto abort;
 617
 618         k = kzalloc(sizeof(*k), GFP_KERNEL);
 619         k->fs = fs;
 620         fs->prime_sb = sget(&lafs_fs_type, sb_test, sb_set, k);
 621         if (IS_ERR(fs->prime_sb)) {
 622                 kfree(k);
 623                 err = PTR_ERR(fs->prime_sb);
 624                 goto abort;
 625         }
 626         if (fs->prime_sb->s_root) {
 627                 /* filesystem with this uuid already exists */
 628                 deactivate_locked_super(fs->prime_sb);
 629                 kfree(k);
 630                 fs->prime_sb = NULL;
 631                 err = -EBUSY;
 632                 goto abort;
 633         }
 634         err = bdi_register_dev(&fs->bdi, fs->prime_sb->s_dev);
 635         if (err) {
 636                 deactivate_locked_super(fs->prime_sb);
 637                 kfree(k);
 638                 fs->prime_sb = NULL;
 639                 goto abort;
 640         }
 641         fs->prime_sb->s_bdi = &fs->bdi;
 642
 643         fs->prime_sb->s_blocksize = 1 << op->blockbits;
 644         fs->prime_sb->s_blocksize_bits = op->blockbits;
 645         fs->prime_sb->s_op = &lafs_sops;
 646         fs->prime_sb->s_export_op = &lafs_export_ops;
 647         fs->prime_sb->s_root = NULL;
 648
 649         /* We allow 29 bits for nanosecs, so they must be even. */
 650         fs->prime_sb->s_time_gran = 2;
 651
 652         for (i = 0; i < fs->devices; i++) {
 653                 struct fs_dev *dv = &fs->devs[i];
 654                 struct devent *de = &op->devlist[i];
 655                 int j;
 656                 dv->bdev = de->bdev;
 657                 de->bdev = NULL;
 658
 659                 dv->devblk = de->devblock;
 660                 de->devblock = NULL;
 661
 662                 dv->recent_dev = de->devchoice;
 663                 dv->recent_state = de->statechoice;
 664
 665                 dv->start = le64_to_cpu(dv->devblk->start);
 666                 dv->size = le64_to_cpu(dv->devblk->size);
 667                 dprintk("Dev %d seems to range %llu + %llu\n",
 668                         i, (unsigned long long)dv->start,
 669                         (unsigned long long)dv->size);
 670
 671                 dv->width = le16_to_cpu(dv->devblk->width);
 672                 dv->stride = le32_to_cpu(dv->devblk->stride);
 673                 dv->segment_size = le32_to_cpu(dv->devblk->segment_size);
 674                 dv->segment_offset = le32_to_cpu(dv->devblk->segment_offset);
 675                 dv->segment_count = le32_to_cpu(dv->devblk->segment_count);
 676                 dv->usage_inum = le32_to_cpu(dv->devblk->usage_inum);
 677
 678                 if (dv->segment_size > fs->max_segment)
 679                         fs->max_segment = dv->segment_size;
 680
 681                 if (dv->width * dv->stride <= dv->segment_size) {
 682                         dv->tables_per_seg = dv->segment_size /
 683                                 dv->width / dv->stride;
 684                         dv->rows_per_table = dv->stride;
 685                         dv->segment_stride = dv->segment_size;
 686                 } else {
 687                         dv->tables_per_seg = 1;
 688                         dv->rows_per_table = dv->segment_size / dv->width;
 689                         dv->segment_stride = dv->rows_per_table;
 690                 }
 691                 /* table size is the number of blocks in the segment usage
 692                  * file per snapshot
 693                  */
 694                 dv->tablesize = (dv->segment_count + (1<<(fs->blocksize_bits-1)) + 1)
 695                         >> (fs->blocksize_bits-1);
 696
 697                 for (j = 0; j < 2; j++)
 698                         dv->devaddr[j] = le64_to_cpu(dv->devblk->devaddr[j]);
 699                 for (j = 0; j < 4; j++)
 700                         dv->stateaddr[j] = le64_to_cpu(dv->devblk->stateaddr[j]);
 701         }
 702         return 0;
 703
 704 abort:
 705         bdi_destroy(&fs->bdi);
 706         kfree(fs->scan.free_usages);
 707         lafs_segtrack_free(fs->segtrack);
 708         kfree(fs->devs);
 709         kfree(fs->ss);
 710         kfree(fs);
 711         return -ENOMEM;
 712 }
 713
 714 static int show_orphans(struct fs *fs)
 715 {
 716         struct datablock *db;
 717         printk("Orphans:\n");
 718         list_for_each_entry(db, &fs->pending_orphans,
 719                             orphans) {
 720                 struct inode *ino = iget_my_inode(db);
 721                 printk("orphan=%s\n", strblk(&db->b));
 722                 if (ino)
 723                         lafs_print_tree(&LAFSI(ino)->iblock->b, 0);
 724                 iput(ino);
 725         }
 726         printk("cleaner active: %d %d\n", fs->cleaner.active,
 727                fs->scan.done);
 728         return 1; /* meaningless, but makes it easy to add to wait_event below */
 729 }
 730
 731 static void lafs_kill_sb(struct super_block *sb)
 732 {
 733         struct fs *fs = fs_from_sb(sb);
 734         /* Release the 'struct fs' */
 735         int i;
 736
 737         /* FIXME should I refcount this when there are multiple
 738          * filesets? How does that work?
 739          */
 740
 741         /* Delay final destruction of the root inode */
 742         /* FIXME all the sbs... */
 743         set_bit(I_Deleting, &LAFSI(fs->ss[0].root)->iflags);
 744
 745         /* FIXME I'm not sure we should be waiting for the
 746          * cleaner.  Maybe we should just release all tc->cleaning
 747          * blocks instead.
 748          */
 749         set_bit(CleanerDisabled, &fs->fsstate);
 750
 751         wait_event(fs->async_complete,
 752                    show_orphans(fs) &&
 753                    !test_bit(OrphansRunning, &fs->fsstate) &&
 754                    list_empty(&fs->pending_orphans) &&
 755                    fs->scan.done == 1 &&
 756                    fs->cleaner.active == 0);
 757
 758         if (LAFSI(fs->ss[0].root)->md.fs.accesstime) {
 759                 struct inode *i = LAFSI(fs->ss[0].root)->md.fs.accesstime;
 760                 LAFSI(fs->ss[0].root)->md.fs.accesstime = NULL;
 761                 iput(i);
 762         }
 763
 764         kill_anon_super(fs->prime_sb);
 765
 766         bdi_destroy(&fs->bdi);
 767
 768         for (i = 0; i < fs->devices; i++) {
 769                 struct fs_dev *dv = &fs->devs[i];
 770                 kfree(dv->devblk);
 771                 blkdev_put(dv->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 772         }
 773
 774         /* Final checkpoint will have cleared out the leafs lists,
 775          * so they should all be empty.
 776          */
 777         /* Lets see what is on the 'leaf' list? */
 778         for (i = 0; i < 2; i++) {
 779                 struct block *b;
 780                 dprintk("For phase %d\n", i);
 781         retry:
 782                 list_for_each_entry(b, &fs->phase_leafs[i], lru) {
 783                         /* FIXME this only OK for readonly mounts.
 784                          */
 785                         getref(b, MKREF(release));
 786                         lafs_refile(b, 0);
 787                         if (test_bit(B_Pinned, &b->flags)) {
 788                                 /* didn't fix the pincnt !! */
 789                                 printk("This was pinned: %s\n", strblk(b));
 790                                 lafs_print_tree(b, 1);
 791                                 BUG();
 792                         }
 793                         putref(b, MKREF(release));
 794                         goto retry;
 795                 }
 796         }
 797         BUG_ON(!list_empty(&fs->clean_leafs));
 798
 799         flush_scheduled_work();
 800         lafs_stop_thread(fs);
 801
 802         for (i = 0; i < 4; i++)
 803                 if (fs->cleaner.seg[i].chead)
 804                         put_page(fs->cleaner.seg[i].chead);
 805
 806         kfree(fs->state);
 807         kfree(fs->ss);
 808         kfree(fs->devs);
 809         lafs_segtrack_free(fs->segtrack);
 810         kfree(fs->scan.free_usages);
 811         kfree(fs->prime_sb->s_fs_info);
 812         kfree(fs);
 813 }
 814
 815 static void
 816 lafs_put_super(struct super_block *sb)
 817 {
 818         struct fs *fs = fs_from_sb(sb);
 819         int ss;
 820         struct lafs_inode *li;
 821
 822         /* If !fs->thread, we never really mounted the fs, so this
 823          * cleanup is inappropriate .. and cannot work anyway.
 824          */
 825         if (fs->thread) {
 826                 lafs_checkpoint_lock(fs);
 827                 lafs_checkpoint_start(fs);
 828                 if (sb == fs->prime_sb)
 829                         /* Don't incorporate any more segusage/quota updates. */
 830                         set_bit(FinalCheckpoint, &fs->fsstate);
 831                 lafs_checkpoint_unlock_wait(fs);
 832                 lafs_cluster_wait_all(fs);
 833         }
 834
 835         if (sb == fs->prime_sb) {
 836                 int d;
 837                 /* This is the main sb, not a snapshot or
 838                  * subordinate fs.
 839                  * Now that all inodes have been invalidated we can do
 840                  * the final checkpoint.
 841                  */
 842                 lafs_close_all_segments(fs);
 843                 lafs_empty_segment_table(fs);
 844                 lafs_seg_put_all(fs);
 845
 846                 iput(fs->orphans);
 847                 fs->orphans = NULL;
 848                 for (d=0; d < fs->devices; d++)
 849                         if (fs->devs[d].segsum) {
 850                                 iput(fs->devs[d].segsum);
 851                                 fs->devs[d].segsum = NULL;
 852                         }
 853         }
 854
 855         /* need to break a circular reference... */
 856         for (ss = 0; ss < fs->maxsnapshot; ss++)
 857                 if (fs->ss[ss].root &&
 858                     fs->ss[ss].root->i_sb == sb) {
 859                         dprintk("Putting ss %d\n", ss);
 860                         li = LAFSI(fs->ss[ss].root);
 861                         if (test_bit(B_Realloc, &li->dblock->b.flags))
 862                                 lafs_dump_tree();
 863                         iput(fs->ss[ss].root);
 864                         fs->ss[ss].root = NULL;
 865                         break;
 866                 }
 867 }
 868
 869 static int
 870 lafs_get_devs(struct fs *fs, struct options *op, int flags)
 871 {
 872         int err;
 873         int i;
 874
 875         for (i = 0; i < op->devcnt; i++) {
 876                 struct block_device *bdev;
 877                 op->curr_dev = i;
 878
 879                 bdev = blkdev_get_by_path(op->devlist[i].dev,
 880                                           FMODE_READ|FMODE_WRITE|FMODE_EXCL,
 881                                           fs);
 882                 err = PTR_ERR(bdev);
 883                 if (IS_ERR(bdev))
 884                         goto out;
 885                 err = lafs_load_super(bdev, op, flags & MS_SILENT ? 1 : 0);
 886                 if (err < 0)
 887                         goto out;
 888                 op->devlist[i].bdev = bdev;
 889         }
 890         return 0;
 891
 892 out:
 893         return err;
 894 }
 895
 896 static int
 897 lafs_get_sb(struct file_system_type *fs_type,
 898             int flags, const char *dev_name, void *data,
 899             struct vfsmount *mnt)
 900 {
 901         /* as we may have multiple devices, some in 'data', we cannot just
 902          * use get_sb_bdev, we need to roll-our-own.
 903          * We call get_sb_bdev on *each* bdev, and make sure the returned
 904          * superblocks are either all new, or all for the same filesystem.
 905          * If the later, we return the primary.
 906          * If the former, we init the filesystem copying static data
 907          * to all supers.
 908          * First we 'blkdev_get_by_path' each device, exclusive to lafs
 909          * Then we 'sget' a superblock that knows any/all the devices.
 910          * This may be pre-existing, or may be new
 911          * If new, it will be created knowing all devices.
 912          * If pre-existing, and don't have correct device list, error
 913          */
 914         struct options op;
 915         int err;
 916         int newest;
 917         struct fs *fs = kzalloc(sizeof(*fs), GFP_KERNEL);
 918         char *cdata = data;
 919         if (cdata == NULL)
 920                 cdata = "";
 921
 922         err = -ENOMEM;
 923         if (!fs)
 924                 goto out;
 925         err = parse_opts(&op, dev_name, cdata);
 926         if (err)
 927                 goto out;
 928
 929         /* We now have as list of device names.  We call blkdev_get_by_path
 930          * on each to collect some superblocks.
 931          */
 932         err = lafs_get_devs(fs, &op, flags);
 933         if (err)
 934                 goto out;
 935
 936         /* Each device has a valid dev and state block.  Hopefully they
 937          * are all for the same filesystem.  If they don't have the
 938          * same uuid, we will bale-out here.  We also check that we have
 939          * enough, and that they don't overlap.
 940          * While we are looking at state blocks, pick the newest.
 941          */
 942         newest = check_devs(&op);
 943         if (newest < 0) {
 944                 err = newest;
 945                 goto out;
 946         }
 947
 948         /* So they seem to be the same - better create our
 949          * 'fs' structure and fill it in
 950          */
 951         err = lafs_load(fs, &op, newest);
 952         if (err)
 953                 goto out;
 954
 955         /* Well, all the devices check out.  Now we need to find the
 956          * filesystem */
 957         err = lafs_mount(fs);
 958         if (err == 0)
 959                 err = lafs_start_thread(fs);
 960         if (err) {
 961                 /* Don't wait for any scan to finish ... */
 962                 fs->scan.done = 1;
 963                 fs->checkpointing = 0;
 964                 deactivate_locked_super(fs->prime_sb);
 965         } else {
 966                 fs->prime_sb->s_flags |= MS_ACTIVE;
 967                 simple_set_mnt(mnt, fs->prime_sb);
 968         }
 969         /* And there you have it.  Filesystem all mounted, root dir found,
 970          * metadata files initialised, all pigs fed, and ready to fly!!!
 971          */
 972
 973 out:
 974         /* Now we clean up 'options'.  Anything that is wanted has
 975          * been moved into 'fs', so we just discard anything we find
 976          */
 977         if (op.devlist) {
 978                 int i;
 979                 for (i = 0; i < op.devcnt; i++) {
 980                         kfree(op.devlist[i].devblock);
 981                         kfree(op.devlist[i].stateblock);
 982                         if (op.devlist[i].bdev)
 983                                 blkdev_put(op.devlist[i].bdev,
 984                                            FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 985                 }
 986                 kfree(op.devlist);
 987         }
 988         return err;
 989 }
 990
 991 static struct dentry *lafs_get_subset_root(struct inode *ino)
 992 {
 993         /* ino must be a TypeInodeFile inode in the prime filesystem. */
 994         struct fs *fs = fs_from_inode(ino);
 995         struct super_block *sb;
 996         int err = 0;
 997         struct inode *rootdir, *imapfile;
 998         struct dentry *root = NULL;
 999
1000         sb = fs->prime_sb;
1001
1002         rootdir = lafs_iget(ino, 2, SYNC);
1003         if (IS_ERR(rootdir) && PTR_ERR(rootdir) == -ENOENT) {
1004                 rootdir = lafs_new_inode(fs, ino, NULL,
1005                                          TypeDir, 2, 0755, NULL);
1006                 /* FIXME could the inode get written before we set
1007                  * the link count ??*/
1008                 rootdir->i_nlink = 2;
1009         }
1010         if (IS_ERR(rootdir))
1011                 err = PTR_ERR(rootdir);
1012         else {
1013                 root = d_alloc_root(rootdir);
1014                 imapfile = lafs_iget(ino, 1, SYNC);
1015                 if (IS_ERR(imapfile) && PTR_ERR(imapfile) == -ENOENT)
1016                         imapfile = lafs_new_inode(fs, ino, NULL,
1017                                                   TypeInodeMap, 1, 0, NULL);
1018
1019                 if (IS_ERR(imapfile))
1020                         err = PTR_ERR(imapfile);
1021                 else
1022                         iput(imapfile);
1023         }
1024
1025         if (!err) {
1026                 struct inode *atime = lafs_iget(ino, 3, SYNC);
1027                 if (!IS_ERR(atime)) {
1028                         if (LAFSI(atime)->type != TypeAccessTime) {
1029                                 iput(atime);
1030                                 err = -EINVAL;
1031                         } else
1032                                 LAFSI(ino)->md.fs.accesstime = atime;
1033                 } else if (PTR_ERR(atime) != -ENOENT)
1034                         err = PTR_ERR(ino);
1035         }
1036         if (err) {
1037                 dput(root);
1038                 return ERR_PTR(err);
1039         } else
1040                 return root;
1041 }
1042
1043 static int
1044 lafs_get_subset(struct file_system_type *fs_type,
1045                 int flags, const char *dev_name, void *data,
1046                 struct vfsmount *mnt)
1047 {
1048         /* mount, possibly creating, a sub-fileset.
1049          * dev_name must be an absolute path that leads
1050          * to an object in a lafs file-system (or snapshot).
1051          * The object must be either an InodeFile or
1052          * an empty directory in the main file-system
1053          * with mode 0 (though that rule might change).
1054          * In the latter case we change the object to an
1055          * InodeFile
1056          * FIXME must require readonly for snapshots, and readwrite
1057          * to create.
1058          */
1059
1060         struct nameidata nd;
1061         int err;
1062         struct super_block *sb;
1063         struct inode *ino;
1064         struct fs *fs;
1065         struct dentry *root;
1066
1067         err = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
1068         if (err)
1069                 goto out_noput;
1070         sb = nd.path.dentry->d_sb;
1071         err = -EINVAL;
1072         if (sb->s_type != &lafs_fs_type &&
1073             sb->s_type != &lafs_snap_fs_type)
1074                 goto out;
1075         /* FIXME test not a subset filesystem */
1076         ino = nd.path.dentry->d_inode;
1077         if (LAFSI(ino)->type != TypeInodeFile &&
1078             LAFSI(ino)->type != TypeDir)
1079                 goto out;
1080         fs = fs_from_sb(sb);
1081         down_write(&sb->s_umount);
1082         mutex_lock(&ino->i_mutex);
1083         err = 0;
1084         if (LAFSI(ino)->type == TypeDir) {
1085                 struct datablock *inodb;
1086                 /* maybe convert this to TypeInodeFile */
1087                 err = -EINVAL;
1088                 if (sb->s_type != &lafs_fs_type)
1089                         goto out_unlock;
1090                 if (ino->i_size)
1091                         /* FIXME maybe I should run orphans */
1092                         goto out_unlock;
1093                 if ((ino->i_mode & 07777) != 0)
1094                         goto out_unlock;
1095                 inodb = lafs_inode_dblock(ino, SYNC, MKREF(make_subset));
1096                 err = PTR_ERR(inodb);
1097                 if (IS_ERR(inodb))
1098                         goto out_unlock;
1099                 lafs_iolock_block(&inodb->b);
1100                 set_bit(B_PinPending, &inodb->b.flags);
1101                 lafs_iounlock_block(&inodb->b);
1102                 lafs_checkpoint_lock(fs);
1103                 err = lafs_pin_dblock(inodb, ReleaseSpace);
1104                 if (!err) {
1105                         struct fs_md *md;
1106                         u32 parent = LAFSI(ino)->md.file.parent;
1107                         /* OK, we are good to go making this filesystem */
1108                         LAFSI(ino)->type = TypeInodeFile;
1109                         LAFSI(ino)->metadata_size = (sizeof(struct la_inode) +
1110                                                      sizeof(struct fs_metadata));
1111                         ino->i_op = &lafs_subset_ino_operations;
1112                         ino->i_fop = &lafs_subset_file_operations;
1113                         md = &LAFSI(ino)->md.fs;
1114                         md->usagetable = 0;
1115                         ino->i_mtime = current_fs_time(sb);
1116                         md->cblocks_used = 0;
1117                         md->pblocks_used = 0;
1118                         md->ablocks_used = 0;
1119                         md->blocks_allowed = 10000; /* FIXME */
1120                         md->blocks_unalloc = 0;
1121                         /* FIXME should I be using inode_init here */
1122                         md->creation_age = fs->wc[0].cluster_seq;
1123                         md->inodes_used = 0;
1124                         md->parent = parent;
1125                         md->quota_inums[0] = 0;
1126                         md->quota_inums[1] = 0;
1127                         md->quota_inums[2] = 0;
1128                         md->quota_inodes[0] = NULL;
1129                         md->quota_inodes[1] = NULL;
1130                         md->quota_inodes[2] = NULL;
1131                         md->accesstime = NULL;
1132                         md->name = NULL;
1133                         lafs_dirty_dblock(inodb);
1134                         lafs_dirty_inode(ino);
1135                         /* We use a checkpoint to commit this change,
1136                          * it is too unusual to bother logging
1137                          */
1138                         lafs_checkpoint_start(fs);
1139                         lafs_checkpoint_unlock_wait(fs);
1140                 } else {
1141                         lafs_checkpoint_unlock(fs);
1142                 }
1143                 putdref(inodb, MKREF(make_subset));
1144                 if (err)
1145                         goto out_unlock;
1146         }
1147         /* We have a TypeInodeFile so we can make a superblock */
1148         root = lafs_get_subset_root(ino);
1149         iput(ino);
1150
1151         if (IS_ERR(root))
1152                 err = PTR_ERR(root);
1153         else {
1154                 mnt->mnt_sb = root->d_sb;
1155                 atomic_inc(&mnt->mnt_sb->s_active);
1156                 mnt->mnt_root = root;
1157         }
1158 out_unlock:
1159         mutex_unlock(&ino->i_mutex);
1160         if (err)
1161                 up_write(&ino->i_sb->s_umount);
1162 out:
1163         path_put(&nd.path);
1164 out_noput:
1165         return err;
1166 }
1167
1168 static struct dentry *subset_lookup(struct inode *dir, struct dentry *dentry,
1169                                     struct nameidata *nd)
1170 {
1171         d_add(dentry, NULL);
1172         return NULL;
1173 }
1174
1175 static int subset_readdir(struct file *filp, void *dirent, filldir_t filldir)
1176 {
1177         struct dentry *dentry = filp->f_dentry;
1178         struct lafs_inode *lai = LAFSI(dentry->d_inode);
1179         ino_t ino;
1180         loff_t i = filp->f_pos;
1181
1182         switch (i) {
1183         case 0:
1184                 ino = dentry->d_inode->i_ino;
1185                 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1186                         break;
1187                 filp->f_pos ++;
1188                 i++;
1189                 /* fallthrough */
1190         case 1:
1191                 ino = lai->md.fs.parent;
1192                 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1193                         break;
1194                 filp->f_pos++;
1195                 i++;
1196                 /* fallthrough */
1197         default:
1198                 break;
1199         }
1200         return 0;
1201 }
1202
1203 const struct file_operations lafs_subset_file_operations = {
1204         .readdir        = subset_readdir,
1205 };
1206
1207 const struct inode_operations lafs_subset_ino_operations = {
1208         .lookup         = subset_lookup,
1209 };
1210
1211
1212 struct file_system_type lafs_fs_type = {
1213         .owner          = THIS_MODULE,
1214         .name           = "lafs",
1215         .get_sb         = lafs_get_sb,
1216         .kill_sb        = lafs_kill_sb,
1217         .fs_flags       = FS_REQUIRES_DEV,
1218 };
1219
1220 static struct file_system_type lafs_subset_fs_type = {
1221         .owner          = THIS_MODULE,
1222         .name           = "lafs_subset",
1223         .get_sb         = lafs_get_subset,
1224 };
1225
1226 static int __init lafs_init(void)
1227 {
1228         int err;
1229
1230         BUILD_BUG_ON(B_NUM_FLAGS > 32);
1231
1232         err = lafs_ihash_init();
1233         err = err ?: register_filesystem(&lafs_fs_type);
1234         err = err ?: register_filesystem(&lafs_snap_fs_type);
1235         err = err ?: register_filesystem(&lafs_subset_fs_type);
1236         if (err)
1237                 goto out;
1238         return 0;
1239
1240 out:
1241         unregister_filesystem(&lafs_fs_type);
1242         unregister_filesystem(&lafs_snap_fs_type);
1243         unregister_filesystem(&lafs_subset_fs_type);
1244         lafs_ihash_free();
1245         return err;
1246 }
1247
1248 static void __exit lafs_exit(void)
1249 {
1250         unregister_filesystem(&lafs_fs_type);
1251         unregister_filesystem(&lafs_snap_fs_type);
1252         unregister_filesystem(&lafs_subset_fs_type);
1253         lafs_ihash_free();
1254 }
1255
1256 static struct inode *lafs_nfs_get_inode(struct super_block *sb,
1257                                         u64 ino, u32 generation)
1258 {
1259         struct fs *fs = fs_from_sb(sb);
1260         struct inode *inode;
1261
1262         inode = lafs_iget(fs->ss[0].root, ino, SYNC);
1263         if (IS_ERR(inode))
1264                 return ERR_CAST(inode);
1265         if (generation && inode->i_generation != generation) {
1266                 iput(inode);
1267                 return ERR_PTR(-ESTALE);
1268         }
1269
1270         return inode;
1271 }
1272
1273 static struct dentry *lafs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1274                                         int fh_len, int fh_type)
1275 {
1276         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1277                                     lafs_nfs_get_inode);
1278 }
1279
1280 static struct dentry *lafs_fh_to_parent(struct super_block *sb, struct fid *fid,
1281                                         int fh_len, int fh_type)
1282 {
1283         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1284                                     lafs_nfs_get_inode);
1285 }
1286
1287 static struct dentry *lafs_get_parent(struct dentry *child)
1288 {
1289         ino_t inum = 0;
1290         struct inode *inode;
1291         switch(LAFSI(child->d_inode)->type) {
1292         case TypeFile:
1293                 inum = LAFSI(child->d_inode)->md.file.parent;
1294                 break;
1295         case TypeInodeFile:
1296                 inum = LAFSI(child->d_inode)->md.fs.parent;
1297                 break;
1298         }
1299         inode = lafs_iget(LAFSI(child->d_inode)->filesys,
1300                           inum, SYNC);
1301         if (IS_ERR(inode))
1302                 return ERR_CAST(inode);
1303         return d_obtain_alias(inode);
1304 }
1305
1306 static const struct export_operations lafs_export_ops = {
1307         .fh_to_dentry = lafs_fh_to_dentry,
1308         .fh_to_parent = lafs_fh_to_parent,
1309         .get_parent = lafs_get_parent,
1310 };
1311
1312 static struct inode *lafs_alloc_inode(struct super_block *sb)
1313 {
1314         struct lafs_inode *li;
1315         li = kmalloc(sizeof(*li), GFP_NOFS);
1316         if (!li)
1317                 return NULL;
1318         inode_init_once(&li->vfs_inode);
1319         li->vfs_inode.i_data.backing_dev_info = sb->s_bdi;
1320         li->iblock = NULL;
1321         li->dblock = NULL;
1322         li->update_cluster = 0;
1323         li->md.fs.name = NULL;
1324
1325         init_rwsem(&li->ind_sem);
1326         INIT_LIST_HEAD(&li->free_index);
1327
1328         return &li->vfs_inode;
1329 }
1330
1331 static void kfree_inode(struct rcu_head *head)
1332 {
1333         struct lafs_inode *lai = container_of(head, struct lafs_inode,
1334                                               md.rcu);
1335         if (lai->type == TypeInodeFile)
1336                 kfree(lai->md.fs.name);
1337         kfree(lai);
1338 }
1339
1340 void lafs_destroy_inode(struct inode *inode)
1341 {
1342         struct datablock *db;
1343         struct inode *fsys = LAFSI(inode)->filesys;
1344
1345         BUG_ON(!list_empty(&inode->i_sb_list));
1346         // Cannot test i_list as dispose_list just does list_del
1347         db = lafs_inode_get_dblock(inode, MKREF(destroy));
1348
1349         if (db) {
1350                 set_bit(I_Destroyed, &LAFSI(inode)->iflags);
1351                 if (test_and_clear_bit(B_Async, &db->b.flags))
1352                         lafs_wake_thread(fs_from_inode(inode));
1353                 putdref(db, MKREF(destroy));
1354         } else {
1355                 if (fsys != inode)
1356                         iput(fsys);
1357                 LAFSI(inode)->filesys = NULL;
1358
1359                 spin_lock(&inode->i_data.private_lock);
1360                 if (LAFSI(inode)->iblock)
1361                         LAFS_BUG(atomic_read(&LAFSI(inode)->iblock->b.refcnt),
1362                                  &LAFSI(inode)->iblock->b);
1363                 /* FIXME could there be Async blocks keeps a refcount?
1364                  * we should free them
1365                  */
1366                 spin_unlock(&inode->i_data.private_lock);
1367                 lafs_release_index(&LAFSI(inode)->free_index);
1368                 call_rcu(&LAFSI(inode)->md.rcu,
1369                          kfree_inode);
1370         }
1371 }
1372
1373 static int lafs_sync_fs(struct super_block *sb, int wait)
1374 {
1375         if (fs_from_sb(sb)->thread == NULL)
1376                 /* Filesystem in not active - nothing to sync */
1377                 return 0;
1378         if (!wait)
1379                 /* We only reach here if s_dirt was set, so it
1380                  * is reasonable to force a checkpoint.
1381                  */
1382                 lafs_checkpoint_start(fs_from_sb(sb));
1383         else
1384                 lafs_checkpoint_wait(fs_from_sb(sb));
1385         return 0;
1386 }
1387
1388 static int lafs_statfs(struct dentry *de, struct kstatfs *buf)
1389 {
1390         int i;
1391         u32 fsid;
1392         u32 *fsuuid;
1393         struct fs *fs = fs_from_inode(de->d_inode);
1394         struct lafs_inode *fsroot = LAFSI(LAFSI(de->d_inode)->filesys);
1395         struct lafs_inode *laroot = LAFSI(fs->ss[0].root);
1396
1397         fsid = 0;
1398         fsuuid = (u32 *)fs->state->uuid;
1399         for (i = 0; i < 16 / 4 ; i++) {
1400                 fsid ^= le32_to_cpu(fsuuid[i]);
1401                 buf->f_fsid.val[i/2] = fsid;
1402         }
1403         buf->f_fsid.val[1] ^= fsroot->vfs_inode.i_ino;
1404         buf->f_type = 0x4C614654; /* "LaFS" */
1405         buf->f_bsize = fs->blocksize;
1406         buf->f_blocks = fsroot->md.fs.blocks_allowed;
1407         if (buf->f_blocks == 0) {
1408                 /* should subtract usage of all other filesystems...*/
1409                 for (i = 0; i < fs->devs_loaded; i++)
1410                         buf->f_blocks += fs->devs[i].size;
1411         }
1412
1413         buf->f_files = 0;
1414         buf->f_ffree = 0;
1415         buf->f_namelen = 255;
1416         buf->f_frsize = 0;
1417
1418         spin_lock(&laroot->vfs_inode.i_lock);
1419         /* "bavail" is "blocks we could succeed in adding to the filesystem".
1420          * "bfree" is effectively total blocks - used blocks
1421          */
1422         buf->f_bavail = fs->free_blocks + fs->clean_reserved - fs->allocated_blocks;
1423         spin_unlock(&laroot->vfs_inode.i_lock);
1424         spin_lock(&fsroot->vfs_inode.i_lock);
1425         buf->f_bfree = buf->f_blocks - (fsroot->md.fs.cblocks_used +
1426                                         fsroot->md.fs.pblocks_used +
1427                                         fsroot->md.fs.ablocks_used);
1428         if (buf->f_bfree < buf->f_bavail)
1429                 buf->f_bavail = buf->f_bfree;
1430         dprintk("df: tot=%ld free=%ld avail=%ld(%ld-%ld-%ld) cb=%ld pb=%ld ab=%ld\n",
1431                 (long)buf->f_blocks, (long)buf->f_bfree, (long)buf->f_bavail,
1432                 (long)fs->free_blocks, (long)fs->clean_reserved,
1433                 (long)fs->allocated_blocks,
1434                 (long)fsroot->md.fs.cblocks_used, (long)fsroot->md.fs.pblocks_used,
1435                 (long)fsroot->md.fs.ablocks_used);
1436         spin_unlock(&fsroot->vfs_inode.i_lock);
1437         return 0;
1438 }
1439
1440 static struct super_operations lafs_sops = {
1441         .alloc_inode    = lafs_alloc_inode,
1442         .destroy_inode  = lafs_destroy_inode,  /* Inverse of 'alloc_inode' */
1443         /* Don't use read_inode */
1444         .dirty_inode    = lafs_dirty_inode,
1445         /* .write_inode not needed */
1446         /* put_inode ?? */
1447
1448         .evict_inode    = lafs_evict_inode,
1449         .put_super      = lafs_put_super,
1450         .sync_fs        = lafs_sync_fs,
1451         /* write_super_lockfs ?? */
1452         /* unlockfs ?? */
1453         .statfs         = lafs_statfs,
1454         /* remount_fs ?? */
1455 };
1456
1457 MODULE_AUTHOR("Neil Brown");
1458 MODULE_DESCRIPTION("LaFS - Log Structured File System");
1459 MODULE_LICENSE("GPL");
1460 module_init(lafs_init);
1461 module_exit(lafs_exit);
1462 int lafs_trace = 1;
1463 module_param(lafs_trace, int, 0644);
1464
1465 #ifdef DUMP
1466 struct fs *dfs;
1467 static int do_dump(const char *val, struct kernel_param *kp)
1468 {
1469         extern void lafs_dump_orphans(void);
1470         extern void lafs_dump_tree(void);
1471         extern void lafs_dump_cleanable(void);
1472         extern void lafs_dump_usage(void);
1473
1474         printk("Want dump of %s\n", val);
1475         if (strncmp(val, "orphan", 6) == 0)
1476                 lafs_dump_orphans();
1477         if (strncmp(val, "tree", 4) == 0)
1478                 lafs_dump_tree();
1479         if (strncmp(val, "cleanable", 9) == 0)
1480                 lafs_dump_cleanable();
1481         if (strncmp(val, "usage", 5) == 0)
1482                 lafs_dump_usage();
1483         return 0;
1484 }
1485
1486 static int get_dump(char *buffer, struct kernel_param *kp)
1487 {
1488         strcpy(buffer, "orphans,tree,cleanable,usage");
1489         return strlen(buffer);
1490 }
1491
1492 int arg;
1493 module_param_call(dump, do_dump, get_dump, &arg, 0775);
1494 #endif