]> git.neil.brown.name Git - LaFS.git/blob - super.c
accesstime: load delta from atime file and apply when loading inode.
[LaFS.git] / super.c
1
2 /*
3  * fs/lafs/super.c
4  * Copyright (C) 2005-2009
5  * Neil Brown <neilb@suse.de>
6  * Released under the GPL, version 2
7  */
8
9 #include        "lafs.h"
10 #include        <linux/namei.h>
11 #include        <linux/crc32.h>
12 #include        <linux/statfs.h>
13 #include        <linux/mount.h>
14 #include        <linux/exportfs.h>
15 #include        <linux/slab.h>
16
17 static struct super_operations lafs_sops;
18 static const struct export_operations lafs_export_ops;
19
20 /*---------------------------------------------------------------------
21  * Write out state and super blocks
22  *  The super blocks only need to be written when the geometry of the
23  *  array changes such as when a device is added, removed, or resized.
24  *  So we don't bother with that just yet.
25  *  The state block needs to be written - twice on each device - whenever
26  *  a checkpoint is completed.  All copies are identical and the writes
27  *  proceed in parallel.  There are 4 stateblock locations on each device.
28  *  2 are typically less recent than the other two.  We over-write the
29  *  less-recent copies.
30  *  FIXME on a RAID4 we should pad the write to be a full stripe.
31  *
32  * Locking issues:  This is called from the checkpoint thread and so
33  *  it does not race with anything else exclusive to that thread.
34  *  The nonlog information needs to be reviewed once that functionality
35  *  is implemented.
36  */
37
38 int lafs_write_state(struct fs *fs)
39 {
40         struct lafs_state *st;
41         int i, d;
42
43         fs->seq++;
44         st = fs->state;
45         st->seq = cpu_to_le32(fs->seq);
46         st->nonlog_segment = cpu_to_le32(fs->nonlog_segment);
47         st->nonlog_dev = cpu_to_le16(fs->nonlog_dev);
48         st->nonlog_offset = cpu_to_le16(fs->nonlog_offset);
49         st->nextyouth = cpu_to_le16(fs->youth_next);
50         st->checkpointcluster = cpu_to_le64(fs->checkpointcluster);
51         for (i = 0; i < fs->maxsnapshot; i++)
52                 st->root_inodes[i] = cpu_to_le64(fs->ss[i].root_addr);
53
54         st->checksum = 0;
55         st->checksum = crc32_le(0, (unsigned char *)st, fs->statesize);
56
57         for (d = 0; d < fs->devices ; d++)
58                 for (i = (fs->seq & 1); i < 4 ; i += 2)
59                         lafs_super_write(fs, d, fs->devs[d].stateaddr[i] >> 9,
60                                          (char *)st, fs->statesize);
61         lafs_super_wait(fs);
62         /* FIXME what about a write error ??? */
63         return 0;
64 }
65
66 static int
67 valid_devblock(struct lafs_dev *db, sector_t addr, sector_t size)
68 {
69         /* check that this devblock is valid, given that
70          * it was found at sector 'addr'
71          */
72         u32 crc, crc2;
73         u64 byteaddr;
74         sector_t segsize;
75         int i;
76
77         if (strncmp(db->idtag, "LaFS-DeviceBlock", 16) != 0)
78                 return 0;
79         if (strncmp(db->version, "AlphaDevel      ", 16) != 0)
80                 return 0;
81         /* uuid can be anything */
82         crc = db->checksum;
83         db->checksum = 0;
84         crc2 = crc32_le(0, (unsigned char *)db, LAFS_DEVBLK_SIZE);
85         db->checksum = crc;
86         if (crc2 != crc) {
87                 dprintk("%lx != %lx\n", (unsigned long)crc,
88                         (unsigned long)crc2);
89                 return 0;
90         }
91
92         byteaddr = (u64)addr << 9; /* convert to byte */
93         if (le64_to_cpu(db->devaddr[0]) != byteaddr &&
94             le64_to_cpu(db->devaddr[1]) != byteaddr)
95                 return 0;
96
97         if (db->statebits < 10 || db->statebits > 16)
98                 return 0;
99         if (db->blockbits < 9 || db->blockbits > 20)
100                 return 0;
101         if (le16_to_cpu(db->width) < 1 || le16_to_cpu(db->width) >= 512)
102                 return 0;
103         if (le32_to_cpu(db->stride) < 1)
104                 return 0;
105         /* devaddr[0] must be early, [1] must be late */
106         if (le64_to_cpu(db->devaddr[0]) >=
107             le64_to_cpu(db->segment_offset))
108                 return 0;
109
110         if (le64_to_cpu(db->devaddr[1]) <
111             le64_to_cpu(db->segment_offset) +
112             ((((sector_t)le32_to_cpu(db->segment_count)
113                * le32_to_cpu(db->segment_size)))
114              << db->blockbits))
115                 return 0;
116
117         /* 2 is an absolute minimum segment size, a few hundred is more
118          * likely. We'll put a lower limit of 8, and an upper of 800000
119          */
120         if (le32_to_cpu(db->segment_size) < 8 ||
121             le32_to_cpu(db->segment_size) > 800000)
122                 return 0;
123
124         if (le32_to_cpu(db->segment_offset) >
125             (le32_to_cpu(db->segment_size)<<db->blockbits) * 10)
126                 return 0;
127
128         /* The 4 state blocks live before the first or after the last segment.
129          * The distance from start of first to end of last is either:
130          * - segment_count * segment_size  if width*stride <= segment_size
131          * - (width-1) * stride + segment_size / width * segment_count
132          *                if width * stride > segment_size
133          */
134         segsize = le32_to_cpu(db->segment_size);
135         segsize *= le32_to_cpu(db->segment_count);
136         if (le16_to_cpu(db->width) *  le32_to_cpu(db->stride)
137             > le32_to_cpu(db->segment_size)) {
138                 int stride = le32_to_cpu(db->stride);
139                 int width = le16_to_cpu(db->width);
140
141                 sector_div(segsize, width);
142                 segsize += (width - 1) * stride;
143         }
144         segsize <<= db->blockbits;
145         for (i = 0; i < 4; i++) {
146                 sector_t addr = le64_to_cpu(db->stateaddr[i]);
147                 int offset = le32_to_cpu(db->segment_offset);
148                 if (addr + (1<<db->statebits) > offset &&
149                     addr < offset + segsize)
150                         return 0;
151                 if (addr + (1<<db->statebits) > (size << db->blockbits))
152                         return 0;
153         }
154
155         /* Check all segments fit within device */
156         if (le32_to_cpu(db->segment_offset) + segsize > (size << db->blockbits))
157                 return 0;
158
159         if (le32_to_cpu(db->level) > 10)
160                 return 0;
161
162         /* I guess it look sane enough... */
163         return 1;
164 }
165
166 static int
167 compare_dev(struct lafs_dev *orig, struct lafs_dev *new)
168 {
169         /* Both these are known to be valid.
170          * Return:
171          *   0 if they are for same filesystem, but 'new' is older
172          *   1 if they are for same filesystem, and 'new' is newer
173          *  -1 if they are for different filesystems
174          */
175         if (memcmp(orig->uuid, new->uuid, 16))
176                 return -1;
177         if (u32_after(le32_to_cpu(new->seq),
178                       le32_to_cpu(orig->seq)))
179                 return 1;
180         return 0;
181 }
182
183 static int
184 valid_stateblock(struct lafs_state *st, struct lafs_dev *dv)
185 {
186         /* Given the 'dv' devblock, make sure 'st' is a valid
187          * and consistent stateblock
188          */
189         u32 crc;
190         if (strncmp(st->idtag, "LaFS-State-Block", 16) != 0)
191                 return 0;
192         if (strncmp(st->version, "AlphaDevel      ", 16) != 0)
193                 return 0;
194         crc = st->checksum;
195         st->checksum = 0;
196         if (crc32_le(0, (unsigned char *)st, 1<<dv->statebits) != crc)
197                 return 0;
198         st->checksum = crc;
199
200         if (memcmp(st->uuid, dv->uuid, 16))
201                 return 0;
202
203         if (sizeof(*st) + le32_to_cpu(st->maxsnapshot) * 8
204             > (1<<dv->statebits))
205                 return 0;
206
207         return 1;
208 }
209
210 static int
211 compare_state(struct lafs_state *orig, struct lafs_state *new)
212 {
213         /* return 1 if 'new' is actually newer than 'orig'.
214          * We already know they are both valid and have the same
215          * uuid... I don't think there is anything else to be checked
216          */
217         return u32_after(le32_to_cpu(new->seq), le32_to_cpu(orig->seq));
218 }
219
220 /*
221  * Mount options.
222  * As we can have multiple devices, things are slightly non-obvious.
223  * The 'devname' can be either a device name, starting '/', or
224  * a filesytem name (not starting '/').
225  * The 'data' is a standard comma-separated list of options.
226  * For 'mount' these are:
227  *    dev=/dev/X
228  *              - devices in addition to 'dev_name'
229  *    new=/dev/X
230  *              - A new device, with a superblock already present, to be added.
231  *    incomplete
232  *              - don't complain if not all devices are given
233  *    ?? quota stuff, cleaning parameters,
234  *
235  * For 'remount', options are
236  *    dev=  - add another device
237  *    new=  - the device is being added.
238  *
239  */
240
241 struct options {
242         int devcnt;
243         int curr_dev;
244         int statebits, blockbits;
245         struct devent {
246                 const char *dev;
247                 int is_new;
248                 int is_name;
249                 struct block_device *bdev;
250                 struct lafs_dev *devblock;
251                 struct lafs_state *stateblock;
252                 int devchoice, statechoice;
253         } *devlist;
254         const char *name;
255 };
256 static int
257 count_devs(const char *name, char *data)
258 {
259         int cnt = 0;
260         if (*name == '/')
261                 cnt = 1;
262         while (data && *data) {
263                 if (strncmp(data, "dev=", 4) == 0)
264                         cnt++;
265                 if (strncmp(data, "new=", 4) == 0)
266                         cnt++;
267                 data = strchr(data, ',');
268                 if (data)
269                         data++;
270         }
271         return cnt;
272 }
273
274 static int
275 parse_opts(struct options *op, const char *name, char *data)
276 {
277         int dv = 0;
278         char *p;
279
280         memset(op, 0, sizeof(*op));
281         op->devcnt = count_devs(name, data);
282         op->devlist = kzalloc(op->devcnt*sizeof(op->devlist[0]), GFP_KERNEL);
283
284         if (!op->devlist)
285                 return -ENOMEM;
286
287         op->name = NULL;
288         if (*name == '/') {
289                 op->devlist[dv].is_name = 1;
290                 op->devlist[dv++].dev = name;
291         } else
292                 op->name = name;
293         while ((p = strsep(&data, ",")) != NULL) {
294                 if (!*p)
295                         continue;
296                 if (strncmp(p, "dev=", 4) == 0)
297                         op->devlist[dv++].dev = p+4;
298                 else if (strncmp(p, "new=", 4) == 0) {
299                         op->devlist[dv].is_new = 1;
300                         op->devlist[dv++].dev = p+4;
301                 } else {
302                         printk(KERN_ERR
303                                "LaFS: Unrecognised mount option \"%s\"\n", p);
304                         return -EINVAL;
305
306                 }
307         }
308         op->devcnt = dv;
309
310         return 0;
311 }
312
313 static int
314 lafs_load_super(struct block_device *bdev, void *opv, int silent)
315 {
316         /* Find the devblock and the stateblock for this device
317
318          * Only do basic internal consistancy checks.  Inter-device
319          * checks happen later
320          */
321         struct options *op = opv;
322         struct devent *dv;
323         struct page *pg;
324         sector_t sect, dev_addr = 0, state_addr = 0;
325         int err = 0;
326         unsigned int n;
327         int i;
328         int have_dev = 0, have_state = 0;
329         sector_t devsize;
330
331         dv = &op->devlist[op->curr_dev];
332         BUG_ON(dv->devblock);
333         BUG_ON(dv->stateblock);
334
335         n = queue_logical_block_size(bdev->bd_disk->queue);
336         if (n < LAFS_DEVBLK_SIZE)
337                 n = LAFS_DEVBLK_SIZE;
338         BUG_ON(n > PAGE_SIZE);
339         dv->devblock = kmalloc(n, GFP_KERNEL);
340         if (!dv->devblock)
341                 return -ENOMEM;
342         pg = alloc_page(GFP_KERNEL);
343         if (!pg)
344                 return -ENOMEM;
345
346         devsize = i_size_read(bdev->bd_inode);
347
348         /* Now find a devblock, check the first two possible locations,
349          * and the last two.  If two devblocks are found with different
350          * uuids, we are confused!
351          */
352         sect = 0;
353         for (i = 0; i < 4; i++) {
354                 /* try to read block at 'sect' */
355                 int ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
356
357                 if (ok && valid_devblock(page_address(pg), sect, devsize)) {
358                         if (!have_dev) {
359                                 have_dev = 1;
360                                 memcpy(dv->devblock, page_address(pg), n);
361                                 dev_addr = sect;
362                         } else switch (compare_dev(dv->devblock,
363                                                    page_address(pg))) {
364                                 case 0: /* older, do nothing */
365                                         break;
366                                 case 1: /* newer, overwrite */
367                                         memcpy(dv->devblock, page_address(pg), n);
368                                         dev_addr = sect;
369                                         break;
370                                 default: /* inconsistent --- HELP */
371                                         printk(KERN_ERR "LaFS: inconsistent device-blocks found.\n");
372                                         err = -EINVAL;
373                                         goto out;
374                                 }
375                 }
376
377                 if (i != 1)
378                         sect += (n>>9);
379                 else {
380                         sect = devsize & ~(sector_t)(n-1);
381                         sect >>= 9;
382                         sect -= (n>>9)*2;
383                 }
384         }
385         /* FIXME - we've lost the read error, if it was significant */
386         err = -EINVAL;
387         if (!have_dev) {
388                 if (!silent)
389                         printk(KERN_ERR "LaFS - no valid devblock found.\n");
390                 goto out;
391         }
392
393         /* OK, we have a valid devblock, that's nice.
394          * Now we should be able to find some stateblocks.
395          * The locations are in the devblock
396          */
397         n = le32_to_cpu(1<<dv->devblock->statebits);
398         if ((n & (n-1)) ||
399             n < queue_logical_block_size(bdev->bd_disk->queue) ||
400             n > 128*1024) {
401                 printk(KERN_ERR "LaFS: statesize of %u not acceptable.\n", n);
402                 err = -EINVAL;
403                 goto out;
404         }
405         dv->stateblock = kmalloc(n, GFP_KERNEL);
406         err = -ENOMEM;
407         if (!dv->stateblock)
408                 goto out;
409         for (i = 0; i < 4; i++) {
410                 int ok;
411                 sect = le64_to_cpu(dv->devblock->stateaddr[i])>>9;
412                 ok = lafs_sync_page_io(bdev, sect, 0, n, pg, READ);
413                 if (ok && valid_stateblock(page_address(pg), dv->devblock)) {
414                         if (!have_state) {
415                                 have_state = 1;
416                                 memcpy(dv->stateblock, page_address(pg), n);
417                                 state_addr = i;
418                         } else if (compare_state(dv->stateblock,
419                                                  page_address(pg))) {
420                                 memcpy(dv->stateblock, page_address(pg), n);
421                                 state_addr = i;
422                         }
423                 }
424         }
425
426         if (have_state) {
427                 err = 0;
428                 dv->devchoice = dev_addr;
429                 dv->statechoice = state_addr;
430         } else {
431                 err = -EINVAL;
432                 if (!silent)
433                         printk(KERN_ERR "LaFS: no valid stateblock found.\n");
434         }
435 out:
436         page_cache_release(pg);
437         return err;
438 }
439
440 static int
441 check_devs(struct options *op)
442 {
443         /* Check we have enough, that they are for the same
444          * uuid, and they they don't overlap
445          * Also check that 'seq' number of devblocks
446          * are within '1'
447          */
448         int seqlo = le32_to_cpu(op->devlist[0].devblock->seq);
449         int seqhi = le32_to_cpu(op->devlist[0].devblock->seq);
450         int newdev = 0;
451         int newstate = 0;
452         int i, j;
453
454         for (i = 1; i < op->devcnt; i++) {
455                 if (memcmp(op->devlist[0].stateblock->uuid,
456                            op->devlist[i].stateblock->uuid,
457                            16) != 0)
458                         return -EINVAL;
459
460                 if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo)
461                         ;
462                 else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqlo+1) {
463                         newdev = i;
464                         seqhi = seqlo+1;
465                 } else if (le32_to_cpu(op->devlist[i].devblock->seq) == seqhi-1)
466                         seqlo = seqhi-1;
467                 else
468                         return -EINVAL;
469
470                 if (u32_after(le32_to_cpu(op->devlist[i].stateblock->seq),
471                               le32_to_cpu(op->devlist[newstate].
472                                           stateblock->seq)))
473                         newstate = i;
474         }
475         if (le32_to_cpu(op->devlist[newstate].stateblock->devices)
476             != op->devcnt)
477                 return -EINVAL;
478
479         op->statebits = op->devlist[0].devblock->statebits;
480         op->blockbits = op->devlist[0].devblock->blockbits;
481
482         /* Now check devices don't overlap in start/size.
483          * We do a simple quadratic search
484          */
485         for (i = 0; i < op->devcnt; i++)
486                 for (j = 0; j < op->devcnt; j++)
487                         if (i != j)
488                                 if (le64_to_cpu(op->devlist[i].devblock->start) <
489                                     le64_to_cpu(op->devlist[j].devblock->start) &&
490
491                                     le64_to_cpu(op->devlist[i].devblock->start)+
492                                     le64_to_cpu(op->devlist[i].devblock->size) >
493                                     le64_to_cpu(op->devlist[j].devblock->start))
494                                         return -EINVAL;
495         return newstate;
496 }
497
498 /* we identify lafs superblocks by the filesystem uuid.  This means
499  * that block-level snapshots cannot be mounted.  You should use
500  * fs-level snapshots instead.
501  */
502 static int sb_test(struct super_block *sb, void *data)
503 {
504         struct sb_key *ptn = data;
505         struct sb_key *sk = sb->s_fs_info;
506         return memcmp(ptn->fs->state->uuid,
507                       sk->fs->state->uuid, 16) == 0;
508 }
509
510 static int sb_set(struct super_block *sb, void *data)
511 {
512         struct sb_key *ptn = data;
513         sb->s_fs_info = ptn;
514         return set_anon_super(sb, NULL);
515 }
516
517
518 static int
519 lafs_load(struct fs *fs, struct options *op, int newest)
520 {
521         /* We seem to have a full set of devices for the filesystem.
522          * Time to create our fs_info structure and fill it out.
523          * This only includes information from the dev and state blocks.
524          * Finding the root-inode comes a bit later.
525          */
526         struct lafs_state *st;
527         int i;
528         int err;
529         struct sb_key *k;
530
531         st = fs->state = op->devlist[newest].stateblock;
532         op->devlist[newest].stateblock = NULL;
533 #ifdef DUMP
534         dfs = fs;
535 #endif
536
537         fs->seq = le32_to_cpu(st->seq);
538         fs->levels = le32_to_cpu(st->levels);
539         fs->devices = op->devcnt;
540         fs->devs_loaded = fs->devices; /* FIXME use this or lose this */
541         fs->statesize = 1 << op->statebits;
542         fs->blocksize = 1 << op->blockbits;
543         fs->blocksize_bits = op->blockbits;
544
545         fs->nonlog_segment = le32_to_cpu(st->nonlog_segment);
546         fs->nonlog_dev = le16_to_cpu(st->nonlog_dev);
547         fs->nonlog_offset = le16_to_cpu(st->nonlog_offset);
548         fs->youth_next = le16_to_cpu(st->nextyouth);
549         fs->checkpoint_youth = fs->youth_next;
550         if (fs->youth_next < 8)
551                 fs->youth_next = 8;
552         fs->scan.first_free_pass = 1;
553         fs->scan.free_dev = -1;
554
555         fs->maxsnapshot = le32_to_cpu(st->maxsnapshot);
556
557         fs->scan.free_usages = kmalloc(PAGE_SIZE, GFP_KERNEL);
558         err = lafs_segtrack_init(fs->segtrack);
559
560         fs->ss = kzalloc(sizeof(struct snapshot)*fs->maxsnapshot, GFP_KERNEL);
561         if (!fs->ss || !fs->scan.free_usages || err) {
562                 if (!err)
563                         err = -ENOMEM;
564                 goto abort;
565         }
566
567         fs->checkpointcluster = le64_to_cpu(st->checkpointcluster);
568         for (i = 0; i < fs->maxsnapshot; i++) {
569                 fs->ss[i].root_addr =
570                         le64_to_cpu(st->root_inodes[i]);
571                 dprintk("root inode %d are %llu\n",
572                         i, fs->ss[i].root_addr);
573         }
574         INIT_LIST_HEAD(&fs->pending_orphans);
575         INIT_LIST_HEAD(&fs->inode_index);
576         INIT_LIST_HEAD(&fs->phase_leafs[0]);
577         INIT_LIST_HEAD(&fs->phase_leafs[1]);
578         INIT_LIST_HEAD(&fs->clean_leafs);
579         INIT_LIST_HEAD(&fs->account_leafs);
580         atomic_set(&fs->sb_writes_pending, 0);
581         init_waitqueue_head(&fs->sb_writes_wait);
582         init_waitqueue_head(&fs->async_complete);
583         init_waitqueue_head(&fs->trunc_wait);
584         mutex_init(&fs->cleaner.lock);
585         spin_lock_init(&fs->stable_lock);
586         spin_lock_init(&fs->alloc_lock);
587         spin_lock_init(&fs->lock);
588         init_waitqueue_head(&fs->phase_wait);
589
590         INIT_WORK(&fs->done_work, lafs_done_work);
591
592         /* FIXME add congention and unplug functions to this bdi */
593         err = bdi_init(&fs->bdi);
594         if (err)
595                 goto abort;
596         
597
598         fs->phase_locked = 0;
599         for (i = 0; i < WC_NUM; i++) {
600                 int j;
601                 mutex_init(&fs->wc[i].lock);
602                 for (j = 0; j < 4 ; j++) {
603                         atomic_set(&fs->wc[i].pending_cnt[j], 0);
604                         INIT_LIST_HEAD(&fs->wc[i].pending_blocks[j]);
605                 }
606                 init_waitqueue_head(&fs->wc[i].pending_wait);
607                 fs->wc[i].seg.dev = -1;
608         }
609
610         fs->max_newsegs = 32; /* FIXME this should be configurable */
611
612         err = -ENOMEM;
613         fs->devs = kzalloc(sizeof(struct fs_dev)*fs->devices, GFP_KERNEL);
614         if (!fs->devs)
615                 goto abort;
616
617         k = kzalloc(sizeof(*k), GFP_KERNEL);
618         k->fs = fs;
619         fs->prime_sb = sget(&lafs_fs_type, sb_test, sb_set, k);
620         if (IS_ERR(fs->prime_sb)) {
621                 kfree(k);
622                 err = PTR_ERR(fs->prime_sb);
623                 goto abort;
624         }
625         if (fs->prime_sb->s_root) {
626                 /* filesystem with this uuid already exists */
627                 deactivate_locked_super(fs->prime_sb);
628                 kfree(k);
629                 fs->prime_sb = NULL;
630                 err = -EBUSY;
631                 goto abort;
632         }
633         err = bdi_register_dev(&fs->bdi, fs->prime_sb->s_dev);
634         if (err) {
635                 deactivate_locked_super(fs->prime_sb);
636                 kfree(k);
637                 fs->prime_sb = NULL;
638                 goto abort;
639         }
640         fs->prime_sb->s_bdi = &fs->bdi;
641
642         fs->prime_sb->s_blocksize = 1 << op->blockbits;
643         fs->prime_sb->s_blocksize_bits = op->blockbits;
644         fs->prime_sb->s_op = &lafs_sops;
645         fs->prime_sb->s_export_op = &lafs_export_ops;
646         fs->prime_sb->s_root = NULL;
647
648         /* We allow 29 bits for nanosecs, so they must be even. */
649         fs->prime_sb->s_time_gran = 2;
650
651         for (i = 0; i < fs->devices; i++) {
652                 struct fs_dev *dv = &fs->devs[i];
653                 struct devent *de = &op->devlist[i];
654                 int j;
655                 dv->bdev = de->bdev;
656                 de->bdev = NULL;
657
658                 dv->devblk = de->devblock;
659                 de->devblock = NULL;
660
661                 dv->recent_dev = de->devchoice;
662                 dv->recent_state = de->statechoice;
663
664                 dv->start = le64_to_cpu(dv->devblk->start);
665                 dv->size = le64_to_cpu(dv->devblk->size);
666                 dprintk("Dev %d seems to range %llu + %llu\n",
667                         i, (unsigned long long)dv->start,
668                         (unsigned long long)dv->size);
669
670                 dv->width = le16_to_cpu(dv->devblk->width);
671                 dv->stride = le32_to_cpu(dv->devblk->stride);
672                 dv->segment_size = le32_to_cpu(dv->devblk->segment_size);
673                 dv->segment_offset = le32_to_cpu(dv->devblk->segment_offset);
674                 dv->segment_count = le32_to_cpu(dv->devblk->segment_count);
675                 dv->usage_inum = le32_to_cpu(dv->devblk->usage_inum);
676                 dv->level = le16_to_cpu(dv->devblk->level);
677
678                 if (dv->segment_size > fs->max_segment)
679                         fs->max_segment = dv->segment_size;
680
681                 if (dv->width * dv->stride <= dv->segment_size) {
682                         dv->tables_per_seg = dv->segment_size /
683                                 dv->width / dv->stride;
684                         dv->rows_per_table = dv->stride;
685                         dv->segment_stride = dv->segment_size;
686                 } else {
687                         dv->tables_per_seg = 1;
688                         dv->rows_per_table = dv->segment_size / dv->width;
689                         dv->segment_stride = dv->rows_per_table;
690                 }
691                 /* table size is the number of blocks in the segment usage
692                  * file per snapshot
693                  */
694                 dv->tablesize = (dv->segment_count + (1<<(fs->blocksize_bits-1)) + 1)
695                         >> (fs->blocksize_bits-1);
696
697                 for (j = 0; j < 2; j++)
698                         dv->devaddr[j] = le64_to_cpu(dv->devblk->devaddr[j]);
699                 for (j = 0; j < 4; j++)
700                         dv->stateaddr[j] = le64_to_cpu(dv->devblk->stateaddr[j]);
701         }
702         return 0;
703
704 abort:
705         bdi_destroy(&fs->bdi);
706         kfree(fs->scan.free_usages);
707         lafs_segtrack_free(fs->segtrack);
708         kfree(fs->devs);
709         kfree(fs->ss);
710         kfree(fs);
711         return -ENOMEM;
712 }
713
714 static int show_orphans(struct fs *fs)
715 {
716         struct datablock *db;
717         printk("Orphans:\n");
718         list_for_each_entry(db, &fs->pending_orphans,
719                             orphans) {
720                 struct inode *ino = iget_my_inode(db);
721                 printk("orphan=%s\n", strblk(&db->b));
722                 if (ino)
723                         lafs_print_tree(&LAFSI(ino)->iblock->b, 0);
724                 iput(ino);
725         }
726         printk("cleaner active: %d %d\n", fs->cleaner.active,
727                fs->scan.done);
728         return 1; /* meaningless, but makes it easy to add to wait_event below */
729 }
730
731 static void lafs_kill_sb(struct super_block *sb)
732 {
733         struct fs *fs = fs_from_sb(sb);
734         /* Release the 'struct fs' */
735         int i;
736
737         /* FIXME should I refcount this when there are multiple
738          * filesets? How does that work?
739          */
740
741         /* Delay final destruction of the root inode */
742         /* FIXME all the sbs... */
743         set_bit(I_Deleting, &LAFSI(fs->ss[0].root)->iflags);
744
745         /* FIXME I'm not sure we should be waiting for the
746          * cleaner.  Maybe we should just release all tc->cleaning
747          * blocks instead.
748          */
749         set_bit(CleanerDisabled, &fs->fsstate);
750
751         wait_event(fs->async_complete,
752                    show_orphans(fs) &&
753                    !test_bit(OrphansRunning, &fs->fsstate) &&
754                    list_empty(&fs->pending_orphans) &&
755                    fs->scan.done == 1 &&
756                    fs->cleaner.active == 0);
757
758         if (LAFSI(fs->ss[0].root)->md.fs.accesstime) {
759                 struct inode *i = LAFSI(fs->ss[0].root)->md.fs.accesstime;
760                 LAFSI(fs->ss[0].root)->md.fs.accesstime = NULL;
761                 iput(i);
762         }
763
764         kill_anon_super(fs->prime_sb);
765
766         bdi_destroy(&fs->bdi);
767
768         for (i = 0; i < fs->devices; i++) {
769                 struct fs_dev *dv = &fs->devs[i];
770                 kfree(dv->devblk);
771                 close_bdev_exclusive(dv->bdev, FMODE_READ|FMODE_WRITE);
772         }
773
774         /* Final checkpoint will have cleared out the leafs lists,
775          * so they should all be empty.
776          */
777         /* Lets see what is on the 'leaf' list? */
778         for (i = 0; i < 2; i++) {
779                 struct block *b;
780                 dprintk("For phase %d\n", i);
781         retry:
782                 list_for_each_entry(b, &fs->phase_leafs[i], lru) {
783                         /* FIXME this only OK for readonly mounts.
784                          */
785                         getref(b, MKREF(release));
786                         lafs_refile(b, 0);
787                         if (test_bit(B_Pinned, &b->flags)) {
788                                 /* didn't fix the pincnt !! */
789                                 printk("This was pinned: %s\n", strblk(b));
790                                 lafs_print_tree(b, 1);
791                                 BUG();
792                         }
793                         putref(b, MKREF(release));
794                         goto retry;
795                 }
796         }
797         BUG_ON(!list_empty(&fs->clean_leafs));
798
799         flush_scheduled_work();
800         lafs_stop_thread(fs);
801
802         for (i = 0; i < 4; i++)
803                 if (fs->cleaner.seg[i].chead)
804                         put_page(fs->cleaner.seg[i].chead);
805
806         kfree(fs->state);
807         kfree(fs->ss);
808         kfree(fs->devs);
809         lafs_segtrack_free(fs->segtrack);
810         kfree(fs->scan.free_usages);
811         kfree(fs->prime_sb->s_fs_info);
812         kfree(fs);
813 }
814
815 static void
816 lafs_put_super(struct super_block *sb)
817 {
818         struct fs *fs = fs_from_sb(sb);
819         int ss;
820         struct lafs_inode *li;
821
822         lafs_checkpoint_lock(fs);
823         lafs_checkpoint_start(fs);
824         if (sb == fs->prime_sb)
825                 /* Don't incorporate any more segusage/quota updates. */
826                 set_bit(FinalCheckpoint, &fs->fsstate);
827         lafs_checkpoint_unlock_wait(fs);
828         lafs_cluster_wait_all(fs);
829
830         if (sb == fs->prime_sb) {
831                 int d;
832                 /* This is the main sb, not a snapshot or
833                  * subordinate fs.
834                  * Now that all inodes have been invalidated we can do
835                  * the final checkpoint.
836                  */
837                 lafs_close_all_segments(fs);
838                 lafs_empty_segment_table(fs);
839                 lafs_seg_put_all(fs);
840
841                 iput(fs->orphans);
842                 fs->orphans = NULL;
843                 for (d=0; d < fs->devices; d++)
844                         if (fs->devs[d].segsum) {
845                                 iput(fs->devs[d].segsum);
846                                 fs->devs[d].segsum = NULL;
847                         }
848         }
849
850         /* need to break a circular reference... */
851         for (ss = 0; ss < fs->maxsnapshot; ss++)
852                 if (fs->ss[ss].root &&
853                     fs->ss[ss].root->i_sb == sb) {
854                         dprintk("Putting ss %d\n", ss);
855                         li = LAFSI(fs->ss[ss].root);
856                         if (test_bit(B_Realloc, &li->dblock->b.flags))
857                                 lafs_dump_tree();
858                         iput(fs->ss[ss].root);
859                         fs->ss[ss].root = NULL;
860                         break;
861                 }
862 }
863
864 static int
865 lafs_get_devs(struct fs *fs, struct options *op, int flags)
866 {
867         int err;
868         int i;
869
870         for (i = 0; i < op->devcnt; i++) {
871                 struct block_device *bdev;
872                 op->curr_dev = i;
873                 
874                 bdev = open_bdev_exclusive(op->devlist[i].dev,
875                                            FMODE_READ|FMODE_WRITE, fs);
876                 err = PTR_ERR(bdev);
877                 if (IS_ERR(bdev))
878                         goto out;
879                 err = lafs_load_super(bdev, op, flags & MS_SILENT ? 1 : 0);
880                 if (err < 0)
881                         goto out;
882                 op->devlist[i].bdev = bdev;
883         }
884         return 0;
885
886 out:
887         return err;
888 }
889
890 static int
891 lafs_get_sb(struct file_system_type *fs_type,
892             int flags, const char *dev_name, void *data,
893             struct vfsmount *mnt)
894 {
895         /* as we may have multiple devices, some in 'data', we cannot just
896          * use get_sb_bdev, we need to roll-our-own.
897          * We call get_sb_bdev on *each* bdev, and make sure the returned
898          * superblocks are either all new, or all for the same filesystem.
899          * If the later, we return the primary.
900          * If the former, we init the filesystem copying static data
901          * to all supers.
902          * First we 'open_bdev_exclusive' each device, exclusive to lafs
903          * Then we 'sget' a superblock that knows any/all the devices.
904          * This may be pre-existing, or may be new
905          * If new, it will be created knowing all devices.
906          * If pre-existing, and don't have correct device list, error
907          */
908         struct options op;
909         int err;
910         int newest;
911         struct fs *fs = kzalloc(sizeof(*fs), GFP_KERNEL);
912         char *cdata = data;
913         if (cdata == NULL)
914                 cdata = "";
915
916         err = -ENOMEM;
917         if (!fs)
918                 goto out;
919         err = parse_opts(&op, dev_name, cdata);
920         if (err)
921                 goto out;
922
923         /* We now have as list of device names.  We call open_bdev_exclusive
924          * on each to collect some superblocks.
925          */
926         err = lafs_get_devs(fs, &op, flags);
927         if (err)
928                 goto out;
929
930         /* Each device has a valid dev and state block.  Hopefully they
931          * are all for the same filesystem.  If they don't have the
932          * same uuid, we will bale-out here.  We also check that we have
933          * enough, and that they don't overlap.
934          * While we are looking at state blocks, pick the newest.
935          */
936         newest = check_devs(&op);
937         if (newest < 0) {
938                 err = newest;
939                 goto out;
940         }
941
942         /* So they seem to be the same - better create our
943          * 'fs' structure and fill it in
944          */
945         err = lafs_load(fs, &op, newest);
946         if (err)
947                 goto out;
948
949         /* Well, all the devices check out.  Now we need to find the
950          * filesystem */
951         err = lafs_mount(fs);
952         if (err == 0)
953                 err = lafs_start_thread(fs);
954         if (err)
955                 deactivate_locked_super(fs->prime_sb);
956         else {
957                 fs->prime_sb->s_flags |= MS_ACTIVE;
958                 simple_set_mnt(mnt, fs->prime_sb);
959         }
960         /* And there you have it.  Filesystem all mounted, root dir found,
961          * metadata files initialised, all pigs fed, and ready to fly!!!
962          */
963
964 out:
965         /* Now we clean up 'options'.  Anything that is wanted has
966          * been moved into 'fs', so we just discard anything we find
967          */
968         if (op.devlist) {
969                 int i;
970                 for (i = 0; i < op.devcnt; i++) {
971                         kfree(op.devlist[i].devblock);
972                         kfree(op.devlist[i].stateblock);
973                         if (op.devlist[i].bdev)
974                                 close_bdev_exclusive(op.devlist[i].bdev,
975                                                      FMODE_READ|FMODE_WRITE);
976                 }
977                 kfree(op.devlist);
978         }
979         return err;
980 }
981
982 static int test_subset(struct super_block *sb, void *data)
983 {
984         struct sb_key *ptn = data;
985         struct sb_key *k = sb->s_fs_info;
986
987         return ptn->fs == k->fs && ptn->root == k->root;
988 }
989
990 static int set_subset(struct super_block *sb, void *data)
991 {
992         sb->s_fs_info = data;
993         set_anon_super(sb, NULL);
994         return 0;
995 }
996
997 static struct file_system_type lafs_subset_fs_type;
998 struct super_block *lafs_get_subset_sb(struct inode *ino)
999 {
1000         /* ino must be a TypeInodeFile inode in the prime filesystem. */
1001         struct fs *fs = fs_from_inode(ino);
1002         struct super_block *sb;
1003         struct sb_key *k = kmalloc(sizeof(*k), GFP_KERNEL);
1004
1005         if (!k)
1006                 return ERR_PTR(-ENOMEM);
1007
1008         k->fs = fs;
1009         k->root = ino;
1010         sb = sget(&lafs_subset_fs_type, test_subset, set_subset, k);
1011         if (IS_ERR(sb)) {
1012                 kfree(k);
1013         } else if (sb->s_root) {
1014                 /* already allocated */
1015                 kfree(k);
1016         } else {
1017                 struct inode *rootdir, *imapfile;
1018                 int err = 0;
1019
1020                 igrab(ino);
1021                 sb->s_blocksize = fs->blocksize;
1022                 sb->s_blocksize_bits = fs->blocksize_bits;
1023                 sb->s_bdi = fs->prime_sb->s_bdi;
1024                 sb->s_op = &lafs_sops;
1025                 sb->s_export_op = &lafs_export_ops;
1026                 sb->s_time_gran = 2;
1027                 rootdir = lafs_iget(sb, 2, SYNC);
1028                 if (IS_ERR(rootdir) && PTR_ERR(rootdir) == -ENOENT) {
1029                         rootdir = lafs_new_inode(fs, sb, NULL,
1030                                                  TypeDir, 2, 0755, NULL);
1031                         /* FIXME could the inode get written before we set
1032                          * the link count ??*/
1033                         rootdir->i_nlink = 2;
1034                 }
1035                 if (IS_ERR(rootdir))
1036                         err = PTR_ERR(rootdir);
1037                 else {
1038                         sb->s_root = d_alloc_root(rootdir);
1039                         imapfile = lafs_iget(sb, 1, SYNC);
1040                         if (IS_ERR(imapfile) && PTR_ERR(imapfile) == -ENOENT)
1041                                 imapfile = lafs_new_inode(fs, sb, NULL,
1042                                                           TypeInodeMap, 1, 0, NULL);
1043
1044                         if (IS_ERR(imapfile))
1045                                 err = PTR_ERR(imapfile);
1046                         else
1047                                 iput(imapfile);
1048                 }
1049
1050                 if (!err) {
1051                         struct inode *atime = lafs_iget(sb, 3, SYNC);
1052                         if (!IS_ERR(atime)) {
1053                                 if (LAFSI(atime)->type != TypeAccessTime) {
1054                                         iput(atime);
1055                                         err = -EINVAL;
1056                                 } else
1057                                         LAFSI(ino)->md.fs.accesstime = atime;
1058                         } else if (PTR_ERR(atime) != -ENOENT)
1059                                 err = PTR_ERR(ino);
1060                 }
1061
1062                 if (!err) {
1063                         sb->s_op = fs->prime_sb->s_op;
1064                         sb->s_flags |= MS_ACTIVE;
1065                         atomic_inc(&fs->prime_sb->s_active);
1066                         igrab(ino);
1067                 } else {
1068                         deactivate_locked_super(sb);
1069                         sb = ERR_PTR(err);
1070                 }
1071         }
1072         return sb;
1073 }
1074
1075 static int
1076 lafs_get_subset(struct file_system_type *fs_type,
1077                 int flags, const char *dev_name, void *data,
1078                 struct vfsmount *mnt)
1079 {
1080         /* mount, possibly creating, a sub-fileset.
1081          * dev_name must be an absolute path that leads
1082          * to an object in a lafs file-system (or snapshot).
1083          * The object must be either an InodeFile or
1084          * an empty directory in the main file-system
1085          * with mode 0 (though that rule might change).
1086          * In the latter case we change the object to an
1087          * InodeFile
1088          * FIXME must require readonly for snapshots, and readwrite
1089          * to create.
1090          */
1091
1092         struct nameidata nd;
1093         int err;
1094         struct super_block *sb;
1095         struct inode *ino;
1096         struct fs *fs;
1097
1098         err = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
1099         if (err)
1100                 goto out_noput;
1101         sb = nd.path.dentry->d_sb;
1102         err = -EINVAL;
1103         if (sb->s_type != &lafs_fs_type &&
1104             sb->s_type != &lafs_snap_fs_type)
1105                 goto out;
1106         ino = nd.path.dentry->d_inode;
1107         if (LAFSI(ino)->type != TypeInodeFile &&
1108             LAFSI(ino)->type != TypeDir)
1109                 goto out;
1110         fs = fs_from_sb(sb);
1111         mutex_lock(&ino->i_mutex);
1112         if (LAFSI(ino)->type == TypeDir) {
1113                 struct datablock *inodb;
1114                 /* maybe convert this to TypeInodeFile */
1115                 if (sb->s_type != &lafs_fs_type)
1116                         goto out_unlock;
1117                 if (ino->i_size)
1118                         /* FIXME maybe I should run orphans */
1119                         goto out_unlock;
1120                 if ((ino->i_mode & 07777) != 0)
1121                         goto out_unlock;
1122                 inodb = lafs_inode_dblock(ino, SYNC, MKREF(make_subset));
1123                 err = PTR_ERR(inodb);
1124                 if (IS_ERR(inodb))
1125                         goto out_unlock;
1126                 lafs_iolock_block(&inodb->b);
1127                 set_bit(B_PinPending, &inodb->b.flags);
1128                 lafs_iounlock_block(&inodb->b);
1129                 lafs_checkpoint_lock(fs);
1130                 err = lafs_pin_dblock(inodb, ReleaseSpace);
1131                 if (!err) {
1132                         struct fs_md *md;
1133                         /* OK, we are good to go making this filesystem */
1134                         LAFSI(ino)->type = TypeInodeFile;
1135                         LAFSI(ino)->metadata_size = (sizeof(struct la_inode) +
1136                                                      sizeof(struct fs_metadata));
1137                         ino->i_op = &lafs_subset_ino_operations;
1138                         ino->i_fop = &lafs_subset_file_operations;
1139                         /* FIXME we lose md->parent here - what to do?? */
1140                         md = &LAFSI(ino)->md.fs;
1141                         md->usagetable = 0;
1142                         ino->i_mtime = current_fs_time(sb);
1143                         md->cblocks_used = 0;
1144                         md->pblocks_used = 0;
1145                         md->ablocks_used = 0;
1146                         md->blocks_allowed = 10000; /* FIXME */
1147                         md->blocks_unalloc = 0;
1148                         /* FIXME should I be using inode_init here */
1149                         md->creation_age = fs->wc[0].cluster_seq;
1150                         md->inodes_used = 0;
1151                         md->quota_inums[0] = 0;
1152                         md->quota_inums[1] = 0;
1153                         md->quota_inums[2] = 0;
1154                         md->quota_inodes[0] = NULL;
1155                         md->quota_inodes[1] = NULL;
1156                         md->quota_inodes[2] = NULL;
1157                         md->accesstime = NULL;
1158                         md->name = NULL;
1159                         lafs_dirty_dblock(inodb);
1160                         lafs_dirty_inode(ino);
1161                         /* We use a checkpoint to commit this change,
1162                          * it is too unusual to bother logging
1163                          */
1164                         lafs_checkpoint_start(fs);
1165                         lafs_checkpoint_unlock_wait(fs);
1166                 } else {
1167                         lafs_checkpoint_unlock(fs);
1168                 }
1169                 putdref(inodb, MKREF(make_subset));
1170                 if (err)
1171                         goto out_unlock;
1172         }
1173         err = 0;
1174         /* We have a TypeInodeFile so we can make a superblock */
1175         sb = lafs_get_subset_sb(ino);
1176         iput(ino);
1177
1178         if (IS_ERR(sb))
1179                 err = PTR_ERR(sb);
1180         else
1181                 simple_set_mnt(mnt, sb);
1182 out_unlock:
1183         mutex_unlock(&ino->i_mutex);
1184 out:
1185         path_put(&nd.path);
1186 out_noput:
1187         return err;
1188 }
1189
1190 static void lafs_kill_subset(struct super_block *sb)
1191 {
1192         struct sb_key *k = sb->s_fs_info;
1193         if (LAFSI(k->root)->md.fs.accesstime) {
1194                 iput(LAFSI(k->root)->md.fs.accesstime);
1195                 LAFSI(k->root)->md.fs.accesstime = NULL;
1196         }
1197         kill_anon_super(sb);
1198         iput(k->root);
1199         deactivate_super(k->fs->prime_sb);
1200         kfree(k);
1201 }
1202
1203 const struct file_operations lafs_subset_file_operations = {
1204 };
1205
1206 const struct inode_operations lafs_subset_ino_operations = {
1207 };
1208
1209
1210 struct file_system_type lafs_fs_type = {
1211         .owner          = THIS_MODULE,
1212         .name           = "lafs",
1213         .get_sb         = lafs_get_sb,
1214         .kill_sb        = lafs_kill_sb,
1215         .fs_flags       = FS_REQUIRES_DEV,
1216 };
1217
1218 static struct file_system_type lafs_subset_fs_type = {
1219         .owner          = THIS_MODULE,
1220         .name           = "lafs_subset",
1221         .get_sb         = lafs_get_subset,
1222         .kill_sb        = lafs_kill_subset,
1223 };
1224
1225 static int __init lafs_init(void)
1226 {
1227         int err;
1228
1229         BUILD_BUG_ON(B_NUM_FLAGS > 32);
1230
1231         err = lafs_ihash_init();
1232         err = err ?: register_filesystem(&lafs_fs_type);
1233         err = err ?: register_filesystem(&lafs_snap_fs_type);
1234         err = err ?: register_filesystem(&lafs_subset_fs_type);
1235         if (err)
1236                 goto out;
1237         return 0;
1238
1239 out:
1240         unregister_filesystem(&lafs_fs_type);
1241         unregister_filesystem(&lafs_snap_fs_type);
1242         unregister_filesystem(&lafs_subset_fs_type);
1243         lafs_ihash_free();
1244         return err;
1245 }
1246
1247 static void __exit lafs_exit(void)
1248 {
1249         unregister_filesystem(&lafs_fs_type);
1250         unregister_filesystem(&lafs_snap_fs_type);
1251         unregister_filesystem(&lafs_subset_fs_type);
1252         lafs_ihash_free();
1253 }
1254
1255 static struct inode *lafs_nfs_get_inode(struct super_block *sb,
1256                                         u64 ino, u32 generation)
1257 {
1258         struct inode *inode;
1259
1260         inode = lafs_iget(sb, ino, SYNC);
1261         if (IS_ERR(inode))
1262                 return ERR_CAST(inode);
1263         if (generation && inode->i_generation != generation) {
1264                 iput(inode);
1265                 return ERR_PTR(-ESTALE);
1266         }
1267
1268         return inode;
1269 }
1270
1271 static struct dentry *lafs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1272                                         int fh_len, int fh_type)
1273 {
1274         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1275                                     lafs_nfs_get_inode);
1276 }
1277
1278 static struct dentry *lafs_fh_to_parent(struct super_block *sb, struct fid *fid,
1279                                         int fh_len, int fh_type)
1280 {
1281         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1282                                     lafs_nfs_get_inode);
1283 }
1284
1285 static struct dentry *lafs_get_parent(struct dentry *child)
1286 {
1287         ino_t inum = LAFSI(child->d_inode)->md.file.parent;
1288         struct inode *inode = lafs_iget(child->d_inode->i_sb, inum, SYNC);
1289         if (IS_ERR(inode))
1290                 return ERR_CAST(inode);
1291         return d_obtain_alias(inode);
1292 }
1293
1294 static const struct export_operations lafs_export_ops = {
1295         .fh_to_dentry = lafs_fh_to_dentry,
1296         .fh_to_parent = lafs_fh_to_parent,
1297         .get_parent = lafs_get_parent,
1298 };
1299
1300 static struct inode *lafs_alloc_inode(struct super_block *sb)
1301 {
1302         struct lafs_inode *li;
1303         li = kmalloc(sizeof(*li), GFP_NOFS);
1304         if (!li)
1305                 return NULL;
1306         inode_init_once(&li->vfs_inode);
1307         li->vfs_inode.i_data.backing_dev_info = sb->s_bdi;
1308         li->iblock = NULL;
1309         li->dblock = NULL;
1310         li->update_cluster = 0;
1311         li->md.fs.name = NULL;
1312
1313         init_rwsem(&li->ind_sem);
1314         INIT_LIST_HEAD(&li->free_index);
1315
1316         return &li->vfs_inode;
1317 }
1318
1319 static void kfree_inode(struct rcu_head *head)
1320 {
1321         struct lafs_inode *lai = container_of(head, struct lafs_inode,
1322                                               md.rcu);
1323         if (lai->type == TypeInodeFile)
1324                 kfree(lai->md.fs.name);
1325         kfree(lai);
1326 }
1327
1328 void lafs_destroy_inode(struct inode *inode)
1329 {
1330         struct datablock *db;
1331
1332         BUG_ON(!list_empty(&inode->i_sb_list));
1333         // Cannot test i_list as dispose_list just does list_del
1334         db = lafs_inode_get_dblock(inode, MKREF(destroy));
1335
1336         if (db) {
1337                 set_bit(I_Destroyed, &LAFSI(inode)->iflags);
1338                 putdref(db, MKREF(destroy));
1339         } else {
1340                 spin_lock(&inode->i_data.private_lock);
1341                 if (LAFSI(inode)->iblock)
1342                         LAFS_BUG(atomic_read(&LAFSI(inode)->iblock->b.refcnt),
1343                                  &LAFSI(inode)->iblock->b);
1344                 /* FIXME could there be Async blocks keeps a refcount?
1345                  * we should free them
1346                  */
1347                 spin_unlock(&inode->i_data.private_lock);
1348                 lafs_release_index(&LAFSI(inode)->free_index);
1349                 call_rcu(&LAFSI(inode)->md.rcu,
1350                          kfree_inode);
1351         }
1352 }
1353
1354 static int lafs_sync_fs(struct super_block *sb, int wait)
1355 {
1356         if (!wait)
1357                 /* We only reach here if s_dirt was set, so it
1358                  * is reasonable to force a checkpoint.
1359                  */
1360                 lafs_checkpoint_start(fs_from_sb(sb));
1361         else
1362                 lafs_checkpoint_wait(fs_from_sb(sb));
1363         return 0;
1364 }
1365
1366 static int lafs_statfs(struct dentry *de, struct kstatfs *buf)
1367 {
1368         int i;
1369         u32 fsid;
1370         u32 *fsuuid;
1371         struct fs *fs = fs_from_inode(de->d_inode);
1372         struct lafs_inode *fsroot = LAFSI(ino_from_sb(de->d_inode->i_sb));
1373         struct lafs_inode *laroot = LAFSI(fs->ss[0].root);
1374
1375         fsid = 0;
1376         fsuuid = (u32 *)fs->state->uuid;
1377         for (i = 0; i < 16 / 4 ; i++) {
1378                 fsid ^= le32_to_cpu(fsuuid[i]);
1379                 buf->f_fsid.val[i/2] = fsid;
1380         }
1381         buf->f_fsid.val[1] ^= fsroot->vfs_inode.i_ino;
1382         buf->f_type = 0x4C614654; /* "LaFS" */
1383         buf->f_bsize = fs->blocksize;
1384         buf->f_blocks = fsroot->md.fs.blocks_allowed;
1385         if (buf->f_blocks == 0) {
1386                 /* should subtract usage of all other filesystems...*/
1387                 for (i = 0; i < fs->devs_loaded; i++)
1388                         buf->f_blocks += fs->devs[i].size;
1389         }
1390
1391         buf->f_files = 0;
1392         buf->f_ffree = 0;
1393         buf->f_namelen = 255;
1394         buf->f_frsize = 0;
1395
1396         spin_lock(&laroot->vfs_inode.i_lock);
1397         /* "bavail" is "blocks we could succeed in adding to the filesystem".
1398          * "bfree" is effectively total blocks - used blocks
1399          */
1400         buf->f_bavail = fs->free_blocks + fs->clean_reserved - fs->allocated_blocks;
1401         spin_unlock(&laroot->vfs_inode.i_lock);
1402         spin_lock(&fsroot->vfs_inode.i_lock);
1403         buf->f_bfree = buf->f_blocks - (fsroot->md.fs.cblocks_used +
1404                                         fsroot->md.fs.pblocks_used +
1405                                         fsroot->md.fs.ablocks_used);
1406         dprintk("df: tot=%ld free=%ld avail=%ld(%ld-%ld-%ld) cb=%ld pb=%ld ab=%ld\n",
1407                 (long)buf->f_blocks, (long)buf->f_bfree, (long)buf->f_bavail,
1408                 (long)fs->free_blocks, (long)fs->clean_reserved,
1409                 (long)fs->allocated_blocks,
1410                 (long)fsroot->md.fs.cblocks_used, (long)fsroot->md.fs.pblocks_used,
1411                 (long)fsroot->md.fs.ablocks_used);
1412         spin_unlock(&fsroot->vfs_inode.i_lock);
1413         return 0;
1414 }
1415
1416 /* FIXME we hold inode_lock while calling drop_inode, so
1417  * extra locking isn't really welcome....???
1418  */
1419 static void lafs_drop_inode(struct inode *inode)
1420 {
1421         struct fs *fs = fs_from_inode(inode);
1422         struct datablock *db;
1423
1424         /* This lock that we now hold on the inode could prevent
1425          * the cleaner from getting the inode.  So after
1426          * the complete the drop we might need to wake the cleaner.
1427          */
1428
1429         db = lafs_inode_get_dblock(inode, MKREF(drop));
1430
1431         generic_drop_inode(inode);
1432         if (db && test_bit(B_Async, &db->b.flags))
1433                 lafs_wake_thread(fs);
1434         if (db)
1435                 putdref(db, MKREF(drop));
1436 }
1437
1438 static struct super_operations lafs_sops = {
1439         .alloc_inode    = lafs_alloc_inode,
1440         .destroy_inode  = lafs_destroy_inode,  /* Inverse of 'alloc_inode' */
1441         /* Don't use read_inode */
1442         .dirty_inode    = lafs_dirty_inode,
1443         /* .write_inode not needed */
1444         /* put_inode ?? */
1445         .drop_inode     = lafs_drop_inode,
1446         /* drop_inode ?? */                     /* default will call delete or forget
1447                                                  * where 'forget' flushes and clears
1448                                                  */
1449
1450         .clear_inode    = lafs_clear_inode,    /* forget internal state of this inode */
1451         .delete_inode   = lafs_delete_inode,   /* remove this inode from filesystem */
1452         .put_super      = lafs_put_super,
1453         .sync_fs        = lafs_sync_fs,
1454         /* write_super_lockfs ?? */
1455         /* unlockfs ?? */
1456         .statfs         = lafs_statfs,
1457         /* remount_fs ?? */
1458 };
1459
1460 MODULE_AUTHOR("Neil Brown");
1461 MODULE_DESCRIPTION("LaFS - Log Structured File System");
1462 MODULE_LICENSE("GPL");
1463 module_init(lafs_init);
1464 module_exit(lafs_exit);
1465 int lafs_trace = 1;
1466 module_param(lafs_trace, int, 0644);
1467
1468 #ifdef DUMP
1469 struct fs *dfs;
1470 static int do_dump(const char *val, struct kernel_param *kp)
1471 {
1472         extern void lafs_dump_orphans(void);
1473         extern void lafs_dump_tree(void);
1474         extern void lafs_dump_cleanable(void);
1475         extern void lafs_dump_usage(void);
1476
1477         printk("Want dump of %s\n", val);
1478         if (strncmp(val, "orphan", 6) == 0)
1479                 lafs_dump_orphans();
1480         if (strncmp(val, "tree", 4) == 0)
1481                 lafs_dump_tree();
1482         if (strncmp(val, "cleanable", 9) == 0)
1483                 lafs_dump_cleanable();
1484         if (strncmp(val, "usage", 5) == 0)
1485                 lafs_dump_usage();
1486         return 0;
1487 }
1488
1489 static int get_dump(char *buffer, struct kernel_param *kp)
1490 {
1491         strcpy(buffer, "orphans,tree,cleanable,usage");
1492         return strlen(buffer);
1493 }
1494
1495 int arg;
1496 module_param_call(dump, do_dump, get_dump, &arg, 0775);
1497 #endif