]> git.neil.brown.name Git - LaFS.git/blob - inode.c
README update
[LaFS.git] / inode.c
1
2 /*
3  * fs/lafs/inode.c
4  * Copyright (C) 2005-2009
5  * Neil Brown <neilb@suse.de>
6  * Released under the GPL, version 2
7  *
8  * generic inode handling
9  *
10  */
11
12 #include        "lafs.h"
13 #include <linux/random.h>
14 #include <linux/delay.h>
15 #include <linux/slab.h>
16
17 static void check_atime_ref(struct inode *ino, int async);
18
19 /* Supporting an async 'iget' - as required by the cleaner -
20  * is slightly non-trivial.
21  * iget*_locked will normally wait for any inode with one
22  * of the flags I_FREEING I_CLEAR I_WILL_FREE I_NEW
23  * to either be unhashed or have the flag cleared.
24  * We cannot afford that wait in the cleaner as we could deadlock.
25  * So we use iget5_locked and provide a test function that fails
26  * if it finds the inode with any of those flags set.
27  * If it does see the inode like that it sets a flag in the 'ikey'
28  * that is passed in by reference so that it knows to continue
29  * failing (for consistency) and so that the 'set' function
30  * we provide can know to fail the 'set'.
31  * The result of this is that if iget finds an inode it would
32  * have to wait on, a flag is set and NULL is returned.
33  * An unfortunate side effect is that an inode will be allocated
34  * and then destroyed to no avail.
35  * This is avoided by calling ilookup5 first.  This also allows
36  * us to only allocate/load the data block if there really seems
37  * to be a need.
38  */
39 struct ikey {
40         ino_t inum;
41         struct inode *fsys;
42         bool was_busy;
43 };
44
45 static int sync_itest(struct inode *inode, void *data)
46 {
47         struct ikey *ik = data;
48
49         if (inode->i_ino != ik->inum ||
50             LAFSI(inode)->filesys != ik->fsys)
51                 return 0;
52         return 1;
53 }
54
55 static int async_itest(struct inode *inode, void *data)
56 {
57         struct ikey *ik = data;
58
59         if (ik->was_busy)
60                 /* found and is freeing */
61                 return 0;
62         if (!sync_itest(inode, data))
63                 return 0;
64         if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) {
65                 ik->was_busy = true;
66                 return 0;
67         }
68         return 1;
69 }
70
71 static int iset(struct inode *inode, void *data)
72 {
73         struct ikey *ik = data;
74         if (ik->was_busy)
75                 return -EBUSY;
76         inode->i_ino = ik->inum;
77         LAFSI(inode)->filesys = ik->fsys;
78         return 0;
79 }
80
81 struct inode *
82 lafs_iget(struct inode *fsys, ino_t inum, int async)
83 {
84         /* find, and load if needed, this inum */
85         struct inode *ino = NULL;
86         struct inode *oldino;
87         struct datablock *b = NULL;
88         struct ikey ik = { .inum = inum, .fsys = fsys, };
89         int err = 0;
90         struct super_block *sb = fsys->i_sb;
91
92         if (async) {
93                 /* We cannot afford to block on 'freeing_inode'
94                  * So use iget5_locked and refuse to match such
95                  * inodes.
96                  * If the inode is 'freeing', inum gets set to NO_INO.
97                  * ilookup5 is used first to avoid an unnecessary
98                  * alloc/free if the inode is locked in some way.
99                  */
100                 while (!ino) {
101
102                         err = 0;
103                         ino = ilookup5(sb, inum, async_itest, &ik);
104                         if (ino)
105                                 break;
106
107                         if (ik.was_busy)
108                                 err = -EAGAIN;
109
110                         /* For async we will always want the dblock loaded,
111                          * and we need to load it first as we cannot afford
112                          * to fail -EAGAIN once we have an I_NEW inode.
113                          */
114                         if (!b)
115                                 b = lafs_get_block(fsys, inum, NULL,
116                                                    GFP_NOFS, MKREF(iget));
117                         if (!b)
118                                 return ERR_PTR(-ENOMEM);
119
120                         if (!err)
121                                 err = lafs_read_block_async(b);
122
123                         if (!err) {
124                                 /* Have the block, so safe to iget */
125                                 ino = iget5_locked(sb, inum,
126                                                    async_itest, iset,
127                                                    &ik);
128                                 if (!ino) {
129                                         if (ik.was_busy)
130                                                 err = -EAGAIN;
131                                         else
132                                                 err = -ENOMEM;
133                                 }
134                         }
135                         if (err) {
136                                 if (test_and_set_bit(B_Async, &b->b.flags)) {
137                                         putdref(b, MKREF(iget));
138                                         return ERR_PTR(err);
139                                 }
140                                 getdref(b, MKREF(async));
141                         }
142                 }
143         } else
144                 ino = iget5_locked(sb, inum, sync_itest, iset, &ik);
145
146         if (!ino) {
147                 putdref(b, MKREF(iget));
148                 return ERR_PTR(-ENOMEM);
149         }
150
151         if (!(ino->i_state & I_NEW)) {
152                 putdref(b, MKREF(iget));
153                 if (ino->i_mode) {
154                         check_atime_ref(ino, async);
155                         return ino;
156                 }
157                 iput(ino);
158                 return ERR_PTR(-ENOENT);
159         }
160
161         igrab(LAFSI(ino)->filesys);
162
163         /* surprisingly the inode bdi does not default to the
164          * super_blocks bdi...
165          */
166         ino->i_data.backing_dev_info = sb->s_bdi;
167         /* Need to load block 'inum' from an inode file...
168          */
169         if (!b) {
170                 b = lafs_get_block(fsys, inum, NULL, GFP_KERNEL, MKREF(iget));
171                 if (!b)
172                         err = -ENOMEM;
173                 else
174                         err = lafs_read_block(b);
175         }
176         if (err)
177                 goto err;
178
179         oldino = rcu_my_inode(b);
180         if (oldino) {
181                 /* The inode is new, but the block thinks it has an
182                  * old inode, so we must be in the process of destroying
183                  * the old one.
184                  * So fail the lookup without even looking at the content
185                  * of the block (Which might not be clear yet).
186                  */
187                 spin_lock(&oldino->i_data.private_lock);
188                 if (!test_bit(I_Deleting, &LAFSI(oldino)->iflags)) {
189                         b->my_inode = NULL;
190                         LAFSI(oldino)->dblock = NULL;
191                         LAFSI(oldino)->iblock = NULL;
192                 }
193                 spin_unlock(&oldino->i_data.private_lock);
194         }
195         rcu_iput(oldino);
196         if (b->my_inode) {
197                 err = -ENOENT;
198                 goto err;
199         }
200
201         err = lafs_import_inode(ino, b);
202         if (err) {
203                 if (err != -ENOENT)
204                         printk("lafs_import_inode failed %d\n", err);
205                 goto err;
206         }
207         check_atime_ref(ino, async);
208         unlock_new_inode(ino);
209 out:
210         if (b && test_and_clear_bit(B_Async, &b->b.flags)) {
211                 putdref(b, MKREF(async));
212                 lafs_wake_thread(fs_from_inode(fsys));
213         }
214         putdref(b, MKREF(iget));
215         return ino;
216 err:
217         ino->i_nlink = 0;
218         unlock_new_inode(ino);
219         iput(ino);
220         ino = ERR_PTR(err);
221         goto out;
222 }
223
224 struct inode *
225 lafs_iget_fs(struct fs *fs, int fsnum, int inum, int async)
226 {
227         struct super_block *sb;
228         struct inode *rv;
229
230         sb = fs->prime_sb;
231
232         if (fsnum) {
233                 /* Need to locate or load the superblock for this
234                  * subordinate filesystem
235                  */
236                 struct inode *filesys;
237
238                 filesys = lafs_iget(fs->ss[0].root, fsnum, async);
239                 if (IS_ERR(filesys))
240                         return filesys;
241                 if (LAFSI(filesys)->type != TypeInodeFile) {
242                         iput(filesys);
243                         return ERR_PTR(-ENOENT);
244                 }
245                 rv = lafs_iget(filesys, inum, async);
246         } else if (inum) {
247                 rv = lafs_iget(fs->ss[0].root, inum, async);
248                 if (!IS_ERR(rv))
249                         atomic_inc(&sb->s_active);
250         } else {
251                 rv = igrab(fs->ss[0].root);
252                 atomic_inc(&sb->s_active);
253         }
254         return rv;
255 }
256
257 int __must_check
258 lafs_import_inode(struct inode *ino, struct datablock *b)
259 {
260         struct la_inode *lai = map_dblock(b);
261         struct lafs_inode *li = LAFSI(ino);
262         int err = -ENOENT;
263
264         if (lai->filetype == 0) {
265                 li->type = 0;
266                 ino->i_mode = 0;
267                 ino->i_nlink = 0;
268                 goto out;
269         }
270
271         ino->i_mode = S_IFREG;
272         ino->i_nlink = 1; /* For special file, set nlink so they
273                            * never appear unlinked */
274
275         err = -EINVAL;
276
277         LAFS_BUG(ino->i_ino != b->b.fileaddr, &b->b);
278         li->cblocks = le32_to_cpu(lai->data_blocks);
279         li->pblocks = li->ablocks = 0;
280         li->vfs_inode.i_blocks = ((blkcnt_t)li->cblocks
281                                   << (ino->i_sb->s_blocksize_bits - 9));
282         li->ciblocks = le32_to_cpu(lai->index_blocks);
283         li->piblocks = 0;
284         li->iflags = 0;
285
286         ino->i_generation = le16_to_cpu(lai->generation);
287         li->trunc_gen = lai->trunc_gen;
288         li->flags = lai->flags;
289         li->type = lai->filetype;
290         li->metadata_size = le16_to_cpu(lai->metadata_size);
291         li->depth = lai->depth;
292
293         dprintk("inode %lu type is %d\n", (unsigned long)ino->i_ino, li->type);
294
295         ino->i_data.a_ops = &lafs_file_aops;
296         li->trunc_next = 0;
297
298         switch (li->type) {
299         case TypeInodeFile:
300         {
301                 struct fs_md *i = &li->md.fs;
302                 struct fs_metadata *l = &lai->metadata[0].fs;
303                 int nlen;
304
305                 i->usagetable = le16_to_cpu(l->snapshot_usage_table);
306                 decode_time(&ino->i_mtime, le64_to_cpu(l->update_time));
307                 i->cblocks_used = le64_to_cpu(l->blocks_used);
308                 i->pblocks_used = i->ablocks_used = 0;
309                 i->blocks_allowed = le64_to_cpu(l->blocks_allowed);
310                 i->blocks_unalloc = 0;
311                 i->creation_age = le64_to_cpu(l->creation_age);
312                 i->inodes_used = le32_to_cpu(l->inodes_used);
313                 i->parent = le32_to_cpu(l->parent);
314                 i->quota_inums[0] = le32_to_cpu(l->quota_inodes[0]);
315                 i->quota_inums[1] = le32_to_cpu(l->quota_inodes[1]);
316                 i->quota_inums[2] = le32_to_cpu(l->quota_inodes[2]);
317                 i->quota_inodes[0] = i->quota_inodes[1]
318                         = i->quota_inodes[2] = NULL;
319                 nlen = li->metadata_size - offsetof(struct la_inode,
320                                                     metadata[0].fs.name);
321                 i->accesstime = NULL;
322                 if (i->name)
323                         kfree(i->name);
324                 if (nlen == 0)
325                         i->name = NULL;
326                 else {
327                         /* Need to unmap the dblock to kmalloc because
328                          * the mapping makes us 'atomic'
329                          */
330                         unmap_dblock(b, lai);
331                         i->name = kmalloc(nlen+1, GFP_KERNEL);
332                         lai = map_dblock(b);
333                         l = &lai->metadata[0].fs;
334
335                         err = -ENOMEM;
336                         if (!i->name)
337                                 goto out;
338                         memcpy(i->name, l->name, nlen);
339                         i->name[nlen] = 0;
340                 }
341                 /* Make this look like a directory */
342                 ino->i_mode = S_IFDIR;
343                 ino->i_uid = 0;
344                 ino->i_gid = 0;
345                 ino->i_size = 0;
346                 ino->i_op = &lafs_subset_ino_operations;
347                 ino->i_fop = &lafs_subset_file_operations;
348                 break;
349         }
350
351         case TypeInodeMap:
352         {
353                 struct inodemap_md *m = &li->md.inodemap;
354                 struct inodemap_metadata *s = &lai->metadata[0].inodemap;
355                 m->size = le32_to_cpu(s->size);
356                 m->thisblock = NoBlock;
357                 m->nextbit = 0;
358                 break;
359         }
360
361         case TypeSegmentMap:
362         {
363                 struct su_md *m = &li->md.segmentusage;
364                 struct su_metadata *s = &lai->metadata[0].segmentusage;
365                 m->table_size = le32_to_cpu(s->table_size);
366                 break;
367         }
368
369         case TypeQuota:
370         {
371                 struct quota_md *m = &li->md.quota;
372                 struct quota_metadata *s = &lai->metadata[0].quota;
373                 m->gracetime = le32_to_cpu(s->gracetime);
374                 m->graceunits = le32_to_cpu(s->graceunits);
375                 break;
376         }
377         case TypeOrphanList:
378         {
379                 struct orphan_md *m = &li->md.orphan;
380                 /* This will be set via lafs_count_orphans */
381                 m->nextfree = 0;
382                 m->reserved = 0;
383                 break;
384         }
385         case TypeAccessTime:
386                 break;
387
388         default: /* TypeBase or larger */
389         {
390                 struct file_md *i = &li->md.file;
391                 struct file_metadata *l = &lai->metadata[0].file;
392                 struct dir_metadata *d = &lai->metadata[0].dir;
393                 struct special_metadata *s = &lai->metadata[0].special;
394
395                 if (li->type < TypeBase)
396                         goto out;
397                 i->flags = le16_to_cpu(l->flags);
398                 ino->i_mode = le16_to_cpu(l->mode);
399                 ino->i_uid = le32_to_cpu(l->userid);
400                 ino->i_gid = le32_to_cpu(l->groupid);
401                 i->treeid = le32_to_cpu(l->treeid);
402                 i->creationtime = le64_to_cpu(l->creationtime);
403                 decode_time(&ino->i_mtime, le64_to_cpu(l->modifytime));
404                 decode_time(&ino->i_ctime, le64_to_cpu(l->ctime));
405                 decode_time(&i->i_accesstime, le64_to_cpu(l->accesstime));
406                 ino->i_atime = i->i_accesstime;
407                 i->atime_offset = 0; /* Will be filled-in later probably */
408                 lafs_add_atime_offset(&ino->i_atime, i->atime_offset);
409                 ino->i_size = le64_to_cpu(l->size);
410                 i->parent = le32_to_cpu(l->parent);
411                 ino->i_nlink = le32_to_cpu(l->linkcount);
412                 if (ino->i_nlink == 0 && list_empty(&b->orphans) &&
413                     fs_from_inode(ino)->rolled) {
414                         /* This block should already be on the orphan
415                          * list, otherwise there is a filesystem
416                          * inconsistency.
417                          * Either the orphan file is wrong, or the
418                          * linkcount is wrong.
419                          * It is safest to assume the later - either
420                          * way an FS check would be needed to fix it.
421                          * Note: while roll-forward is happening, this
422                          * situation is perfectly possible and is handled
423                          * correctly.
424                          */
425                         /* FIXME set a superblock flag requesting
426                          * directory linkage checking
427                          */
428                         ino->i_nlink = 1;
429                 }
430
431                 dprintk("  mode = 0%o uid %d size %lld\n",
432                         ino->i_mode, ino->i_uid, ino->i_size);
433                 switch (li->type) {
434                 case TypeFile:
435                         ino->i_op = &lafs_file_ino_operations;
436                         ino->i_fop = &lafs_file_file_operations;
437                         ino->i_mode = (ino->i_mode & 07777)  | S_IFREG;
438                         break;
439                 case TypeDir:
440                         i->seed = le32_to_cpu(d->hash_seed);
441                         ino->i_op = &lafs_dir_ino_operations;
442                         ino->i_fop = &lafs_dir_file_operations;
443                         ino->i_mode = (ino->i_mode & 07777)  | S_IFDIR;
444                         {
445                                 u32 *b = (u32 *)lai;
446                                 dprintk("Hmm. %d %d %d\n",
447                                         (int)b[24],
448                                         (int)b[25],
449                                         (int)b[26]);
450                         }
451                         break;
452                 case TypeSymlink:
453                         ino->i_op = &lafs_link_ino_operations;
454                         ino->i_mode = (ino->i_mode & 07777)  | S_IFLNK;
455                         break;
456                 case TypeSpecial:
457                         /* the data had better be in the inode ... */
458                         ino->i_rdev = MKDEV(le32_to_cpu(s->major),
459                                             le32_to_cpu(s->minor));
460                         ino->i_op = &lafs_special_ino_operations;
461                         init_special_inode(ino, ino->i_mode, ino->i_rdev);
462                         break;
463                 }
464                 break;
465         }
466         }
467
468         ino->i_blkbits = ino->i_sb->s_blocksize_bits;
469         /* FIXME i_blocks and i_byte - used for quota?? */
470         err = 0;
471
472         /* Note: no refcount yet.  Either will remove the reference to the
473          * other when freed
474          */
475         li->dblock = b;
476         rcu_assign_pointer(b->my_inode, ino);
477
478 out:
479         if (err && li->type)
480                 printk("inode %lu type is %d\n",
481                        (unsigned long)ino->i_ino, li->type);
482         unmap_dblock(b, lai);
483         return err;
484 }
485
486 static void check_atime_ref(struct inode *ino, int async)
487 {
488         /* If there is an atime file in this filesystem the inode
489          * should hold a reference to the relevant block in
490          * that file.
491          */
492         struct inode *root, *at;
493         u32 bnum;
494         struct datablock *b;
495         if (async)
496                 /* Never bother for async lookups */
497                 return;
498         if (LAFSI(ino)->type < TypeBase)
499                 return;
500         if (test_bit(I_AccessTime, &LAFSI(ino)->iflags))
501                 return;
502         root = LAFSI(ino)->filesys;
503         at = LAFSI(root)->md.fs.accesstime;
504         if (at == NULL)
505                 return;
506
507         if (LAFSI(ino)->md.file.atime_offset)
508                 LAFSI(ino)->md.file.atime_offset = 0;
509
510         /* "* 2" to get byte number, then shift to get block
511          * number.  So just shift by 1 less than blkbits.
512          */
513         bnum = ino->i_ino >> (at->i_blkbits-1);
514         b = lafs_get_block(at, bnum, NULL, GFP_NOFS, MKREF(atime));
515         if (b) {
516                 if (lafs_read_block(b) == 0) {
517                         u16 *atp;
518                         int i;
519                         atp = map_dblock(b);
520                         i = (ino->i_ino * 2) & ((1<<at->i_blkbits)-1);
521                         LAFSI(ino)->md.file.atime_offset = le16_to_cpu(atp[i]);
522                         set_bit(I_AccessTime, &LAFSI(ino)->iflags);
523                         unmap_dblock(b, atp);
524                         lafs_add_atime_offset(&ino->i_atime,
525                                               LAFSI(ino)->md.file.atime_offset);
526                 } else
527                         putdref(b, MKREF(atime));
528         }
529 }
530
531 void lafs_add_atime_offset(struct timespec *atime, int offset)
532 {
533         int expon;
534         time_t mantissa;
535         if (offset == 0)
536                 return;
537
538         expon = offset & 0x1f;
539         if (expon)
540                 mantissa = (offset >> 5) | 0x800;
541         else
542                 mantissa = (offset >> 5);
543         if (expon >= 11) {
544                 /* seconds */
545                 mantissa <<= expon-11;
546                 atime->tv_sec += mantissa;
547         } else {
548                 /* milliseconds */
549                 if (expon)
550                         mantissa <<= expon-1;
551                 timespec_add_ns(atime, (s64)mantissa * 1000000);
552         }
553 }
554
555 static int normalise(int *mantissa)
556 {
557         /* Shift down until value can be stored in 12 bits:
558          * Top bit will be '1', so only 11 bits needed.
559          * Not used on values below 2048.
560          */
561         int shift = 0;
562         while (*mantissa >= 4096) {
563                 *mantissa >>= 1;
564                 shift ++;
565         }
566         return shift;
567 }
568
569 static int update_atime_delta(struct inode *ino)
570 {
571         /* calculate new delta to show the difference between
572          * i_atime and i_accesstime
573          */
574         int rv;
575         if (LAFSI(ino)->type < TypeBase)
576                 return 0;
577         if (timespec_compare(&ino->i_atime,
578                              &LAFSI(ino)->md.file.i_accesstime) <= 0) {
579                 /* We cannot store negative delta so if i_atime is in the
580                  * past, just store zero
581                  */
582                 rv = 0;
583         } else {
584                 struct timespec diff;
585                 int shift;
586
587                 diff = timespec_sub(ino->i_atime,
588                                     LAFSI(ino)->md.file.i_accesstime);
589                 if (diff.tv_sec >= 2048) {
590                         /* Just store the seconds */
591                         rv = diff.tv_sec;
592                         shift = normalise(&rv) + 11;
593                 } else {
594                         /* Store the milliseconds */
595                         int rv = diff.tv_nsec / 1000000;
596                         rv += diff.tv_sec * 1000;
597                         if (rv >= 2048)
598                                 shift = normalise(&rv) + 1;
599                         else
600                                 shift = 0;
601                 }
602                 if (shift > 31)
603                         rv = 0xFFFF;
604                 else {
605                         rv &= 0x7ff;
606                         rv <<= 5;
607                         rv |= shift;
608                 }
609         }
610         if (LAFSI(ino)->md.file.atime_offset == rv)
611                 return 0;
612
613         LAFSI(ino)->md.file.atime_offset = rv;
614         return 1;
615 }
616
617 static void store_atime_delta(struct inode *ino)
618 {
619         struct inode *at;
620         u32 bnum;
621         struct datablock *b;
622         u16 *atp;
623         int i;
624
625         if (!test_bit(I_AccessTime, &LAFSI(ino)->iflags))
626                 /* sorry, nothing we can do here */
627                 return;
628
629         /* We own a reference, so this lookup must succeed */
630         at = LAFSI(LAFSI(ino)->filesys)->md.fs.accesstime;
631         bnum = ino->i_ino >> (at->i_blkbits-1);
632         b = lafs_get_block(at, bnum, NULL, GFP_NOFS, MKREF(store_atime));
633         BUG_ON(!b);
634         atp = map_dblock(b);
635         i = (ino->i_ino * 2) & ((1<<at->i_blkbits)-1);
636         if (le16_to_cpu(atp[i]) != LAFSI(ino)->md.file.atime_offset) {
637                 atp[i] = cpu_to_le16(LAFSI(ino)->md.file.atime_offset);
638                 /* FIXME - I could lose an update here - do I care? */
639                 /* Can only reserve NewSpace with checkpoint locked... */
640                 lafs_checkpoint_lock(fs_from_inode(ino));
641                 if (lafs_reserve_block(&b->b, NewSpace) == 0)
642                         lafs_dirty_dblock(b);
643                 lafs_checkpoint_unlock(fs_from_inode(ino));
644         }
645         unmap_dblock(b, atp);
646         putdref(b, MKREF(store_atime));
647 }
648
649 void lafs_inode_checkpin(struct inode *ino)
650 {
651         /* Make sure I_Pinned is set correctly.
652          * It should be set precisely if i_nlink is non-zero,
653          * and ->iblock is B_Pinned.
654          * When it is set, we own a reference to the inode.
655          *
656          * This needs to be called whenever we change
657          * i_nlink, and whenever we pin or unpin an InoIdx
658          * block.
659          */
660         if (ino->i_nlink == 0) {
661                 /* I_Pinned should not be set */
662                 if (test_and_clear_bit(I_Pinned, &LAFSI(ino)->iflags)) {
663                         if (ino->i_sb->s_type == &lafs_fs_type)
664                                 iput(ino);
665                         else
666                                 lafs_iput_fs(ino);
667                 }
668         } else {
669                 /* Need to check if iblock is Pinned. */
670                 struct indexblock *ib = NULL;
671                 if (LAFSI(ino)->iblock) {
672                         spin_lock(&ino->i_data.private_lock);
673                         ib = LAFSI(ino)->iblock;
674                         if (ib && !test_bit(B_Pinned, &ib->b.flags))
675                                 ib = NULL;
676                         spin_unlock(&ino->i_data.private_lock);
677                 }
678                 if (ib) {
679                         if (!test_and_set_bit(I_Pinned, &LAFSI(ino)->iflags)) {
680                                 if (ino->i_sb->s_type == &lafs_fs_type)
681                                         igrab(ino);
682                                 else
683                                         lafs_igrab_fs(ino);
684                         }
685                 } else {
686                         if (test_and_clear_bit(I_Pinned, &LAFSI(ino)->iflags)) {
687                         if (ino->i_sb->s_type == &lafs_fs_type)
688                                 iput(ino);
689                         else
690                                 lafs_iput_fs(ino);
691                         }
692                 }
693         }
694 }
695
696 struct datablock *lafs_inode_get_dblock(struct inode *ino, REFARG)
697 {
698         struct datablock *db;
699
700         spin_lock(&ino->i_data.private_lock);
701         db = LAFSI(ino)->dblock;
702         if (db) {
703                 if (db->b.inode == ino)
704                         getdref_locked(db, REF);
705                 else {
706                         spin_lock_nested(&db->b.inode->i_data.private_lock, 1);
707                         getdref_locked(db, REF);
708                         spin_unlock(&db->b.inode->i_data.private_lock);
709                 }
710         }
711         spin_unlock(&ino->i_data.private_lock);
712         return db;
713 }
714
715 struct datablock *lafs_inode_dblock(struct inode *ino, int async, REFARG)
716 {
717         struct datablock *db;
718         int err;
719
720         db = lafs_inode_get_dblock(ino, REF);
721
722         if (!db)
723                 db = lafs_get_block(LAFSI(ino)->filesys, ino->i_ino, NULL,
724                                     GFP_KERNEL, REF);
725         if (!db)
726                 return ERR_PTR(-ENOMEM);
727
728         LAFSI(ino)->dblock = db;
729         rcu_assign_pointer(db->my_inode, ino);
730         if (async)
731                 err = lafs_read_block_async(db);
732         else
733                 err = lafs_read_block(db);
734         if (err == 0)
735                 return db;
736
737         putdref(db, REF);
738         return ERR_PTR(err);
739 }
740
741 void lafs_inode_init(struct datablock *b, int type, int mode, struct inode *dir)
742 {
743         /* A new block has been allocated in an inode file to hold an
744          * inode.  We get to fill in initial values so that when
745          * 'iget' calls lafs_import_inode, the correct inode is
746          * loaded.
747          */
748         struct fs *fs = fs_from_inode(b->b.inode);
749         struct la_inode *lai = map_dblock(b);
750         int size;
751
752         lai->data_blocks = cpu_to_le32(0);
753         lai->index_blocks = cpu_to_le32(0);
754         get_random_bytes(&lai->generation, sizeof(lai->generation));
755         lai->depth = 1;
756         lai->trunc_gen = 0;
757         lai->filetype = type;
758         lai->flags = 0;
759
760         switch(type) {
761         case TypeInodeFile:
762         {
763                 struct fs_metadata *l = &lai->metadata[0].fs;
764                 size = sizeof(struct fs_metadata);
765                 l->update_time = 0;
766                 l->blocks_used = 0;
767                 l->blocks_allowed = 0;
768                 l->creation_age = fs->wc[0].cluster_seq;
769                 l->inodes_used = 0;
770                 l->parent = 0;
771                 l->quota_inodes[0] = 0;
772                 l->quota_inodes[1] = 0;
773                 l->quota_inodes[2] = 0;
774                 l->snapshot_usage_table = 0;
775                 l->pad = 0;
776                 /* name will be zero length and not used */
777                 break;
778         }
779         case TypeInodeMap:
780         {
781                 struct inodemap_metadata *l = &lai->metadata[0].inodemap;
782                 l->size = 0;
783                 size = sizeof(struct inodemap_metadata);
784                 break;
785         }
786         case TypeSegmentMap:
787                 size = sizeof(struct su_metadata);
788                 break;
789         case TypeQuota:
790                 size = sizeof(struct quota_metadata);
791                 break;
792         case TypeOrphanList:
793                 size = 0;
794                 break;
795         case TypeAccessTime:
796                 size = 0;
797                 break;
798         default:
799         {
800                 struct file_metadata *l = &lai->metadata[0].file;
801                 struct timespec now = CURRENT_TIME;
802
803                 l->flags = cpu_to_le16(0);
804                 l->userid = cpu_to_le32(current->cred->fsuid);
805                 if (dir && (dir->i_mode & S_ISGID)) {
806                         l->groupid = cpu_to_le32(dir->i_gid);
807                         if (type == TypeDir)
808                                 mode |= S_ISGID;
809                 } else
810                         l->groupid = cpu_to_le32(current->cred->fsgid);
811                 if (dir && LAFSI(dir)->md.file.treeid)
812                         l->treeid = cpu_to_le32(LAFSI(dir)->md.file.treeid);
813                 else
814                         l->treeid = l->userid;
815
816                 l->mode = cpu_to_le16(mode);
817                 l->creationtime = encode_time(&now);
818                 l->modifytime = l->creationtime;
819                 l->ctime = l->creationtime;
820                 l->accesstime = l->creationtime;
821                 l->size = 0;
822                 l->parent = dir ? cpu_to_le32(dir->i_ino) : 0;
823                 l->linkcount = 0;
824                 l->attrinode = 0;
825                 if (type == TypeDir) {
826                         struct dir_metadata *l = &lai->metadata[0].dir;
827                         u32 seed;
828                         get_random_bytes(&seed,
829                                          sizeof(seed));
830                         seed = (seed & ~7) | 1;
831                         l->hash_seed = cpu_to_le32(seed);
832                         size = sizeof(struct dir_metadata);
833                 } else if (type == TypeSpecial) {
834                         struct special_metadata *s = &lai->metadata[0].special;
835                         s->major = s->minor = 0;
836                         size = sizeof(struct special_metadata);
837                 } else
838                         size = sizeof(struct file_metadata);
839         }
840         }
841         size += sizeof(struct la_inode);
842         lai->metadata_size = cpu_to_le32(size);
843         memset(((char *)lai)+size, 0, fs->blocksize-size);
844         *(u16 *)(((char *)lai)+size) = cpu_to_le16(IBLK_EXTENT);
845
846         unmap_dblock(b, lai);
847         set_bit(B_Valid, &b->b.flags);
848         LAFS_BUG(!test_bit(B_Pinned, &b->b.flags), &b->b);
849         lafs_dirty_dblock(b);
850 }
851
852 static int inode_map_free(struct fs *fs, struct inode *fsys, u32 inum);
853
854 void lafs_evict_inode(struct inode *ino)
855 {
856         struct fs *fs = fs_from_inode(ino);
857         struct lafs_inode *li = LAFSI(ino);
858
859         if (ino->i_mode == 0) {
860                 /* There never was an inode here,
861                  * so nothing to do.
862                  * We just call end_writeback to get the
863                  * flags set properly.
864                  */
865                 end_writeback(ino);
866                 return;
867         }
868
869         dprintk("EVICT INODE %d\n", (int)ino->i_ino);
870
871
872         /* Normal truncation holds an igrab, so we cannot be
873          * deleted until any truncation finishes
874          */
875         BUG_ON(test_bit(I_Trunc, &LAFSI(ino)->iflags));
876
877         if (ino->i_nlink == 0) {
878                 struct datablock *b =
879                         lafs_inode_dblock(ino, SYNC, MKREF(delete_inode));
880                 i_size_write(ino, 0);
881                 truncate_inode_pages(&ino->i_data, 0);
882                 LAFSI(ino)->trunc_next = 0;
883                 set_bit(I_Deleting, &LAFSI(ino)->iflags);
884                 set_bit(I_Trunc, &LAFSI(ino)->iflags);
885                 lafs_igrab_fs(ino);
886                 if (!IS_ERR(b)) {
887                         set_bit(B_Claimed, &b->b.flags);
888                         lafs_add_orphan(fs, b);
889                         dprintk("PUNCH hole for %d\n", (int)b->b.fileaddr);
890                         putdref(b, MKREF(delete_inode));
891                 }
892                 inode_map_free(fs, LAFSI(ino)->filesys,  ino->i_ino);
893         } else
894                 truncate_inode_pages(&ino->i_data, 0);
895         end_writeback(ino);
896
897         dprintk("CLEAR INODE %d\n", (int)ino->i_ino);
898
899         li->type = 0;
900
901         /* Now is a good time to break the linkage between
902          * inode and dblock - but not if the file is
903          * being deleted
904          */
905         if (!test_bit(I_Deleting, &li->iflags)) {
906                 struct datablock *db;
907                 spin_lock(&ino->i_data.private_lock);
908                 db = li->dblock;
909                 if (db) {
910                         struct indexblock *ib = li->iblock;
911                         LAFS_BUG(ib && atomic_read(&ib->b.refcnt), &db->b);
912                         db->my_inode = NULL;
913                         li->dblock = NULL;
914                         li->iblock = NULL;
915                 }
916                 spin_unlock(&ino->i_data.private_lock);
917         }
918
919         /* FIXME release quota inodes if filesystem */
920 }
921
922 static int prune(void *data, u32 addr, u64 paddr, int len)
923 {
924         /* This whole index block is being pruned, just account
925          * for everything and it will be cleared afterwards
926          */
927         struct indexblock *ib = data;
928         struct inode *ino = ib->b.inode;
929         struct fs *fs = fs_from_inode(ino);
930         int ph = !!test_bit(B_Phase1, &ib->b.flags);
931         int i;
932         dprintk("PRUNE %d for %d at %lld\n", addr, len, (long long)paddr);
933         if (paddr == 0 || len == 0)
934                 return 0;
935         for (i = 0 ; i < len ; i++)
936                 lafs_summary_update(fs, ino, paddr+i, 0, 0, ph, 0);
937         return len;
938 }
939
940 static int prune_some(void *data, u32 addr, u64 paddr, int len)
941 {
942         /* Part of this index block is being pruned.  Copy
943          * what addresses we can into uninc_table so that
944          * it can be 'incorporated'
945          * We should probably share some code with
946          * lafs_allocated_block??
947          */
948         struct indexblock *ib = data;
949         struct inode *ino = ib->b.inode;
950         struct fs *fs = fs_from_inode(ino);
951         int ph = !!test_bit(B_Phase1, &ib->b.flags);
952         int i;
953
954         if (paddr == 0 || len == 0)
955                 return 0;
956         dprintk("PRUNE2 %d for %d at %lld\n", addr, len, (long long)paddr);
957         for (i = 0 ; i < len ; i++) {
958                 /* FIXME should allow longer truncation ranges in uninc_table
959                  * as they are easy to handle.
960                  */
961                 struct addr *a;
962                 if (addr + i < LAFSI(ino)->trunc_next)
963                         continue;
964                 spin_lock(&ino->i_data.private_lock);
965                 a = &ib->uninc_table.pending_addr
966                         [ib->uninc_table.pending_cnt - 1];
967                 if (ib->uninc_table.pending_cnt <
968                     ARRAY_SIZE(ib->uninc_table.pending_addr)) {
969                         a++;
970                         a->fileaddr = addr + i;
971                         a->physaddr = 0;
972                         a->cnt = 1;
973                         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
974                         ib->uninc_table.pending_cnt++;
975                 } else {
976                         spin_unlock(&ino->i_data.private_lock);
977                         break;
978                 }
979                 spin_unlock(&ino->i_data.private_lock);
980                 lafs_summary_update(fs, ino, paddr+i, 0, 0, ph, 0);
981         }
982         return i;
983 }
984
985 int lafs_inode_handle_orphan(struct datablock *b)
986 {
987         /* Don't need rcu protection for my_inode run_orphan
988          * holds a reference
989          */
990         struct indexblock *ib, *ib2;
991         struct inode *ino = b->my_inode;
992         struct fs *fs = fs_from_inode(ino);
993         u32 trunc_next, next_trunc;
994         int loop_cnt = 20;
995         int err = -ENOMEM;
996
997         if (!test_bit(I_Trunc, &LAFSI(ino)->iflags)) {
998                 if (test_bit(I_Deleting, &LAFSI(ino)->iflags)) {
999                         LAFS_BUG(ino->i_nlink, &b->b);
1000                         if (LAFSI(ino)->cblocks +
1001                             LAFSI(ino)->pblocks +
1002                             LAFSI(ino)->ablocks +
1003                             LAFSI(ino)->ciblocks +
1004                             LAFSI(ino)->piblocks)
1005                         printk("Deleting inode %lu: %ld+%ld+%ld %ld+%ld\n",
1006                                ino->i_ino,
1007                                LAFSI(ino)->cblocks,
1008                                LAFSI(ino)->pblocks,
1009                                LAFSI(ino)->ablocks,
1010                                LAFSI(ino)->ciblocks,
1011                                LAFSI(ino)->piblocks);
1012                         BUG_ON(LAFSI(ino)->cblocks +
1013                                LAFSI(ino)->pblocks +
1014                                LAFSI(ino)->ablocks +
1015                                LAFSI(ino)->ciblocks +
1016                                LAFSI(ino)->piblocks);
1017                         if (lafs_erase_dblock_async(b))
1018                                 lafs_orphan_release(fs, b);
1019                 } else if (ino->i_nlink || LAFSI(ino)->type == 0)
1020                         lafs_orphan_release(fs, b);
1021                 else
1022                         lafs_orphan_forget(fs, b);
1023                 return 0;
1024         }
1025
1026         ib = lafs_make_iblock(ino, ADOPT, SYNC, MKREF(inode_handle_orphan));
1027         if (IS_ERR(ib))
1028                 return PTR_ERR(ib);
1029
1030         /* Here is the guts of 'truncate'.  We find the next leaf index
1031          * block and discard all the addresses there-in.
1032          */
1033         trunc_next = LAFSI(ino)->trunc_next;
1034
1035         if (trunc_next == 0xFFFFFFFF) {
1036                 /* truncate has finished in that all data blocks
1037                  * have been removed and all index block are either
1038                  * gone or pending incorporation at which point they will
1039                  * go.
1040                  * If we hit a phase change, we will need to postpone
1041                  * the rest of the cleaning until it completes.
1042                  * If there is a checkpoint happening, then all the work
1043                  * that we can do now, it will do for us.  So just
1044                  * let it.
1045                  */
1046                 struct indexblock *tmp;
1047                 struct indexblock *next;
1048                 u32 lastaddr;
1049
1050                 if (!test_bit(B_Pinned, &ib->b.flags)) {
1051                         /* must be finished */
1052                         LAFS_BUG(test_bit(B_Dirty, &ib->b.flags), &ib->b);
1053                         clear_bit(I_Trunc, &LAFSI(ino)->iflags);
1054                         lafs_iput_fs(ino);
1055                         wake_up(&fs->trunc_wait);
1056                         err = -ERESTARTSYS;
1057                         goto out2;
1058                 }
1059                 if (fs->checkpointing) {
1060                         /* This cannot happen with current code,
1061                          * but leave it in case we ever have
1062                          * orphan handling parallel with checkpoints
1063                          */
1064                         err = -EBUSY; /* Try again after the checkpoint */
1065                         goto out2;
1066                 }
1067
1068                 lastaddr = (i_size_read(ino) +
1069                             fs->blocksize - 1)
1070                         >> fs->blocksize_bits;
1071                 /* Find a Pinned descendent of ib which has no
1072                  * Pinned descendents and no PrimaryRef dependent
1073                  * (so take the last).
1074                  * Prefer blocks that are beyond EOF (again, take the last).
1075                  * If there are none, descend the last block that
1076                  * is not after EOF and look at its children.
1077                  */
1078                 ib2 = next = ib;
1079                 spin_lock(&ib->b.inode->i_data.private_lock);
1080                 while (next) {
1081                         ib2 = next;
1082                         next = NULL;
1083                         list_for_each_entry(tmp, &ib2->children, b.siblings) {
1084                                 if (!test_bit(B_Index, &tmp->b.flags) ||
1085                                     !test_bit(B_Pinned, &tmp->b.flags))
1086                                         continue;
1087                                 if (next == NULL ||
1088                                     tmp->b.fileaddr > next->b.fileaddr)
1089                                         next = tmp;
1090                         }
1091                 }
1092                 if (ib2->b.fileaddr < lastaddr) {
1093                         /* Must be all done */
1094                         spin_unlock(&ib->b.inode->i_data.private_lock);
1095                         clear_bit(I_Trunc, &LAFSI(ino)->iflags);
1096                         lafs_iput_fs(ino);
1097                         wake_up(&fs->trunc_wait);
1098                         err = -ERESTARTSYS;
1099                         goto out2;
1100                 }
1101                 getiref(ib2, MKREF(inode_handle_orphan2));
1102                 spin_unlock(&ib->b.inode->i_data.private_lock);
1103
1104                 /* ib2 is an index block beyond EOF with no
1105                  * Pinned children.
1106                  * Incorporating it should unpin it.
1107                  */
1108                 if (!list_empty(&ib2->children)) {
1109                         lafs_print_tree(&ib2->b, 3);
1110                         LAFS_BUG(1, &ib2->b);
1111                 }
1112
1113                 if (!lafs_iolock_written_async(&ib2->b)) {
1114                         putiref(ib2, MKREF(inode_handle_orphan2));
1115                         err = -EAGAIN;
1116                         goto out2;
1117                 }
1118                 while (ib2->uninc_table.pending_cnt || ib2->uninc)
1119                         lafs_incorporate(fs, ib2);
1120
1121                 if (test_bit(B_Dirty, &ib2->b.flags) ||
1122                     test_bit(B_Realloc, &ib2->b.flags))
1123                         lafs_cluster_allocate(&ib2->b, 0);
1124                 else
1125                         lafs_iounlock_block(&ib2->b);
1126
1127                 if (!list_empty(&ib2->b.siblings)) {
1128                         printk("looping on %s\n", strblk(&ib2->b));
1129                         loop_cnt--;
1130                         if (loop_cnt < 0)
1131                                 BUG();
1132                 }
1133                 putiref(ib2, MKREF(inode_handle_orphan2));
1134                 err = -ERESTARTSYS;
1135                 if (ib->uninc) {
1136                         if (lafs_iolock_written_async(&ib->b)) {
1137                                 while (ib->uninc)
1138                                         lafs_incorporate(fs, ib);
1139                                 lafs_iounlock_block(&ib->b);
1140                         } else
1141                                 err = -EAGAIN;
1142                 }
1143         out2:
1144                 putiref(ib, MKREF(inode_handle_orphan));
1145                 return err;
1146         }
1147
1148         putiref(ib, MKREF(inode_handle_orphan));
1149
1150         ib = lafs_leaf_find(ino, trunc_next, ADOPT, &next_trunc,
1151                             ASYNC, MKREF(inode_handle_orphan3));
1152         if (IS_ERR(ib))
1153                 return PTR_ERR(ib);
1154         /* now hold an iolock on ib */
1155
1156         /* Ok, trunc_next seems to refer to a block that exists.
1157          * We need to erase it..
1158          *
1159          * So we open up the index block ourselves, call
1160          * lafs_summary_update with each block address, and then
1161          * erase the block.
1162          */
1163
1164         if (LAFSI(ino)->depth == 0) {
1165                 /* Nothing to truncate */
1166                 clear_bit(I_Trunc, &LAFSI(ino)->iflags);
1167                 lafs_iput_fs(ino);
1168                 if (test_bit(B_Pinned, &ib->b.flags))
1169                         /* Need to move the dirtiness which keeps this
1170                          * pinned to the data block.
1171                          */
1172                         lafs_cluster_allocate(&ib->b, 0);
1173                 else
1174                         lafs_iounlock_block(&ib->b);
1175                 err = -ERESTARTSYS;
1176                 goto out_put;
1177         }
1178
1179         lafs_checkpoint_lock(fs);
1180         err = lafs_reserve_block(&ib->b, ReleaseSpace);
1181         if (err < 0)
1182                 goto out;
1183
1184         if (!test_bit(B_Valid, &ib->b.flags) &&
1185             test_bit(B_InoIdx, &ib->b.flags)) {
1186                 /* still invalid, just re-erase to remove
1187                  * pinning */
1188                 LAFSI(ino)->trunc_next = next_trunc;
1189                 lafs_cluster_allocate(&ib->b, 0);
1190                 err = -ERESTARTSYS;
1191                 goto out_unlocked;
1192         }
1193
1194         lafs_pin_block(&ib->b);
1195
1196         /* It might be that this can happen, in which case
1197          * we simply update trunc_next and loop.  But I'd like
1198          * to be sure before I implement that
1199          */
1200         if (!test_bit(B_Valid, &ib->b.flags)) {
1201                 printk("Not Valid: %s\n", strblk(&ib->b));
1202                 printk("depth = %d\n", LAFSI(ino)->depth);
1203                 if (test_bit(B_InoIdx, &ib->b.flags))
1204                         printk("DB: %s\n", strblk(&LAFSI(ib->b.inode)->dblock->b));
1205                 LAFSI(ino)->trunc_next = next_trunc;
1206                 //BUG_ON(!test_bit(B_Valid, &ib->b.flags));
1207                 err = -ERESTARTSYS;
1208                 goto out;
1209         }
1210
1211         if (ib->b.fileaddr < trunc_next &&
1212             lafs_leaf_next(ib, 0) < trunc_next) {
1213                 /* We only want to truncate part of this index block.
1214                  * So we copy addresses into uninc_table and then
1215                  * call lafs_incorporate.
1216                  * This might cause the index tree to grow, so we
1217                  * cannot trust next_trunc
1218                  */
1219                 if (ib->uninc_table.pending_cnt == 0 &&
1220                     ib->uninc == NULL) {
1221                         lafs_dirty_iblock(ib, 0);
1222                         /* FIXME this just removes 8 blocks at a time,
1223                          * which is not enough
1224                          */
1225                         lafs_walk_leaf_index(ib, prune_some, ib);
1226                 }
1227                 if (test_bit(B_Dirty, &ib->b.flags))
1228                         lafs_incorporate(fs, ib);
1229                 err = -ERESTARTSYS;
1230                 goto out;
1231         }
1232         LAFSI(ino)->trunc_next = next_trunc;
1233
1234         while (ib->uninc_table.pending_cnt || ib->uninc) {
1235                 /* There should be no Realloc data blocks here
1236                  * but index blocks might be realloc still.
1237                  */
1238                 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1239                          !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1240                 lafs_incorporate(fs, ib);
1241         }
1242         if (test_bit(B_InoIdx, &ib->b.flags) ||
1243             !test_bit(B_PhysValid, &ib->b.flags) ||
1244             ib->b.physaddr != 0) {
1245                 lafs_walk_leaf_index(ib, prune, ib);
1246                 lafs_clear_index(ib);
1247                 lafs_dirty_iblock(ib, 0);
1248         }
1249         if (test_bit(B_Dirty, &ib->b.flags))
1250                 lafs_incorporate(fs, ib);
1251         if (!list_empty(&ib->children))
1252                 lafs_print_tree(&ib->b, 2);
1253         LAFS_BUG(!list_empty(&ib->children), &ib->b);
1254         err = -ERESTARTSYS;
1255 out:
1256         lafs_iounlock_block(&ib->b);
1257 out_unlocked:
1258         lafs_checkpoint_unlock(fs);
1259 out_put:
1260         putiref(ib, MKREF(inode_handle_orphan3));
1261         return err;
1262 }
1263
1264 void lafs_dirty_inode(struct inode *ino)
1265 {
1266         /* this is called in one of three cases:
1267          * 1/ by lafs internally when dblock or iblock is pinned and
1268          *    ready to be dirtied
1269          * 2/ by writeout before requesting a write - to update mtime
1270          * 3/ by read to update atime
1271          *
1272          * We want to handle atime updates carefully as they may not change
1273          * the stored inode itself.
1274          * For all other updates, the inode dblock exists and is pinned.
1275          * In those cases we will be updating the inode and so can store
1276          * the atime exactly.
1277          * For an atime update, the dblock may not exists, or may not be
1278          * Pinned.  If it isn't then we don't want to make the inode dirty
1279          * but only want to update the delta stored in the atime file.
1280          * The block for that should already be pinned.
1281          *
1282          *
1283          * We mustn't update the data block as it could be in
1284          * writeout and we cannot always wait safely.
1285          * So require that anyone who really cares, dirties the datablock
1286          * or a child themselves.
1287          * When cluster_allocate eventually gets called, it will update
1288          * the datablock from the inode.
1289          * If an update has to wait for the next phase, lock_dblock
1290          * (e.g. in setattr) will do that.
1291          *
1292          * We also use this opportunity to update the filesystem modify time.
1293          */
1294         struct timespec now;
1295         struct inode *filesys;
1296         int atime_only = 1;
1297
1298         if (LAFSI(ino)->dblock) {
1299                 struct datablock *db;
1300                 spin_lock(&ino->i_data.private_lock);
1301                 db = LAFSI(ino)->dblock;
1302                 if (db && test_bit(B_Pinned, &db->b.flags))
1303                         atime_only = 0;
1304                 spin_unlock(&ino->i_data.private_lock);
1305         }
1306
1307         if (atime_only) {
1308                 if (update_atime_delta(ino))
1309                         store_atime_delta(ino);
1310                 return;
1311         }
1312
1313         set_bit(I_Dirty, &LAFSI(ino)->iflags);
1314         ino->i_sb->s_dirt = 1;
1315
1316         if (LAFSI(ino)->type < TypeBase)
1317                 return;
1318         LAFSI(ino)->md.file.i_accesstime = ino->i_atime;
1319         if (LAFSI(ino)->md.file.atime_offset) {
1320                 LAFSI(ino)->md.file.atime_offset = 0;
1321                 store_atime_delta(ino);
1322         }
1323
1324         now = current_fs_time(ino->i_sb);
1325         filesys = LAFSI(ino)->filesys;
1326         if (!timespec_equal(&filesys->i_mtime, &now)) {
1327                 filesys->i_mtime = now;
1328                 set_bit(I_Dirty, &LAFSI(filesys)->iflags);
1329         }
1330 }
1331
1332 int lafs_sync_inode(struct inode *ino, int wait)
1333 {
1334         /* fsync has been called on this file so we need
1335          * to sync any inode updates to the next cluster.
1336          *
1337          * If we cannot create an update record,
1338          * we wait for a phase change, which writes everything
1339          * out.
1340          */
1341         struct datablock *b;
1342         struct fs *fs = fs_from_inode(ino);
1343         struct update_handle uh;
1344         int err;
1345
1346         if (wait) {
1347                 if (LAFSI(ino)->update_cluster > 1)
1348                         lafs_cluster_wait(fs, LAFSI(ino)->update_cluster);
1349                 if (LAFSI(ino)->update_cluster == 1) {
1350                         lafs_checkpoint_lock(fs);
1351                         lafs_checkpoint_unlock_wait(fs);
1352                 }
1353                 return 0;
1354         }
1355
1356         LAFSI(ino)->update_cluster = 0;
1357         if (!test_bit(I_Dirty, &LAFSI(ino)->iflags))
1358                 return 0;
1359         b = lafs_inode_dblock(ino, SYNC, MKREF(write_inode));
1360         if (IS_ERR(b))
1361                 return PTR_ERR(b);
1362
1363         lafs_iolock_written(&b->b);
1364         lafs_inode_fillblock(ino);
1365         lafs_iounlock_block(&b->b);
1366
1367         err = lafs_cluster_update_prepare(&uh, fs, LAFS_INODE_LOG_SIZE);
1368         if (err)
1369                 lafs_cluster_update_abort(&uh);
1370         else {
1371                 lafs_checkpoint_lock(fs);
1372                 if (lafs_cluster_update_pin(&uh) == 0) {
1373                         if (test_and_clear_bit(B_Dirty, &b->b.flags))
1374                                 lafs_space_return(fs, 1);
1375                         LAFSI(ino)->update_cluster =
1376                                 lafs_cluster_update_commit
1377                                 (&uh, b, LAFS_INODE_LOG_START,
1378                                  LAFS_INODE_LOG_SIZE);
1379                 } else  
1380                         lafs_cluster_update_abort(&uh);
1381                 lafs_checkpoint_unlock(fs);
1382         }
1383         if (test_bit(B_Dirty, &b->b.flags)) {
1384                 /* FIXME need to write out the data block...
1385                  * Is that just lafs_cluster_allocate ?
1386                  */
1387         }
1388
1389         if (LAFSI(ino)->update_cluster == 0) {
1390                 lafs_checkpoint_lock(fs);
1391                 if (test_bit(B_Dirty, &b->b.flags))
1392                         LAFSI(ino)->update_cluster = 1;
1393                 lafs_checkpoint_start(fs);
1394                 lafs_checkpoint_unlock(fs);
1395         }
1396         putdref(b, MKREF(write_inode));
1397         return 0; /* FIXME should I return some error message??? */
1398 }
1399
1400 void lafs_inode_fillblock(struct inode *ino)
1401 {
1402         /* copy data from ino into the related data block */
1403
1404         struct lafs_inode *li = LAFSI(ino);
1405         struct datablock *db = li->dblock;
1406         struct la_inode *lai;
1407
1408         clear_bit(I_Dirty, &LAFSI(ino)->iflags);
1409
1410         lai = map_dblock(db);
1411         lai->data_blocks = cpu_to_le32(li->cblocks);
1412         lai->index_blocks = cpu_to_le32(li->ciblocks);
1413         lai->generation = cpu_to_le16(ino->i_generation);
1414         lai->trunc_gen = li->trunc_gen;
1415         lai->flags = li->flags;
1416         lai->filetype = li->type;
1417         if (lai->metadata_size != cpu_to_le16(li->metadata_size)) {
1418                 /* Changing metadata size is wierd.
1419                  * We will need to handle this somehow for xattrs
1420                  * For now we just want to cope with
1421                  * Dir -> InodeFile changes, and that guarantees us
1422                  * there is no index info - so just clear the index 
1423                  * area.
1424                  */
1425                 u16 *s = (u16*)(((char*)lai) + li->metadata_size);
1426                 BUG_ON(li->type != TypeInodeFile);
1427                 lai->metadata_size = cpu_to_le16(li->metadata_size);
1428                 memset(s, 0, ino->i_sb->s_blocksize - li->metadata_size);
1429                 *s = cpu_to_le16(IBLK_INDIRECT);
1430         }
1431         lai->depth = li->depth;
1432
1433         switch (li->type) {
1434         case TypeInodeFile:
1435         {
1436                 struct fs_md *i = &li->md.fs;
1437                 struct fs_metadata *l = &lai->metadata[0].fs;
1438                 int nlen;
1439
1440                 l->snapshot_usage_table = cpu_to_le16(i->usagetable);
1441                 l->update_time = cpu_to_le64(encode_time(&ino->i_mtime));
1442                 l->blocks_used = cpu_to_le64(i->cblocks_used);
1443                 l->blocks_allowed = cpu_to_le64(i->blocks_allowed);
1444                 l->creation_age = cpu_to_le64(i->creation_age);
1445                 l->inodes_used = cpu_to_le32(i->inodes_used);
1446                 l->parent = cpu_to_le32(i->parent);
1447                 l->quota_inodes[0] = cpu_to_le32(i->quota_inums[0]);
1448                 l->quota_inodes[1] = cpu_to_le32(i->quota_inums[1]);
1449                 l->quota_inodes[2] = cpu_to_le32(i->quota_inums[2]);
1450                 nlen = lai->metadata_size - offsetof(struct la_inode,
1451                                                      metadata[0].fs.name);
1452                 memset(l->name, 0, nlen);
1453                 if (i->name == NULL)
1454                         nlen = 0;
1455                 else if (strlen(i->name) < nlen)
1456                         nlen = strlen(i->name);
1457                 memcpy(l->name, i->name, nlen);
1458                 break;
1459         }
1460
1461         case TypeInodeMap:
1462         {
1463                 struct inodemap_md *m = &li->md.inodemap;
1464                 struct inodemap_metadata *s = &lai->metadata[0].inodemap;
1465                 s->size = cpu_to_le32(m->size);
1466                 break;
1467         }
1468
1469         case TypeSegmentMap:
1470         {
1471                 struct su_md *m = &li->md.segmentusage;
1472                 struct su_metadata *s = &lai->metadata[0].segmentusage;
1473                 s->table_size = cpu_to_le32(m->table_size);
1474                 break;
1475         }
1476
1477         case TypeQuota:
1478         {
1479                 struct quota_md *m = &li->md.quota;
1480                 struct quota_metadata *s = &lai->metadata[0].quota;
1481                 s->gracetime = cpu_to_le32(m->gracetime);
1482                 s->graceunits = cpu_to_le32(m->graceunits);
1483                 break;
1484         }
1485         case TypeOrphanList:
1486         case TypeAccessTime:
1487                 break;
1488
1489         default: /* TypeBase or larger */
1490         {
1491                 struct file_md *i = &li->md.file;
1492                 struct file_metadata *l = &lai->metadata[0].file;
1493                 struct dir_metadata *d = &lai->metadata[0].dir;
1494                 struct special_metadata *s = &lai->metadata[0].special;
1495
1496                 if (li->type < TypeBase)
1497                         break;
1498                 l->flags = cpu_to_le16(i->flags);
1499                 l->mode = cpu_to_le16(ino->i_mode);
1500                 l->userid = cpu_to_le32(ino->i_uid);
1501                 l->groupid = cpu_to_le32(ino->i_gid);
1502                 l->treeid = cpu_to_le32(i->treeid);
1503                 l->creationtime = cpu_to_le64(i->creationtime);
1504                 l->modifytime = cpu_to_le64(encode_time(&ino->i_mtime));
1505                 l->ctime = cpu_to_le64(encode_time(&ino->i_ctime));
1506                 l->accesstime = cpu_to_le64(encode_time(&i->i_accesstime));
1507                 l->size = cpu_to_le64(ino->i_size);
1508                 l->parent = cpu_to_le32(i->parent);
1509                 l->linkcount = cpu_to_le32(ino->i_nlink);
1510
1511                 switch (li->type) {
1512                 case TypeFile:
1513                         break;
1514                 case TypeDir:
1515                         d->hash_seed = cpu_to_le32(i->seed);
1516                         break;
1517                 case TypeSymlink:
1518                         break;
1519                 case TypeSpecial:
1520                         s->major = cpu_to_le32(MAJOR(ino->i_rdev));
1521                         s->minor = cpu_to_le32(MINOR(ino->i_rdev));
1522                         break;
1523                 }
1524         }
1525         }
1526         unmap_dblock(db, lai);
1527 }
1528
1529 /*-----------------------------------------------------------------------
1530  * Inode allocate map handling.
1531  * Inode 1 of each fileset is a bitmap of free inode numbers.
1532  * Whenever the file is extended in size, new bits are set to one.  They
1533  * are then cleared when the inode is allocated.  When a block becomes
1534  * full of zeros, we don't need to store it any more.
1535  *
1536  * We don't clear the bit until we are committed to creating an inode
1537  * This means we cannot clear it straight away, so two different threads
1538  * might see the same inode number as being available.  We have two
1539  * approaches to guard against this.
1540  * Firstly we have a 'current' pointer into the inodemap file and
1541  * increase that past the inode we return.  This discourages multiple
1542  * hits but as the pointer would need to be rewound occasionally it
1543  * isn't a guarantee.  The guarantee against multiple allocations is done
1544  * via a flag in the block representing an inode.  This is set
1545  * while an inode is being allocated.
1546  */
1547
1548 /* inode number allocation has the prealloc/pin/commit/abort structure
1549  * so it can be committed effectively
1550  */
1551
1552 static int
1553 choose_free_inum(struct fs *fs, struct inode *fsys, u32 *inump,
1554                  struct datablock **bp, int *restarted)
1555 {
1556         struct inode *im = lafs_iget(fsys, 1, SYNC);
1557         loff_t bnum;
1558         struct datablock *b;
1559         char *buf;
1560         int err;
1561         int bit;
1562
1563         if (*bp) {
1564                 struct inode *i = (*bp)->b.inode;
1565                 putdref(*bp, MKREF(cfi_map));
1566                 iput(i);
1567                 *bp = NULL;
1568         }
1569
1570         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1571 retry:
1572         bnum = LAFSI(im)->md.inodemap.thisblock;
1573
1574         if (bnum == NoBlock ||
1575             LAFSI(im)->md.inodemap.nextbit >= (fs->blocksize<<3)) {
1576                 if (bnum == NoBlock)
1577                         bnum = LAFSI(im)->md.inodemap.size;
1578
1579                 if (bnum+1 < LAFSI(im)->md.inodemap.size)
1580                         bnum++;
1581                 else if (!*restarted) {
1582                         bnum = 0;
1583                         *restarted = 1;
1584                 } else {
1585                         /* Need to add a new block to the file */
1586                         bnum = LAFSI(im)->md.inodemap.size;
1587                         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL,
1588                                            MKREF(cfi_map));
1589                         err = -ENOMEM;
1590                         if (!b)
1591                                 goto abort;
1592                         lafs_iolock_written(&b->b);
1593                         set_bit(B_PinPending, &b->b.flags);
1594                         lafs_iounlock_block(&b->b);
1595                 retry2:
1596                         lafs_checkpoint_lock(fs);
1597                         err = lafs_pin_dblock(b, NewSpace);
1598                         if (err == -EAGAIN) {
1599                                 lafs_checkpoint_unlock_wait(fs);
1600                                 goto retry2;
1601                         }
1602                         if (err < 0)
1603                                 goto abort_unlock;
1604
1605                         buf = map_dblock(b);
1606                         /* Set block to "all are free" */
1607                         memset(buf, 0xff, fs->blocksize);
1608                         unmap_dblock(b, buf);
1609                         set_bit(B_Valid, &b->b.flags);
1610                         LAFSI(im)->md.inodemap.size = bnum+1;
1611                         lafs_dirty_inode(im);
1612                         lafs_dirty_dblock(b);
1613                         lafs_checkpoint_unlock(fs);
1614                         putdref(b, MKREF(cfi_map));
1615                 }
1616                 b = NULL;
1617                 err = lafs_find_next(im, &bnum);
1618                 if (err < 0)
1619                         goto abort;
1620                 if (err == 0)
1621                         bnum = 0;
1622
1623                 LAFSI(im)->md.inodemap.nextbit = 0;
1624                 LAFSI(im)->md.inodemap.thisblock = bnum;
1625                 goto retry;
1626         }
1627         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(cfi_map));
1628         err = -ENOSPC;
1629         if (!b)
1630                 goto abort;
1631         err = lafs_find_block(b, NOADOPT);
1632         if (err)
1633                 goto abort;
1634         if (b->b.physaddr == 0 && !test_bit(B_Valid, &b->b.flags)) {
1635                 LAFSI(im)->md.inodemap.nextbit =
1636                         (fs->blocksize<<3) + 1;
1637                 putdref(b,MKREF(cfi_map));
1638                 goto retry;
1639         }
1640         err = lafs_read_block(b);
1641         if (err)
1642                 goto abort;
1643
1644         bit = LAFSI(im)->md.inodemap.nextbit;
1645         LAFSI(im)->md.inodemap.thisblock = bnum;
1646         buf = map_dblock(b);
1647         while (bnum == 0 && bit < 16) {
1648                 /* Never return an inum below 16 - they are special */
1649                 if (!generic_test_le_bit(bit, (unsigned long *)buf))
1650                         generic___clear_le_bit(bit, (unsigned long *)buf);
1651                 bit++;
1652         }
1653
1654         bit = generic_find_next_le_bit((unsigned long *)buf,
1655                                        fs->blocksize<<3, bit);
1656         unmap_dblock(b, buf);
1657         LAFSI(im)->md.inodemap.nextbit = bit+1;
1658         if (bit >= fs->blocksize<<3) {
1659                 putdref(b,MKREF(cfi_map));
1660                 goto retry;
1661         }
1662         mutex_unlock(&im->i_mutex);
1663         *bp = b;
1664         *inump = bit + (bnum << (im->i_blkbits + 3));
1665         return 0;
1666
1667 abort_unlock:
1668         lafs_checkpoint_unlock(fs);
1669 abort:
1670         putdref(b, MKREF(cfi_map));
1671         *bp = NULL;
1672         mutex_unlock(&im->i_mutex);
1673         iput(im);
1674         return err;
1675 }
1676
1677 struct inode_map_new_info {
1678         struct datablock *ib, *mb;
1679 };
1680
1681 static int
1682 inode_map_new_prepare(struct fs *fs, int inum, struct inode *fsys,
1683                       struct inode_map_new_info *imni)
1684 {
1685         int choice = inum;
1686         int restarted = 0;
1687         int err = 0;
1688         struct datablock *b;
1689
1690         imni->ib = imni->mb = NULL;
1691 retry:
1692         if (inum == 0)
1693                 /* choose a possibly-free inode number */
1694                 err = choose_free_inum(fs, fsys, &choice,
1695                                        &imni->mb, &restarted);
1696         if (err)
1697                 return err;
1698
1699         b = lafs_get_block(fsys, choice, NULL, GFP_KERNEL,
1700                            MKREF(cfi_ino));
1701         if (!b)
1702                 return -ENOMEM;
1703
1704         if (test_and_set_bit(B_Claimed, &b->b.flags)) {
1705                 putdref(b, MKREF(cfi_ino));
1706                 if (inum)
1707                         return -EEXIST;
1708                 goto retry;
1709         }
1710         if (imni->mb) {
1711                 lafs_iolock_written(&imni->mb->b);
1712                 set_bit(B_PinPending, &imni->mb->b.flags);
1713                 lafs_iounlock_block(&imni->mb->b);
1714         }
1715         set_bit(B_PinPending, &b->b.flags);
1716         b->my_inode = NULL;
1717         imni->ib = b;
1718         return 0;
1719 }
1720
1721 static int
1722 inode_map_new_pin(struct inode_map_new_info *imni)
1723 {
1724         int err = 0;
1725         if (imni->mb)
1726                 err = lafs_pin_dblock(imni->mb, NewSpace);
1727         err = err ?: lafs_pin_dblock(imni->ib, NewSpace);
1728         return err;
1729 }
1730
1731 static void
1732 inode_map_new_commit(struct inode_map_new_info *imni)
1733 {
1734         unsigned long *buf;
1735
1736         if (imni->mb) {
1737                 int blksize = imni->ib->b.inode->i_sb->s_blocksize;
1738                 int bit = imni->ib->b.fileaddr & (blksize*8 - 1);
1739                 int hole = 0;
1740                 struct inode *ino = imni->mb->b.inode;
1741
1742                 mutex_lock_nested(&ino->i_mutex, I_MUTEX_QUOTA);
1743                 buf = map_dblock(imni->mb);
1744                 generic___clear_le_bit(bit, buf);
1745                 if (buf[blksize/sizeof(*buf)-1] == 0 &&
1746                     generic_find_next_le_bit(buf, blksize*8, 0) == blksize*8)
1747                         /* block is empty, punch a hole */
1748                         hole = 1;
1749
1750                 unmap_dblock(imni->mb, buf);
1751                 if (hole)
1752                         lafs_erase_dblock(imni->mb);
1753                 else
1754                         lafs_dirty_dblock(imni->mb);
1755
1756                 putdref(imni->mb, MKREF(cfi_map));
1757                 mutex_unlock(&ino->i_mutex);
1758                 iput(ino);
1759         }
1760         putdref(imni->ib, MKREF(cfi_ino));
1761 }
1762
1763 static void
1764 inode_map_new_abort(struct inode_map_new_info *imni)
1765 {
1766         if (imni->ib) {
1767                 clear_bit(B_Claimed, &imni->ib->b.flags);
1768                 clear_bit(B_PinPending, &imni->ib->b.flags);
1769                 lafs_orphan_release(fs_from_inode(imni->ib->b.inode),
1770                                     imni->ib);
1771         }
1772         putdref(imni->ib, MKREF(cfi_ino));
1773         if (imni->mb) {
1774                 struct inode *ino = imni->mb->b.inode;
1775                 putdref(imni->mb, MKREF(cfi_map));
1776                 iput(ino);
1777         }
1778 }
1779
1780 struct inode *
1781 lafs_new_inode(struct fs *fs, struct inode *fsys, struct inode *dir,
1782                int type, int inum, int mode, struct datablock **inodbp)
1783 {
1784         /* allocate and instantiate a new inode.  If inum is non-zero,
1785          * choose any number, otherwise we are creating a special inode
1786          * and have to use the given number.
1787          * This creation is committed independently of any name that might
1788          * subsequently be given to the inode.  So we register it as an
1789          * orphan so that it will be cleaned up if the name isn't
1790          * successfully created
1791          *
1792          */
1793         struct inode *ino;
1794         struct datablock *b;
1795         struct inode_map_new_info imni;
1796         struct update_handle ui;
1797         int err;
1798
1799         err = inode_map_new_prepare(fs, inum, fsys, &imni);
1800         err = lafs_cluster_update_prepare(&ui, fs, sizeof(struct la_inode))
1801                 ?: err;
1802         if (err == 0)
1803                 err = lafs_make_orphan(fs, imni.ib);
1804         if (err)
1805                 goto abort;
1806 retry:
1807         lafs_checkpoint_lock(fs);
1808
1809         err = inode_map_new_pin(&imni);
1810
1811         if (err == -EAGAIN) {
1812                 lafs_checkpoint_unlock_wait(fs);
1813                 goto retry;
1814         }
1815         if (err < 0)
1816                 goto abort_unlock;
1817
1818         b = getdref(imni.ib, MKREF(inode_new));
1819
1820         lafs_iolock_block(&b->b); /* make sure we don't race with the cleaner
1821                                    * and zero this inode while trying to load it
1822                                    */
1823         lafs_inode_init(b, type, mode, dir);
1824         lafs_iounlock_block(&b->b);
1825
1826         inode_map_new_commit(&imni);
1827         ino = lafs_iget(fsys, b->b.fileaddr, SYNC);
1828         if (IS_ERR(ino)) {
1829                 lafs_cluster_update_abort(&ui);
1830                 LAFS_BUG(1, &b->b);
1831         } else
1832                 lafs_cluster_update_commit(&ui, b, 0,
1833                                            LAFSI(ino)->metadata_size);
1834         LAFS_BUG(LAFSI(ino)->dblock != b, &b->b);
1835         LAFS_BUG(b->my_inode != ino, &b->b);
1836         lafs_checkpoint_unlock(fs);
1837
1838         if (inodbp)
1839                 *inodbp = b;
1840         else
1841                 putdref(b, MKREF(inode_new));
1842         return ino;
1843
1844 abort_unlock:
1845         lafs_checkpoint_unlock(fs);
1846         err = -ENOSPC;
1847 abort:
1848         inode_map_new_abort(&imni);
1849         lafs_cluster_update_abort(&ui);
1850         dprintk("After abort %d: %s\n", err, strblk(&imni.ib->b));
1851         return ERR_PTR(err);
1852 }
1853
1854 static int inode_map_free(struct fs *fs, struct inode *fsys, u32 inum)
1855 {
1856         struct inode *im = lafs_iget(fsys, 1, SYNC);
1857         int bit;
1858         unsigned long *buf;
1859         struct datablock *b;
1860         u32 bnum;
1861         int err;
1862
1863         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1864
1865         bnum = inum >> (3 + fs->blocksize_bits);
1866         bit = inum - (bnum << (3 + fs->blocksize_bits));
1867         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(inode_map_free));
1868         if (!b) {
1869                 mutex_unlock(&im->i_mutex);
1870                 iput(im);
1871                 return -ENOMEM;
1872         }
1873         err = lafs_read_block(b);
1874         if (err) {
1875                 putdref(b, MKREF(inode_map_free));
1876                 mutex_unlock(&im->i_mutex);
1877                 iput(im);
1878                 return err;
1879         }
1880         lafs_iolock_written(&b->b);
1881         set_bit(B_PinPending, &b->b.flags);
1882         lafs_iounlock_block(&b->b);
1883 retry:
1884         lafs_checkpoint_lock(fs);
1885         err = lafs_pin_dblock(b, ReleaseSpace);
1886         if (err == -EAGAIN) {
1887                 lafs_checkpoint_unlock_wait(fs);
1888                 goto retry;
1889         }
1890         BUG_ON(err < 0);
1891         buf = map_dblock(b);
1892         generic___set_le_bit(bit, buf);
1893         unmap_dblock(b, buf);
1894         lafs_dirty_dblock(b);
1895         putdref(b, MKREF(inode_map_free));
1896         lafs_checkpoint_unlock(fs);
1897         mutex_unlock(&im->i_mutex);
1898         iput(im);
1899         return 0;
1900 }
1901
1902 int lafs_inode_inuse(struct fs *fs, struct inode *fsys, u32 inum)
1903 {
1904         /* This is used during roll-forward to register a newly created
1905          * inode in the inode map
1906          */
1907         struct inode *im = lafs_iget(fsys, 1, SYNC);
1908         int bit;
1909         unsigned long *buf;
1910         struct datablock *b;
1911         u32 bnum;
1912         int err;
1913
1914         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1915
1916         bnum = inum >> (3 + fs->blocksize_bits);
1917         bit = inum - (bnum << (3 + fs->blocksize_bits));
1918         if (bnum > LAFSI(im)->md.inodemap.size) {
1919                 /* inum to unbelievably big */
1920                 mutex_unlock(&im->i_mutex);
1921                 iput(im);
1922                 return -EINVAL;
1923         }
1924         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(inode_map_free));
1925         if (!b) {
1926                 mutex_unlock(&im->i_mutex);
1927                 iput(im);
1928                 return -ENOMEM;
1929         }
1930
1931         err = lafs_read_block(b);
1932         if (err) {
1933                 putdref(b, MKREF(inode_map_free));
1934                 mutex_unlock(&im->i_mutex);
1935                 iput(im);
1936                 return err;
1937         }
1938
1939         lafs_iolock_written(&b->b);
1940         set_bit(B_PinPending, &b->b.flags);
1941         lafs_iounlock_block(&b->b);
1942 retry:
1943         lafs_checkpoint_lock(fs);
1944         err = lafs_pin_dblock(b, CleanSpace);
1945         if (err == -EAGAIN) {
1946                 lafs_checkpoint_unlock_wait(fs);
1947                 goto retry;
1948         }
1949         BUG_ON(err < 0);
1950         buf = map_dblock(b);
1951         if (bnum == LAFSI(im)->md.inodemap.size) {
1952                 /* need to add a new block to the file */
1953                 memset(buf, 0xff, fs->blocksize);
1954                 LAFSI(im)->md.inodemap.size = bnum + 1;
1955                 lafs_dirty_inode(im);
1956         }
1957         generic___clear_le_bit(bit, buf);
1958         unmap_dblock(b, buf);
1959         lafs_dirty_dblock(b);
1960         putdref(b, MKREF(inode_map_free));
1961         lafs_checkpoint_unlock(fs);
1962         mutex_unlock(&im->i_mutex);
1963         iput(im);
1964         return 0;
1965 }
1966
1967
1968
1969 int lafs_setattr(struct dentry *dentry, struct iattr *attr)
1970 {
1971         int err;
1972         struct inode *ino = dentry->d_inode;
1973         struct fs *fs = fs_from_inode(ino);
1974         struct datablock *db;
1975
1976         err = inode_change_ok(ino, attr);
1977         db = lafs_inode_dblock(ino, SYNC, MKREF(setattr));
1978         if (IS_ERR(db))
1979                 err = PTR_ERR(db);
1980         if (err)
1981                 return err;
1982
1983         /* We don't need iolock_written here as we don't
1984          * actually change the inode block yet
1985          */
1986         lafs_iolock_block(&db->b);
1987         set_bit(B_PinPending, &db->b.flags);
1988         lafs_iounlock_block(&db->b);
1989
1990         /* FIXME quota stuff */
1991
1992 again:
1993         lafs_checkpoint_lock(fs);
1994         err = lafs_pin_dblock(db, ReleaseSpace);
1995         if (err == -EAGAIN) {
1996                 lafs_checkpoint_unlock_wait(fs);
1997                 goto again;
1998         }
1999
2000         if (!err) {
2001                 if ((attr->ia_valid & ATTR_SIZE) &&
2002                     attr->ia_size != i_size_read(ino))
2003                         truncate_setsize(ino, attr->ia_size);
2004                 setattr_copy(ino, attr);
2005                 mark_inode_dirty(ino);
2006
2007                 lafs_dirty_dblock(db);
2008         }
2009         clear_bit(B_PinPending, &db->b.flags);
2010         putdref(db, MKREF(setattr));
2011         lafs_checkpoint_unlock(fs);
2012
2013         return err;
2014 }
2015
2016 void lafs_truncate(struct inode *ino)
2017 {
2018         /* Want to truncate this file.
2019          * i_size has already been changed, and the address space
2020          * has been cleaned up.
2021          * So just start the background truncate
2022          */
2023         struct fs *fs = fs_from_inode(ino);
2024         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(trunc));
2025         loff_t trunc_block;
2026         DEFINE_WAIT(wq);
2027
2028         if (IS_ERR(db))
2029                 return;
2030
2031         trunc_block = ((i_size_read(ino) + fs->blocksize - 1)
2032                        >> fs->blocksize_bits);
2033         /* We hold i_mutex, so regular orphan processing cannot
2034          * contine - we have to push it forward ourselves.
2035          */
2036         while (test_bit(I_Trunc, &LAFSI(ino)->iflags) &&
2037                LAFSI(ino)->trunc_next < trunc_block) {
2038                 prepare_to_wait(&fs->async_complete, &wq,
2039                                 TASK_UNINTERRUPTIBLE);
2040                 lafs_inode_handle_orphan(db);
2041                 if (test_bit(B_Orphan, &db->b.flags))
2042                         schedule();
2043         }
2044         finish_wait(&fs->async_complete, &wq);
2045
2046         /* There is nothing we can do about errors here.  The
2047          * most likely are ENOMEM which itself is very unlikely.
2048          * If this doesn't get registered as an orphan .... maybe
2049          * it will have to wait until something else truncates it.
2050          */
2051         lafs_make_orphan(fs, db);
2052
2053         if (!test_and_set_bit(I_Trunc, &LAFSI(ino)->iflags))
2054                 lafs_igrab_fs(ino);
2055         if (trunc_block == 0)
2056                 LAFSI(ino)->trunc_gen++;
2057         LAFSI(ino)->trunc_next = trunc_block;
2058         putdref(db, MKREF(trunc));
2059 }
2060
2061 const struct inode_operations lafs_special_ino_operations = {
2062         .setattr        = lafs_setattr,
2063         .getattr        = lafs_getattr,
2064         .truncate       = lafs_truncate,
2065 };