]> git.neil.brown.name Git - LaFS.git/blob - inode.c
Return ref on sb as well as ino from lafs_iget_fs
[LaFS.git] / inode.c
1
2 /*
3  * fs/lafs/inode.c
4  * Copyright (C) 2005-2009
5  * Neil Brown <neilb@suse.de>
6  * Released under the GPL, version 2
7  *
8  * generic inode handling
9  *
10  */
11
12 #include        "lafs.h"
13 #include <linux/random.h>
14 #include <linux/delay.h>
15 #include <linux/slab.h>
16
17 /* Supporting an async 'iget' - as required by the cleaner -
18  * is slightly non-trivial.
19  * iget*_locked will normally wait for any inode with one
20  * of the flags I_FREEING I_CLEAR I_WILL_FREE I_NEW
21  * to either be unhashed or has the flag cleared.
22  * We cannot afford that wait in the cleaner as we could deadlock.
23  * So we use iget5_locked and provide a test function that fails
24  * if it finds the inode with any of those flags set.
25  * If it does see the inode like that it clear the inum
26  * that is passed in (by reference) so that it knows to continue
27  * failing (for consistency) and so that the 'set' function
28  * we provide can know to fail the 'set'.
29  * The result of this is that if iget finds an inode it would
30  * have to wait on, the inum is cleared and NULL is returned.
31  * An unfortunate side effect is that an inode will be allocated
32  * and then destroyed to no avail.
33  * This is avoided by calling ilookup5 first.  This also allows
34  * us to only allocate/load the data block if there really seems
35  * to be a need.
36  */
37 #define NO_INO (~(ino_t)0)
38 static int async_itest(struct inode *inode, void *data)
39 {
40         ino_t *inump = data;
41         ino_t inum = *inump;
42
43         if (inum == NO_INO)
44                 /* found and is freeing */
45                 return 0;
46         if (inode->i_ino != inum)
47                 return 0;
48         if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) {
49                 *inump = NO_INO;
50                 return 0;
51         }
52         return 1;
53 }
54
55 static int async_iset(struct inode *inode, void *data)
56 {
57         ino_t *inump = data;
58         if (!*inump)
59                 return -EBUSY;
60         inode->i_ino = *inump;
61         return 0;
62 }
63
64 struct inode *
65 lafs_iget(struct super_block *sb, ino_t inum, int async)
66 {
67         /* find, and load if needed, this inum */
68         struct inode *ino = NULL;
69         struct inode *oldino;
70         struct datablock *b = NULL;
71         struct inode *inodefile;
72         struct sb_key *k;
73         int err = 0;
74
75         BUG_ON(inum == NO_INO);
76
77         k = sb->s_fs_info;
78         inodefile = k->root;
79
80         if (async) {
81                 /* We cannot afford to block on 'freeing_inode'
82                  * So use iget5_locked and refuse to match such
83                  * inodes.
84                  * If the inode is 'freeing', inum gets set to NO_INO.
85                  * ilookup5 is used first to avoid an unnecessary
86                  * alloc/free if the inode is locked in some way.
87                  */
88                 while (!ino) {
89                         ino_t inum2 = inum;
90                         err = 0;
91                         ino = ilookup5(sb, inum, async_itest, &inum2);
92                         if (ino)
93                                 break;
94
95                         if (inum2 == NO_INO)
96                                 err = -EAGAIN;
97
98                         /* For async we will always want the dblock loaded,
99                          * and we need to load it first as we cannot afford
100                          * to fail -EAGAIN once we have an I_NEW inode.
101                          */
102                         if (!b)
103                                 b = lafs_get_block(inodefile, inum, NULL,
104                                                    GFP_NOFS, MKREF(iget));
105                         if (!b)
106                                 return ERR_PTR(-ENOMEM);
107
108                         if (!err)
109                                 err = lafs_read_block_async(b);
110
111                         if (!err) {
112                                 /* Have the block, so safe to iget */
113                                 inum2 = inum;
114                                 ino = iget5_locked(sb, inum,
115                                                    async_itest, async_iset,
116                                                    &inum2);
117                                 if (!ino) {
118                                         if (inum2 == NO_INO)
119                                                 err = -EAGAIN;
120                                         else
121                                                 err = -ENOMEM;
122                                 }
123                         }
124                         if (err) {
125                                 if (test_and_set_bit(B_Async, &b->b.flags)) {
126                                         putdref(b, MKREF(iget));
127                                         return ERR_PTR(err);
128                                 }
129                                 getdref(b, MKREF(async));
130                         }
131                 }
132         } else
133                 ino = iget_locked(sb, inum);
134
135         if (!ino) {
136                 putdref(b, MKREF(iget));
137                 return ERR_PTR(-ENOMEM);
138         }
139
140         if (!(ino->i_state & I_NEW)) {
141                 putdref(b, MKREF(iget));
142                 if (ino->i_mode)
143                         return ino;
144                 iput(ino);
145                 return ERR_PTR(-ENOENT);
146         }
147
148         /* Need to load block 'inum' from an inode file...
149          */
150         if (!b) {
151                 b = lafs_get_block(inodefile, inum, NULL, GFP_KERNEL, MKREF(iget));
152                 if (!b)
153                         err = -ENOMEM;
154                 else
155                         err = lafs_read_block(b);
156         }
157         if (err)
158                 goto err;
159
160         oldino = rcu_my_inode(b);
161         if (oldino) {
162                 /* The inode is new, but the block thinks it has an
163                  * old inode, so we must be in the process of destroying
164                  * the old one.
165                  * So fail the lookup without even looking at the content
166                  * of the block (Which might not be clear yet).
167                  */
168                 spin_lock(&oldino->i_data.private_lock);
169                 if (!test_bit(I_Deleting, &LAFSI(oldino)->iflags)) {
170                         b->my_inode = NULL;
171                         LAFSI(oldino)->dblock = NULL;
172                         LAFSI(oldino)->iblock = NULL;
173                 }
174                 spin_unlock(&oldino->i_data.private_lock);
175         }
176         rcu_iput(oldino);
177         if (b->my_inode) {
178                 err = -ENOENT;
179                 goto err;
180         }
181
182         err = lafs_import_inode(ino, b);
183         if (err) {
184                 if (err != -ENOENT)
185                         printk("lafs_import_inode failed %d\n", err);
186                 goto err;
187         }
188         unlock_new_inode(ino);
189 out:
190         if (b && test_and_clear_bit(B_Async, &b->b.flags)) {
191                 putdref(b, MKREF(async));
192                 lafs_wake_thread(fs_from_sb(sb));
193         }
194         putdref(b, MKREF(iget));
195         return ino;
196 err:
197         ino->i_nlink = 0;
198         unlock_new_inode(ino);
199         iput(ino);
200         ino = ERR_PTR(err);
201         goto out;
202 }
203
204 struct inode *
205 lafs_iget_fs(struct fs *fs, int fsnum, int inum, int async)
206 {
207         struct super_block *sb;
208         struct inode *rv;
209
210         sb = fs->prime_sb;
211
212         if (fsnum) {
213                 /* Need to locate or load the superblock for this
214                  * subordinate filesystem
215                  * FIXME
216                  */
217                 struct inode *filesys;
218
219                 printk("get filesys %d\n", fsnum);
220                 BUG();
221                 filesys = lafs_iget(sb, fsnum, async);
222                 if (IS_ERR(filesys))
223                         return filesys;
224                 if (filesys->i_sb == sb) {
225                         /* No mounted fs here.
226                          * Need to mount one FIXME
227                          */
228                         BUG();
229                         return ERR_PTR(-ENOENT);
230                 }
231                 printk("get inode %d\n", inum);
232                 rv = lafs_iget(filesys->i_sb, inum, async);
233                 iput(filesys);
234         } else {
235                 rv = lafs_iget(sb, inum, async);
236                 atomic_inc(&sb->s_active);
237         }
238         return rv;
239 }
240
241 int __must_check
242 lafs_import_inode(struct inode *ino, struct datablock *b)
243 {
244         struct la_inode *lai = map_dblock(b);
245         struct lafs_inode *li = LAFSI(ino);
246         int err = -ENOENT;
247
248         if (lai->filetype == 0) {
249                 li->type = 0;
250                 ino->i_mode = 0;
251                 ino->i_nlink = 0;
252                 goto out;
253         }
254
255         ino->i_mode = S_IFREG;
256         ino->i_nlink = 1; /* For special file, set nlink so they
257                            * never appear unlinked */
258
259         err = -EINVAL;
260
261         LAFS_BUG(ino->i_ino != b->b.fileaddr, &b->b);
262         li->cblocks = le32_to_cpu(lai->data_blocks);
263         li->pblocks = li->ablocks = 0;
264         li->vfs_inode.i_blocks = ((blkcnt_t)li->cblocks
265                                   << (ino->i_sb->s_blocksize_bits - 9));
266         li->ciblocks = le32_to_cpu(lai->index_blocks);
267         li->piblocks = 0;
268         li->iflags = 0;
269
270         ino->i_generation = le16_to_cpu(lai->generation);
271         li->trunc_gen = lai->trunc_gen;
272         li->flags = lai->flags;
273         li->type = lai->filetype;
274         li->metadata_size = le16_to_cpu(lai->metadata_size);
275         li->depth = lai->depth;
276
277         dprintk("inode %lu type is %d\n", (unsigned long)ino->i_ino, li->type);
278
279         ino->i_data.a_ops = &lafs_file_aops;
280         li->trunc_next = 0;
281
282         switch (li->type) {
283         case TypeInodeFile:
284         {
285                 struct fs_md *i = &li->md.fs;
286                 struct fs_metadata *l = &lai->metadata[0].fs;
287                 int nlen;
288
289                 i->usagetable = le16_to_cpu(l->snapshot_usage_table);
290                 decode_time(&ino->i_mtime, le64_to_cpu(l->update_time));
291                 i->cblocks_used = le64_to_cpu(l->blocks_used);
292                 i->pblocks_used = i->ablocks_used = 0;
293                 i->blocks_allowed = le64_to_cpu(l->blocks_allowed);
294                 i->blocks_unalloc = 0;
295                 i->creation_age = le64_to_cpu(l->creation_age);
296                 i->inodes_used = le32_to_cpu(l->inodes_used);
297                 i->quota_inums[0] = le32_to_cpu(l->quota_inodes[0]);
298                 i->quota_inums[1] = le32_to_cpu(l->quota_inodes[1]);
299                 i->quota_inums[2] = le32_to_cpu(l->quota_inodes[2]);
300                 i->quota_inodes[0] = i->quota_inodes[1]
301                         = i->quota_inodes[2] = NULL;
302                 nlen = li->metadata_size - offsetof(struct la_inode,
303                                                     metadata[0].fs.name);
304                 if (nlen == 0)
305                         i->name = NULL;
306                 else {
307                         /* Need to unmap the dblock to kmalloc because
308                          * the mapping makes us 'atomic'
309                          */
310                         unmap_dblock(b, lai);
311                         i->name = kmalloc(nlen+1, GFP_KERNEL);
312                         lai = map_dblock(b);
313                         l = &lai->metadata[0].fs;
314
315                         err = -ENOMEM;
316                         if (!i->name)
317                                 goto out;
318                         memcpy(i->name, l->name, nlen);
319                         i->name[nlen] = 0;
320                 }
321                 /* Make this look like a directory */
322                 ino->i_mode = S_IFDIR;
323                 ino->i_uid = 0;
324                 ino->i_gid = 0;
325                 ino->i_size = 0;
326                 ino->i_op = &lafs_subset_ino_operations;
327                 ino->i_fop = &lafs_subset_file_operations;
328                 break;
329         }
330
331         case TypeInodeMap:
332         {
333                 struct inodemap_md *m = &li->md.inodemap;
334                 struct inodemap_metadata *s = &lai->metadata[0].inodemap;
335                 m->size = le32_to_cpu(s->size);
336                 m->thisblock = NoBlock;
337                 m->nextbit = 0;
338                 break;
339         }
340
341         case TypeSegmentMap:
342         {
343                 struct su_md *m = &li->md.segmentusage;
344                 struct su_metadata *s = &lai->metadata[0].segmentusage;
345                 m->table_size = le32_to_cpu(s->table_size);
346                 break;
347         }
348
349         case TypeQuota:
350         {
351                 struct quota_md *m = &li->md.quota;
352                 struct quota_metadata *s = &lai->metadata[0].quota;
353                 m->gracetime = le32_to_cpu(s->gracetime);
354                 m->graceunits = le32_to_cpu(s->graceunits);
355                 break;
356         }
357         case TypeOrphanList:
358         {
359                 struct orphan_md *m = &li->md.orphan;
360                 /* This will be set via lafs_count_orphans */
361                 m->nextfree = 0;
362                 m->reserved = 0;
363                 break;
364         }
365         case TypeAccessTime:
366                 break;
367
368         default: /* TypeBase or larger */
369         {
370                 struct file_md *i = &li->md.file;
371                 struct file_metadata *l = &lai->metadata[0].file;
372                 struct dir_metadata *d = &lai->metadata[0].dir;
373                 struct special_metadata *s = &lai->metadata[0].special;
374
375                 if (li->type < TypeBase)
376                         goto out;
377                 i->flags = le16_to_cpu(l->flags);
378                 ino->i_mode = le16_to_cpu(l->mode);
379                 ino->i_uid = le32_to_cpu(l->userid);
380                 ino->i_gid = le32_to_cpu(l->groupid);
381                 i->treeid = le32_to_cpu(l->treeid);
382                 i->creationtime = le64_to_cpu(l->creationtime);
383                 decode_time(&ino->i_mtime, le64_to_cpu(l->modifytime));
384                 decode_time(&ino->i_ctime, le64_to_cpu(l->ctime));
385                 decode_time(&i->i_accesstime, le64_to_cpu(l->accesstime));
386                 ino->i_atime = i->i_accesstime; /* FIXME load from
387                                                  * accesstime file */
388                 ino->i_size = le64_to_cpu(l->size);
389                 i->parent = le32_to_cpu(l->parent);
390                 ino->i_nlink = le32_to_cpu(l->linkcount);
391                 if (ino->i_nlink == 0 && list_empty(&b->orphans)) {
392                         /* This block should already be on the orphan
393                          * list, otherwise there is a filesystem
394                          * inconsistency.
395                          * Either the orphan file is wrong, or the
396                          * linkcount is wrong.
397                          * It is safest to assume the later - either
398                          * way an FS check would be needed to fix it.
399                          */
400                         /* FIXME set a superblock flag requesting
401                          * directory linkage checking
402                          */
403                         ino->i_nlink = 1;
404                 }
405
406                 dprintk("  mode = 0%o uid %d size %lld\n",
407                         ino->i_mode, ino->i_uid, ino->i_size);
408                 switch (li->type) {
409                 case TypeFile:
410                         ino->i_op = &lafs_file_ino_operations;
411                         ino->i_fop = &lafs_file_file_operations;
412                         ino->i_mode = (ino->i_mode & 07777)  | S_IFREG;
413                         break;
414                 case TypeDir:
415                         i->seed = le32_to_cpu(d->hash_seed);
416                         ino->i_op = &lafs_dir_ino_operations;
417                         ino->i_fop = &lafs_dir_file_operations;
418                         ino->i_mode = (ino->i_mode & 07777)  | S_IFDIR;
419                         {
420                                 u32 *b = (u32 *)lai;
421                                 dprintk("Hmm. %d %d %d\n",
422                                         (int)b[24],
423                                         (int)b[25],
424                                         (int)b[26]);
425                         }
426                         break;
427                 case TypeSymlink:
428                         ino->i_op = &lafs_link_ino_operations;
429                         ino->i_mode = (ino->i_mode & 07777)  | S_IFLNK;
430                         break;
431                 case TypeSpecial:
432                         /* the data had better be in the inode ... */
433                         ino->i_rdev = MKDEV(le32_to_cpu(s->major),
434                                             le32_to_cpu(s->minor));
435                         ino->i_op = &lafs_special_ino_operations;
436                         init_special_inode(ino, ino->i_mode, ino->i_rdev);
437                         break;
438                 }
439                 break;
440         }
441         }
442
443         ino->i_blkbits = ino->i_sb->s_blocksize_bits;
444         /* FIXME i_blocks and i_byte - used for quota?? */
445         err = 0;
446
447         /* Note: no refcount yet.  Either will remove the reference to the
448          * other when freed
449          */
450         li->dblock = b;
451         rcu_assign_pointer(b->my_inode, ino);
452
453 out:
454         if (err && li->type)
455                 printk("inode %lu type is %d\n",
456                        (unsigned long)ino->i_ino, li->type);
457         unmap_dblock(b, lai);
458         return err;
459 }
460
461 void lafs_inode_checkpin(struct inode *ino)
462 {
463         /* Make sure I_Pinned is set correctly.
464          * It should be set precisely if i_nlink is non-zero,
465          * and ->iblock is B_Pinned.
466          * When it is set, we own a reference to the inode.
467          *
468          * This needs to be called whenever we change
469          * i_nlink, and whenever we pin or unpin an InoIdx
470          * block.
471          */
472         if (ino->i_nlink == 0) {
473                 /* I_Pinned should not be set */
474                 if (test_and_clear_bit(I_Pinned, &LAFSI(ino)->iflags))
475                         iput(ino);
476         } else {
477                 /* Need to check if iblock is Pinned. */
478                 struct indexblock *ib = NULL;
479                 if (LAFSI(ino)->iblock) {
480                         spin_lock(&ino->i_data.private_lock);
481                         ib = LAFSI(ino)->iblock;
482                         if (ib && !test_bit(B_Pinned, &ib->b.flags))
483                                 ib = NULL;
484                         spin_unlock(&ino->i_data.private_lock);
485                 }
486                 if (ib) {
487                         if (!test_and_set_bit(I_Pinned, &LAFSI(ino)->iflags))
488                                 igrab(ino);
489                 } else {
490                         if (test_and_clear_bit(I_Pinned, &LAFSI(ino)->iflags))
491                                 iput(ino);
492                 }
493         }
494 }
495
496 struct datablock *lafs_inode_get_dblock(struct inode *ino, REFARG)
497 {
498         struct datablock *db;
499
500         spin_lock(&ino->i_data.private_lock);
501         db = LAFSI(ino)->dblock;
502         if (db) {
503                 if (db->b.inode == ino)
504                         getdref_locked(db, REF);
505                 else {
506                         spin_lock_nested(&db->b.inode->i_data.private_lock, 1);
507                         getdref_locked(db, REF);
508                         spin_unlock(&db->b.inode->i_data.private_lock);
509                 }
510         }
511         spin_unlock(&ino->i_data.private_lock);
512         return db;
513 }
514
515 struct datablock *lafs_inode_dblock(struct inode *ino, int async, REFARG)
516 {
517         struct datablock *db;
518         int err;
519
520         db = lafs_inode_get_dblock(ino, REF);
521
522         if (!db)
523                 db = lafs_get_block(ino_from_sb(ino->i_sb), ino->i_ino, NULL,
524                                     GFP_KERNEL, REF);
525         if (IS_ERR(db))
526                 return db;
527
528         LAFSI(ino)->dblock = db;
529         rcu_assign_pointer(db->my_inode, ino);
530         if (async)
531                 err = lafs_read_block_async(db);
532         else
533                 err = lafs_read_block(db);
534         if (err == 0)
535                 return db;
536
537         putdref(db, REF);
538         return ERR_PTR(err);
539 }
540
541 void lafs_inode_init(struct datablock *b, int type, int mode, struct inode *dir)
542 {
543         /* A new block has been allocated in an inode file to hold an
544          * inode.  We get to fill in initial values so that when
545          * 'iget' calls lafs_import_inode, the correct inode is
546          * loaded.
547          */
548         struct fs *fs = fs_from_inode(b->b.inode);
549         struct la_inode *lai = map_dblock(b);
550         int size;
551
552         lai->data_blocks = cpu_to_le32(0);
553         lai->index_blocks = cpu_to_le32(0);
554         get_random_bytes(&lai->generation, sizeof(lai->generation));
555         lai->depth = 1;
556         lai->trunc_gen = 0;
557         lai->filetype = type;
558         lai->flags = 0;
559
560         switch(type) {
561         case TypeInodeFile:
562         {
563                 struct fs_metadata *l = &lai->metadata[0].fs;
564                 size = sizeof(struct fs_metadata);
565                 l->update_time = 0;
566                 l->blocks_used = 0;
567                 l->blocks_allowed = 0;
568                 l->creation_age = fs->wc[0].cluster_seq;
569                 l->inodes_used = 0;
570                 l->quota_inodes[0] = 0;
571                 l->quota_inodes[1] = 0;
572                 l->quota_inodes[2] = 0;
573                 l->snapshot_usage_table = 0;
574                 l->pad = 0;
575                 /* name will be zero length and not used */
576                 break;
577         }
578         case TypeInodeMap:
579         {
580                 struct inodemap_metadata *l = &lai->metadata[0].inodemap;
581                 l->size = 0;
582                 size = sizeof(struct inodemap_metadata);
583                 break;
584         }
585         case TypeSegmentMap:
586                 size = sizeof(struct su_metadata);
587                 break;
588         case TypeQuota:
589                 size = sizeof(struct quota_metadata);
590                 break;
591         case TypeOrphanList:
592                 size = 0;
593                 break;
594         case TypeAccessTime:
595                 size = 0;
596                 break;
597         default:
598         {
599                 struct file_metadata *l = &lai->metadata[0].file;
600                 struct timespec now = CURRENT_TIME;
601
602                 l->flags = cpu_to_le16(0);
603                 l->userid = cpu_to_le32(current->cred->fsuid);
604                 if (dir && (dir->i_mode & S_ISGID)) {
605                         l->groupid = cpu_to_le32(dir->i_gid);
606                         if (type == TypeDir)
607                                 mode |= S_ISGID;
608                 } else
609                         l->groupid = cpu_to_le32(current->cred->fsgid);
610                 if (dir && LAFSI(dir)->md.file.treeid)
611                         l->treeid = cpu_to_le32(LAFSI(dir)->md.file.treeid);
612                 else
613                         l->treeid = l->userid;
614
615                 l->mode = cpu_to_le16(mode);
616                 l->creationtime = encode_time(&now);
617                 l->modifytime = l->creationtime;
618                 l->ctime = l->creationtime;
619                 l->accesstime = l->creationtime;
620                 l->size = 0;
621                 l->parent = dir ? cpu_to_le32(dir->i_ino) : 0;
622                 l->linkcount = 0;
623                 l->attrinode = 0;
624                 if (type == TypeDir) {
625                         struct dir_metadata *l = &lai->metadata[0].dir;
626                         u32 seed;
627                         get_random_bytes(&seed,
628                                          sizeof(seed));
629                         seed = (seed & ~7) | 1;
630                         l->hash_seed = cpu_to_le32(seed);
631                         size = sizeof(struct dir_metadata);
632                 } else if (type == TypeSpecial) {
633                         struct special_metadata *s = &lai->metadata[0].special;
634                         s->major = s->minor = 0;
635                         size = sizeof(struct special_metadata);
636                 } else
637                         size = sizeof(struct file_metadata);
638         }
639         }
640         size += sizeof(struct la_inode);
641         lai->metadata_size = cpu_to_le32(size);
642         memset(((char *)lai)+size, 0, fs->blocksize-size);
643         *(u16 *)(((char *)lai)+size) = cpu_to_le16(IBLK_EXTENT);
644
645         unmap_dblock(b, lai);
646         set_bit(B_Valid, &b->b.flags);
647         LAFS_BUG(!test_bit(B_Pinned, &b->b.flags), &b->b);
648         lafs_dirty_dblock(b);
649 }
650
651 void lafs_clear_inode(struct inode *ino)
652 {
653         struct lafs_inode *li = LAFSI(ino);
654         dprintk("CLEAR INODE %d\n", (int)ino->i_ino);
655
656         li->type = 0;
657
658         /* Now is a good time to break the linkage between
659          * inode and dblock - but not if the file is
660          * being deleted
661          */
662         if (!test_bit(I_Deleting, &LAFSI(ino)->iflags)) {
663                 struct datablock *db;
664                 spin_lock(&ino->i_data.private_lock);
665                 db = LAFSI(ino)->dblock;
666                 if (db) {
667                         struct indexblock *ib = LAFSI(ino)->iblock;
668                         LAFS_BUG(ib && atomic_read(&ib->b.refcnt), &db->b);
669                         db->my_inode = NULL;
670                         LAFSI(ino)->dblock = NULL;
671                         LAFSI(ino)->iblock = NULL;
672                 }
673                 spin_unlock(&ino->i_data.private_lock);
674         }
675
676         /* FIXME release quota inodes if filesystem */
677 }
678
679 static int inode_map_free(struct fs *fs, struct super_block *sb, u32 inum);
680
681 void lafs_delete_inode(struct inode *ino)
682 {
683         struct fs *fs = fs_from_inode(ino);
684         struct datablock *b;
685
686         if (ino->i_mode == 0) {
687                 /* There never was an inode here,
688                  * so nothing to do.
689                  */
690                 clear_inode(ino);
691                 return;
692         }
693         dprintk("DELETE INODE %d\n", (int)ino->i_ino);
694
695         /* Normal truncation holds an igrab, so we cannot be
696          * deleted until any truncation finishes
697          */
698         BUG_ON(test_bit(I_Trunc, &LAFSI(ino)->iflags));
699
700         b = lafs_inode_dblock(ino, SYNC, MKREF(delete_inode));
701
702         i_size_write(ino, 0);
703         truncate_inode_pages(&ino->i_data, 0);
704         LAFSI(ino)->trunc_next = 0;
705         set_bit(I_Trunc, &LAFSI(ino)->iflags);
706
707         set_bit(I_Deleting, &LAFSI(ino)->iflags);
708         if (!IS_ERR(b)) {
709                 set_bit(B_Claimed, &b->b.flags);
710                 lafs_add_orphan(fs, b);
711                 dprintk("PUNCH hole for %d\n", (int)b->b.fileaddr);
712                 putdref(b, MKREF(delete_inode));
713         }
714         inode_map_free(fs, ino->i_sb,  ino->i_ino);
715
716         clear_inode(ino);
717 }
718
719 static int prune(void *data, u32 addr, u64 paddr, int len)
720 {
721         /* This whole index block is being pruned, just account
722          * for everything and it will be cleared afterwards
723          */
724         struct indexblock *ib = data;
725         struct inode *ino = ib->b.inode;
726         struct fs *fs = fs_from_inode(ino);
727         int ph = !!test_bit(B_Phase1, &ib->b.flags);
728         int i;
729         dprintk("PRUNE %d for %d at %lld\n", addr, len, (long long)paddr);
730         if (paddr == 0 || len == 0)
731                 return 0;
732         for (i = 0 ; i < len ; i++)
733                 lafs_summary_update(fs, ino, paddr+i, 0, 0, ph, 0);
734         return len;
735 }
736
737 static int prune_some(void *data, u32 addr, u64 paddr, int len)
738 {
739         /* Part of this index block is being pruned.  Copy
740          * what addresses we can into uninc_table so that
741          * it can be 'incorporated'
742          * We should probably share some code with
743          * lafs_allocated_block??
744          */
745         struct indexblock *ib = data;
746         struct inode *ino = ib->b.inode;
747         struct fs *fs = fs_from_inode(ino);
748         int ph = !!test_bit(B_Phase1, &ib->b.flags);
749         int i;
750
751         if (paddr == 0 || len == 0)
752                 return 0;
753         dprintk("PRUNE2 %d for %d at %lld\n", addr, len, (long long)paddr);
754         for (i = 0 ; i < len ; i++) {
755                 /* FIXME should allow longer truncation ranges in uninc_table
756                  * as they are easy to handle.
757                  */
758                 struct addr *a;
759                 if (addr + i < LAFSI(ino)->trunc_next)
760                         continue;
761                 spin_lock(&ino->i_data.private_lock);
762                 a = &ib->uninc_table.pending_addr
763                         [ib->uninc_table.pending_cnt - 1];
764                 if (ib->uninc_table.pending_cnt <
765                     ARRAY_SIZE(ib->uninc_table.pending_addr)) {
766                         a++;
767                         a->fileaddr = addr + i;
768                         a->physaddr = 0;
769                         a->cnt = 1;
770                         LAFS_BUG(!test_bit(B_Pinned, &ib->b.flags), &ib->b);
771                         ib->uninc_table.pending_cnt++;
772                 } else {
773                         spin_unlock(&ino->i_data.private_lock);
774                         break;
775                 }
776                 spin_unlock(&ino->i_data.private_lock);
777                 lafs_summary_update(fs, ino, paddr+i, 0, 0, ph, 0);
778         }
779         return i;
780 }
781
782 int lafs_inode_handle_orphan(struct datablock *b)
783 {
784         /* Don't need rcu protection for my_inode run_orphan
785          * holds a reference
786          */
787         struct indexblock *ib, *ib2;
788         struct inode *ino = b->my_inode;
789         struct fs *fs = fs_from_inode(ino);
790         u32 trunc_next, next_trunc;
791         int loop_cnt = 20;
792         int err = -ENOMEM;
793
794         if (!test_bit(I_Trunc, &LAFSI(ino)->iflags)) {
795                 if (test_bit(I_Deleting, &LAFSI(ino)->iflags)) {
796                         LAFS_BUG(ino->i_nlink, &b->b);
797                         if (LAFSI(ino)->cblocks +
798                             LAFSI(ino)->pblocks +
799                             LAFSI(ino)->ablocks +
800                             LAFSI(ino)->ciblocks +
801                             LAFSI(ino)->piblocks)
802                         printk("Deleting inode %lu: %ld+%ld+%ld %ld+%ld\n",
803                                ino->i_ino,
804                                LAFSI(ino)->cblocks,
805                                LAFSI(ino)->pblocks,
806                                LAFSI(ino)->ablocks,
807                                LAFSI(ino)->ciblocks,
808                                LAFSI(ino)->piblocks);
809                         BUG_ON(LAFSI(ino)->cblocks +
810                                LAFSI(ino)->pblocks +
811                                LAFSI(ino)->ablocks +
812                                LAFSI(ino)->ciblocks +
813                                LAFSI(ino)->piblocks);
814                         if (lafs_erase_dblock_async(b))
815                                 lafs_orphan_release(fs, b, NULL);
816                 } else if (ino->i_nlink || LAFSI(ino)->type == 0)
817                         lafs_orphan_release(fs, b, NULL);
818                 else
819                         lafs_orphan_forget(fs, b);
820                 return 0;
821         }
822
823         ib = lafs_make_iblock(ino, ADOPT, SYNC, MKREF(inode_handle_orphan));
824         if (IS_ERR(ib))
825                 return PTR_ERR(ib);
826
827         /* Here is the guts of 'truncate'.  We find the next leaf index
828          * block and discard all the addresses there-in.
829          */
830         trunc_next = LAFSI(ino)->trunc_next;
831
832         if (trunc_next == 0xFFFFFFFF) {
833                 /* truncate has finished in that all data blocks
834                  * have been removed and all index block are either
835                  * gone or pending incorporation at which point they will
836                  * go.
837                  * If we hit a phase change, we will need to postpone
838                  * the rest of the cleaning until it completes.
839                  * If there is a checkpoint happening, then all the work
840                  * that we can do now, it will do for us.  So just
841                  * let it.
842                  */
843                 struct indexblock *tmp;
844                 struct indexblock *next;
845                 u32 lastaddr;
846
847                 if (!test_bit(B_Pinned, &ib->b.flags)) {
848                         /* must be finished */
849                         LAFS_BUG(test_bit(B_Dirty, &ib->b.flags), &ib->b);
850                         clear_bit(I_Trunc, &LAFSI(ino)->iflags);
851                         if (!test_bit(I_Deleting, &LAFSI(ino)->iflags))
852                                 iput(ino);
853                         wake_up(&fs->trunc_wait);
854                         err = -ERESTARTSYS;
855                         goto out2;
856                 }
857                 if (fs->checkpointing) {
858                         /* This cannot happen with current code,
859                          * but leave it in case we ever have
860                          * orphan handling parallel with checkpoints
861                          */
862                         err = -EBUSY; /* Try again after the checkpoint */
863                         goto out2;
864                 }
865
866                 lastaddr = (i_size_read(ino) +
867                             fs->blocksize - 1)
868                         >> fs->blocksize_bits;
869                 /* Find a Pinned descendent of ib which has no
870                  * Pinned descendents and no PrimaryRef dependent
871                  * (so take the last).
872                  * Prefer blocks that are beyond EOF (again, take the last).
873                  * If there are none, descend the last block that
874                  * is not after EOF and look at its children.
875                  */
876                 ib2 = next = ib;
877                 spin_lock(&ib->b.inode->i_data.private_lock);
878                 while (next) {
879                         ib2 = next;
880                         next = NULL;
881                         list_for_each_entry(tmp, &ib2->children, b.siblings) {
882                                 if (!test_bit(B_Index, &tmp->b.flags) ||
883                                     !test_bit(B_Pinned, &tmp->b.flags))
884                                         continue;
885                                 if (next == NULL ||
886                                     tmp->b.fileaddr > next->b.fileaddr)
887                                         next = tmp;
888                         }
889                 }
890                 if (ib2->b.fileaddr < lastaddr) {
891                         /* Must be all done */
892                         spin_unlock(&ib->b.inode->i_data.private_lock);
893                         clear_bit(I_Trunc, &LAFSI(ino)->iflags);
894                         if (!test_bit(I_Deleting, &LAFSI(ino)->iflags))
895                                 iput(ino);
896                         wake_up(&fs->trunc_wait);
897                         err = -ERESTARTSYS;
898                         goto out2;
899                 }
900                 getiref(ib2, MKREF(inode_handle_orphan2));
901                 spin_unlock(&ib->b.inode->i_data.private_lock);
902
903                 /* ib2 is an index block beyond EOF with no
904                  * Pinned children.
905                  * Incorporating it should unpin it.
906                  */
907                 if (!list_empty(&ib2->children)) {
908                         lafs_print_tree(&ib2->b, 3);
909                         LAFS_BUG(1, &ib2->b);
910                 }
911
912                 if (!lafs_iolock_written_async(&ib2->b)) {
913                         putiref(ib2, MKREF(inode_handle_orphan2));
914                         err = -EAGAIN;
915                         goto out2;
916                 }
917                 while (ib2->uninc_table.pending_cnt || ib2->uninc)
918                         lafs_incorporate(fs, ib2);
919
920                 if (test_bit(B_Dirty, &ib2->b.flags) ||
921                     test_bit(B_Realloc, &ib2->b.flags))
922                         lafs_cluster_allocate(&ib2->b, 0);
923                 else
924                         lafs_iounlock_block(&ib2->b);
925
926                 if (!list_empty(&ib2->b.siblings)) {
927                         printk("looping on %s\n", strblk(&ib2->b));
928                         loop_cnt--;
929                         if (loop_cnt < 0)
930                                 BUG();
931                 }
932                 putiref(ib2, MKREF(inode_handle_orphan2));
933                 err = -ERESTARTSYS;
934                 if (ib->uninc) {
935                         if (lafs_iolock_written_async(&ib->b)) {
936                                 while (ib->uninc)
937                                         lafs_incorporate(fs, ib);
938                                 lafs_iounlock_block(&ib->b);
939                         } else
940                                 err = -EAGAIN;
941                 }
942         out2:
943                 putiref(ib, MKREF(inode_handle_orphan));
944                 return err;
945         }
946
947         putiref(ib, MKREF(inode_handle_orphan));
948
949         ib = lafs_leaf_find(ino, trunc_next, ADOPT, &next_trunc,
950                             ASYNC, MKREF(inode_handle_orphan3));
951         if (IS_ERR(ib))
952                 return PTR_ERR(ib);
953         /* now hold an iolock on ib */
954
955         /* Ok, trunc_next seems to refer to a block that exists.
956          * We need to erase it..
957          *
958          * So we open up the index block ourselves, call
959          * lafs_summary_update with each block address, and then
960          * erase the block.
961          */
962
963         if (LAFSI(ino)->depth == 0) {
964                 /* Nothing to truncate */
965                 clear_bit(I_Trunc, &LAFSI(ino)->iflags);
966                 if (!test_bit(I_Deleting, &LAFSI(ino)->iflags))
967                         iput(ino);
968                 if (test_bit(B_Pinned, &ib->b.flags))
969                         /* Need to move the dirtiness which keeps this
970                          * pinned to the data block.
971                          */
972                         lafs_cluster_allocate(&ib->b, 0);
973                 else
974                         lafs_iounlock_block(&ib->b);
975                 err = -ERESTARTSYS;
976                 goto out_put;
977         }
978
979         lafs_checkpoint_lock(fs);
980         err = lafs_reserve_block(&ib->b, ReleaseSpace);
981         if (err < 0)
982                 goto out;
983
984         if (!test_bit(B_Valid, &ib->b.flags) &&
985             test_bit(B_InoIdx, &ib->b.flags)) {
986                 /* still invalid, just re-erase to remove
987                  * pinning */
988                 LAFSI(ino)->trunc_next = next_trunc;
989                 lafs_cluster_allocate(&ib->b, 0);
990                 err = -ERESTARTSYS;
991                 goto out_unlocked;
992         }
993
994         lafs_pin_block(&ib->b);
995
996         /* It might be that this can happen, in which case
997          * we simply update trunc_next and loop.  But I'd like
998          * to be sure before I implement that
999          */
1000         if (!test_bit(B_Valid, &ib->b.flags)) {
1001                 printk("Not Valid: %s\n", strblk(&ib->b));
1002                 printk("depth = %d\n", LAFSI(ino)->depth);
1003                 if (test_bit(B_InoIdx, &ib->b.flags))
1004                         printk("DB: %s\n", strblk(&LAFSI(ib->b.inode)->dblock->b));
1005                 LAFSI(ino)->trunc_next = next_trunc;
1006                 //BUG_ON(!test_bit(B_Valid, &ib->b.flags));
1007                 err = -ERESTARTSYS;
1008                 goto out;
1009         }
1010
1011         if (ib->b.fileaddr < trunc_next &&
1012             lafs_leaf_next(ib, 0) < trunc_next) {
1013                 /* We only want to truncate part of this index block.
1014                  * So we copy addresses into uninc_table and then
1015                  * call lafs_incorporate.
1016                  * This might cause the index tree to grow, so we
1017                  * cannot trust next_trunc
1018                  */
1019                 if (ib->uninc_table.pending_cnt == 0 &&
1020                     ib->uninc == NULL) {
1021                         lafs_dirty_iblock(ib, 0);
1022                         /* FIXME this just removes 8 blocks at a time,
1023                          * which is not enough
1024                          */
1025                         lafs_walk_leaf_index(ib, prune_some, ib);
1026                 }
1027                 if (test_bit(B_Dirty, &ib->b.flags))
1028                         lafs_incorporate(fs, ib);
1029                 err = -ERESTARTSYS;
1030                 goto out;
1031         }
1032         LAFSI(ino)->trunc_next = next_trunc;
1033
1034         while (ib->uninc_table.pending_cnt || ib->uninc) {
1035                 /* There should be no Realloc data blocks here
1036                  * but index blocks might be realloc still.
1037                  */
1038                 LAFS_BUG(!test_bit(B_Dirty, &ib->b.flags) &&
1039                          !test_bit(B_Realloc, &ib->b.flags), &ib->b);
1040                 lafs_incorporate(fs, ib);
1041         }
1042         if (test_bit(B_InoIdx, &ib->b.flags) ||
1043             !test_bit(B_PhysValid, &ib->b.flags) ||
1044             ib->b.physaddr != 0) {
1045                 lafs_walk_leaf_index(ib, prune, ib);
1046                 lafs_clear_index(ib);
1047                 lafs_dirty_iblock(ib, 0);
1048         }
1049         if (test_bit(B_Dirty, &ib->b.flags))
1050                 lafs_incorporate(fs, ib);
1051         if (!list_empty(&ib->children))
1052                 lafs_print_tree(&ib->b, 2);
1053         LAFS_BUG(!list_empty(&ib->children), &ib->b);
1054         err = -ERESTARTSYS;
1055 out:
1056         lafs_iounlock_block(&ib->b);
1057 out_unlocked:
1058         lafs_checkpoint_unlock(fs);
1059 out_put:
1060         putiref(ib, MKREF(inode_handle_orphan3));
1061         return err;
1062 }
1063
1064 void lafs_dirty_inode(struct inode *ino)
1065 {
1066         /* this is called in one of three cases:
1067          * 1/ by lafs internally when dblock or iblock is pinned and
1068          *    ready to be dirtied
1069          * 2/ by writeout before requesting a write - to update mtime
1070          * 3/ by read to update atime
1071          *
1072          * As we don't know which, there is not much we can do.
1073          * We mustn't update the data block as it could be in
1074          * writeout and we cannot always wait safely.
1075          * So require that anyone who really cares, dirties the datablock
1076          * or a child themselves.
1077          * When cluster_allocate eventually gets called, it will update
1078          * the datablock from the inode.
1079          * If an update has to wait for the next phase, lock_dblock
1080          * (e.g. in setattr) will do that.
1081          *
1082          * We also use this opportunity to update the filesystem modify time.
1083          */
1084         struct timespec now;
1085         struct inode *filesys;
1086         set_bit(I_Dirty, &LAFSI(ino)->iflags);
1087         ino->i_sb->s_dirt = 1;
1088
1089         now = current_fs_time(ino->i_sb);
1090         filesys = ino_from_sb(ino->i_sb);
1091         if (!timespec_equal(&filesys->i_mtime, &now)) {
1092                 filesys->i_mtime = now;
1093                 set_bit(I_Dirty, &LAFSI(filesys)->iflags);
1094         }
1095 }
1096
1097 int lafs_sync_inode(struct inode *ino, int wait)
1098 {
1099         /* fsync has been called on this file so we need
1100          * to sync any inode updates to the next cluster.
1101          *
1102          * If we cannot create an update record,
1103          * we wait for a phase change, which writes everything
1104          * out.
1105          */
1106         struct datablock *b;
1107         struct fs *fs = fs_from_inode(ino);
1108         struct update_handle uh;
1109         int err;
1110
1111         if (wait) {
1112                 if (LAFSI(ino)->update_cluster > 1)
1113                         lafs_cluster_wait(fs, LAFSI(ino)->update_cluster);
1114                 if (LAFSI(ino)->update_cluster == 1) {
1115                         lafs_checkpoint_lock(fs);
1116                         lafs_checkpoint_unlock_wait(fs);
1117                 }
1118                 return 0;
1119         }
1120
1121         LAFSI(ino)->update_cluster = 0;
1122         if (!test_bit(I_Dirty, &LAFSI(ino)->iflags))
1123                 return 0;
1124         b = lafs_inode_dblock(ino, SYNC, MKREF(write_inode));
1125         if (IS_ERR(b))
1126                 return PTR_ERR(b);
1127
1128         lafs_iolock_written(&b->b);
1129         lafs_inode_fillblock(ino);
1130         lafs_iounlock_block(&b->b);
1131
1132         err = lafs_cluster_update_prepare(&uh, fs, LAFS_INODE_LOG_SIZE);
1133         if (err)
1134                 lafs_cluster_update_abort(&uh);
1135         else {
1136                 lafs_checkpoint_lock(fs);
1137                 if (lafs_cluster_update_pin(&uh) == 0) {
1138                         if (test_and_clear_bit(B_Dirty, &b->b.flags))
1139                                 lafs_space_return(fs, 1);
1140                         LAFSI(ino)->update_cluster =
1141                                 lafs_cluster_update_commit
1142                                 (&uh, b, LAFS_INODE_LOG_START,
1143                                  LAFS_INODE_LOG_SIZE);
1144                 } else  
1145                         lafs_cluster_update_abort(&uh);
1146                 lafs_checkpoint_unlock(fs);
1147         }
1148         if (test_bit(B_Dirty, &b->b.flags)) {
1149                 /* FIXME need to write out the data block...
1150                  * Is that just lafs_cluster_allocate ?
1151                  */
1152         }
1153
1154         if (LAFSI(ino)->update_cluster == 0) {
1155                 lafs_checkpoint_lock(fs);
1156                 if (test_bit(B_Dirty, &b->b.flags))
1157                         LAFSI(ino)->update_cluster = 1;
1158                 lafs_checkpoint_start(fs);
1159                 lafs_checkpoint_unlock(fs);
1160         }
1161         putdref(b, MKREF(write_inode));
1162         return 0; /* FIXME should I return some error message??? */
1163 }
1164
1165 void lafs_inode_fillblock(struct inode *ino)
1166 {
1167         /* copy data from ino into the related data block */
1168
1169         struct lafs_inode *li = LAFSI(ino);
1170         struct datablock *db = li->dblock;
1171         struct la_inode *lai;
1172
1173         clear_bit(I_Dirty, &LAFSI(ino)->iflags);
1174
1175         lai = map_dblock(db);
1176         lai->data_blocks = cpu_to_le32(li->cblocks);
1177         lai->index_blocks = cpu_to_le32(li->ciblocks);
1178         lai->generation = cpu_to_le16(ino->i_generation);
1179         lai->trunc_gen = li->trunc_gen;
1180         lai->flags = li->flags;
1181         lai->filetype = li->type;
1182         if (lai->metadata_size != cpu_to_le16(li->metadata_size)) {
1183                 /* Changing metadata size is wierd.
1184                  * We will need to handle this somehow for xattrs
1185                  * For now we just want to cope with
1186                  * Dir -> InodeFile changes, and that guarantees us
1187                  * there is no index info - so just clear the index 
1188                  * area.
1189                  */
1190                 u16 *s = (u16*)(((char*)lai) + li->metadata_size);
1191                 BUG_ON(li->type != TypeInodeFile);
1192                 lai->metadata_size = cpu_to_le16(li->metadata_size);
1193                 memset(s, 0, ino->i_sb->s_blocksize - li->metadata_size);
1194                 *s = cpu_to_le16(IBLK_INDIRECT);
1195         }
1196         lai->depth = li->depth;
1197
1198         switch (li->type) {
1199         case TypeInodeFile:
1200         {
1201                 struct fs_md *i = &li->md.fs;
1202                 struct fs_metadata *l = &lai->metadata[0].fs;
1203                 int nlen;
1204
1205                 l->snapshot_usage_table = cpu_to_le16(i->usagetable);
1206                 l->update_time = cpu_to_le64(encode_time(&ino->i_mtime));
1207                 l->blocks_used = cpu_to_le64(i->cblocks_used);
1208                 l->blocks_allowed = cpu_to_le64(i->blocks_allowed);
1209                 l->creation_age = cpu_to_le64(i->creation_age);
1210                 l->inodes_used = cpu_to_le32(i->inodes_used);
1211                 l->quota_inodes[0] = cpu_to_le32(i->quota_inums[0]);
1212                 l->quota_inodes[1] = cpu_to_le32(i->quota_inums[1]);
1213                 l->quota_inodes[2] = cpu_to_le32(i->quota_inums[2]);
1214                 nlen = lai->metadata_size - offsetof(struct la_inode,
1215                                                      metadata[0].fs.name);
1216                 memset(l->name, 0, nlen);
1217                 if (i->name == NULL)
1218                         nlen = 0;
1219                 else if (strlen(i->name) < nlen)
1220                         nlen = strlen(i->name);
1221                 memcpy(l->name, i->name, nlen);
1222                 break;
1223         }
1224
1225         case TypeInodeMap:
1226         {
1227                 struct inodemap_md *m = &li->md.inodemap;
1228                 struct inodemap_metadata *s = &lai->metadata[0].inodemap;
1229                 s->size = cpu_to_le32(m->size);
1230                 break;
1231         }
1232
1233         case TypeSegmentMap:
1234         {
1235                 struct su_md *m = &li->md.segmentusage;
1236                 struct su_metadata *s = &lai->metadata[0].segmentusage;
1237                 s->table_size = cpu_to_le32(m->table_size);
1238                 break;
1239         }
1240
1241         case TypeQuota:
1242         {
1243                 struct quota_md *m = &li->md.quota;
1244                 struct quota_metadata *s = &lai->metadata[0].quota;
1245                 s->gracetime = cpu_to_le32(m->gracetime);
1246                 s->graceunits = cpu_to_le32(m->graceunits);
1247                 break;
1248         }
1249         case TypeOrphanList:
1250         case TypeAccessTime:
1251                 break;
1252
1253         default: /* TypeBase or larger */
1254         {
1255                 struct file_md *i = &li->md.file;
1256                 struct file_metadata *l = &lai->metadata[0].file;
1257                 struct dir_metadata *d = &lai->metadata[0].dir;
1258                 struct special_metadata *s = &lai->metadata[0].special;
1259
1260                 if (li->type < TypeBase)
1261                         break;
1262                 l->flags = cpu_to_le16(i->flags);
1263                 l->mode = cpu_to_le16(ino->i_mode);
1264                 l->userid = cpu_to_le32(ino->i_uid);
1265                 l->groupid = cpu_to_le32(ino->i_gid);
1266                 l->treeid = cpu_to_le32(i->treeid);
1267                 l->creationtime = cpu_to_le64(i->creationtime);
1268                 l->modifytime = cpu_to_le64(encode_time(&ino->i_mtime));
1269                 l->ctime = cpu_to_le64(encode_time(&ino->i_ctime));
1270                 l->accesstime = cpu_to_le64(encode_time(&ino->i_atime));
1271                 /* FIXME write 0 to accesstime file */
1272                 l->size = cpu_to_le64(ino->i_size);
1273                 l->parent = cpu_to_le32(i->parent);
1274                 l->linkcount = cpu_to_le32(ino->i_nlink);
1275
1276                 switch (li->type) {
1277                 case TypeFile:
1278                         break;
1279                 case TypeDir:
1280                         d->hash_seed = cpu_to_le32(i->seed);
1281                         break;
1282                 case TypeSymlink:
1283                         break;
1284                 case TypeSpecial:
1285                         s->major = cpu_to_le32(MAJOR(ino->i_rdev));
1286                         s->minor = cpu_to_le32(MINOR(ino->i_rdev));
1287                         break;
1288                 }
1289         }
1290         }
1291         unmap_dblock(db, lai);
1292 }
1293
1294 /*-----------------------------------------------------------------------
1295  * Inode allocate map handling.
1296  * Inode 1 of each fileset is a bitmap of free inode numbers.
1297  * Whenever the file is extended in size, new bits are set to one.  They
1298  * are then cleared when the inode is allocated.  When a block becomes
1299  * full of zeros, we don't need to store it any more.
1300  *
1301  * We don't clear the bit until we are committed to creating an inode
1302  * This means we cannot clear it straight away, so two different threads
1303  * might see the same inode number as being available.  We have two
1304  * approaches to guard against this.
1305  * Firstly we have a 'current' pointer into the inodemap file and
1306  * increase that past the inode we return.  This discourages multiple
1307  * hits but as the pointer would need to be rewound occasionally it
1308  * isn't a guarantee.  The guarantee against multiple allocations is done
1309  * via a flag in the block representing an inode.  This is set
1310  * while an inode is being allocated.
1311  */
1312
1313 /* inode number allocation has the prealloc/pin/commit/abort structure
1314  * so it can be committed effectively
1315  */
1316
1317 static int
1318 choose_free_inum(struct fs *fs, struct super_block *sb, u32 *inump,
1319                  struct datablock **bp, int *restarted)
1320 {
1321         struct inode *im = lafs_iget(sb, 1, SYNC);
1322         loff_t bnum;
1323         struct datablock *b;
1324         char *buf;
1325         int err;
1326         int bit;
1327
1328         if (*bp) {
1329                 struct inode *i = (*bp)->b.inode;
1330                 putdref(*bp, MKREF(cfi_map));
1331                 iput(i);
1332                 *bp = NULL;
1333         }
1334
1335         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1336 retry:
1337         bnum = LAFSI(im)->md.inodemap.thisblock;
1338
1339         if (bnum == NoBlock ||
1340             LAFSI(im)->md.inodemap.nextbit >= (fs->blocksize<<3)) {
1341                 if (bnum == NoBlock)
1342                         bnum = LAFSI(im)->md.inodemap.size;
1343
1344                 if (bnum+1 < LAFSI(im)->md.inodemap.size)
1345                         bnum++;
1346                 else if (!*restarted) {
1347                         bnum = 0;
1348                         *restarted = 1;
1349                 } else {
1350                         /* Need to add a new block to the file */
1351                         bnum = LAFSI(im)->md.inodemap.size;
1352                         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL,
1353                                            MKREF(cfi_map));
1354                         err = -ENOMEM;
1355                         if (!b)
1356                                 goto abort;
1357                         lafs_iolock_written(&b->b);
1358                         set_bit(B_PinPending, &b->b.flags);
1359                         lafs_iounlock_block(&b->b);
1360                 retry2:
1361                         lafs_checkpoint_lock(fs);
1362                         err = lafs_pin_dblock(b, NewSpace);
1363                         if (err == -EAGAIN) {
1364                                 lafs_checkpoint_unlock_wait(fs);
1365                                 goto retry2;
1366                         }
1367                         if (err < 0)
1368                                 goto abort_unlock;
1369
1370                         buf = map_dblock(b);
1371                         /* Set block to "all are free" */
1372                         memset(buf, 0xff, fs->blocksize);
1373                         unmap_dblock(b, buf);
1374                         set_bit(B_Valid, &b->b.flags);
1375                         LAFSI(im)->md.inodemap.size = bnum+1;
1376                         lafs_dirty_inode(im);
1377                         lafs_dirty_dblock(b);
1378                         lafs_checkpoint_unlock(fs);
1379                         putdref(b, MKREF(cfi_map));
1380                 }
1381                 b = NULL;
1382                 err = lafs_find_next(im, &bnum);
1383                 if (err < 0)
1384                         goto abort;
1385                 if (err == 0)
1386                         bnum = 0;
1387
1388                 LAFSI(im)->md.inodemap.nextbit = 0;
1389                 LAFSI(im)->md.inodemap.thisblock = bnum;
1390                 goto retry;
1391         }
1392         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(cfi_map));
1393         err = -ENOSPC;
1394         if (!b)
1395                 goto abort;
1396         err = lafs_find_block(b, NOADOPT);
1397         if (err)
1398                 goto abort;
1399         if (b->b.physaddr == 0 && !test_bit(B_Valid, &b->b.flags)) {
1400                 LAFSI(im)->md.inodemap.nextbit =
1401                         (fs->blocksize<<3) + 1;
1402                 putdref(b,MKREF(cfi_map));
1403                 goto retry;
1404         }
1405         err = lafs_read_block(b);
1406         if (err)
1407                 goto abort;
1408
1409         bit = LAFSI(im)->md.inodemap.nextbit;
1410         LAFSI(im)->md.inodemap.thisblock = bnum;
1411         buf = map_dblock(b);
1412         while (bnum == 0 && bit < 16) {
1413                 /* Never return an inum below 16 - they are special */
1414                 if (!generic_test_le_bit(bit, (unsigned long *)buf))
1415                         generic___clear_le_bit(bit, (unsigned long *)buf);
1416                 bit++;
1417         }
1418
1419         bit = generic_find_next_le_bit((unsigned long *)buf,
1420                                        fs->blocksize<<3, bit);
1421         unmap_dblock(b, buf);
1422         LAFSI(im)->md.inodemap.nextbit = bit+1;
1423         if (bit >= fs->blocksize<<3) {
1424                 putdref(b,MKREF(cfi_map));
1425                 goto retry;
1426         }
1427         mutex_unlock(&im->i_mutex);
1428         *bp = b;
1429         *inump = bit + (bnum << (im->i_blkbits + 3));
1430         return 0;
1431
1432 abort_unlock:
1433         lafs_checkpoint_unlock(fs);
1434 abort:
1435         putdref(b, MKREF(cfi_map));
1436         *bp = NULL;
1437         mutex_unlock(&im->i_mutex);
1438         iput(im);
1439         return err;
1440 }
1441
1442 struct inode_map_new_info {
1443         struct datablock *ib, *mb;
1444 };
1445
1446 static int
1447 inode_map_new_prepare(struct fs *fs, int inum, struct super_block *sb,
1448                       struct inode_map_new_info *imni)
1449 {
1450         int choice = inum;
1451         int restarted = 0;
1452         int err = 0;
1453         struct datablock *b;
1454
1455         imni->ib = imni->mb = NULL;
1456 retry:
1457         if (inum == 0)
1458                 /* choose a possibly-free inode number */
1459                 err = choose_free_inum(fs, sb, &choice,
1460                                        &imni->mb, &restarted);
1461         if (err)
1462                 return err;
1463
1464         b = lafs_get_block(ino_from_sb(sb), choice, NULL, GFP_KERNEL,
1465                            MKREF(cfi_ino));
1466         if (!b)
1467                 return -ENOMEM;
1468
1469         if (test_and_set_bit(B_Claimed, &b->b.flags)) {
1470                 putdref(b, MKREF(cfi_ino));
1471                 if (inum)
1472                         return -EEXIST;
1473                 goto retry;
1474         }
1475         if (imni->mb) {
1476                 lafs_iolock_written(&imni->mb->b);
1477                 set_bit(B_PinPending, &imni->mb->b.flags);
1478                 lafs_iounlock_block(&imni->mb->b);
1479         }
1480         set_bit(B_PinPending, &b->b.flags);
1481         b->my_inode = NULL;
1482         imni->ib = b;
1483         return 0;
1484 }
1485
1486 static int
1487 inode_map_new_pin(struct inode_map_new_info *imni)
1488 {
1489         int err = 0;
1490         if (imni->mb)
1491                 err = lafs_pin_dblock(imni->mb, NewSpace);
1492         err = err ?: lafs_pin_dblock(imni->ib, NewSpace);
1493         return err;
1494 }
1495
1496 static void
1497 inode_map_new_commit(struct inode_map_new_info *imni)
1498 {
1499         unsigned long *buf;
1500
1501         if (imni->mb) {
1502                 int blksize = imni->ib->b.inode->i_sb->s_blocksize;
1503                 int bit = imni->ib->b.fileaddr & (blksize*8 - 1);
1504                 int hole = 0;
1505                 struct inode *ino = imni->mb->b.inode;
1506
1507                 mutex_lock_nested(&ino->i_mutex, I_MUTEX_QUOTA);
1508                 buf = map_dblock(imni->mb);
1509                 generic___clear_le_bit(bit, buf);
1510                 if (buf[blksize/sizeof(*buf)-1] == 0 &&
1511                     generic_find_next_le_bit(buf, blksize*8, 0) == blksize*8)
1512                         /* block is empty, punch a hole */
1513                         hole = 1;
1514
1515                 unmap_dblock(imni->mb, buf);
1516                 if (hole)
1517                         lafs_erase_dblock(imni->mb);
1518                 else
1519                         lafs_dirty_dblock(imni->mb);
1520
1521                 putdref(imni->mb, MKREF(cfi_map));
1522                 mutex_unlock(&ino->i_mutex);
1523                 iput(ino);
1524         }
1525         putdref(imni->ib, MKREF(cfi_ino));
1526 }
1527
1528 static void
1529 inode_map_new_abort(struct inode_map_new_info *imni)
1530 {
1531         if (imni->ib) {
1532                 clear_bit(B_Claimed, &imni->ib->b.flags);
1533                 clear_bit(B_PinPending, &imni->ib->b.flags);
1534                 lafs_orphan_release(fs_from_inode(imni->ib->b.inode),
1535                                     imni->ib, NULL);
1536         }
1537         putdref(imni->ib, MKREF(cfi_ino));
1538         if (imni->mb) {
1539                 struct inode *ino = imni->mb->b.inode;
1540                 putdref(imni->mb, MKREF(cfi_map));
1541                 iput(ino);
1542         }
1543 }
1544
1545 struct inode *
1546 lafs_new_inode(struct fs *fs, struct super_block *sb, struct inode *dir,
1547                int type, int inum, int mode, struct datablock **inodbp)
1548 {
1549         /* allocate and instantiate a new inode.  If inum is non-zero,
1550          * choose any number, otherwise we are creating a special inode
1551          * and have to use the given number.
1552          * This creation is committed independently of any name that might
1553          * subsequently be given to the inode.  So we register it as an
1554          * orphan so that it will be cleaned up if the name isn't
1555          * successfully created
1556          *
1557          */
1558         struct inode *ino;
1559         struct datablock *b;
1560         struct inode_map_new_info imni;
1561         struct update_handle ui;
1562         int err;
1563
1564         err = inode_map_new_prepare(fs, inum, sb, &imni);
1565         err = lafs_cluster_update_prepare(&ui, fs, sizeof(struct la_inode))
1566                 ?: err;
1567         if (err == 0)
1568                 err = lafs_make_orphan(fs, imni.ib, NULL);
1569         if (err)
1570                 goto abort;
1571 retry:
1572         lafs_checkpoint_lock(fs);
1573
1574         err = inode_map_new_pin(&imni);
1575
1576         if (err == -EAGAIN) {
1577                 lafs_checkpoint_unlock_wait(fs);
1578                 goto retry;
1579         }
1580         if (err < 0)
1581                 goto abort_unlock;
1582
1583         b = getdref(imni.ib, MKREF(inode_new));
1584
1585         lafs_iolock_block(&b->b); /* make sure we don't race with the cleaner
1586                                    * and zero this inode while trying to load it
1587                                    */
1588         lafs_inode_init(b, type, mode, dir);
1589         lafs_iounlock_block(&b->b);
1590
1591         inode_map_new_commit(&imni);
1592         ino = lafs_iget(sb, b->b.fileaddr, SYNC);
1593         if (IS_ERR(ino)) {
1594                 lafs_cluster_update_abort(&ui);
1595                 LAFS_BUG(1, &b->b);
1596         } else
1597                 lafs_cluster_update_commit(&ui, b, 0,
1598                                            LAFSI(ino)->metadata_size);
1599         LAFS_BUG(LAFSI(ino)->dblock != b, &b->b);
1600         LAFS_BUG(b->my_inode != ino, &b->b);
1601         lafs_checkpoint_unlock(fs);
1602
1603         if (inodbp)
1604                 *inodbp = b;
1605         else
1606                 putdref(b, MKREF(inode_new));
1607         return ino;
1608
1609 abort_unlock:
1610         lafs_checkpoint_unlock(fs);
1611         err = -ENOSPC;
1612 abort:
1613         inode_map_new_abort(&imni);
1614         lafs_cluster_update_abort(&ui);
1615         dprintk("After abort %d: %s\n", err, strblk(&imni.ib->b));
1616         return ERR_PTR(err);
1617 }
1618
1619 static int inode_map_free(struct fs *fs, struct super_block *sb, u32 inum)
1620 {
1621         struct inode *im = lafs_iget(sb, 1, SYNC);
1622         int bit;
1623         unsigned long *buf;
1624         struct datablock *b;
1625         u32 bnum;
1626         int err;
1627
1628         mutex_lock_nested(&im->i_mutex, I_MUTEX_QUOTA);
1629
1630         bnum = inum >> (3 + sb->s_blocksize_bits);
1631         bit = inum - (bnum << (3 + sb->s_blocksize_bits));
1632         b = lafs_get_block(im, bnum, NULL, GFP_KERNEL, MKREF(inode_map_free));
1633         if (!b) {
1634                 mutex_unlock(&im->i_mutex);
1635                 iput(im);
1636                 return -ENOMEM;
1637         }
1638         err = lafs_read_block(b);
1639         if (err) {
1640                 putdref(b, MKREF(inode_map_free));
1641                 mutex_unlock(&im->i_mutex);
1642                 iput(im);
1643                 return err;
1644         }
1645         lafs_iolock_written(&b->b);
1646         set_bit(B_PinPending, &b->b.flags);
1647         lafs_iounlock_block(&b->b);
1648 retry:
1649         lafs_checkpoint_lock(fs);
1650         err = lafs_pin_dblock(b, ReleaseSpace);
1651         if (err == -EAGAIN) {
1652                 lafs_checkpoint_unlock_wait(fs);
1653                 goto retry;
1654         }
1655         BUG_ON(err < 0);
1656         buf = map_dblock(b);
1657         generic___set_le_bit(bit, buf);
1658         unmap_dblock(b, buf);
1659         lafs_dirty_dblock(b);
1660         putdref(b, MKREF(inode_map_free));
1661         lafs_checkpoint_unlock(fs);
1662                 mutex_unlock(&im->i_mutex);
1663         iput(im);
1664         return 0;
1665 }
1666
1667 int lafs_setattr(struct dentry *dentry, struct iattr *attr)
1668 {
1669         int err;
1670         struct inode *ino = dentry->d_inode;
1671         struct fs *fs = fs_from_inode(ino);
1672         struct datablock *db;
1673
1674         err = inode_change_ok(ino, attr);
1675         db = lafs_inode_dblock(ino, SYNC, MKREF(setattr));
1676         if (IS_ERR(db))
1677                 err = PTR_ERR(db);
1678         if (err)
1679                 return err;
1680
1681         /* We don't need iolock_written here as we don't
1682          * actually change the inode block yet
1683          */
1684         lafs_iolock_block(&db->b);
1685         set_bit(B_PinPending, &db->b.flags);
1686         lafs_iounlock_block(&db->b);
1687
1688         /* FIXME quota stuff */
1689
1690 again:
1691         lafs_checkpoint_lock(fs);
1692         err = lafs_pin_dblock(db, ReleaseSpace);
1693         if (err == -EAGAIN) {
1694                 lafs_checkpoint_unlock_wait(fs);
1695                 goto again;
1696         }
1697         /* inode_setattr calls lafs_dirty_inode, which sets
1698          * I_Dirty so the dblock will get updated.
1699          */
1700         err = err ?: inode_setattr(ino, attr);
1701         if (!err)
1702                 lafs_dirty_dblock(db);
1703         clear_bit(B_PinPending, &db->b.flags);
1704         putdref(db, MKREF(setattr));
1705         lafs_checkpoint_unlock(fs);
1706
1707         return err;
1708 }
1709
1710 void lafs_truncate(struct inode *ino)
1711 {
1712         /* Want to truncate this file.
1713          * i_size has already been changed, and the address space
1714          * has been cleaned up.
1715          * So just start the background truncate
1716          */
1717         struct fs *fs = fs_from_inode(ino);
1718         struct datablock *db = lafs_inode_dblock(ino, SYNC, MKREF(trunc));
1719         loff_t trunc_block;
1720         DEFINE_WAIT(wq);
1721
1722         if (IS_ERR(db))
1723                 return;
1724
1725         trunc_block = ((i_size_read(ino) + fs->blocksize - 1)
1726                        >> fs->blocksize_bits);
1727         /* We hold i_mutex, so regular orphan processing cannot
1728          * contine - we have to push it forward ourselves.
1729          */
1730         while (test_bit(I_Trunc, &LAFSI(ino)->iflags) &&
1731                LAFSI(ino)->trunc_next < trunc_block) {
1732                 prepare_to_wait(&fs->async_complete, &wq,
1733                                 TASK_UNINTERRUPTIBLE);
1734                 lafs_inode_handle_orphan(db);
1735                 if (test_bit(B_Orphan, &db->b.flags))
1736                         schedule();
1737         }
1738         finish_wait(&fs->async_complete, &wq);
1739
1740         /* There is nothing we can do about errors here.  The
1741          * most likely are ENOMEM which itself is very unlikely.
1742          * If this doesn't get registered as an orphan .... maybe
1743          * it will have to wait until something else truncates it.
1744          */
1745         lafs_make_orphan(fs, db, NULL);
1746
1747         if (!test_and_set_bit(I_Trunc, &LAFSI(ino)->iflags))
1748                 igrab(ino);
1749         if (trunc_block == 0)
1750                 LAFSI(ino)->trunc_gen++;
1751         LAFSI(ino)->trunc_next = trunc_block;
1752         putdref(db, MKREF(trunc));
1753 }
1754
1755 const struct inode_operations lafs_special_ino_operations = {
1756         .setattr        = lafs_setattr,
1757         .truncate       = lafs_truncate,
1758 };