]> git.neil.brown.name Git - LaFS.git/blob - roll.c
b28e5e3d1df77a3dd31e35f36f3389cbb9c8a4ef
[LaFS.git] / roll.c
1
2 /*
3  * fs/lafs/roll.c
4  * Copyright (C) 2005-2009
5  * Neil Brown <neilb@suse.de>
6  * Released under the GPL, version 2
7  *
8  * 'rollforward'
9  */
10
11 #include        "lafs.h"
12 #include        <linux/slab.h>
13
14 static int
15 roll_valid(struct fs *fs, struct cluster_head *ch, unsigned long long addr)
16 {
17         /* return 1 if the cluster_head looks locally valid.
18          * Don't check checksum as we may not have the whole head
19          */
20         if (memcmp(ch->idtag, "LaFSHead", 8) != 0)
21                 return 0;
22         if (memcmp(fs->state->uuid, ch->uuid, 16) != 0)
23                 return 0;
24         if (le64_to_cpu(ch->this_addr) != addr)
25                 return 0;
26         switch (le16_to_cpu(ch->verify_type)) {
27         case VerifyNull:
28         case VerifyNext:
29         case VerifyNext2:
30                 break;
31         default:
32                 return 0;
33         }
34         if (ch->pad0 != 0)
35                 return 0;
36         if (le16_to_cpu(ch->Clength) > fs->max_segment)
37                 return 0;
38         return 1;
39 }
40
41 /*
42  * roll_locate returns 0 if proper endpoints were found,
43  * or -EINVAL?? if CheckpointStart and CheckpointEnd weren't found properly
44  * "next" will contain the address of the next cluster to be written to,
45  * "last" the cluster before that, and "seq" the seq number for next cluster
46  */
47 static int
48 roll_locate(struct fs *fs, u64 start,
49             u64 *next, u64 *lastp, u64 *seqp,
50             int *maxp, struct page *p)
51 {
52         struct cluster_head *ch;
53         u64 this, prev, prev2, last;
54         u64 seq = 0;
55         int max = 0;
56         int prevtype, prev2type;
57
58         ch = (struct cluster_head *)page_address(p);
59
60         this = start; prev = start;
61
62         do {
63                 if (lafs_load_page(fs, p, this, 1) != 0) {
64                         printk(KERN_ERR "LaFS: Could not read cluster %llu\n",
65                                (unsigned long long) this);
66                         return -EIO;
67                 }
68                 if (!roll_valid(fs, ch, this)) {
69                         printk(KERN_ERR "LaFS: Bad cluster at %llu\n",
70                                (unsigned long long) this);
71                         return -EIO;
72                 }
73                 if (this == start) {
74                         seq = le64_to_cpu(ch->seq);
75                         if (!(ch->flags & CH_CheckpointStart)) {
76                                 printk(KERN_ERR "LaFS: Cluster at %llu not CheckpointStart!!\n",
77                                        (unsigned long long)this);
78                                 return -EIO;
79                         }
80                 } else if (seq != le64_to_cpu(ch->seq)) {
81                         printk(KERN_ERR "LaFS: Cluster sequence bad at %llu: %llu->%llu\n",
82                                (unsigned long long)this,
83                                (unsigned long long)seq,
84                                (unsigned long long)le64_to_cpu(ch->seq));
85                         return -EIO;
86                 }
87
88                 if (this != start && le64_to_cpu(ch->prev_addr) != prev) {
89                         printk(KERN_ERR "LaFS: Cluster Linkage error at %llu: %llu != %llu\n",
90                                (unsigned long long)this,
91                                (unsigned long long)le64_to_cpu(ch->prev_addr),
92                                (unsigned long long)prev);
93                         return -EIO;
94                 }
95                 if (!ch->flags & CH_Checkpoint) {
96                         printk(KERN_ERR "LaFS: Cluster %llu not a Checkpoint cluster\n",
97                                (unsigned long long)this);
98                         return -EIO;
99                 }
100                 dprintk("Found seq %llu at %llu\n",
101                         (unsigned long long)seq, (unsigned long long)this);
102                 if (le16_to_cpu(ch->Hlength) > max)
103                         max = le16_to_cpu(ch->Hlength);
104                 prev = this;
105                 this = le64_to_cpu(ch->next_addr);
106                 seq++;
107         } while (!(ch->flags & CH_CheckpointEnd));
108
109         /* 'seq' is sequence number of 'this' */
110         dprintk("CheckpointEnd found at %llu, seq %llu\n", prev, seq-1);
111
112         /* now we need to step forward a bit more carefully. as any
113          * cluster we find now could easily be bad.
114          * We keep:
115          *   this - address of cluster we are now considering
116          *   prev - address of previous cluster
117          *   prevtype - verify type of previous cluster
118          *   prev2 - address of cluster before prev
119          *   prev2type - verify type of that cluster.
120          *   start - "next_addr" entry from last known-good cluster
121          *
122          *
123          */
124
125         last = prev;
126         start = this;
127         prev2 = prev;
128         prevtype = prev2type = VerifyNull;
129
130         while (1) {
131                 if (lafs_load_page(fs, p, this, 1) != 0)
132                         break;
133                 if (!roll_valid(fs, ch, this))
134                         break;
135                 if (le64_to_cpu(ch->prev_addr) != prev)
136                         break;
137                 if (le64_to_cpu(ch->seq) != seq)
138                         break;
139                 /* FIXME check checksum, and possibly VerifySum */
140                 /* this head looks valid, so we can possibly verify previous
141                  * clusters
142                  */
143                 if (le16_to_cpu(ch->Hlength) > max)
144                         max = le16_to_cpu(ch->Hlength);
145
146                 if (prev2type == VerifyNext2) {
147                         start = prev; last = prev2;
148                 }
149                 if (prevtype == VerifyNext) {
150                         start = this;
151                         last = prev;
152                 }
153
154                 /* shift prev info back */
155                 prev2 = prev; prev2type = prevtype;
156                 prev = this ; prevtype = le16_to_cpu(ch->verify_type);
157                 this = le64_to_cpu(ch->next_addr);
158                 if (prevtype == VerifyNull) {
159                         start = this;
160                         last = prev;
161                 }
162                 seq++;
163         }
164
165         dprintk("LaFS: Next address to write is %llu\n", start);
166         *next = start;
167         *lastp = last;
168         if (start == this)
169                 *seqp = seq;
170         else if (start == prev)
171                 *seqp = seq-1;
172         else if (start == prev2)
173                 *seqp = seq-2;
174         else
175                 BUG();
176         *maxp = max;
177         return 0;
178 }
179
180 static int __must_check
181 roll_mini(struct fs *fs, int fsnum, int inum, int trunc, int flg,
182           u32 bnum, int offset, int len, char *data)
183 {
184         struct inode *inode;
185         struct lafs_inode *li;
186         struct datablock *db;
187         int err = 0;
188         void *buf;
189
190         dprintk("Roll Mini  %d/%d/%d/%lu/%d,%d\n",
191                 fsnum, inum, flg, (unsigned long) bnum,
192                 offset, len);
193
194         /* The handling of miniblock updates is quite different for
195          * different objects.
196          *
197          * inode-files: meta-data updates, including size, are allowed.
198          *     index update and data update are not (data update must
199          *      go through the file).  Implied creation requires
200          *      orphan handling
201          * regular-files: just over-write data, possibly extending size
202          * symlink,dev,pipe: as with reg-files
203          * directory: add/remove entries.
204          */
205
206         if (flg)
207                 return 0; /* old stuff isn't interesting, or even possible */
208
209         inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
210         if (IS_ERR(inode))
211                 return PTR_ERR(inode);
212
213         li = LAFSI(inode);
214
215         switch (li->type) {
216         case TypeInodeFile:
217
218                 BUG_ON(fsnum); /* FIXME should be more careful */
219                 lafs_iput_fs(inode);
220                 inode = lafs_iget_fs(fs, inum, bnum, SYNC);
221                 if (IS_ERR(inode)) {
222                         err = PTR_ERR(inode);
223                         if (err == -EIO && offset == 0) {
224                                 /* creating new inode */
225                         }
226                         return PTR_ERR(inode);
227                 }
228                 db = lafs_inode_dblock(inode, SYNC, MKREF(roll));
229                 buf = map_dblock(db);
230                 /* FIXME do I sync the inode back to the datablock first? */
231                 memcpy(buf+offset, data, len);
232                 unmap_dblock(db, buf);
233                 err = lafs_import_inode(inode, db);
234                 /* We borrow the orphan list to keep a reference on
235                  * this inode until all processing is finished
236                  * to make sure inodes that are about to get linked
237                  * to get deleted early
238                  */
239                 if (list_empty(&db->orphans)) {
240                         list_add(&db->orphans, &fs->pending_orphans);
241                         lafs_igrab_fs(inode);
242                         getdref(db, MKREF(roll_orphan));
243                 }
244                 putdref(db, MKREF(roll));
245                 break;
246         }
247         lafs_iput_fs(inode);
248         return err;
249 }
250
251 static int __must_check
252 roll_block(struct fs *fs, int fsnum, int inum, int trunc, int flg,
253            u32 bnum, u64 baddr, int type, struct page *p)
254 {
255         struct inode *inode;
256         struct datablock *blk = NULL;
257         struct lafs_inode *li;
258         int err = 0;
259
260         if (flg)
261                 return 0; /* "old" blocks aren't interesting */
262         if (type == DescIndex)
263                 return 0; /* index blocks aren't interesting either */
264         if (type == DescHole)
265                 return 0; /* FIXME should I punch a hole here? */
266
267         dprintk("Roll Block %d/%d/%d/%lu/%llu\n",
268                 fsnum, inum, flg, (unsigned long) bnum,
269                 (unsigned long long)baddr);
270
271         /* find/load the inode */
272         inode = lafs_iget_fs(fs, fsnum, inum, SYNC);
273         if (IS_ERR(inode))
274                 return PTR_ERR(inode);
275
276         /* FIXME do I need to 'lock' the inode in any way? */
277
278         /* check type */
279         li = LAFSI(inode);
280
281         dprintk("Got the inode, type %d %p size %llu\n", li->type,
282                 inode, inode->i_size);
283
284         switch (li->type) {
285                 struct la_inode *lai;
286                 int mdsize;
287
288         default: /* most filetypes are simply ignored */
289                 break;
290
291         case TypeInodeFile:
292                 /* The only part of an inode that might be interesting
293                  * is embedded data: All metadata changes get logged
294                  * as miniblocks.
295                  * Further the data can only be interesting for non-directories,
296                  * as directory updates are also logged as miniblocks.
297                  * So if this is a depth==0 non-directory inode,
298                  * treat the data as a miniblock update.
299                  */
300                 err = lafs_load_page(fs, p, baddr, 1);
301                 dprintk("inode load page err %d\n", err);
302                 if (err)
303                         break;
304                 lai = (struct la_inode *)page_address(p);
305                 mdsize = le16_to_cpu(lai->metadata_size);
306                 if (lai->filetype >= TypeBase &&
307                     lai->filetype != TypeDir  &&
308                     lai->depth == 0 &&
309                     mdsize > 1 && mdsize < fs->blocksize) {
310                         u64 sz = le64_to_cpu(lai->metadata[0].file.size);
311                         if (sz <= fs->blocksize - mdsize)
312                                 err = roll_mini(fs, inum, bnum, -1, flg, 0, 0,
313                                                 (int)sz,
314                                                 page_address(p) + mdsize);
315                 }
316                 break;
317
318         case TypeSegmentMap:
319         case TypeQuota:
320                 /* These only get merged while in a checkpoint. */
321                 if (fs->qphase == fs->phase)
322                         break;
323                 /* FALL THROUGH */
324         case TypeFile:
325         case TypeSymlink:
326                 /* merge into the file and possibly extend inode.size
327                  * Only extend the size if it was before this block.
328                  * i.e. if size was to the middle of this block, we don't
329                  * extend the size
330                  */
331                 dprintk("FILE type\n");
332                 err = -ENOMEM;
333                 blk = lafs_get_block(inode, bnum, NULL, GFP_KERNEL,
334                                      MKREF(roll));
335                 if (!blk)
336                         break;
337
338                 err = lafs_find_block(blk, ADOPT);
339                 if (err)
340                         break;
341                 if (blk->b.physaddr == baddr)
342                         /* already correctly indexed */
343                         break;
344
345                 /* FIXME do I need to dirty the inode to flush
346                  * this change into the datablock?
347                  */
348                 if (li->type >= TypeBase &&
349                     inode->i_size <= ((loff_t)bnum << inode->i_blkbits))
350                         inode->i_size = ((loff_t)bnum << inode->i_blkbits) + type;
351
352                 /* FIXME: we pretend this is a dirty, pinned block
353                  * so the lower-level code doesn't get confused.
354                  * Is this really the best approach?
355                  * Do I need to release some space here?
356                  */
357                 set_bit(B_PinPending, &blk->b.flags); /* Don't need iolock as no io yet */
358                 lafs_pin_dblock(blk, CleanSpace); /* cannot fail during ! ->rolled */
359
360                 lafs_iolock_block(&blk->b);
361                 lafs_summary_update(fs, blk->b.inode, blk->b.physaddr, baddr,
362                                     0, fs->phase, 1);
363                 blk->b.physaddr = baddr;
364                 lafs_dirty_iblock(blk->b.parent, 0);
365                 /* FIXME maybe set Writeback and unlock */
366                 if (lafs_add_block_address(fs, &blk->b) == 0)
367                         /* FIXME if the table becomes full, we have a problem... */
368                         LAFS_BUG(1, &blk->b);
369                 dprintk("Allocated block %lu to %llu\n",
370                         (unsigned long)bnum, baddr);
371                 /* FIXME maybe clear Writeback instead */
372                 lafs_iounlock_block(&blk->b);
373
374                 clear_bit(B_PinPending, &blk->b.flags);
375                 /* If we had previously read this block for some reason,
376                  * the contents are now invalid.  If they are dirty,
377                  * we have a real problem as those changes cannot be saved.
378                  */
379                 LAFS_BUG(test_bit(B_Dirty, &blk->b.flags), &blk->b);
380                 clear_bit(B_Valid, &blk->b.flags);
381
382                 break;
383         }
384         if (blk)
385                 putdref(blk, MKREF(roll));
386         lafs_iput_fs(inode);
387         dprintk("leaving with error %d\n", err);
388         return err;
389 }
390
391 static int __must_check
392 roll_one(struct fs *fs, u64 *addrp, struct page *p, struct page *pg,
393          int max)
394 {
395         u64 addr = *addrp;
396         struct cluster_head *ch = (struct cluster_head *)page_address(p);
397         struct group_head *gh;
398         struct descriptor *desc;
399         int i;
400         u64 baddr = addr;
401         int err;
402         int blocksize = fs->blocksize;
403
404         /* we "know" buf is big enough */
405         err = lafs_load_page(fs, p, addr, max/blocksize);
406         if (err)
407                 return err;
408
409         /* just minimal checks, as we have looked at this already */
410         if (!roll_valid(fs, ch, addr))
411                 return -EIO;
412         if (lafs_calc_cluster_csum(ch) != ch->checksum)
413                 return -EIO;
414         *addrp = le64_to_cpu(ch->next_addr);
415
416         if (le16_to_cpu(ch->Hlength) > max)
417                 return -EIO;
418
419         baddr += (le16_to_cpu(ch->Hlength) + blocksize - 1) / blocksize;
420
421         if (!(ch->flags & CH_Checkpoint))
422                 fs->qphase = fs->phase;
423
424         gh = ch->groups;
425         i = 0;
426         while (((char *)gh - (char *)ch) < le16_to_cpu(ch->Hlength)) {
427                 int j = 0;
428                 int inum = le32_to_cpu(gh->inum);
429                 int fsnum = le32_to_cpu(gh->fsnum);
430                 int trunc = le16_to_cpu(gh->truncatenum_and_flag) & 0x7fff;
431                 int flg   = le16_to_cpu(gh->truncatenum_and_flag) & 0x8000;
432
433                 desc = gh->u.desc;
434                 while (((char *)desc - (char *)gh) <
435                        le16_to_cpu(gh->group_size_words)*4) {
436                         if (le16_to_cpu(desc->block_bytes) <= DescMiniOffset ||
437                             le16_to_cpu(desc->block_bytes) == DescIndex) {
438                                 u32 bnum = le32_to_cpu(desc->block_num);
439                                 int cnt = le16_to_cpu(desc->block_cnt);
440
441                                 if (le16_to_cpu(desc->block_bytes) == DescIndex
442                                     && cnt != 1)
443                                         return -EIO; /* FIXME is this
444                                                       * the best
445                                                       * response */
446                                 /* FIXME range check count */
447                                 while (!err && cnt--) {
448                                         err = roll_block(fs, fsnum, inum, trunc,
449                                                          flg, bnum, baddr,
450                                                          cnt == 0
451                                                          ? le16_to_cpu(desc->block_bytes)
452                                                          : blocksize,
453                                                          pg);
454                                         bnum++; baddr++;
455                                 }
456                                 /* FIXME allow for striping */
457                                 desc++;
458                         } else {
459                                 struct miniblock *mb = (struct miniblock *)desc;
460                                 u32 bnum = le32_to_cpu(mb->block_num);
461                                 int offset = le16_to_cpu(mb->block_offset);
462                                 int len = le16_to_cpu(mb->length)
463                                         - DescMiniOffset;
464                                 err = roll_mini(fs, fsnum, inum, trunc, flg,
465                                                 bnum, offset, len, (char *)(mb+1));
466
467                                 mb++;
468                                 mb = (struct miniblock *)(((char*)mb)
469                                                           + ROUND_UP(len));
470                                 desc = (struct descriptor *)mb;
471                         }
472                         j++;
473                         if (err)
474                                 break;
475                 }
476                 gh = (struct group_head *)desc;
477                 i++;
478                 if (err)
479                         break;
480         }
481         if (ch->flags & CH_CheckpointEnd)
482                 fs->qphase = fs->phase;
483         return err;
484 }
485
486 static int roll_forward(struct fs *fs)
487 {
488         u64 first, next = 0, last = 0, seq = 0;
489         int max = 0;
490         struct page *p, *pg;
491         int err;
492         int blocksize = fs->blocksize;
493         int dev;
494         u32 seg;
495         u32 offset;
496
497         fs->phase = 1;
498         fs->qphase = 0;
499         fs->checkpointing = CH_Checkpoint;
500         clear_bit(DelayYouth, &fs->fsstate);
501
502         first = fs->checkpointcluster;
503         p = alloc_page(GFP_KERNEL);
504         if (!p)
505                 return -ENOMEM;
506
507         err = roll_locate(fs, first, &next, &last, &seq, &max, p);
508
509         max = ((max + blocksize - 1) / blocksize) * blocksize;
510
511         if (!err && max > PAGE_SIZE)
512                 err = -EFBIG;
513         if (err) {
514                 put_page(p);
515                 return err;
516         }
517
518         pg = alloc_page(GFP_KERNEL);
519         if (!pg) {
520                 put_page(p);
521                 return -ENOMEM;
522         }
523
524         err = lafs_cluster_init(fs, 0, next, last, seq);
525         if (err) {
526                 put_page(p); put_page(pg);
527                 return err;
528         }
529         lafs_cluster_init(fs, 1, 0, 0, 0);
530
531         virttoseg(fs, first, &dev, &seg, &offset);
532
533         while (first != next) {
534                 int dev2;
535                 u32 seg2;
536
537                 virttoseg(fs, first, &dev2, &seg2, &offset);
538                 err = roll_one(fs, &first, p, pg, max);
539                 if (err)
540                         break;
541
542                 if (fs->qphase == fs->phase &&
543                     fs->checkpointing) {
544                         fs->checkpointing = 0;
545                         clear_bit(DelayYouth, &fs->fsstate);
546                         lafs_seg_apply_all(fs);
547                 }
548
549                 if (dev2 != dev || seg2 != seg) {
550                         /* New segment - need to make sure youth is correct */
551                         dev = dev2;
552                         seg = seg2;
553                         /* if fs->checkpointing, seg_apply_all will do the youth
554                          * update
555                          */
556                         if (fs->checkpointing == 0)
557                                 lafs_update_youth(fs, dev, seg);
558                 }
559         }
560         put_page(p);
561         put_page(pg);
562
563         lafs_add_active(fs, next);
564
565         /* Now we release all the nlink==0 inodes that we found */
566         while (!list_empty(&fs->pending_orphans)) {
567                 struct datablock *db = list_entry(fs->pending_orphans.next,
568                                                   struct datablock,
569                                                   orphans);
570                 list_del_init(&db->orphans);
571                 lafs_iput_fs(db->my_inode);
572                 putdref(db, MKREF(roll_orphan));
573         }
574         fs->rolled = 1;
575         return err;
576 }
577
578 int
579 lafs_mount(struct fs *fs)
580 {
581         struct datablock *b;
582         struct inode *root;
583         struct inode *rootdir;
584         struct dentry *de;
585         int err;
586         int d;
587         struct sb_key *k = fs->prime_sb->s_fs_info;
588         int orphan_count;
589
590         fs->rolled = 0;
591         fs->ss[0].root = root = iget_locked(fs->prime_sb, 0);
592         k->root = root;
593
594         err = -ENOMEM;
595         b = lafs_get_block(root, 0, NULL, GFP_KERNEL, MKREF(mount));
596         if (!b)
597                 goto err;
598         set_bit(B_Root, &b->b.flags);
599         b->b.physaddr = fs->ss[0].root_addr;
600         set_bit(B_PhysValid, &b->b.flags);
601         err = lafs_load_block(&b->b, NULL);
602         if (err)
603                 goto err;
604         err = lafs_wait_block(&b->b);
605         if (err)
606                 goto err;
607
608         err = lafs_import_inode(root, b);
609         if (err)
610                 goto err;
611         putdref(b, MKREF(mount));
612         b = NULL;
613
614         unlock_new_inode(root);
615         /* FIXME lots of error checking */
616
617         rootdir = lafs_iget(fs->prime_sb, 2, SYNC);
618         err = PTR_ERR(rootdir);
619         if (IS_ERR(rootdir))
620                 goto err;
621         de = d_alloc_root(rootdir);
622         err = PTR_ERR(de);
623         if (IS_ERR(de))
624                 goto err;
625         fs->prime_sb->s_root = de;
626
627         fs->orphans = lafs_iget(fs->prime_sb, 8, SYNC);
628         for (d = 0; d < fs->devices ; d++) {
629                 fs->devs[d].segsum = lafs_iget(fs->prime_sb,
630                                                fs->devs[d].usage_inum,
631                                                SYNC);
632                 /* FIXME check this is a segusage file */
633         }
634         orphan_count = lafs_count_orphans(fs->orphans);
635         LAFSI(fs->orphans)->md.orphan.nextfree = orphan_count;
636
637         lafs_checkpoint_lock(fs);
638         err = roll_forward(fs);
639         lafs_checkpoint_unlock(fs);
640
641         lafs_add_orphans(fs, fs->orphans, orphan_count);
642
643         for (d = 0; d < 4; d++) {
644                 fs->cleaner.seg[d].chead = alloc_page(GFP_KERNEL);
645                 INIT_LIST_HEAD(&fs->cleaner.seg[d].cleaning);
646         }
647         return err;
648
649 err:
650         putdref(b, MKREF(mount));
651         return err;
652 }